RyanYr's picture
Training in progress, step 600, checkpoint
1cd4cbf verified
raw
history blame
161 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6920914137408983,
"eval_steps": 100,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023069713791363275,
"grad_norm": 70.983255945064,
"learning_rate": 4e-09,
"logits/chosen": -1.6907414197921753,
"logits/rejected": -1.6978764533996582,
"logps/chosen": -135.08778381347656,
"logps/rejected": -140.00140380859375,
"loss": 0.6978,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.010493194684386253,
"rewards/margins": 0.006632559932768345,
"rewards/rejected": 0.003860633820295334,
"step": 2
},
{
"epoch": 0.004613942758272655,
"grad_norm": 79.97063682582153,
"learning_rate": 8e-09,
"logits/chosen": -1.6330227851867676,
"logits/rejected": -1.7231806516647339,
"logps/chosen": -197.88365173339844,
"logps/rejected": -218.62255859375,
"loss": 0.6925,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.008352389559149742,
"rewards/margins": -0.00352578517049551,
"rewards/rejected": 0.011878175660967827,
"step": 4
},
{
"epoch": 0.006920914137408983,
"grad_norm": 78.73277588414827,
"learning_rate": 1.1999999999999998e-08,
"logits/chosen": -1.7628690004348755,
"logits/rejected": -1.6921380758285522,
"logps/chosen": -181.12741088867188,
"logps/rejected": -177.64956665039062,
"loss": 0.6919,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.005539673380553722,
"rewards/margins": -0.012507464736700058,
"rewards/rejected": 0.006967790424823761,
"step": 6
},
{
"epoch": 0.00922788551654531,
"grad_norm": 84.94475101410946,
"learning_rate": 1.6e-08,
"logits/chosen": -1.6862337589263916,
"logits/rejected": -1.6957104206085205,
"logps/chosen": -229.57574462890625,
"logps/rejected": -308.63421630859375,
"loss": 0.6949,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.019691964611411095,
"rewards/margins": 0.022208284586668015,
"rewards/rejected": -0.0025163227692246437,
"step": 8
},
{
"epoch": 0.011534856895681638,
"grad_norm": 78.53713189911603,
"learning_rate": 2e-08,
"logits/chosen": -1.72577965259552,
"logits/rejected": -1.72530198097229,
"logps/chosen": -182.21597290039062,
"logps/rejected": -197.15383911132812,
"loss": 0.6876,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.00015587342204526067,
"rewards/margins": -0.00128166563808918,
"rewards/rejected": 0.0011257934384047985,
"step": 10
},
{
"epoch": 0.013841828274817966,
"grad_norm": 60.639642502259356,
"learning_rate": 2.3999999999999997e-08,
"logits/chosen": -1.5887395143508911,
"logits/rejected": -1.7574589252471924,
"logps/chosen": -121.71543884277344,
"logps/rejected": -164.58782958984375,
"loss": 0.6915,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.004625019151717424,
"rewards/margins": 0.00861622579395771,
"rewards/rejected": -0.003991207107901573,
"step": 12
},
{
"epoch": 0.016148799653954292,
"grad_norm": 86.96054524483631,
"learning_rate": 2.8000000000000003e-08,
"logits/chosen": -1.5507514476776123,
"logits/rejected": -1.5499210357666016,
"logps/chosen": -147.94631958007812,
"logps/rejected": -200.87417602539062,
"loss": 0.6926,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.013226826675236225,
"rewards/margins": 0.021001461893320084,
"rewards/rejected": -0.00777463661506772,
"step": 14
},
{
"epoch": 0.01845577103309062,
"grad_norm": 77.14253037311525,
"learning_rate": 3.2e-08,
"logits/chosen": -1.6721559762954712,
"logits/rejected": -1.7068090438842773,
"logps/chosen": -157.89622497558594,
"logps/rejected": -199.2628631591797,
"loss": 0.6951,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.0034806535113602877,
"rewards/margins": 0.006839222740381956,
"rewards/rejected": -0.010319876484572887,
"step": 16
},
{
"epoch": 0.020762742412226948,
"grad_norm": 78.17299743340894,
"learning_rate": 3.6e-08,
"logits/chosen": -1.6556451320648193,
"logits/rejected": -1.7276983261108398,
"logps/chosen": -135.32223510742188,
"logps/rejected": -158.1257781982422,
"loss": 0.6884,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.01184301357716322,
"rewards/margins": 0.023063620552420616,
"rewards/rejected": -0.011220606043934822,
"step": 18
},
{
"epoch": 0.023069713791363276,
"grad_norm": 77.30701284634611,
"learning_rate": 4e-08,
"logits/chosen": -1.735813856124878,
"logits/rejected": -1.789333701133728,
"logps/chosen": -157.4953155517578,
"logps/rejected": -186.05862426757812,
"loss": 0.693,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.004054094199091196,
"rewards/margins": 0.010992627590894699,
"rewards/rejected": -0.006938533391803503,
"step": 20
},
{
"epoch": 0.025376685170499604,
"grad_norm": 77.95778126726918,
"learning_rate": 4.4e-08,
"logits/chosen": -1.5402836799621582,
"logits/rejected": -1.5884689092636108,
"logps/chosen": -133.27554321289062,
"logps/rejected": -170.48594665527344,
"loss": 0.6944,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.009935155510902405,
"rewards/margins": 0.019674377515912056,
"rewards/rejected": -0.009739222005009651,
"step": 22
},
{
"epoch": 0.02768365654963593,
"grad_norm": 78.69027024475307,
"learning_rate": 4.799999999999999e-08,
"logits/chosen": -1.4641175270080566,
"logits/rejected": -1.6491130590438843,
"logps/chosen": -139.53375244140625,
"logps/rejected": -193.21438598632812,
"loss": 0.6945,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.010036014020442963,
"rewards/margins": 0.005420446861535311,
"rewards/rejected": -0.015456462278962135,
"step": 24
},
{
"epoch": 0.02999062792877226,
"grad_norm": 82.14181039722058,
"learning_rate": 5.2e-08,
"logits/chosen": -1.7350623607635498,
"logits/rejected": -1.6639574766159058,
"logps/chosen": -161.62164306640625,
"logps/rejected": -160.58840942382812,
"loss": 0.6926,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0011872733011841774,
"rewards/margins": -0.006157425232231617,
"rewards/rejected": 0.0049701533280313015,
"step": 26
},
{
"epoch": 0.032297599307908584,
"grad_norm": 73.66413843287592,
"learning_rate": 5.6000000000000005e-08,
"logits/chosen": -1.4280009269714355,
"logits/rejected": -1.6318210363388062,
"logps/chosen": -131.8518524169922,
"logps/rejected": -166.72335815429688,
"loss": 0.6964,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.01029287837445736,
"rewards/margins": -0.014272996224462986,
"rewards/rejected": 0.003980117850005627,
"step": 28
},
{
"epoch": 0.034604570687044915,
"grad_norm": 86.40461346238433,
"learning_rate": 6e-08,
"logits/chosen": -1.6838908195495605,
"logits/rejected": -1.7304034233093262,
"logps/chosen": -124.49476623535156,
"logps/rejected": -140.77841186523438,
"loss": 0.6931,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0041793473064899445,
"rewards/margins": -0.0030185298528522253,
"rewards/rejected": 0.007197877857834101,
"step": 30
},
{
"epoch": 0.03691154206618124,
"grad_norm": 79.23808836252253,
"learning_rate": 6.4e-08,
"logits/chosen": -1.5962588787078857,
"logits/rejected": -1.549019455909729,
"logps/chosen": -194.4256591796875,
"logps/rejected": -237.36117553710938,
"loss": 0.7003,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.0013054789742454886,
"rewards/margins": -0.018302934244275093,
"rewards/rejected": 0.016997454687952995,
"step": 32
},
{
"epoch": 0.03921851344531757,
"grad_norm": 74.59682312018288,
"learning_rate": 6.8e-08,
"logits/chosen": -1.740609884262085,
"logits/rejected": -1.6540827751159668,
"logps/chosen": -157.9348907470703,
"logps/rejected": -158.6222686767578,
"loss": 0.6889,
"rewards/accuracies": 0.78125,
"rewards/chosen": 0.01421279925853014,
"rewards/margins": 0.03194922208786011,
"rewards/rejected": -0.017736420035362244,
"step": 34
},
{
"epoch": 0.041525484824453895,
"grad_norm": 79.71071945309694,
"learning_rate": 7.2e-08,
"logits/chosen": -1.5709240436553955,
"logits/rejected": -1.6920911073684692,
"logps/chosen": -173.94007873535156,
"logps/rejected": -219.2288818359375,
"loss": 0.6962,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.011220686137676239,
"rewards/margins": -0.008418139070272446,
"rewards/rejected": -0.0028025463689118624,
"step": 36
},
{
"epoch": 0.04383245620359023,
"grad_norm": 86.21753265595521,
"learning_rate": 7.599999999999999e-08,
"logits/chosen": -1.7426936626434326,
"logits/rejected": -1.5694864988327026,
"logps/chosen": -159.00730895996094,
"logps/rejected": -146.18772888183594,
"loss": 0.6971,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.01584451086819172,
"rewards/margins": -0.012521232478320599,
"rewards/rejected": -0.003323277225717902,
"step": 38
},
{
"epoch": 0.04613942758272655,
"grad_norm": 76.87714834723751,
"learning_rate": 8e-08,
"logits/chosen": -1.7129234075546265,
"logits/rejected": -1.6838488578796387,
"logps/chosen": -191.7474822998047,
"logps/rejected": -162.81466674804688,
"loss": 0.6986,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0020668436773121357,
"rewards/margins": -0.008048251271247864,
"rewards/rejected": 0.010115095414221287,
"step": 40
},
{
"epoch": 0.04844639896186288,
"grad_norm": 71.62490665795943,
"learning_rate": 8.4e-08,
"logits/chosen": -1.7202041149139404,
"logits/rejected": -1.7000675201416016,
"logps/chosen": -157.0576934814453,
"logps/rejected": -186.76138305664062,
"loss": 0.6889,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.0023848186247050762,
"rewards/margins": 0.002543981885537505,
"rewards/rejected": -0.00015916326083242893,
"step": 42
},
{
"epoch": 0.05075337034099921,
"grad_norm": 81.75722048702342,
"learning_rate": 8.8e-08,
"logits/chosen": -1.5299909114837646,
"logits/rejected": -1.5974533557891846,
"logps/chosen": -158.53753662109375,
"logps/rejected": -179.9112548828125,
"loss": 0.6924,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.006203922443091869,
"rewards/margins": 0.02108878269791603,
"rewards/rejected": -0.014884857460856438,
"step": 44
},
{
"epoch": 0.05306034172013553,
"grad_norm": 78.9572151515279,
"learning_rate": 9.2e-08,
"logits/chosen": -1.6293164491653442,
"logits/rejected": -1.6444960832595825,
"logps/chosen": -152.41412353515625,
"logps/rejected": -170.09869384765625,
"loss": 0.6925,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.010927575640380383,
"rewards/margins": 0.0014875519555062056,
"rewards/rejected": 0.009440025314688683,
"step": 46
},
{
"epoch": 0.05536731309927186,
"grad_norm": 81.84986800244135,
"learning_rate": 9.599999999999999e-08,
"logits/chosen": -1.7789497375488281,
"logits/rejected": -1.7547317743301392,
"logps/chosen": -214.71987915039062,
"logps/rejected": -240.7999725341797,
"loss": 0.6889,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0025361552834510803,
"rewards/margins": 0.006769699975848198,
"rewards/rejected": -0.004233543295413256,
"step": 48
},
{
"epoch": 0.05767428447840819,
"grad_norm": 76.76668544313137,
"learning_rate": 1e-07,
"logits/chosen": -1.491020917892456,
"logits/rejected": -1.5447454452514648,
"logps/chosen": -201.1317901611328,
"logps/rejected": -267.04248046875,
"loss": 0.6974,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.005572921596467495,
"rewards/margins": 0.01456289179623127,
"rewards/rejected": -0.02013581432402134,
"step": 50
},
{
"epoch": 0.05998125585754452,
"grad_norm": 84.23078533480681,
"learning_rate": 1.04e-07,
"logits/chosen": -1.6120061874389648,
"logits/rejected": -1.7649461030960083,
"logps/chosen": -183.1412353515625,
"logps/rejected": -282.4610290527344,
"loss": 0.695,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.02553856186568737,
"rewards/margins": -0.038517288863658905,
"rewards/rejected": 0.012978724204003811,
"step": 52
},
{
"epoch": 0.06228822723668084,
"grad_norm": 92.0173388072239,
"learning_rate": 1.08e-07,
"logits/chosen": -1.4607347249984741,
"logits/rejected": -1.625195026397705,
"logps/chosen": -152.8062744140625,
"logps/rejected": -219.1183624267578,
"loss": 0.6938,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.012677345424890518,
"rewards/margins": -0.032261885702610016,
"rewards/rejected": 0.01958453841507435,
"step": 54
},
{
"epoch": 0.06459519861581717,
"grad_norm": 79.32388836348366,
"learning_rate": 1.1200000000000001e-07,
"logits/chosen": -1.6375060081481934,
"logits/rejected": -1.5966099500656128,
"logps/chosen": -223.85108947753906,
"logps/rejected": -256.30072021484375,
"loss": 0.699,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.023084305226802826,
"rewards/margins": -0.02949170023202896,
"rewards/rejected": 0.006407391745597124,
"step": 56
},
{
"epoch": 0.0669021699949535,
"grad_norm": 69.72321013611293,
"learning_rate": 1.1599999999999999e-07,
"logits/chosen": -1.5106103420257568,
"logits/rejected": -1.596940517425537,
"logps/chosen": -188.92218017578125,
"logps/rejected": -275.0534973144531,
"loss": 0.6941,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.017192328348755836,
"rewards/margins": 0.03240702301263809,
"rewards/rejected": -0.015214694663882256,
"step": 58
},
{
"epoch": 0.06920914137408983,
"grad_norm": 72.64787237644839,
"learning_rate": 1.2e-07,
"logits/chosen": -1.667373776435852,
"logits/rejected": -1.6947064399719238,
"logps/chosen": -103.05116271972656,
"logps/rejected": -151.50807189941406,
"loss": 0.6928,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.00641946354880929,
"rewards/margins": 0.032484300434589386,
"rewards/rejected": -0.026064833626151085,
"step": 60
},
{
"epoch": 0.07151611275322615,
"grad_norm": 75.24112563737103,
"learning_rate": 1.24e-07,
"logits/chosen": -1.5305185317993164,
"logits/rejected": -1.6095894575119019,
"logps/chosen": -154.5998077392578,
"logps/rejected": -176.33970642089844,
"loss": 0.6965,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.00010838371235877275,
"rewards/margins": 0.006697986274957657,
"rewards/rejected": -0.006589602679014206,
"step": 62
},
{
"epoch": 0.07382308413236248,
"grad_norm": 70.73819169103417,
"learning_rate": 1.28e-07,
"logits/chosen": -1.6058859825134277,
"logits/rejected": -1.6498842239379883,
"logps/chosen": -151.62286376953125,
"logps/rejected": -169.5069122314453,
"loss": 0.6927,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.003975578583776951,
"rewards/margins": 0.006644865497946739,
"rewards/rejected": -0.0026692876126617193,
"step": 64
},
{
"epoch": 0.0761300555114988,
"grad_norm": 79.80905942384086,
"learning_rate": 1.32e-07,
"logits/chosen": -1.566676139831543,
"logits/rejected": -1.635036587715149,
"logps/chosen": -213.95590209960938,
"logps/rejected": -268.84747314453125,
"loss": 0.6878,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0030877357348799706,
"rewards/margins": -0.004700601100921631,
"rewards/rejected": 0.0016128652496263385,
"step": 66
},
{
"epoch": 0.07843702689063514,
"grad_norm": 68.290418726697,
"learning_rate": 1.36e-07,
"logits/chosen": -1.625745177268982,
"logits/rejected": -1.7014200687408447,
"logps/chosen": -191.14842224121094,
"logps/rejected": -222.51779174804688,
"loss": 0.6954,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.0007452260470017791,
"rewards/margins": 0.00873138289898634,
"rewards/rejected": -0.007986157201230526,
"step": 68
},
{
"epoch": 0.08074399826977147,
"grad_norm": 79.51788387191074,
"learning_rate": 1.3999999999999998e-07,
"logits/chosen": -1.5817363262176514,
"logits/rejected": -1.6993348598480225,
"logps/chosen": -131.7422637939453,
"logps/rejected": -162.98304748535156,
"loss": 0.6941,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.0074032689444720745,
"rewards/margins": -0.010667804628610611,
"rewards/rejected": 0.003264536615461111,
"step": 70
},
{
"epoch": 0.08305096964890779,
"grad_norm": 83.37639460390962,
"learning_rate": 1.44e-07,
"logits/chosen": -1.5910240411758423,
"logits/rejected": -1.6661615371704102,
"logps/chosen": -181.304931640625,
"logps/rejected": -209.23526000976562,
"loss": 0.6943,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0042681945487856865,
"rewards/margins": -0.02247396856546402,
"rewards/rejected": 0.018205774948000908,
"step": 72
},
{
"epoch": 0.08535794102804412,
"grad_norm": 85.04605551266509,
"learning_rate": 1.48e-07,
"logits/chosen": -1.5876970291137695,
"logits/rejected": -1.7304015159606934,
"logps/chosen": -146.5454559326172,
"logps/rejected": -182.179931640625,
"loss": 0.6891,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.011930807493627071,
"rewards/margins": -0.013042710721492767,
"rewards/rejected": 0.0011119036935269833,
"step": 74
},
{
"epoch": 0.08766491240718045,
"grad_norm": 71.97159401776734,
"learning_rate": 1.5199999999999998e-07,
"logits/chosen": -1.6893718242645264,
"logits/rejected": -1.6752575635910034,
"logps/chosen": -163.57489013671875,
"logps/rejected": -162.34803771972656,
"loss": 0.6906,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.0007063052617013454,
"rewards/margins": 0.002567308023571968,
"rewards/rejected": -0.0018610022962093353,
"step": 76
},
{
"epoch": 0.08997188378631678,
"grad_norm": 80.09910611023234,
"learning_rate": 1.56e-07,
"logits/chosen": -1.612238883972168,
"logits/rejected": -1.5296804904937744,
"logps/chosen": -143.39723205566406,
"logps/rejected": -165.65318298339844,
"loss": 0.6871,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.009035947732627392,
"rewards/margins": 0.025550464168190956,
"rewards/rejected": -0.01651451550424099,
"step": 78
},
{
"epoch": 0.0922788551654531,
"grad_norm": 70.85997602518799,
"learning_rate": 1.6e-07,
"logits/chosen": -1.6167306900024414,
"logits/rejected": -1.720908522605896,
"logps/chosen": -137.2986602783203,
"logps/rejected": -246.95404052734375,
"loss": 0.693,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.006022963672876358,
"rewards/margins": 0.008436123840510845,
"rewards/rejected": -0.0024131599348038435,
"step": 80
},
{
"epoch": 0.09458582654458943,
"grad_norm": 73.43468397188529,
"learning_rate": 1.6399999999999999e-07,
"logits/chosen": -1.7178070545196533,
"logits/rejected": -1.7651526927947998,
"logps/chosen": -154.39617919921875,
"logps/rejected": -187.47491455078125,
"loss": 0.6985,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.014386245980858803,
"rewards/margins": -0.011840756982564926,
"rewards/rejected": -0.002545490860939026,
"step": 82
},
{
"epoch": 0.09689279792372577,
"grad_norm": 82.20956953972079,
"learning_rate": 1.68e-07,
"logits/chosen": -1.5902974605560303,
"logits/rejected": -1.604806900024414,
"logps/chosen": -127.57711029052734,
"logps/rejected": -146.8506317138672,
"loss": 0.6887,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.006446457467973232,
"rewards/margins": 0.008786877617239952,
"rewards/rejected": -0.015233333222568035,
"step": 84
},
{
"epoch": 0.09919976930286209,
"grad_norm": 76.58342600254258,
"learning_rate": 1.7199999999999998e-07,
"logits/chosen": -1.6294573545455933,
"logits/rejected": -1.6458450555801392,
"logps/chosen": -248.47845458984375,
"logps/rejected": -246.7737579345703,
"loss": 0.6895,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.009375479072332382,
"rewards/margins": 0.025924015790224075,
"rewards/rejected": -0.03529949486255646,
"step": 86
},
{
"epoch": 0.10150674068199841,
"grad_norm": 73.05738349044326,
"learning_rate": 1.76e-07,
"logits/chosen": -1.84279465675354,
"logits/rejected": -1.7476646900177002,
"logps/chosen": -153.08554077148438,
"logps/rejected": -154.7803497314453,
"loss": 0.6909,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.002370176836848259,
"rewards/margins": -0.0033320121001452208,
"rewards/rejected": 0.0009618350304663181,
"step": 88
},
{
"epoch": 0.10381371206113474,
"grad_norm": 77.95992122604585,
"learning_rate": 1.8e-07,
"logits/chosen": -1.6355715990066528,
"logits/rejected": -1.6984450817108154,
"logps/chosen": -169.7340087890625,
"logps/rejected": -185.59031677246094,
"loss": 0.6929,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.01826353184878826,
"rewards/margins": -0.008648518472909927,
"rewards/rejected": -0.009615011513233185,
"step": 90
},
{
"epoch": 0.10612068344027106,
"grad_norm": 74.70608110655593,
"learning_rate": 1.84e-07,
"logits/chosen": -1.5153872966766357,
"logits/rejected": -1.5389485359191895,
"logps/chosen": -214.6402130126953,
"logps/rejected": -224.4317626953125,
"loss": 0.6877,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.00619399081915617,
"rewards/margins": 0.011179441586136818,
"rewards/rejected": -0.004985451698303223,
"step": 92
},
{
"epoch": 0.1084276548194074,
"grad_norm": 74.73028912491218,
"learning_rate": 1.88e-07,
"logits/chosen": -1.5655710697174072,
"logits/rejected": -1.5568063259124756,
"logps/chosen": -170.8540496826172,
"logps/rejected": -195.4169158935547,
"loss": 0.6887,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.016467105597257614,
"rewards/margins": 0.011973596177995205,
"rewards/rejected": -0.028440698981285095,
"step": 94
},
{
"epoch": 0.11073462619854373,
"grad_norm": 81.65007402302682,
"learning_rate": 1.9199999999999997e-07,
"logits/chosen": -1.7869484424591064,
"logits/rejected": -1.7626042366027832,
"logps/chosen": -208.46002197265625,
"logps/rejected": -256.5970458984375,
"loss": 0.6846,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.009895925410091877,
"rewards/margins": 0.011812476441264153,
"rewards/rejected": -0.021708402782678604,
"step": 96
},
{
"epoch": 0.11304159757768005,
"grad_norm": 77.09604224965807,
"learning_rate": 1.9599999999999998e-07,
"logits/chosen": -1.6421016454696655,
"logits/rejected": -1.5873386859893799,
"logps/chosen": -176.68853759765625,
"logps/rejected": -188.64544677734375,
"loss": 0.69,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.01306343637406826,
"rewards/margins": 0.021781262010335922,
"rewards/rejected": -0.03484470024704933,
"step": 98
},
{
"epoch": 0.11534856895681637,
"grad_norm": 77.37699461097638,
"learning_rate": 2e-07,
"logits/chosen": -1.5747566223144531,
"logits/rejected": -1.5757074356079102,
"logps/chosen": -146.1053009033203,
"logps/rejected": -177.65733337402344,
"loss": 0.6926,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.01261211559176445,
"rewards/margins": 0.005542438477277756,
"rewards/rejected": -0.018154552206397057,
"step": 100
},
{
"epoch": 0.11534856895681637,
"eval_logits/chosen": -1.592301368713379,
"eval_logits/rejected": -1.4917248487472534,
"eval_logps/chosen": -185.32534790039062,
"eval_logps/rejected": -150.51693725585938,
"eval_loss": 0.6938029527664185,
"eval_rewards/accuracies": 0.4000000059604645,
"eval_rewards/chosen": -0.014318165369331837,
"eval_rewards/margins": -0.014164167456328869,
"eval_rewards/rejected": -0.00015399709809571505,
"eval_runtime": 22.8572,
"eval_samples_per_second": 4.375,
"eval_steps_per_second": 1.094,
"step": 100
},
{
"epoch": 0.11765554033595271,
"grad_norm": 83.18929756458088,
"learning_rate": 1.9999925887938156e-07,
"logits/chosen": -1.553455114364624,
"logits/rejected": -1.6009831428527832,
"logps/chosen": -171.79664611816406,
"logps/rejected": -223.1472930908203,
"loss": 0.6923,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.0011008224682882428,
"rewards/margins": 0.010834511369466782,
"rewards/rejected": -0.009733689948916435,
"step": 102
},
{
"epoch": 0.11996251171508904,
"grad_norm": 73.85193862190509,
"learning_rate": 1.9999703552851146e-07,
"logits/chosen": -1.7583006620407104,
"logits/rejected": -1.714582920074463,
"logps/chosen": -209.88302612304688,
"logps/rejected": -255.11888122558594,
"loss": 0.6913,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.030152078717947006,
"rewards/margins": -0.011741320602595806,
"rewards/rejected": -0.018410757184028625,
"step": 104
},
{
"epoch": 0.12226948309422536,
"grad_norm": 73.38727733625704,
"learning_rate": 1.9999332998034512e-07,
"logits/chosen": -1.6966747045516968,
"logits/rejected": -1.6100220680236816,
"logps/chosen": -160.12281799316406,
"logps/rejected": -167.38145446777344,
"loss": 0.686,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.008994976989924908,
"rewards/margins": 0.011080076918005943,
"rewards/rejected": -0.020075054839253426,
"step": 106
},
{
"epoch": 0.12457645447336169,
"grad_norm": 79.80728796393491,
"learning_rate": 1.9998814228980768e-07,
"logits/chosen": -1.6656932830810547,
"logits/rejected": -1.7435060739517212,
"logps/chosen": -156.0963897705078,
"logps/rejected": -208.7286376953125,
"loss": 0.6956,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.01158140879124403,
"rewards/margins": -0.003068419173359871,
"rewards/rejected": -0.008512990549206734,
"step": 108
},
{
"epoch": 0.126883425852498,
"grad_norm": 71.572270187751,
"learning_rate": 1.9998147253379324e-07,
"logits/chosen": -1.7250394821166992,
"logits/rejected": -1.720632791519165,
"logps/chosen": -143.606201171875,
"logps/rejected": -164.64126586914062,
"loss": 0.6911,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.00029725395143032074,
"rewards/margins": 0.004560052417218685,
"rewards/rejected": -0.004857306368649006,
"step": 110
},
{
"epoch": 0.12919039723163434,
"grad_norm": 76.7363281665151,
"learning_rate": 1.999733208111637e-07,
"logits/chosen": -1.680725336074829,
"logits/rejected": -1.7269576787948608,
"logps/chosen": -141.92416381835938,
"logps/rejected": -163.63902282714844,
"loss": 0.6878,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.013096440583467484,
"rewards/margins": 0.013841900043189526,
"rewards/rejected": -0.026938341557979584,
"step": 112
},
{
"epoch": 0.13149736861077066,
"grad_norm": 71.23095597503095,
"learning_rate": 1.9996368724274726e-07,
"logits/chosen": -1.7746036052703857,
"logits/rejected": -1.651440978050232,
"logps/chosen": -201.5678253173828,
"logps/rejected": -208.96060180664062,
"loss": 0.6814,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.020835982635617256,
"rewards/margins": 0.0012882971204817295,
"rewards/rejected": -0.02212427742779255,
"step": 114
},
{
"epoch": 0.133804339989907,
"grad_norm": 78.35820889872501,
"learning_rate": 1.999525719713366e-07,
"logits/chosen": -1.6184967756271362,
"logits/rejected": -1.6276531219482422,
"logps/chosen": -138.03579711914062,
"logps/rejected": -156.24818420410156,
"loss": 0.6893,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.024172522127628326,
"rewards/margins": -0.007286247797310352,
"rewards/rejected": -0.0168862733989954,
"step": 116
},
{
"epoch": 0.13611131136904334,
"grad_norm": 73.30378065799947,
"learning_rate": 1.9993997516168685e-07,
"logits/chosen": -1.5095572471618652,
"logits/rejected": -1.4317773580551147,
"logps/chosen": -168.31259155273438,
"logps/rejected": -181.01010131835938,
"loss": 0.6946,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.02184070646762848,
"rewards/margins": -0.0038399603217840195,
"rewards/rejected": -0.01800074614584446,
"step": 118
},
{
"epoch": 0.13841828274817966,
"grad_norm": 76.67682193401895,
"learning_rate": 1.9992589700051315e-07,
"logits/chosen": -1.6505416631698608,
"logits/rejected": -1.6528055667877197,
"logps/chosen": -163.4833221435547,
"logps/rejected": -173.32627868652344,
"loss": 0.691,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.037532731890678406,
"rewards/margins": -0.001648992532864213,
"rewards/rejected": -0.035883739590644836,
"step": 120
},
{
"epoch": 0.14072525412731599,
"grad_norm": 79.25219710625622,
"learning_rate": 1.9991033769648782e-07,
"logits/chosen": -1.6732072830200195,
"logits/rejected": -1.6914747953414917,
"logps/chosen": -192.20941162109375,
"logps/rejected": -249.57064819335938,
"loss": 0.6785,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02337605319917202,
"rewards/margins": 0.03376854211091995,
"rewards/rejected": -0.05714459717273712,
"step": 122
},
{
"epoch": 0.1430322255064523,
"grad_norm": 74.1794316582206,
"learning_rate": 1.9989329748023723e-07,
"logits/chosen": -1.6055612564086914,
"logits/rejected": -1.6374058723449707,
"logps/chosen": -150.5140838623047,
"logps/rejected": -178.90463256835938,
"loss": 0.6838,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.03467298671603203,
"rewards/margins": 0.017992481589317322,
"rewards/rejected": -0.05266546458005905,
"step": 124
},
{
"epoch": 0.14533919688558863,
"grad_norm": 76.65585990685855,
"learning_rate": 1.9987477660433854e-07,
"logits/chosen": -1.6969408988952637,
"logits/rejected": -1.7563108205795288,
"logps/chosen": -142.4885711669922,
"logps/rejected": -210.4172821044922,
"loss": 0.689,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.024329736828804016,
"rewards/margins": 0.004793995060026646,
"rewards/rejected": -0.029123730957508087,
"step": 126
},
{
"epoch": 0.14764616826472496,
"grad_norm": 78.42733649878224,
"learning_rate": 1.998547753433158e-07,
"logits/chosen": -1.6231815814971924,
"logits/rejected": -1.4993540048599243,
"logps/chosen": -248.5255584716797,
"logps/rejected": -283.0628662109375,
"loss": 0.6867,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.022915348410606384,
"rewards/margins": 0.04015136882662773,
"rewards/rejected": -0.06306671351194382,
"step": 128
},
{
"epoch": 0.14995313964386128,
"grad_norm": 81.34489884736102,
"learning_rate": 1.9983329399363594e-07,
"logits/chosen": -1.696123719215393,
"logits/rejected": -1.5894306898117065,
"logps/chosen": -157.25205993652344,
"logps/rejected": -169.28765869140625,
"loss": 0.6888,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02339826337993145,
"rewards/margins": 0.029402071610093117,
"rewards/rejected": -0.05280033499002457,
"step": 130
},
{
"epoch": 0.1522601110229976,
"grad_norm": 71.86675386697708,
"learning_rate": 1.998103328737044e-07,
"logits/chosen": -1.614111304283142,
"logits/rejected": -1.668984055519104,
"logps/chosen": -169.32870483398438,
"logps/rejected": -184.28652954101562,
"loss": 0.694,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.032653309404850006,
"rewards/margins": -0.012304544448852539,
"rewards/rejected": -0.020348764955997467,
"step": 132
},
{
"epoch": 0.15456708240213396,
"grad_norm": 81.85224601265395,
"learning_rate": 1.9978589232386034e-07,
"logits/chosen": -1.715609073638916,
"logits/rejected": -1.7786422967910767,
"logps/chosen": -167.58688354492188,
"logps/rejected": -199.66270446777344,
"loss": 0.6864,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03811431676149368,
"rewards/margins": 0.017543859779834747,
"rewards/rejected": -0.05565817654132843,
"step": 134
},
{
"epoch": 0.15687405378127028,
"grad_norm": 73.7622516118343,
"learning_rate": 1.9975997270637168e-07,
"logits/chosen": -1.6321560144424438,
"logits/rejected": -1.7015608549118042,
"logps/chosen": -159.351318359375,
"logps/rejected": -176.723876953125,
"loss": 0.6859,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02714327722787857,
"rewards/margins": 0.021138466894626617,
"rewards/rejected": -0.04828174412250519,
"step": 136
},
{
"epoch": 0.1591810251604066,
"grad_norm": 71.96760531715911,
"learning_rate": 1.997325744054297e-07,
"logits/chosen": -1.5530474185943604,
"logits/rejected": -1.5373188257217407,
"logps/chosen": -158.63812255859375,
"logps/rejected": -204.0651092529297,
"loss": 0.6845,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.01808898150920868,
"rewards/margins": 0.017602307721972466,
"rewards/rejected": -0.035691291093826294,
"step": 138
},
{
"epoch": 0.16148799653954293,
"grad_norm": 73.93273755038686,
"learning_rate": 1.9970369782714328e-07,
"logits/chosen": -1.522450566291809,
"logits/rejected": -1.635149598121643,
"logps/chosen": -142.74459838867188,
"logps/rejected": -149.5770721435547,
"loss": 0.6894,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.025994691997766495,
"rewards/margins": -0.006413338705897331,
"rewards/rejected": -0.019581351429224014,
"step": 140
},
{
"epoch": 0.16379496791867926,
"grad_norm": 79.55861212361464,
"learning_rate": 1.99673343399533e-07,
"logits/chosen": -1.525217056274414,
"logits/rejected": -1.5952740907669067,
"logps/chosen": -116.86170959472656,
"logps/rejected": -175.00779724121094,
"loss": 0.6832,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.020393915474414825,
"rewards/margins": 0.032093390822410583,
"rewards/rejected": -0.05248731002211571,
"step": 142
},
{
"epoch": 0.16610193929781558,
"grad_norm": 83.97535534931897,
"learning_rate": 1.9964151157252466e-07,
"logits/chosen": -1.6767423152923584,
"logits/rejected": -1.6693997383117676,
"logps/chosen": -207.50936889648438,
"logps/rejected": -216.3134002685547,
"loss": 0.6789,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03636579588055611,
"rewards/margins": 0.03607642278075218,
"rewards/rejected": -0.07244221866130829,
"step": 144
},
{
"epoch": 0.1684089106769519,
"grad_norm": 73.85111113100436,
"learning_rate": 1.996082028179428e-07,
"logits/chosen": -1.4807971715927124,
"logits/rejected": -1.4092496633529663,
"logps/chosen": -168.455078125,
"logps/rejected": -172.4587860107422,
"loss": 0.689,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.06219344958662987,
"rewards/margins": 0.011872416362166405,
"rewards/rejected": -0.07406586408615112,
"step": 146
},
{
"epoch": 0.17071588205608823,
"grad_norm": 80.66954314511047,
"learning_rate": 1.9957341762950344e-07,
"logits/chosen": -1.5618644952774048,
"logits/rejected": -1.67661452293396,
"logps/chosen": -114.58411407470703,
"logps/rejected": -158.1700439453125,
"loss": 0.6807,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.04578070342540741,
"rewards/margins": 0.029419898986816406,
"rewards/rejected": -0.07520060241222382,
"step": 148
},
{
"epoch": 0.17302285343522458,
"grad_norm": 71.43623661516392,
"learning_rate": 1.9953715652280706e-07,
"logits/chosen": -1.6976016759872437,
"logits/rejected": -1.6299835443496704,
"logps/chosen": -228.1553497314453,
"logps/rejected": -214.32037353515625,
"loss": 0.6848,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.06526876986026764,
"rewards/margins": -0.002617661375552416,
"rewards/rejected": -0.06265110522508621,
"step": 150
},
{
"epoch": 0.1753298248143609,
"grad_norm": 85.31030453694517,
"learning_rate": 1.9949942003533064e-07,
"logits/chosen": -1.7211732864379883,
"logits/rejected": -1.720245122909546,
"logps/chosen": -138.57936096191406,
"logps/rejected": -158.134521484375,
"loss": 0.6829,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.042685676366090775,
"rewards/margins": -0.006229763850569725,
"rewards/rejected": -0.0364559069275856,
"step": 152
},
{
"epoch": 0.17763679619349723,
"grad_norm": 75.37656195588123,
"learning_rate": 1.9946020872642006e-07,
"logits/chosen": -1.602712631225586,
"logits/rejected": -1.5105926990509033,
"logps/chosen": -152.95616149902344,
"logps/rejected": -252.92359924316406,
"loss": 0.6848,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.05936397612094879,
"rewards/margins": 0.023188650608062744,
"rewards/rejected": -0.08255261927843094,
"step": 154
},
{
"epoch": 0.17994376757263356,
"grad_norm": 74.43256656918146,
"learning_rate": 1.9941952317728147e-07,
"logits/chosen": -1.6266837120056152,
"logits/rejected": -1.5794254541397095,
"logps/chosen": -154.70660400390625,
"logps/rejected": -171.6692657470703,
"loss": 0.688,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0499938428401947,
"rewards/margins": 0.022745870053768158,
"rewards/rejected": -0.07273972034454346,
"step": 156
},
{
"epoch": 0.18225073895176988,
"grad_norm": 75.91057680239186,
"learning_rate": 1.993773639909728e-07,
"logits/chosen": -1.49541437625885,
"logits/rejected": -1.6966127157211304,
"logps/chosen": -165.41343688964844,
"logps/rejected": -208.544189453125,
"loss": 0.6768,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.02937229909002781,
"rewards/margins": 0.061954449862241745,
"rewards/rejected": -0.0913267433643341,
"step": 158
},
{
"epoch": 0.1845577103309062,
"grad_norm": 79.53079693796676,
"learning_rate": 1.99333731792395e-07,
"logits/chosen": -1.5714216232299805,
"logits/rejected": -1.543687343597412,
"logps/chosen": -153.09767150878906,
"logps/rejected": -177.41847229003906,
"loss": 0.684,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0551866851747036,
"rewards/margins": 0.03179415315389633,
"rewards/rejected": -0.08698083460330963,
"step": 160
},
{
"epoch": 0.18686468171004253,
"grad_norm": 73.91686194821585,
"learning_rate": 1.9928862722828242e-07,
"logits/chosen": -1.7037162780761719,
"logits/rejected": -1.675144076347351,
"logps/chosen": -153.01358032226562,
"logps/rejected": -175.93673706054688,
"loss": 0.6781,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.023145336657762527,
"rewards/margins": 0.06261962652206421,
"rewards/rejected": -0.08576496690511703,
"step": 162
},
{
"epoch": 0.18917165308917885,
"grad_norm": 76.57698818122466,
"learning_rate": 1.9924205096719357e-07,
"logits/chosen": -1.5918736457824707,
"logits/rejected": -1.4768625497817993,
"logps/chosen": -196.1853485107422,
"logps/rejected": -179.04530334472656,
"loss": 0.6692,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.04375737905502319,
"rewards/margins": 0.0569818913936615,
"rewards/rejected": -0.10073927044868469,
"step": 164
},
{
"epoch": 0.19147862446831518,
"grad_norm": 77.81644602396882,
"learning_rate": 1.9919400369950097e-07,
"logits/chosen": -1.4722576141357422,
"logits/rejected": -1.540255069732666,
"logps/chosen": -205.6660614013672,
"logps/rejected": -248.9489288330078,
"loss": 0.6786,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.043852321803569794,
"rewards/margins": 0.0406915545463562,
"rewards/rejected": -0.084543876349926,
"step": 166
},
{
"epoch": 0.19378559584745153,
"grad_norm": 75.44269350923808,
"learning_rate": 1.9914448613738103e-07,
"logits/chosen": -1.529039740562439,
"logits/rejected": -1.5173804759979248,
"logps/chosen": -202.2668914794922,
"logps/rejected": -226.52684020996094,
"loss": 0.6763,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07004385441541672,
"rewards/margins": 0.021601226180791855,
"rewards/rejected": -0.09164508432149887,
"step": 168
},
{
"epoch": 0.19609256722658785,
"grad_norm": 76.79086636181425,
"learning_rate": 1.9909349901480347e-07,
"logits/chosen": -1.610205888748169,
"logits/rejected": -1.622270107269287,
"logps/chosen": -152.17263793945312,
"logps/rejected": -153.09571838378906,
"loss": 0.6826,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.049044616520404816,
"rewards/margins": 0.020372100174427032,
"rewards/rejected": -0.06941672414541245,
"step": 170
},
{
"epoch": 0.19839953860572418,
"grad_norm": 65.6162753738054,
"learning_rate": 1.990410430875205e-07,
"logits/chosen": -1.6482963562011719,
"logits/rejected": -1.6161506175994873,
"logps/chosen": -131.635986328125,
"logps/rejected": -142.7827606201172,
"loss": 0.6691,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.028512008488178253,
"rewards/margins": 0.06365156173706055,
"rewards/rejected": -0.0921635702252388,
"step": 172
},
{
"epoch": 0.2007065099848605,
"grad_norm": 90.14268267387868,
"learning_rate": 1.9898711913305547e-07,
"logits/chosen": -1.5566825866699219,
"logits/rejected": -1.6173129081726074,
"logps/chosen": -174.24017333984375,
"logps/rejected": -181.01629638671875,
"loss": 0.6766,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.05642353743314743,
"rewards/margins": 0.03944730758666992,
"rewards/rejected": -0.09587083756923676,
"step": 174
},
{
"epoch": 0.20301348136399683,
"grad_norm": 76.41450508388954,
"learning_rate": 1.9893172795069142e-07,
"logits/chosen": -1.5998440980911255,
"logits/rejected": -1.6545708179473877,
"logps/chosen": -156.8663330078125,
"logps/rejected": -159.1892547607422,
"loss": 0.6915,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.05183999985456467,
"rewards/margins": 0.0032500806264579296,
"rewards/rejected": -0.055090077221393585,
"step": 176
},
{
"epoch": 0.20532045274313315,
"grad_norm": 87.56584383747318,
"learning_rate": 1.988748703614594e-07,
"logits/chosen": -1.6627997159957886,
"logits/rejected": -1.6540586948394775,
"logps/chosen": -155.84368896484375,
"logps/rejected": -186.67495727539062,
"loss": 0.6755,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.03810984268784523,
"rewards/margins": 0.029699210077524185,
"rewards/rejected": -0.06780905276536942,
"step": 178
},
{
"epoch": 0.20762742412226948,
"grad_norm": 70.4422512373765,
"learning_rate": 1.9881654720812592e-07,
"logits/chosen": -1.5361154079437256,
"logits/rejected": -1.610466480255127,
"logps/chosen": -115.56837463378906,
"logps/rejected": -142.57766723632812,
"loss": 0.6805,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.02763887494802475,
"rewards/margins": 0.0444108322262764,
"rewards/rejected": -0.07204970717430115,
"step": 180
},
{
"epoch": 0.2099343955014058,
"grad_norm": 71.9817063853092,
"learning_rate": 1.9875675935518094e-07,
"logits/chosen": -1.547518014907837,
"logits/rejected": -1.5500645637512207,
"logps/chosen": -226.55401611328125,
"logps/rejected": -206.99913024902344,
"loss": 0.6836,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.09524687379598618,
"rewards/margins": 0.004950803238898516,
"rewards/rejected": -0.1001976728439331,
"step": 182
},
{
"epoch": 0.21224136688054213,
"grad_norm": 84.4720159255109,
"learning_rate": 1.9869550768882454e-07,
"logits/chosen": -1.5599523782730103,
"logits/rejected": -1.5133187770843506,
"logps/chosen": -182.1778564453125,
"logps/rejected": -241.0075225830078,
"loss": 0.6671,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.057929787784814835,
"rewards/margins": 0.0736684501171112,
"rewards/rejected": -0.13159823417663574,
"step": 184
},
{
"epoch": 0.21454833825967848,
"grad_norm": 73.76638464490958,
"learning_rate": 1.9863279311695428e-07,
"logits/chosen": -1.4902362823486328,
"logits/rejected": -1.55423903465271,
"logps/chosen": -219.845703125,
"logps/rejected": -273.73712158203125,
"loss": 0.6773,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07643218338489532,
"rewards/margins": 0.07768993079662323,
"rewards/rejected": -0.15412212908267975,
"step": 186
},
{
"epoch": 0.2168553096388148,
"grad_norm": 68.86254513199266,
"learning_rate": 1.985686165691514e-07,
"logits/chosen": -1.704699993133545,
"logits/rejected": -1.6342915296554565,
"logps/chosen": -120.14341735839844,
"logps/rejected": -114.1607666015625,
"loss": 0.6819,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03834118694067001,
"rewards/margins": -0.00312834233045578,
"rewards/rejected": -0.03521284461021423,
"step": 188
},
{
"epoch": 0.21916228101795113,
"grad_norm": 77.39475970610044,
"learning_rate": 1.9850297899666707e-07,
"logits/chosen": -1.6166346073150635,
"logits/rejected": -1.666224479675293,
"logps/chosen": -138.47250366210938,
"logps/rejected": -183.75860595703125,
"loss": 0.6784,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.059065092355012894,
"rewards/margins": 0.0348266139626503,
"rewards/rejected": -0.09389171749353409,
"step": 190
},
{
"epoch": 0.22146925239708745,
"grad_norm": 79.62622809714712,
"learning_rate": 1.9843588137240855e-07,
"logits/chosen": -1.4786595106124878,
"logits/rejected": -1.5819900035858154,
"logps/chosen": -156.80630493164062,
"logps/rejected": -225.65887451171875,
"loss": 0.6727,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.06785481423139572,
"rewards/margins": 0.046741731464862823,
"rewards/rejected": -0.11459654569625854,
"step": 192
},
{
"epoch": 0.22377622377622378,
"grad_norm": 71.61908905732174,
"learning_rate": 1.9836732469092446e-07,
"logits/chosen": -1.7382750511169434,
"logits/rejected": -1.7238702774047852,
"logps/chosen": -135.97625732421875,
"logps/rejected": -134.9537353515625,
"loss": 0.6751,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07399953901767731,
"rewards/margins": -0.0150267593562603,
"rewards/rejected": -0.05897277966141701,
"step": 194
},
{
"epoch": 0.2260831951553601,
"grad_norm": 77.39998086488785,
"learning_rate": 1.982973099683902e-07,
"logits/chosen": -1.6806734800338745,
"logits/rejected": -1.7225916385650635,
"logps/chosen": -139.36279296875,
"logps/rejected": -160.1461639404297,
"loss": 0.6592,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.05896836146712303,
"rewards/margins": 0.05167616903781891,
"rewards/rejected": -0.11064451932907104,
"step": 196
},
{
"epoch": 0.22839016653449642,
"grad_norm": 71.42176978474515,
"learning_rate": 1.982258382425928e-07,
"logits/chosen": -1.53923499584198,
"logits/rejected": -1.5509108304977417,
"logps/chosen": -145.9883575439453,
"logps/rejected": -173.95388793945312,
"loss": 0.6819,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.06814471632242203,
"rewards/margins": 0.05520961806178093,
"rewards/rejected": -0.12335430830717087,
"step": 198
},
{
"epoch": 0.23069713791363275,
"grad_norm": 65.61388633290626,
"learning_rate": 1.9815291057291578e-07,
"logits/chosen": -1.5758477449417114,
"logits/rejected": -1.6140058040618896,
"logps/chosen": -105.85581970214844,
"logps/rejected": -122.97176361083984,
"loss": 0.679,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.06785964965820312,
"rewards/margins": 0.01952926628291607,
"rewards/rejected": -0.08738891780376434,
"step": 200
},
{
"epoch": 0.23069713791363275,
"eval_logits/chosen": -1.5610222816467285,
"eval_logits/rejected": -1.462950587272644,
"eval_logps/chosen": -186.2912139892578,
"eval_logps/rejected": -151.6300048828125,
"eval_loss": 0.6922155618667603,
"eval_rewards/accuracies": 0.5600000023841858,
"eval_rewards/chosen": -0.11090204119682312,
"eval_rewards/margins": 0.0005596327828243375,
"eval_rewards/rejected": -0.11146167665719986,
"eval_runtime": 21.7555,
"eval_samples_per_second": 4.597,
"eval_steps_per_second": 1.149,
"step": 200
},
{
"epoch": 0.23300410929276907,
"grad_norm": 69.78727065961301,
"learning_rate": 1.9807852804032302e-07,
"logits/chosen": -1.4734337329864502,
"logits/rejected": -1.491389513015747,
"logps/chosen": -154.9561767578125,
"logps/rejected": -204.73797607421875,
"loss": 0.6705,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.046838290989398956,
"rewards/margins": 0.11147616803646088,
"rewards/rejected": -0.15831446647644043,
"step": 202
},
{
"epoch": 0.23531108067190543,
"grad_norm": 82.59595988703826,
"learning_rate": 1.980026917473432e-07,
"logits/chosen": -1.5889283418655396,
"logits/rejected": -1.7049566507339478,
"logps/chosen": -174.74598693847656,
"logps/rejected": -223.11842346191406,
"loss": 0.6737,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.05181250721216202,
"rewards/margins": 0.09021350741386414,
"rewards/rejected": -0.14202602207660675,
"step": 204
},
{
"epoch": 0.23761805205104175,
"grad_norm": 67.38723201323596,
"learning_rate": 1.9792540281805298e-07,
"logits/chosen": -1.4892499446868896,
"logits/rejected": -1.517817497253418,
"logps/chosen": -140.6378631591797,
"logps/rejected": -161.47348022460938,
"loss": 0.6682,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.08649797737598419,
"rewards/margins": 0.03692768141627312,
"rewards/rejected": -0.12342565506696701,
"step": 206
},
{
"epoch": 0.23992502343017807,
"grad_norm": 73.06374890842753,
"learning_rate": 1.9784666239806089e-07,
"logits/chosen": -1.5101206302642822,
"logits/rejected": -1.5768136978149414,
"logps/chosen": -164.27035522460938,
"logps/rejected": -203.25975036621094,
"loss": 0.6718,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.06570874899625778,
"rewards/margins": 0.09578941017389297,
"rewards/rejected": -0.16149815917015076,
"step": 208
},
{
"epoch": 0.2422319948093144,
"grad_norm": 75.60130708995507,
"learning_rate": 1.9776647165448983e-07,
"logits/chosen": -1.5699687004089355,
"logits/rejected": -1.520723581314087,
"logps/chosen": -188.6508331298828,
"logps/rejected": -217.18365478515625,
"loss": 0.6708,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08720759302377701,
"rewards/margins": 0.02055392973124981,
"rewards/rejected": -0.10776151716709137,
"step": 210
},
{
"epoch": 0.24453896618845072,
"grad_norm": 76.08847895651103,
"learning_rate": 1.9768483177596006e-07,
"logits/chosen": -1.5900119543075562,
"logits/rejected": -1.6237448453903198,
"logps/chosen": -143.37664794921875,
"logps/rejected": -166.52413940429688,
"loss": 0.6689,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0496542751789093,
"rewards/margins": 0.06798863410949707,
"rewards/rejected": -0.11764290928840637,
"step": 212
},
{
"epoch": 0.24684593756758705,
"grad_norm": 85.55002048517719,
"learning_rate": 1.9760174397257153e-07,
"logits/chosen": -1.5799341201782227,
"logits/rejected": -1.5739325284957886,
"logps/chosen": -187.42578125,
"logps/rejected": -227.91946411132812,
"loss": 0.6871,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.11932831257581711,
"rewards/margins": 0.00022871512919664383,
"rewards/rejected": -0.11955701559782028,
"step": 214
},
{
"epoch": 0.24915290894672337,
"grad_norm": 79.3908422125236,
"learning_rate": 1.97517209475886e-07,
"logits/chosen": -1.578735589981079,
"logits/rejected": -1.7003827095031738,
"logps/chosen": -147.41778564453125,
"logps/rejected": -185.31602478027344,
"loss": 0.6678,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.07496561855077744,
"rewards/margins": 0.0857120081782341,
"rewards/rejected": -0.16067762672901154,
"step": 216
},
{
"epoch": 0.2514598803258597,
"grad_norm": 78.5386783375159,
"learning_rate": 1.9743122953890854e-07,
"logits/chosen": -1.5871162414550781,
"logits/rejected": -1.5231672525405884,
"logps/chosen": -174.3480682373047,
"logps/rejected": -196.07998657226562,
"loss": 0.6548,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.07399033010005951,
"rewards/margins": 0.05520808696746826,
"rewards/rejected": -0.12919840216636658,
"step": 218
},
{
"epoch": 0.253766851704996,
"grad_norm": 80.06682238716625,
"learning_rate": 1.9734380543606927e-07,
"logits/chosen": -1.643662452697754,
"logits/rejected": -1.6430741548538208,
"logps/chosen": -200.37826538085938,
"logps/rejected": -207.41136169433594,
"loss": 0.6815,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.07183612138032913,
"rewards/margins": 0.07066242396831512,
"rewards/rejected": -0.14249853789806366,
"step": 220
},
{
"epoch": 0.25607382308413235,
"grad_norm": 73.47957521005397,
"learning_rate": 1.972549384632043e-07,
"logits/chosen": -1.5852243900299072,
"logits/rejected": -1.736151099205017,
"logps/chosen": -167.52194213867188,
"logps/rejected": -218.90122985839844,
"loss": 0.6604,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0951998233795166,
"rewards/margins": 0.027591748163104057,
"rewards/rejected": -0.12279157340526581,
"step": 222
},
{
"epoch": 0.25838079446326867,
"grad_norm": 79.05067580811021,
"learning_rate": 1.9716462993753655e-07,
"logits/chosen": -1.476207971572876,
"logits/rejected": -1.5456207990646362,
"logps/chosen": -288.57379150390625,
"logps/rejected": -338.8498840332031,
"loss": 0.6567,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.17371979355812073,
"rewards/margins": 0.15009327232837677,
"rewards/rejected": -0.3238130807876587,
"step": 224
},
{
"epoch": 0.260687765842405,
"grad_norm": 67.9414989656304,
"learning_rate": 1.9707288119765622e-07,
"logits/chosen": -1.5781480073928833,
"logits/rejected": -1.569219708442688,
"logps/chosen": -124.80656433105469,
"logps/rejected": -141.52476501464844,
"loss": 0.6732,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12547817826271057,
"rewards/margins": 0.03562304750084877,
"rewards/rejected": -0.16110120713710785,
"step": 226
},
{
"epoch": 0.2629947372215413,
"grad_norm": 78.11670530443735,
"learning_rate": 1.9697969360350095e-07,
"logits/chosen": -1.6346409320831299,
"logits/rejected": -1.565224051475525,
"logps/chosen": -178.9912109375,
"logps/rejected": -190.82681274414062,
"loss": 0.6661,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.10180149972438812,
"rewards/margins": 0.05022910237312317,
"rewards/rejected": -0.1520306020975113,
"step": 228
},
{
"epoch": 0.2653017086006777,
"grad_norm": 68.3329236115507,
"learning_rate": 1.968850685363357e-07,
"logits/chosen": -1.7000384330749512,
"logits/rejected": -1.7287462949752808,
"logps/chosen": -199.75430297851562,
"logps/rejected": -241.5220947265625,
"loss": 0.6556,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.09640266001224518,
"rewards/margins": 0.09287622570991516,
"rewards/rejected": -0.18927887082099915,
"step": 230
},
{
"epoch": 0.267608679979814,
"grad_norm": 82.26094651176318,
"learning_rate": 1.9678900739873226e-07,
"logits/chosen": -1.677142858505249,
"logits/rejected": -1.6745737791061401,
"logps/chosen": -170.59425354003906,
"logps/rejected": -181.05661010742188,
"loss": 0.6694,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.1134408637881279,
"rewards/margins": 0.036339692771434784,
"rewards/rejected": -0.14978057146072388,
"step": 232
},
{
"epoch": 0.26991565135895035,
"grad_norm": 78.36181831181821,
"learning_rate": 1.966915116145484e-07,
"logits/chosen": -1.4915921688079834,
"logits/rejected": -1.523095726966858,
"logps/chosen": -155.88290405273438,
"logps/rejected": -164.45022583007812,
"loss": 0.6567,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0988837480545044,
"rewards/margins": 0.09765380620956421,
"rewards/rejected": -0.196537584066391,
"step": 234
},
{
"epoch": 0.2722226227380867,
"grad_norm": 83.64468364737313,
"learning_rate": 1.965925826289068e-07,
"logits/chosen": -1.6482906341552734,
"logits/rejected": -1.6469372510910034,
"logps/chosen": -185.45001220703125,
"logps/rejected": -208.0437469482422,
"loss": 0.6708,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0873761773109436,
"rewards/margins": 0.06748253107070923,
"rewards/rejected": -0.15485870838165283,
"step": 236
},
{
"epoch": 0.274529594117223,
"grad_norm": 74.16820675500993,
"learning_rate": 1.964922219081738e-07,
"logits/chosen": -1.764983057975769,
"logits/rejected": -1.7145969867706299,
"logps/chosen": -223.3017578125,
"logps/rejected": -218.1916046142578,
"loss": 0.6555,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11543229222297668,
"rewards/margins": 0.12305162101984024,
"rewards/rejected": -0.23848390579223633,
"step": 238
},
{
"epoch": 0.2768365654963593,
"grad_norm": 75.00121800473413,
"learning_rate": 1.9639043093993727e-07,
"logits/chosen": -1.5264173746109009,
"logits/rejected": -1.4717910289764404,
"logps/chosen": -178.43338012695312,
"logps/rejected": -188.60101318359375,
"loss": 0.6481,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.09335748851299286,
"rewards/margins": 0.026005972176790237,
"rewards/rejected": -0.1193634569644928,
"step": 240
},
{
"epoch": 0.27914353687549565,
"grad_norm": 64.46972140040022,
"learning_rate": 1.9628721123298492e-07,
"logits/chosen": -1.6837042570114136,
"logits/rejected": -1.6980068683624268,
"logps/chosen": -161.4723663330078,
"logps/rejected": -171.20248413085938,
"loss": 0.6609,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.11237211525440216,
"rewards/margins": 0.049555521458387375,
"rewards/rejected": -0.16192764043807983,
"step": 242
},
{
"epoch": 0.28145050825463197,
"grad_norm": 66.85001786944659,
"learning_rate": 1.961825643172819e-07,
"logits/chosen": -1.5771496295928955,
"logits/rejected": -1.5039366483688354,
"logps/chosen": -158.33685302734375,
"logps/rejected": -160.24057006835938,
"loss": 0.6701,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.14522022008895874,
"rewards/margins": 0.04340605437755585,
"rewards/rejected": -0.1886262595653534,
"step": 244
},
{
"epoch": 0.2837574796337683,
"grad_norm": 76.39303352760503,
"learning_rate": 1.9607649174394787e-07,
"logits/chosen": -1.4101349115371704,
"logits/rejected": -1.4513871669769287,
"logps/chosen": -147.43826293945312,
"logps/rejected": -182.31005859375,
"loss": 0.6596,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.08130650967359543,
"rewards/margins": 0.0953046903014183,
"rewards/rejected": -0.17661119997501373,
"step": 246
},
{
"epoch": 0.2860644510129046,
"grad_norm": 84.03579410656208,
"learning_rate": 1.959689950852343e-07,
"logits/chosen": -1.6520403623580933,
"logits/rejected": -1.6739228963851929,
"logps/chosen": -172.19305419921875,
"logps/rejected": -184.803466796875,
"loss": 0.6669,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.16027307510375977,
"rewards/margins": 0.02343956008553505,
"rewards/rejected": -0.1837126612663269,
"step": 248
},
{
"epoch": 0.28837142239204094,
"grad_norm": 78.31280460476106,
"learning_rate": 1.9586007593450095e-07,
"logits/chosen": -1.568188190460205,
"logits/rejected": -1.586582064628601,
"logps/chosen": -169.95675659179688,
"logps/rejected": -188.78858947753906,
"loss": 0.6779,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.1373595893383026,
"rewards/margins": 0.02867070771753788,
"rewards/rejected": -0.16603030264377594,
"step": 250
},
{
"epoch": 0.29067839377117727,
"grad_norm": 77.82835801736759,
"learning_rate": 1.957497359061924e-07,
"logits/chosen": -1.5796047449111938,
"logits/rejected": -1.5543608665466309,
"logps/chosen": -191.53219604492188,
"logps/rejected": -220.70361328125,
"loss": 0.6393,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.15552642941474915,
"rewards/margins": 0.09192191064357758,
"rewards/rejected": -0.24744835495948792,
"step": 252
},
{
"epoch": 0.2929853651503136,
"grad_norm": 81.6835137198976,
"learning_rate": 1.956379766358141e-07,
"logits/chosen": -1.5779876708984375,
"logits/rejected": -1.504298448562622,
"logps/chosen": -218.59942626953125,
"logps/rejected": -230.2102813720703,
"loss": 0.6635,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.14433127641677856,
"rewards/margins": 0.08938172459602356,
"rewards/rejected": -0.23371298611164093,
"step": 254
},
{
"epoch": 0.2952923365294499,
"grad_norm": 74.03670184892391,
"learning_rate": 1.9552479977990798e-07,
"logits/chosen": -1.6765474081039429,
"logits/rejected": -1.643741488456726,
"logps/chosen": -185.69444274902344,
"logps/rejected": -199.7008819580078,
"loss": 0.676,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14419934153556824,
"rewards/margins": 0.016861233860254288,
"rewards/rejected": -0.16106057167053223,
"step": 256
},
{
"epoch": 0.29759930790858624,
"grad_norm": 79.12914884219855,
"learning_rate": 1.954102070160281e-07,
"logits/chosen": -1.6632733345031738,
"logits/rejected": -1.6073827743530273,
"logps/chosen": -149.79641723632812,
"logps/rejected": -174.7237091064453,
"loss": 0.6638,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.10197722166776657,
"rewards/margins": 0.07854845374822617,
"rewards/rejected": -0.18052567541599274,
"step": 258
},
{
"epoch": 0.29990627928772257,
"grad_norm": 80.451175401901,
"learning_rate": 1.9529420004271567e-07,
"logits/chosen": -1.5313125848770142,
"logits/rejected": -1.5560095310211182,
"logps/chosen": -207.1497802734375,
"logps/rejected": -222.1211395263672,
"loss": 0.6407,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.16675114631652832,
"rewards/margins": 0.1034877672791481,
"rewards/rejected": -0.2702389061450958,
"step": 260
},
{
"epoch": 0.3022132506668589,
"grad_norm": 66.85531080853532,
"learning_rate": 1.9517678057947382e-07,
"logits/chosen": -1.6430004835128784,
"logits/rejected": -1.597357153892517,
"logps/chosen": -135.1138153076172,
"logps/rejected": -132.63619995117188,
"loss": 0.6618,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.11752544343471527,
"rewards/margins": 0.03515633940696716,
"rewards/rejected": -0.15268178284168243,
"step": 262
},
{
"epoch": 0.3045202220459952,
"grad_norm": 80.22448615221649,
"learning_rate": 1.9505795036674232e-07,
"logits/chosen": -1.6319184303283691,
"logits/rejected": -1.4991899728775024,
"logps/chosen": -217.16680908203125,
"logps/rejected": -245.2107696533203,
"loss": 0.6523,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.18451935052871704,
"rewards/margins": 0.11840308457612991,
"rewards/rejected": -0.30292242765426636,
"step": 264
},
{
"epoch": 0.3068271934251316,
"grad_norm": 69.95576974969472,
"learning_rate": 1.9493771116587156e-07,
"logits/chosen": -1.5522364377975464,
"logits/rejected": -1.5948469638824463,
"logps/chosen": -113.81831359863281,
"logps/rejected": -155.99346923828125,
"loss": 0.6551,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.08541279286146164,
"rewards/margins": 0.14459526538848877,
"rewards/rejected": -0.230008065700531,
"step": 266
},
{
"epoch": 0.3091341648042679,
"grad_norm": 75.25744246014891,
"learning_rate": 1.9481606475909656e-07,
"logits/chosen": -1.500025749206543,
"logits/rejected": -1.5494239330291748,
"logps/chosen": -125.84722900390625,
"logps/rejected": -164.76669311523438,
"loss": 0.6526,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.08980143815279007,
"rewards/margins": 0.17133310437202454,
"rewards/rejected": -0.2611345648765564,
"step": 268
},
{
"epoch": 0.31144113618340424,
"grad_norm": 77.53434606640313,
"learning_rate": 1.9469301294951057e-07,
"logits/chosen": -1.6267601251602173,
"logits/rejected": -1.5587116479873657,
"logps/chosen": -172.08139038085938,
"logps/rejected": -181.32717895507812,
"loss": 0.6596,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.1692037582397461,
"rewards/margins": 0.051171936094760895,
"rewards/rejected": -0.2203756868839264,
"step": 270
},
{
"epoch": 0.31374810756254057,
"grad_norm": 74.84897771580975,
"learning_rate": 1.9456855756103816e-07,
"logits/chosen": -1.5624661445617676,
"logits/rejected": -1.6530312299728394,
"logps/chosen": -147.84597778320312,
"logps/rejected": -174.6589813232422,
"loss": 0.6707,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13743659853935242,
"rewards/margins": 0.05220307409763336,
"rewards/rejected": -0.18963965773582458,
"step": 272
},
{
"epoch": 0.3160550789416769,
"grad_norm": 71.99044362933952,
"learning_rate": 1.9444270043840852e-07,
"logits/chosen": -1.6625701189041138,
"logits/rejected": -1.5914949178695679,
"logps/chosen": -147.29147338867188,
"logps/rejected": -129.6570587158203,
"loss": 0.6831,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.21893730759620667,
"rewards/margins": -0.0283275805413723,
"rewards/rejected": -0.19060972332954407,
"step": 274
},
{
"epoch": 0.3183620503208132,
"grad_norm": 75.21160091684173,
"learning_rate": 1.9431544344712772e-07,
"logits/chosen": -1.4378788471221924,
"logits/rejected": -1.3864963054656982,
"logps/chosen": -147.2783660888672,
"logps/rejected": -177.4646759033203,
"loss": 0.6472,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.11688334494829178,
"rewards/margins": 0.11695411056280136,
"rewards/rejected": -0.23383745551109314,
"step": 276
},
{
"epoch": 0.32066902169994954,
"grad_norm": 72.07898599790276,
"learning_rate": 1.9418678847345146e-07,
"logits/chosen": -1.5210872888565063,
"logits/rejected": -1.5768458843231201,
"logps/chosen": -164.58419799804688,
"logps/rejected": -213.6575469970703,
"loss": 0.6664,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.12316928803920746,
"rewards/margins": 0.07401876151561737,
"rewards/rejected": -0.19718804955482483,
"step": 278
},
{
"epoch": 0.32297599307908587,
"grad_norm": 67.17404804695744,
"learning_rate": 1.9405673742435676e-07,
"logits/chosen": -1.5087511539459229,
"logits/rejected": -1.5612874031066895,
"logps/chosen": -142.5220947265625,
"logps/rejected": -195.3551483154297,
"loss": 0.6718,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.14393991231918335,
"rewards/margins": 0.11825156211853027,
"rewards/rejected": -0.2621914744377136,
"step": 280
},
{
"epoch": 0.3252829644582222,
"grad_norm": 81.75237089659649,
"learning_rate": 1.939252922275139e-07,
"logits/chosen": -1.6113684177398682,
"logits/rejected": -1.520400047302246,
"logps/chosen": -215.8910675048828,
"logps/rejected": -227.26637268066406,
"loss": 0.6556,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.2654890716075897,
"rewards/margins": 0.09542025625705719,
"rewards/rejected": -0.3609093129634857,
"step": 282
},
{
"epoch": 0.3275899358373585,
"grad_norm": 65.02297736065502,
"learning_rate": 1.937924548312578e-07,
"logits/chosen": -1.6812703609466553,
"logits/rejected": -1.7281326055526733,
"logps/chosen": -130.5011749267578,
"logps/rejected": -195.49452209472656,
"loss": 0.6431,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.12183240056037903,
"rewards/margins": 0.12844915688037872,
"rewards/rejected": -0.25028154253959656,
"step": 284
},
{
"epoch": 0.32989690721649484,
"grad_norm": 75.13263031113792,
"learning_rate": 1.9365822720455912e-07,
"logits/chosen": -1.4847445487976074,
"logits/rejected": -1.4161133766174316,
"logps/chosen": -154.5245361328125,
"logps/rejected": -203.3861541748047,
"loss": 0.6537,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.16228517889976501,
"rewards/margins": 0.12002203613519669,
"rewards/rejected": -0.2823072075843811,
"step": 286
},
{
"epoch": 0.33220387859563116,
"grad_norm": 78.41024724428831,
"learning_rate": 1.935226113369951e-07,
"logits/chosen": -1.686346173286438,
"logits/rejected": -1.6542606353759766,
"logps/chosen": -172.25059509277344,
"logps/rejected": -199.93182373046875,
"loss": 0.6469,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.12491661310195923,
"rewards/margins": 0.12406705319881439,
"rewards/rejected": -0.24898366630077362,
"step": 288
},
{
"epoch": 0.3345108499747675,
"grad_norm": 74.32689822052723,
"learning_rate": 1.9338560923872006e-07,
"logits/chosen": -1.5119750499725342,
"logits/rejected": -1.524541974067688,
"logps/chosen": -159.21376037597656,
"logps/rejected": -237.09561157226562,
"loss": 0.6455,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1832338273525238,
"rewards/margins": 0.2291288673877716,
"rewards/rejected": -0.4123626947402954,
"step": 290
},
{
"epoch": 0.3368178213539038,
"grad_norm": 77.04623177811982,
"learning_rate": 1.9324722294043556e-07,
"logits/chosen": -1.6212831735610962,
"logits/rejected": -1.5947524309158325,
"logps/chosen": -187.361572265625,
"logps/rejected": -187.34519958496094,
"loss": 0.6585,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.24990221858024597,
"rewards/margins": 0.07834864407777786,
"rewards/rejected": -0.3282508850097656,
"step": 292
},
{
"epoch": 0.33912479273304014,
"grad_norm": 83.55231847560428,
"learning_rate": 1.9310745449336044e-07,
"logits/chosen": -1.58076012134552,
"logits/rejected": -1.5445674657821655,
"logps/chosen": -192.48617553710938,
"logps/rejected": -215.64193725585938,
"loss": 0.6418,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1813565194606781,
"rewards/margins": 0.1242499127984047,
"rewards/rejected": -0.3056064546108246,
"step": 294
},
{
"epoch": 0.34143176411217646,
"grad_norm": 73.20320061572161,
"learning_rate": 1.929663059692002e-07,
"logits/chosen": -1.477115273475647,
"logits/rejected": -1.5140092372894287,
"logps/chosen": -154.4539794921875,
"logps/rejected": -214.9960174560547,
"loss": 0.6894,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2464270293712616,
"rewards/margins": 0.08608925342559814,
"rewards/rejected": -0.33251628279685974,
"step": 296
},
{
"epoch": 0.3437387354913128,
"grad_norm": 82.85464536249332,
"learning_rate": 1.928237794601165e-07,
"logits/chosen": -1.5687949657440186,
"logits/rejected": -1.6849851608276367,
"logps/chosen": -140.14784240722656,
"logps/rejected": -234.17706298828125,
"loss": 0.6525,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.1264043152332306,
"rewards/margins": 0.24397864937782288,
"rewards/rejected": -0.3703829348087311,
"step": 298
},
{
"epoch": 0.34604570687044917,
"grad_norm": 65.65777237412837,
"learning_rate": 1.9267987707869604e-07,
"logits/chosen": -1.4391192197799683,
"logits/rejected": -1.4724018573760986,
"logps/chosen": -153.69284057617188,
"logps/rejected": -173.3372039794922,
"loss": 0.6486,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.16697266697883606,
"rewards/margins": 0.13511566817760468,
"rewards/rejected": -0.30208835005760193,
"step": 300
},
{
"epoch": 0.34604570687044917,
"eval_logits/chosen": -1.5305781364440918,
"eval_logits/rejected": -1.4347938299179077,
"eval_logps/chosen": -187.96263122558594,
"eval_logps/rejected": -153.34820556640625,
"eval_loss": 0.678679347038269,
"eval_rewards/accuracies": 0.6399999856948853,
"eval_rewards/chosen": -0.278046578168869,
"eval_rewards/margins": 0.005234198644757271,
"eval_rewards/rejected": -0.28328076004981995,
"eval_runtime": 21.7114,
"eval_samples_per_second": 4.606,
"eval_steps_per_second": 1.151,
"step": 300
},
{
"epoch": 0.3483526782495855,
"grad_norm": 69.96196416042814,
"learning_rate": 1.9253460095791922e-07,
"logits/chosen": -1.5020473003387451,
"logits/rejected": -1.4953689575195312,
"logps/chosen": -106.53646087646484,
"logps/rejected": -165.1669158935547,
"loss": 0.6546,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.15904603898525238,
"rewards/margins": 0.06554871797561646,
"rewards/rejected": -0.22459478676319122,
"step": 302
},
{
"epoch": 0.3506596496287218,
"grad_norm": 74.69729400373957,
"learning_rate": 1.9238795325112868e-07,
"logits/chosen": -1.636529803276062,
"logits/rejected": -1.6348826885223389,
"logps/chosen": -140.86441040039062,
"logps/rejected": -174.48370361328125,
"loss": 0.6433,
"rewards/accuracies": 0.84375,
"rewards/chosen": -0.12615619599819183,
"rewards/margins": 0.20733490586280823,
"rewards/rejected": -0.3334910571575165,
"step": 304
},
{
"epoch": 0.35296662100785814,
"grad_norm": 84.17293540044481,
"learning_rate": 1.9223993613199713e-07,
"logits/chosen": -1.6913816928863525,
"logits/rejected": -1.6646835803985596,
"logps/chosen": -152.25997924804688,
"logps/rejected": -171.05575561523438,
"loss": 0.6514,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.11823489516973495,
"rewards/margins": 0.18948128819465637,
"rewards/rejected": -0.3077161908149719,
"step": 306
},
{
"epoch": 0.35527359238699446,
"grad_norm": 83.6870493511653,
"learning_rate": 1.9209055179449537e-07,
"logits/chosen": -1.517793893814087,
"logits/rejected": -1.6404225826263428,
"logps/chosen": -91.36832427978516,
"logps/rejected": -134.06529235839844,
"loss": 0.6551,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.10601670295000076,
"rewards/margins": 0.14076808094978333,
"rewards/rejected": -0.24678479135036469,
"step": 308
},
{
"epoch": 0.3575805637661308,
"grad_norm": 64.57674968550867,
"learning_rate": 1.9193980245285966e-07,
"logits/chosen": -1.4689788818359375,
"logits/rejected": -1.3954423666000366,
"logps/chosen": -143.7101287841797,
"logps/rejected": -169.8336181640625,
"loss": 0.6402,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.16834121942520142,
"rewards/margins": 0.08874449878931046,
"rewards/rejected": -0.25708574056625366,
"step": 310
},
{
"epoch": 0.3598875351452671,
"grad_norm": 81.4185321584637,
"learning_rate": 1.9178769034155887e-07,
"logits/chosen": -1.6560229063034058,
"logits/rejected": -1.7177590131759644,
"logps/chosen": -144.23033142089844,
"logps/rejected": -166.01162719726562,
"loss": 0.6303,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.19495287537574768,
"rewards/margins": 0.08614547550678253,
"rewards/rejected": -0.281098335981369,
"step": 312
},
{
"epoch": 0.36219450652440344,
"grad_norm": 70.47869326950462,
"learning_rate": 1.9163421771526151e-07,
"logits/chosen": -1.5131672620773315,
"logits/rejected": -1.548357367515564,
"logps/chosen": -146.3427734375,
"logps/rejected": -159.85092163085938,
"loss": 0.6536,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1731819212436676,
"rewards/margins": 0.1254611313343048,
"rewards/rejected": -0.29864302277565,
"step": 314
},
{
"epoch": 0.36450147790353976,
"grad_norm": 79.69549984021036,
"learning_rate": 1.914793868488021e-07,
"logits/chosen": -1.512197732925415,
"logits/rejected": -1.4396047592163086,
"logps/chosen": -97.64339447021484,
"logps/rejected": -117.3057632446289,
"loss": 0.6579,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1673259437084198,
"rewards/margins": 0.045555103570222855,
"rewards/rejected": -0.21288102865219116,
"step": 316
},
{
"epoch": 0.3668084492826761,
"grad_norm": 82.99383875929993,
"learning_rate": 1.9132320003714754e-07,
"logits/chosen": -1.5376619100570679,
"logits/rejected": -1.5551142692565918,
"logps/chosen": -207.0707244873047,
"logps/rejected": -242.56712341308594,
"loss": 0.6439,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.24572816491127014,
"rewards/margins": 0.16944444179534912,
"rewards/rejected": -0.41517263650894165,
"step": 318
},
{
"epoch": 0.3691154206618124,
"grad_norm": 78.2099765504223,
"learning_rate": 1.9116565959536327e-07,
"logits/chosen": -1.4779236316680908,
"logits/rejected": -1.4861027002334595,
"logps/chosen": -193.60748291015625,
"logps/rejected": -232.04690551757812,
"loss": 0.6534,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.16232052445411682,
"rewards/margins": 0.13388732075691223,
"rewards/rejected": -0.29620781540870667,
"step": 320
},
{
"epoch": 0.37142239204094873,
"grad_norm": 74.80406821040707,
"learning_rate": 1.9100676785857857e-07,
"logits/chosen": -1.6256941556930542,
"logits/rejected": -1.5659886598587036,
"logps/chosen": -170.6388702392578,
"logps/rejected": -198.07733154296875,
"loss": 0.6395,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.17732584476470947,
"rewards/margins": 0.1462487280368805,
"rewards/rejected": -0.32357457280158997,
"step": 322
},
{
"epoch": 0.37372936342008506,
"grad_norm": 81.93843569632895,
"learning_rate": 1.9084652718195236e-07,
"logits/chosen": -1.5257925987243652,
"logits/rejected": -1.4617056846618652,
"logps/chosen": -208.795166015625,
"logps/rejected": -243.7969970703125,
"loss": 0.6648,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.2373400181531906,
"rewards/margins": 0.16046729683876038,
"rewards/rejected": -0.3978073298931122,
"step": 324
},
{
"epoch": 0.3760363347992214,
"grad_norm": 68.63199696676665,
"learning_rate": 1.9068493994063798e-07,
"logits/chosen": -1.4899076223373413,
"logits/rejected": -1.5616645812988281,
"logps/chosen": -133.66110229492188,
"logps/rejected": -236.15924072265625,
"loss": 0.6245,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.15444569289684296,
"rewards/margins": 0.2277567982673645,
"rewards/rejected": -0.38220247626304626,
"step": 326
},
{
"epoch": 0.3783433061783577,
"grad_norm": 77.96696778978115,
"learning_rate": 1.905220085297482e-07,
"logits/chosen": -1.5441091060638428,
"logits/rejected": -1.6405153274536133,
"logps/chosen": -204.56991577148438,
"logps/rejected": -610.9658203125,
"loss": 0.6369,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.25125816464424133,
"rewards/margins": 0.27758753299713135,
"rewards/rejected": -0.5288456678390503,
"step": 328
},
{
"epoch": 0.38065027755749403,
"grad_norm": 70.94819657566394,
"learning_rate": 1.9035773536431955e-07,
"logits/chosen": -1.5916917324066162,
"logits/rejected": -1.529220461845398,
"logps/chosen": -137.5714111328125,
"logps/rejected": -160.11544799804688,
"loss": 0.628,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.20854628086090088,
"rewards/margins": 0.11146115511655807,
"rewards/rejected": -0.32000741362571716,
"step": 330
},
{
"epoch": 0.38295724893663036,
"grad_norm": 74.31467840644032,
"learning_rate": 1.901921228792766e-07,
"logits/chosen": -1.5668599605560303,
"logits/rejected": -1.6017038822174072,
"logps/chosen": -253.0677947998047,
"logps/rejected": -266.9024658203125,
"loss": 0.6419,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.2701232433319092,
"rewards/margins": 0.1171327605843544,
"rewards/rejected": -0.387255996465683,
"step": 332
},
{
"epoch": 0.3852642203157667,
"grad_norm": 80.19418315617096,
"learning_rate": 1.9002517352939596e-07,
"logits/chosen": -1.538657784461975,
"logits/rejected": -1.4902359247207642,
"logps/chosen": -151.844482421875,
"logps/rejected": -182.43423461914062,
"loss": 0.6542,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.20499791204929352,
"rewards/margins": 0.14708584547042847,
"rewards/rejected": -0.3520837724208832,
"step": 334
},
{
"epoch": 0.38757119169490306,
"grad_norm": 78.45881437768317,
"learning_rate": 1.898568897892697e-07,
"logits/chosen": -1.502273440361023,
"logits/rejected": -1.567176342010498,
"logps/chosen": -149.17568969726562,
"logps/rejected": -218.93869018554688,
"loss": 0.6324,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.21270516514778137,
"rewards/margins": 0.24096481502056122,
"rewards/rejected": -0.4536699950695038,
"step": 336
},
{
"epoch": 0.3898781630740394,
"grad_norm": 69.72871536048268,
"learning_rate": 1.8968727415326882e-07,
"logits/chosen": -1.595134973526001,
"logits/rejected": -1.6751508712768555,
"logps/chosen": -112.13485717773438,
"logps/rejected": -138.27838134765625,
"loss": 0.6302,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11406655609607697,
"rewards/margins": 0.13377144932746887,
"rewards/rejected": -0.24783800542354584,
"step": 338
},
{
"epoch": 0.3921851344531757,
"grad_norm": 66.47735099680594,
"learning_rate": 1.8951632913550623e-07,
"logits/chosen": -1.6112767457962036,
"logits/rejected": -1.5350615978240967,
"logps/chosen": -212.4505615234375,
"logps/rejected": -239.0753173828125,
"loss": 0.621,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12918683886528015,
"rewards/margins": 0.254965603351593,
"rewards/rejected": -0.3841524124145508,
"step": 340
},
{
"epoch": 0.39449210583231203,
"grad_norm": 81.17863346925296,
"learning_rate": 1.8934405726979945e-07,
"logits/chosen": -1.4070253372192383,
"logits/rejected": -1.4879088401794434,
"logps/chosen": -166.3784942626953,
"logps/rejected": -204.57489013671875,
"loss": 0.6395,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.31329959630966187,
"rewards/margins": 0.13568538427352905,
"rewards/rejected": -0.4489849805831909,
"step": 342
},
{
"epoch": 0.39679907721144836,
"grad_norm": 72.25844304700202,
"learning_rate": 1.8917046110963314e-07,
"logits/chosen": -1.6808464527130127,
"logits/rejected": -1.6618741750717163,
"logps/chosen": -184.7408905029297,
"logps/rejected": -213.8212127685547,
"loss": 0.6414,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.1948232203722,
"rewards/margins": 0.18943095207214355,
"rewards/rejected": -0.3842541575431824,
"step": 344
},
{
"epoch": 0.3991060485905847,
"grad_norm": 69.12287284056892,
"learning_rate": 1.8899554322812116e-07,
"logits/chosen": -1.677032470703125,
"logits/rejected": -1.6319351196289062,
"logps/chosen": -114.67143249511719,
"logps/rejected": -125.2265625,
"loss": 0.6256,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.18165619671344757,
"rewards/margins": 0.17791113257408142,
"rewards/rejected": -0.3595673143863678,
"step": 346
},
{
"epoch": 0.401413019969721,
"grad_norm": 68.82861341006546,
"learning_rate": 1.8881930621796846e-07,
"logits/chosen": -1.531043291091919,
"logits/rejected": -1.4552069902420044,
"logps/chosen": -172.90670776367188,
"logps/rejected": -228.29833984375,
"loss": 0.6321,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.21518906950950623,
"rewards/margins": 0.16281384229660034,
"rewards/rejected": -0.37800291180610657,
"step": 348
},
{
"epoch": 0.40371999134885733,
"grad_norm": 79.01675049183694,
"learning_rate": 1.8864175269143273e-07,
"logits/chosen": -1.628811001777649,
"logits/rejected": -1.5073944330215454,
"logps/chosen": -162.4159393310547,
"logps/rejected": -173.65521240234375,
"loss": 0.6361,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.17217856645584106,
"rewards/margins": 0.20255069434642792,
"rewards/rejected": -0.3747292459011078,
"step": 350
},
{
"epoch": 0.40602696272799366,
"grad_norm": 80.14358020089544,
"learning_rate": 1.8846288528028552e-07,
"logits/chosen": -1.2868863344192505,
"logits/rejected": -1.4563894271850586,
"logps/chosen": -176.4993438720703,
"logps/rejected": -219.99745178222656,
"loss": 0.6388,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.34355729818344116,
"rewards/margins": 0.19085751473903656,
"rewards/rejected": -0.5344148278236389,
"step": 352
},
{
"epoch": 0.40833393410713,
"grad_norm": 72.34750725400806,
"learning_rate": 1.8828270663577336e-07,
"logits/chosen": -1.5702780485153198,
"logits/rejected": -1.6198755502700806,
"logps/chosen": -135.76097106933594,
"logps/rejected": -133.5688018798828,
"loss": 0.6593,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.28700345754623413,
"rewards/margins": 0.014538988471031189,
"rewards/rejected": -0.3015424311161041,
"step": 354
},
{
"epoch": 0.4106409054862663,
"grad_norm": 71.70524840332104,
"learning_rate": 1.8810121942857845e-07,
"logits/chosen": -1.5310659408569336,
"logits/rejected": -1.547040343284607,
"logps/chosen": -137.63137817382812,
"logps/rejected": -175.15028381347656,
"loss": 0.6293,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1476406753063202,
"rewards/margins": 0.20084424316883087,
"rewards/rejected": -0.34848493337631226,
"step": 356
},
{
"epoch": 0.41294787686540263,
"grad_norm": 77.60677795627835,
"learning_rate": 1.8791842634877896e-07,
"logits/chosen": -1.546626091003418,
"logits/rejected": -1.6076010465621948,
"logps/chosen": -136.61058044433594,
"logps/rejected": -187.11056518554688,
"loss": 0.6506,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2092825025320053,
"rewards/margins": 0.11802927404642105,
"rewards/rejected": -0.32731181383132935,
"step": 358
},
{
"epoch": 0.41525484824453895,
"grad_norm": 76.22986147865214,
"learning_rate": 1.8773433010580933e-07,
"logits/chosen": -1.5016052722930908,
"logits/rejected": -1.6018908023834229,
"logps/chosen": -129.33348083496094,
"logps/rejected": -151.12342834472656,
"loss": 0.627,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1853492707014084,
"rewards/margins": 0.10909079760313034,
"rewards/rejected": -0.2944400906562805,
"step": 360
},
{
"epoch": 0.4175618196236753,
"grad_norm": 71.86807271397895,
"learning_rate": 1.8754893342842e-07,
"logits/chosen": -1.5751183032989502,
"logits/rejected": -1.4908232688903809,
"logps/chosen": -187.5486602783203,
"logps/rejected": -194.04296875,
"loss": 0.6223,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.27427998185157776,
"rewards/margins": 0.1835474967956543,
"rewards/rejected": -0.45782750844955444,
"step": 362
},
{
"epoch": 0.4198687910028116,
"grad_norm": 70.36519300815779,
"learning_rate": 1.8736223906463695e-07,
"logits/chosen": -1.6419646739959717,
"logits/rejected": -1.6212923526763916,
"logps/chosen": -165.32421875,
"logps/rejected": -171.27830505371094,
"loss": 0.6154,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.21126417815685272,
"rewards/margins": 0.188466876745224,
"rewards/rejected": -0.3997310400009155,
"step": 364
},
{
"epoch": 0.4221757623819479,
"grad_norm": 70.09468918933095,
"learning_rate": 1.8717424978172102e-07,
"logits/chosen": -1.3921918869018555,
"logits/rejected": -1.469792127609253,
"logps/chosen": -167.81964111328125,
"logps/rejected": -210.77825927734375,
"loss": 0.6308,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2520577609539032,
"rewards/margins": 0.21120049059391022,
"rewards/rejected": -0.463258296251297,
"step": 366
},
{
"epoch": 0.42448273376108425,
"grad_norm": 83.57733506311956,
"learning_rate": 1.8698496836612691e-07,
"logits/chosen": -1.494173288345337,
"logits/rejected": -1.5522290468215942,
"logps/chosen": -163.31491088867188,
"logps/rejected": -189.11239624023438,
"loss": 0.6605,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.2657621204853058,
"rewards/margins": 0.16207075119018555,
"rewards/rejected": -0.42783284187316895,
"step": 368
},
{
"epoch": 0.4267897051402206,
"grad_norm": 81.29498139829452,
"learning_rate": 1.8679439762346184e-07,
"logits/chosen": -1.5649724006652832,
"logits/rejected": -1.6319153308868408,
"logps/chosen": -208.2643585205078,
"logps/rejected": -215.9363555908203,
"loss": 0.6724,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.27036455273628235,
"rewards/margins": 0.1651400327682495,
"rewards/rejected": -0.43550461530685425,
"step": 370
},
{
"epoch": 0.42909667651935696,
"grad_norm": 76.18451864107462,
"learning_rate": 1.8660254037844388e-07,
"logits/chosen": -1.4427084922790527,
"logits/rejected": -1.5188959836959839,
"logps/chosen": -171.85968017578125,
"logps/rejected": -233.1151580810547,
"loss": 0.629,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.27071186900138855,
"rewards/margins": 0.2559873163700104,
"rewards/rejected": -0.5266991853713989,
"step": 372
},
{
"epoch": 0.4314036478984933,
"grad_norm": 82.63010621157098,
"learning_rate": 1.8640939947486023e-07,
"logits/chosen": -1.5887802839279175,
"logits/rejected": -1.355837106704712,
"logps/chosen": -242.5066375732422,
"logps/rejected": -230.2034912109375,
"loss": 0.6329,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.3870730698108673,
"rewards/margins": 0.15506887435913086,
"rewards/rejected": -0.5421419143676758,
"step": 374
},
{
"epoch": 0.4337106192776296,
"grad_norm": 59.14499379914714,
"learning_rate": 1.8621497777552505e-07,
"logits/chosen": -1.420657992362976,
"logits/rejected": -1.4776450395584106,
"logps/chosen": -127.46673583984375,
"logps/rejected": -184.2600860595703,
"loss": 0.5869,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.15772147476673126,
"rewards/margins": 0.3883221745491028,
"rewards/rejected": -0.5460436344146729,
"step": 376
},
{
"epoch": 0.43601759065676593,
"grad_norm": 76.51933767322383,
"learning_rate": 1.8601927816223695e-07,
"logits/chosen": -1.3575465679168701,
"logits/rejected": -1.3156774044036865,
"logps/chosen": -218.0836944580078,
"logps/rejected": -228.03778076171875,
"loss": 0.6557,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4051874279975891,
"rewards/margins": 0.143568217754364,
"rewards/rejected": -0.5487555861473083,
"step": 378
},
{
"epoch": 0.43832456203590225,
"grad_norm": 61.424133205634206,
"learning_rate": 1.8582230353573624e-07,
"logits/chosen": -1.4618622064590454,
"logits/rejected": -1.4945478439331055,
"logps/chosen": -95.66145324707031,
"logps/rejected": -135.7235870361328,
"loss": 0.6206,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1784054934978485,
"rewards/margins": 0.23733605444431305,
"rewards/rejected": -0.415741503238678,
"step": 380
},
{
"epoch": 0.4406315334150386,
"grad_norm": 64.92661329207279,
"learning_rate": 1.8562405681566214e-07,
"logits/chosen": -1.5636019706726074,
"logits/rejected": -1.5756021738052368,
"logps/chosen": -201.42442321777344,
"logps/rejected": -188.35606384277344,
"loss": 0.6289,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3109050691127777,
"rewards/margins": 0.10487519204616547,
"rewards/rejected": -0.415780246257782,
"step": 382
},
{
"epoch": 0.4429385047941749,
"grad_norm": 83.39366061705226,
"learning_rate": 1.854245409405092e-07,
"logits/chosen": -1.6649830341339111,
"logits/rejected": -1.5097665786743164,
"logps/chosen": -217.35536193847656,
"logps/rejected": -223.5187225341797,
"loss": 0.6113,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.2543387711048126,
"rewards/margins": 0.2463696151971817,
"rewards/rejected": -0.5007083415985107,
"step": 384
},
{
"epoch": 0.4452454761733112,
"grad_norm": 74.49558456416251,
"learning_rate": 1.852237588675841e-07,
"logits/chosen": -1.582183599472046,
"logits/rejected": -1.7068113088607788,
"logps/chosen": -162.75521850585938,
"logps/rejected": -220.6885986328125,
"loss": 0.5992,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.21387754380702972,
"rewards/margins": 0.31847310066223145,
"rewards/rejected": -0.5323505997657776,
"step": 386
},
{
"epoch": 0.44755244755244755,
"grad_norm": 72.0795411450381,
"learning_rate": 1.850217135729614e-07,
"logits/chosen": -1.605985164642334,
"logits/rejected": -1.5858122110366821,
"logps/chosen": -196.78073120117188,
"logps/rejected": -213.26580810546875,
"loss": 0.6034,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.44325143098831177,
"rewards/margins": 0.07666480541229248,
"rewards/rejected": -0.5199161767959595,
"step": 388
},
{
"epoch": 0.4498594189315839,
"grad_norm": 72.48651390442274,
"learning_rate": 1.8481840805143987e-07,
"logits/chosen": -1.5632058382034302,
"logits/rejected": -1.5244344472885132,
"logps/chosen": -127.80747985839844,
"logps/rejected": -152.81256103515625,
"loss": 0.6163,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1298586130142212,
"rewards/margins": 0.42240971326828003,
"rewards/rejected": -0.5522683262825012,
"step": 390
},
{
"epoch": 0.4521663903107202,
"grad_norm": 74.34299341635638,
"learning_rate": 1.8461384531649773e-07,
"logits/chosen": -1.4820444583892822,
"logits/rejected": -1.605046033859253,
"logps/chosen": -105.68638610839844,
"logps/rejected": -156.26785278320312,
"loss": 0.6202,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1893598437309265,
"rewards/margins": 0.2589360773563385,
"rewards/rejected": -0.4482958912849426,
"step": 392
},
{
"epoch": 0.4544733616898565,
"grad_norm": 76.36773452235572,
"learning_rate": 1.844080284002482e-07,
"logits/chosen": -1.5065568685531616,
"logits/rejected": -1.5656404495239258,
"logps/chosen": -158.7242889404297,
"logps/rejected": -228.84844970703125,
"loss": 0.6139,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.251006543636322,
"rewards/margins": 0.21102304756641388,
"rewards/rejected": -0.46202951669692993,
"step": 394
},
{
"epoch": 0.45678033306899285,
"grad_norm": 71.03674812873284,
"learning_rate": 1.8420096035339452e-07,
"logits/chosen": -1.5289005041122437,
"logits/rejected": -1.527197003364563,
"logps/chosen": -200.40029907226562,
"logps/rejected": -212.3697967529297,
"loss": 0.6187,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.2883009612560272,
"rewards/margins": 0.30317747592926025,
"rewards/rejected": -0.5914784073829651,
"step": 396
},
{
"epoch": 0.4590873044481292,
"grad_norm": 81.19707296013529,
"learning_rate": 1.8399264424518465e-07,
"logits/chosen": -1.494114875793457,
"logits/rejected": -1.4553757905960083,
"logps/chosen": -173.10043334960938,
"logps/rejected": -222.2396240234375,
"loss": 0.5955,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.3878926932811737,
"rewards/margins": 0.3027462959289551,
"rewards/rejected": -0.6906389594078064,
"step": 398
},
{
"epoch": 0.4613942758272655,
"grad_norm": 89.13135103863338,
"learning_rate": 1.8378308316336582e-07,
"logits/chosen": -1.618680715560913,
"logits/rejected": -1.5578938722610474,
"logps/chosen": -191.10128784179688,
"logps/rejected": -280.5110778808594,
"loss": 0.6411,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4683380126953125,
"rewards/margins": 0.19769813120365143,
"rewards/rejected": -0.6660361289978027,
"step": 400
},
{
"epoch": 0.4613942758272655,
"eval_logits/chosen": -1.4853571653366089,
"eval_logits/rejected": -1.3932629823684692,
"eval_logps/chosen": -189.0384521484375,
"eval_logps/rejected": -156.24160766601562,
"eval_loss": 0.654194176197052,
"eval_rewards/accuracies": 0.6800000071525574,
"eval_rewards/chosen": -0.38562828302383423,
"eval_rewards/margins": 0.18699264526367188,
"eval_rewards/rejected": -0.5726209282875061,
"eval_runtime": 26.5299,
"eval_samples_per_second": 3.769,
"eval_steps_per_second": 0.942,
"step": 400
},
{
"epoch": 0.4637012472064018,
"grad_norm": 69.21606890792003,
"learning_rate": 1.8357228021413883e-07,
"logits/chosen": -1.5431230068206787,
"logits/rejected": -1.7365866899490356,
"logps/chosen": -147.3966827392578,
"logps/rejected": -170.9712371826172,
"loss": 0.6581,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.30663323402404785,
"rewards/margins": 0.11269617080688477,
"rewards/rejected": -0.4193294048309326,
"step": 402
},
{
"epoch": 0.46600821858553815,
"grad_norm": 78.7990153253576,
"learning_rate": 1.8336023852211194e-07,
"logits/chosen": -1.5721492767333984,
"logits/rejected": -1.4822769165039062,
"logps/chosen": -148.9419403076172,
"logps/rejected": -158.44668579101562,
"loss": 0.609,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.27455994486808777,
"rewards/margins": 0.3990754187107086,
"rewards/rejected": -0.6736353039741516,
"step": 404
},
{
"epoch": 0.4683151899646745,
"grad_norm": 67.81492283153628,
"learning_rate": 1.8314696123025453e-07,
"logits/chosen": -1.6370363235473633,
"logits/rejected": -1.5174671411514282,
"logps/chosen": -145.17050170898438,
"logps/rejected": -142.74551391601562,
"loss": 0.6312,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.28109437227249146,
"rewards/margins": 0.2069387137889862,
"rewards/rejected": -0.48803308606147766,
"step": 406
},
{
"epoch": 0.47062216134381085,
"grad_norm": 78.2843593072173,
"learning_rate": 1.8293245149985053e-07,
"logits/chosen": -1.5488444566726685,
"logits/rejected": -1.4798938035964966,
"logps/chosen": -161.83570861816406,
"logps/rejected": -162.7615509033203,
"loss": 0.6484,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.2718978822231293,
"rewards/margins": 0.15639187395572662,
"rewards/rejected": -0.4282897710800171,
"step": 408
},
{
"epoch": 0.4729291327229472,
"grad_norm": 73.10449012391845,
"learning_rate": 1.827167125104517e-07,
"logits/chosen": -1.4978845119476318,
"logits/rejected": -1.4839560985565186,
"logps/chosen": -148.445556640625,
"logps/rejected": -161.85986328125,
"loss": 0.6481,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.27761712670326233,
"rewards/margins": 0.09577606618404388,
"rewards/rejected": -0.3733932077884674,
"step": 410
},
{
"epoch": 0.4752361041020835,
"grad_norm": 77.23312704566136,
"learning_rate": 1.8249974745983021e-07,
"logits/chosen": -1.4896149635314941,
"logits/rejected": -1.4279950857162476,
"logps/chosen": -136.3888397216797,
"logps/rejected": -184.14625549316406,
"loss": 0.6186,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3546374440193176,
"rewards/margins": 0.3140718638896942,
"rewards/rejected": -0.6687093377113342,
"step": 412
},
{
"epoch": 0.4775430754812198,
"grad_norm": 65.58481770102698,
"learning_rate": 1.822815595639316e-07,
"logits/chosen": -1.4790016412734985,
"logits/rejected": -1.525940179824829,
"logps/chosen": -162.99288940429688,
"logps/rejected": -190.2974853515625,
"loss": 0.6112,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.36069726943969727,
"rewards/margins": 0.20576652884483337,
"rewards/rejected": -0.5664637684822083,
"step": 414
},
{
"epoch": 0.47985004686035615,
"grad_norm": 68.7972400850831,
"learning_rate": 1.820621520568268e-07,
"logits/chosen": -1.5574984550476074,
"logits/rejected": -1.4820420742034912,
"logps/chosen": -178.15878295898438,
"logps/rejected": -191.66177368164062,
"loss": 0.6,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.314214825630188,
"rewards/margins": 0.32970941066741943,
"rewards/rejected": -0.6439242362976074,
"step": 416
},
{
"epoch": 0.4821570182394925,
"grad_norm": 77.22458475405976,
"learning_rate": 1.8184152819066434e-07,
"logits/chosen": -1.5454033613204956,
"logits/rejected": -1.5681257247924805,
"logps/chosen": -206.4539031982422,
"logps/rejected": -221.17599487304688,
"loss": 0.6395,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4876091778278351,
"rewards/margins": 0.06031504273414612,
"rewards/rejected": -0.5479242205619812,
"step": 418
},
{
"epoch": 0.4844639896186288,
"grad_norm": 69.59230881656185,
"learning_rate": 1.8161969123562217e-07,
"logits/chosen": -1.54752516746521,
"logits/rejected": -1.5821384191513062,
"logps/chosen": -182.0235137939453,
"logps/rejected": -163.29364013671875,
"loss": 0.6107,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3480142056941986,
"rewards/margins": 0.3120378255844116,
"rewards/rejected": -0.6600520610809326,
"step": 420
},
{
"epoch": 0.4867709609977651,
"grad_norm": 68.29468448816121,
"learning_rate": 1.813966444798591e-07,
"logits/chosen": -1.513810634613037,
"logits/rejected": -1.4666978120803833,
"logps/chosen": -204.99462890625,
"logps/rejected": -204.5595245361328,
"loss": 0.6143,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3375055491924286,
"rewards/margins": 0.3794183135032654,
"rewards/rejected": -0.7169238328933716,
"step": 422
},
{
"epoch": 0.48907793237690145,
"grad_norm": 73.69015362328696,
"learning_rate": 1.8117239122946611e-07,
"logits/chosen": -1.3477180004119873,
"logits/rejected": -1.4509586095809937,
"logps/chosen": -118.67777252197266,
"logps/rejected": -176.48667907714844,
"loss": 0.6192,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.3034321069717407,
"rewards/margins": 0.12479298561811447,
"rewards/rejected": -0.4282251298427582,
"step": 424
},
{
"epoch": 0.49138490375603777,
"grad_norm": 78.31541581493791,
"learning_rate": 1.809469348084174e-07,
"logits/chosen": -1.459653377532959,
"logits/rejected": -1.5776402950286865,
"logps/chosen": -159.45347595214844,
"logps/rejected": -189.2720489501953,
"loss": 0.6554,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.37468722462654114,
"rewards/margins": 0.1383470892906189,
"rewards/rejected": -0.5130342841148376,
"step": 426
},
{
"epoch": 0.4936918751351741,
"grad_norm": 130.5379676824635,
"learning_rate": 1.8072027855852095e-07,
"logits/chosen": -1.4528967142105103,
"logits/rejected": -1.423844814300537,
"logps/chosen": -172.85316467285156,
"logps/rejected": -215.22189331054688,
"loss": 0.6639,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.41784724593162537,
"rewards/margins": 0.3192124366760254,
"rewards/rejected": -0.7370596528053284,
"step": 428
},
{
"epoch": 0.4959988465143104,
"grad_norm": 63.21984381769687,
"learning_rate": 1.8049242583936918e-07,
"logits/chosen": -1.5084190368652344,
"logits/rejected": -1.4574109315872192,
"logps/chosen": -165.896484375,
"logps/rejected": -227.423828125,
"loss": 0.5893,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.25652381777763367,
"rewards/margins": 0.47441697120666504,
"rewards/rejected": -0.7309407591819763,
"step": 430
},
{
"epoch": 0.49830581789344675,
"grad_norm": 71.69590925642426,
"learning_rate": 1.802633800282891e-07,
"logits/chosen": -1.516315221786499,
"logits/rejected": -1.6526371240615845,
"logps/chosen": -229.77777099609375,
"logps/rejected": -292.7660827636719,
"loss": 0.5979,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3780279755592346,
"rewards/margins": 0.49888893961906433,
"rewards/rejected": -0.8769169449806213,
"step": 432
},
{
"epoch": 0.5006127892725831,
"grad_norm": 72.54608833334152,
"learning_rate": 1.8003314452029213e-07,
"logits/chosen": -1.5792149305343628,
"logits/rejected": -1.550574779510498,
"logps/chosen": -226.616455078125,
"logps/rejected": -228.4210205078125,
"loss": 0.6046,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5066580176353455,
"rewards/margins": 0.34013134241104126,
"rewards/rejected": -0.8467893600463867,
"step": 434
},
{
"epoch": 0.5029197606517194,
"grad_norm": 73.04169645370872,
"learning_rate": 1.7980172272802395e-07,
"logits/chosen": -1.5109785795211792,
"logits/rejected": -1.499125361442566,
"logps/chosen": -154.92233276367188,
"logps/rejected": -175.07643127441406,
"loss": 0.5817,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.25602594017982483,
"rewards/margins": 0.5013114809989929,
"rewards/rejected": -0.7573373913764954,
"step": 436
},
{
"epoch": 0.5052267320308558,
"grad_norm": 69.05059334922119,
"learning_rate": 1.7956911808171373e-07,
"logits/chosen": -1.561600923538208,
"logits/rejected": -1.5301151275634766,
"logps/chosen": -217.26930236816406,
"logps/rejected": -240.7093048095703,
"loss": 0.6151,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.46973368525505066,
"rewards/margins": 0.2093038558959961,
"rewards/rejected": -0.6790375113487244,
"step": 438
},
{
"epoch": 0.507533703409992,
"grad_norm": 74.68873536524164,
"learning_rate": 1.793353340291235e-07,
"logits/chosen": -1.3198765516281128,
"logits/rejected": -1.4805912971496582,
"logps/chosen": -175.9479217529297,
"logps/rejected": -226.83265686035156,
"loss": 0.6134,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5549490451812744,
"rewards/margins": 0.23202911019325256,
"rewards/rejected": -0.7869781851768494,
"step": 440
},
{
"epoch": 0.5098406747891284,
"grad_norm": 73.37532376774183,
"learning_rate": 1.7910037403549692e-07,
"logits/chosen": -1.4717934131622314,
"logits/rejected": -1.5461549758911133,
"logps/chosen": -159.91883850097656,
"logps/rejected": -204.87376403808594,
"loss": 0.6459,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4326345920562744,
"rewards/margins": 0.22945694625377655,
"rewards/rejected": -0.6620914936065674,
"step": 442
},
{
"epoch": 0.5121476461682647,
"grad_norm": 69.28741446430803,
"learning_rate": 1.7886424158350782e-07,
"logits/chosen": -1.5604138374328613,
"logits/rejected": -1.663907766342163,
"logps/chosen": -158.54408264160156,
"logps/rejected": -192.7698516845703,
"loss": 0.5921,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3090921640396118,
"rewards/margins": 0.3891502916812897,
"rewards/rejected": -0.6982424855232239,
"step": 444
},
{
"epoch": 0.5144546175474011,
"grad_norm": 77.66154968693108,
"learning_rate": 1.7862694017320886e-07,
"logits/chosen": -1.3435657024383545,
"logits/rejected": -1.3843066692352295,
"logps/chosen": -174.62672424316406,
"logps/rejected": -288.0128173828125,
"loss": 0.6145,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4857187271118164,
"rewards/margins": 0.4818662703037262,
"rewards/rejected": -0.9675850868225098,
"step": 446
},
{
"epoch": 0.5167615889265373,
"grad_norm": 86.0701716220196,
"learning_rate": 1.7838847332197937e-07,
"logits/chosen": -1.4369436502456665,
"logits/rejected": -1.5111709833145142,
"logps/chosen": -193.0187225341797,
"logps/rejected": -258.660400390625,
"loss": 0.6179,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4460400640964508,
"rewards/margins": 0.4027029871940613,
"rewards/rejected": -0.8487430810928345,
"step": 448
},
{
"epoch": 0.5190685603056737,
"grad_norm": 84.40844346826594,
"learning_rate": 1.7814884456447335e-07,
"logits/chosen": -1.5306761264801025,
"logits/rejected": -1.4944154024124146,
"logps/chosen": -195.49612426757812,
"logps/rejected": -222.01425170898438,
"loss": 0.6006,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.2904947102069855,
"rewards/margins": 0.5166550874710083,
"rewards/rejected": -0.8071498870849609,
"step": 450
},
{
"epoch": 0.52137553168481,
"grad_norm": 86.3712126774886,
"learning_rate": 1.7790805745256703e-07,
"logits/chosen": -1.3275847434997559,
"logits/rejected": -1.38175630569458,
"logps/chosen": -136.90707397460938,
"logps/rejected": -184.36331176757812,
"loss": 0.6767,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.44699156284332275,
"rewards/margins": 0.12617343664169312,
"rewards/rejected": -0.5731649398803711,
"step": 452
},
{
"epoch": 0.5236825030639464,
"grad_norm": 66.61833278109548,
"learning_rate": 1.7766611555530635e-07,
"logits/chosen": -1.6141921281814575,
"logits/rejected": -1.5151243209838867,
"logps/chosen": -156.77407836914062,
"logps/rejected": -154.7230682373047,
"loss": 0.5733,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.3759933114051819,
"rewards/margins": 0.17464786767959595,
"rewards/rejected": -0.5506411790847778,
"step": 454
},
{
"epoch": 0.5259894744430826,
"grad_norm": 69.26758309677136,
"learning_rate": 1.774230224588538e-07,
"logits/chosen": -1.3204282522201538,
"logits/rejected": -1.4286822080612183,
"logps/chosen": -152.52542114257812,
"logps/rejected": -232.16189575195312,
"loss": 0.5494,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4260653853416443,
"rewards/margins": 0.5102941989898682,
"rewards/rejected": -0.9363595247268677,
"step": 456
},
{
"epoch": 0.528296445822219,
"grad_norm": 81.07739462727531,
"learning_rate": 1.771787817664356e-07,
"logits/chosen": -1.508811116218567,
"logits/rejected": -1.5395921468734741,
"logps/chosen": -134.4735565185547,
"logps/rejected": -166.41592407226562,
"loss": 0.6351,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.49481019377708435,
"rewards/margins": 0.1262877732515335,
"rewards/rejected": -0.6210979223251343,
"step": 458
},
{
"epoch": 0.5306034172013554,
"grad_norm": 86.01343093557993,
"learning_rate": 1.769333970982879e-07,
"logits/chosen": -1.518664836883545,
"logits/rejected": -1.3482635021209717,
"logps/chosen": -173.78538513183594,
"logps/rejected": -160.53573608398438,
"loss": 0.5857,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.49463319778442383,
"rewards/margins": 0.202806293964386,
"rewards/rejected": -0.6974395513534546,
"step": 460
},
{
"epoch": 0.5329103885804917,
"grad_norm": 85.16027410016599,
"learning_rate": 1.766868720916035e-07,
"logits/chosen": -1.359481930732727,
"logits/rejected": -1.3029265403747559,
"logps/chosen": -134.05616760253906,
"logps/rejected": -134.0654754638672,
"loss": 0.6487,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4239296019077301,
"rewards/margins": 0.03123108297586441,
"rewards/rejected": -0.4551607072353363,
"step": 462
},
{
"epoch": 0.535217359959628,
"grad_norm": 84.5629811685175,
"learning_rate": 1.7643921040047766e-07,
"logits/chosen": -1.6018937826156616,
"logits/rejected": -1.6816954612731934,
"logps/chosen": -237.3992919921875,
"logps/rejected": -253.08688354492188,
"loss": 0.597,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.6288573741912842,
"rewards/margins": 0.15610165894031525,
"rewards/rejected": -0.7849590182304382,
"step": 464
},
{
"epoch": 0.5375243313387643,
"grad_norm": 80.72916842158041,
"learning_rate": 1.7619041569585418e-07,
"logits/chosen": -1.4444328546524048,
"logits/rejected": -1.4673030376434326,
"logps/chosen": -170.2801971435547,
"logps/rejected": -214.7718963623047,
"loss": 0.6181,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.564181923866272,
"rewards/margins": 0.2302751988172531,
"rewards/rejected": -0.7944571375846863,
"step": 466
},
{
"epoch": 0.5398313027179007,
"grad_norm": 76.00828750498393,
"learning_rate": 1.759404916654707e-07,
"logits/chosen": -1.4668854475021362,
"logits/rejected": -1.421462059020996,
"logps/chosen": -360.7674560546875,
"logps/rejected": -301.1515197753906,
"loss": 0.6139,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6432144641876221,
"rewards/margins": 0.3255874514579773,
"rewards/rejected": -0.9688019156455994,
"step": 468
},
{
"epoch": 0.542138274097037,
"grad_norm": 75.00038820917719,
"learning_rate": 1.756894420138043e-07,
"logits/chosen": -1.5766559839248657,
"logits/rejected": -1.656800627708435,
"logps/chosen": -216.8627471923828,
"logps/rejected": -270.90850830078125,
"loss": 0.615,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4949862062931061,
"rewards/margins": 0.4039486050605774,
"rewards/rejected": -0.8989347815513611,
"step": 470
},
{
"epoch": 0.5444452454761733,
"grad_norm": 86.17675092820859,
"learning_rate": 1.754372704620164e-07,
"logits/chosen": -1.4618090391159058,
"logits/rejected": -1.5533053874969482,
"logps/chosen": -202.59561157226562,
"logps/rejected": -221.70413208007812,
"loss": 0.6478,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.44822004437446594,
"rewards/margins": 0.28794264793395996,
"rewards/rejected": -0.7361626625061035,
"step": 472
},
{
"epoch": 0.5467522168553096,
"grad_norm": 72.36150215283246,
"learning_rate": 1.7518398074789774e-07,
"logits/chosen": -1.4804517030715942,
"logits/rejected": -1.5212501287460327,
"logps/chosen": -195.58935546875,
"logps/rejected": -247.99276733398438,
"loss": 0.553,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.44707149267196655,
"rewards/margins": 0.6286894679069519,
"rewards/rejected": -1.0757609605789185,
"step": 474
},
{
"epoch": 0.549059188234446,
"grad_norm": 73.94947964279808,
"learning_rate": 1.7492957662581294e-07,
"logits/chosen": -1.3577089309692383,
"logits/rejected": -1.4486963748931885,
"logps/chosen": -133.3319091796875,
"logps/rejected": -188.2812957763672,
"loss": 0.6001,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.34889039397239685,
"rewards/margins": 0.3021068871021271,
"rewards/rejected": -0.6509972214698792,
"step": 476
},
{
"epoch": 0.5513661596135823,
"grad_norm": 74.0047644626624,
"learning_rate": 1.7467406186664473e-07,
"logits/chosen": -1.5747010707855225,
"logits/rejected": -1.5058567523956299,
"logps/chosen": -216.6630401611328,
"logps/rejected": -223.66598510742188,
"loss": 0.6345,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5371094346046448,
"rewards/margins": 0.3996596932411194,
"rewards/rejected": -0.9367691874504089,
"step": 478
},
{
"epoch": 0.5536731309927186,
"grad_norm": 50.915761396824145,
"learning_rate": 1.7441744025773834e-07,
"logits/chosen": -1.4014126062393188,
"logits/rejected": -1.569306492805481,
"logps/chosen": -156.43629455566406,
"logps/rejected": -228.84625244140625,
"loss": 0.5975,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.399608850479126,
"rewards/margins": 0.29513585567474365,
"rewards/rejected": -0.6947447061538696,
"step": 480
},
{
"epoch": 0.5559801023718549,
"grad_norm": 80.40246802194461,
"learning_rate": 1.74159715602845e-07,
"logits/chosen": -1.49760103225708,
"logits/rejected": -1.4302232265472412,
"logps/chosen": -152.4906005859375,
"logps/rejected": -165.43942260742188,
"loss": 0.6511,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4252295196056366,
"rewards/margins": 0.12136977910995483,
"rewards/rejected": -0.5465993285179138,
"step": 482
},
{
"epoch": 0.5582870737509913,
"grad_norm": 70.56990492477674,
"learning_rate": 1.739008917220659e-07,
"logits/chosen": -1.4919289350509644,
"logits/rejected": -1.5267033576965332,
"logps/chosen": -187.85191345214844,
"logps/rejected": -220.8524169921875,
"loss": 0.5689,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5196070671081543,
"rewards/margins": 0.3590528666973114,
"rewards/rejected": -0.8786599636077881,
"step": 484
},
{
"epoch": 0.5605940451301276,
"grad_norm": 78.98020718967784,
"learning_rate": 1.7364097245179527e-07,
"logits/chosen": -1.599880337715149,
"logits/rejected": -1.5224246978759766,
"logps/chosen": -196.72555541992188,
"logps/rejected": -213.14309692382812,
"loss": 0.5892,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5108906030654907,
"rewards/margins": 0.1902090609073639,
"rewards/rejected": -0.701099693775177,
"step": 486
},
{
"epoch": 0.5629010165092639,
"grad_norm": 75.35371757401214,
"learning_rate": 1.733799616446637e-07,
"logits/chosen": -1.4978597164154053,
"logits/rejected": -1.5102261304855347,
"logps/chosen": -186.15167236328125,
"logps/rejected": -226.00375366210938,
"loss": 0.6112,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.43081170320510864,
"rewards/margins": 0.36774906516075134,
"rewards/rejected": -0.7985607385635376,
"step": 488
},
{
"epoch": 0.5652079878884002,
"grad_norm": 75.43303696622675,
"learning_rate": 1.7311786316948108e-07,
"logits/chosen": -1.418121337890625,
"logits/rejected": -1.4920923709869385,
"logps/chosen": -179.17889404296875,
"logps/rejected": -229.40098571777344,
"loss": 0.5938,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6558996438980103,
"rewards/margins": 0.27045130729675293,
"rewards/rejected": -0.9263509511947632,
"step": 490
},
{
"epoch": 0.5675149592675366,
"grad_norm": 71.0686050492484,
"learning_rate": 1.7285468091117904e-07,
"logits/chosen": -1.4989047050476074,
"logits/rejected": -1.4156945943832397,
"logps/chosen": -153.10214233398438,
"logps/rejected": -172.13262939453125,
"loss": 0.5901,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4824844300746918,
"rewards/margins": 0.44079095125198364,
"rewards/rejected": -0.9232754707336426,
"step": 492
},
{
"epoch": 0.569821930646673,
"grad_norm": 67.99918941849218,
"learning_rate": 1.7259041877075352e-07,
"logits/chosen": -1.430630087852478,
"logits/rejected": -1.3989218473434448,
"logps/chosen": -209.73452758789062,
"logps/rejected": -254.0313720703125,
"loss": 0.5729,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5363369584083557,
"rewards/margins": 0.5890082120895386,
"rewards/rejected": -1.125345230102539,
"step": 494
},
{
"epoch": 0.5721289020258092,
"grad_norm": 78.40754956054191,
"learning_rate": 1.7232508066520698e-07,
"logits/chosen": -1.5510261058807373,
"logits/rejected": -1.5487847328186035,
"logps/chosen": -211.16983032226562,
"logps/rejected": -240.33824157714844,
"loss": 0.5772,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4531714916229248,
"rewards/margins": 0.2688879370689392,
"rewards/rejected": -0.7220594882965088,
"step": 496
},
{
"epoch": 0.5744358734049456,
"grad_norm": 61.990430466819326,
"learning_rate": 1.7205867052749023e-07,
"logits/chosen": -1.363396167755127,
"logits/rejected": -1.3964465856552124,
"logps/chosen": -147.12242126464844,
"logps/rejected": -180.23667907714844,
"loss": 0.6459,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5750865340232849,
"rewards/margins": 0.11157172918319702,
"rewards/rejected": -0.6866582632064819,
"step": 498
},
{
"epoch": 0.5767428447840819,
"grad_norm": 76.0573953537264,
"learning_rate": 1.717911923064442e-07,
"logits/chosen": -1.5747530460357666,
"logits/rejected": -1.4509817361831665,
"logps/chosen": -181.61216735839844,
"logps/rejected": -153.97573852539062,
"loss": 0.6012,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5273740887641907,
"rewards/margins": 0.1454104781150818,
"rewards/rejected": -0.6727845668792725,
"step": 500
},
{
"epoch": 0.5767428447840819,
"eval_logits/chosen": -1.440444827079773,
"eval_logits/rejected": -1.3533989191055298,
"eval_logps/chosen": -191.4648895263672,
"eval_logps/rejected": -158.6099395751953,
"eval_loss": 0.636239767074585,
"eval_rewards/accuracies": 0.6800000071525574,
"eval_rewards/chosen": -0.628268837928772,
"eval_rewards/margins": 0.18118661642074585,
"eval_rewards/rejected": -0.809455394744873,
"eval_runtime": 37.9799,
"eval_samples_per_second": 2.633,
"eval_steps_per_second": 0.658,
"step": 500
},
{
"epoch": 0.5790498161632183,
"grad_norm": 77.78220283215643,
"learning_rate": 1.7152264996674135e-07,
"logits/chosen": -1.4428610801696777,
"logits/rejected": -1.2872042655944824,
"logps/chosen": -184.39501953125,
"logps/rejected": -238.38723754882812,
"loss": 0.5953,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6475786566734314,
"rewards/margins": 0.2779845893383026,
"rewards/rejected": -0.9255632758140564,
"step": 502
},
{
"epoch": 0.5813567875423545,
"grad_norm": 93.29916680291039,
"learning_rate": 1.71253047488827e-07,
"logits/chosen": -1.4898688793182373,
"logits/rejected": -1.5620332956314087,
"logps/chosen": -178.47802734375,
"logps/rejected": -205.5224609375,
"loss": 0.6703,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5680350065231323,
"rewards/margins": 0.18766377866268158,
"rewards/rejected": -0.7556988000869751,
"step": 504
},
{
"epoch": 0.5836637589214909,
"grad_norm": 77.19105499219319,
"learning_rate": 1.7098238886886024e-07,
"logits/chosen": -1.4835506677627563,
"logits/rejected": -1.5302045345306396,
"logps/chosen": -203.8736114501953,
"logps/rejected": -228.69265747070312,
"loss": 0.5951,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.47867119312286377,
"rewards/margins": 0.22942683100700378,
"rewards/rejected": -0.7080979943275452,
"step": 506
},
{
"epoch": 0.5859707303006272,
"grad_norm": 67.4261860354,
"learning_rate": 1.7071067811865473e-07,
"logits/chosen": -1.4649958610534668,
"logits/rejected": -1.4145183563232422,
"logps/chosen": -199.42066955566406,
"logps/rejected": -235.40292358398438,
"loss": 0.5368,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.4195340573787689,
"rewards/margins": 0.551209032535553,
"rewards/rejected": -0.9707430601119995,
"step": 508
},
{
"epoch": 0.5882777016797636,
"grad_norm": 87.85240065033273,
"learning_rate": 1.7043791926561932e-07,
"logits/chosen": -1.5964919328689575,
"logits/rejected": -1.561856746673584,
"logps/chosen": -201.67276000976562,
"logps/rejected": -234.04359436035156,
"loss": 0.651,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6162290573120117,
"rewards/margins": 0.4439167082309723,
"rewards/rejected": -1.0601458549499512,
"step": 510
},
{
"epoch": 0.5905846730588998,
"grad_norm": 62.42968300457303,
"learning_rate": 1.7016411635269815e-07,
"logits/chosen": -1.4615092277526855,
"logits/rejected": -1.4488492012023926,
"logps/chosen": -151.2560577392578,
"logps/rejected": -176.4474334716797,
"loss": 0.609,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.33995571732521057,
"rewards/margins": 0.2483442723751068,
"rewards/rejected": -0.5882999897003174,
"step": 512
},
{
"epoch": 0.5928916444380362,
"grad_norm": 74.39629379240114,
"learning_rate": 1.6988927343831091e-07,
"logits/chosen": -1.5747379064559937,
"logits/rejected": -1.4773468971252441,
"logps/chosen": -198.891845703125,
"logps/rejected": -210.0729522705078,
"loss": 0.61,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.47531554102897644,
"rewards/margins": 0.47791624069213867,
"rewards/rejected": -0.9532317519187927,
"step": 514
},
{
"epoch": 0.5951986158171725,
"grad_norm": 70.19350216590036,
"learning_rate": 1.6961339459629266e-07,
"logits/chosen": -1.4481630325317383,
"logits/rejected": -1.4714566469192505,
"logps/chosen": -190.8370361328125,
"logps/rejected": -242.71621704101562,
"loss": 0.5872,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5172877907752991,
"rewards/margins": 0.48140281438827515,
"rewards/rejected": -0.998690664768219,
"step": 516
},
{
"epoch": 0.5975055871963089,
"grad_norm": 73.75535823993799,
"learning_rate": 1.6933648391583328e-07,
"logits/chosen": -1.531792163848877,
"logits/rejected": -1.4680547714233398,
"logps/chosen": -144.9717559814453,
"logps/rejected": -172.87686157226562,
"loss": 0.6006,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.3757992386817932,
"rewards/margins": 0.35130438208580017,
"rewards/rejected": -0.7271036505699158,
"step": 518
},
{
"epoch": 0.5998125585754451,
"grad_norm": 69.85303523035323,
"learning_rate": 1.6905854550141714e-07,
"logits/chosen": -1.5805073976516724,
"logits/rejected": -1.5384862422943115,
"logps/chosen": -171.9115753173828,
"logps/rejected": -169.82862854003906,
"loss": 0.5875,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.5081273317337036,
"rewards/margins": 0.2863667607307434,
"rewards/rejected": -0.794494092464447,
"step": 520
},
{
"epoch": 0.6021195299545815,
"grad_norm": 69.03602758187714,
"learning_rate": 1.6877958347276197e-07,
"logits/chosen": -1.4844419956207275,
"logits/rejected": -1.4906061887741089,
"logps/chosen": -149.6005859375,
"logps/rejected": -163.59097290039062,
"loss": 0.6013,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.42841285467147827,
"rewards/margins": 0.30834630131721497,
"rewards/rejected": -0.7367592453956604,
"step": 522
},
{
"epoch": 0.6044265013337178,
"grad_norm": 80.75337933099041,
"learning_rate": 1.6849960196475805e-07,
"logits/chosen": -1.5245236158370972,
"logits/rejected": -1.5345442295074463,
"logps/chosen": -148.5638885498047,
"logps/rejected": -178.37429809570312,
"loss": 0.5909,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3656730651855469,
"rewards/margins": 0.3520704507827759,
"rewards/rejected": -0.7177435159683228,
"step": 524
},
{
"epoch": 0.6067334727128542,
"grad_norm": 79.6488573037571,
"learning_rate": 1.682186051274067e-07,
"logits/chosen": -1.4462357759475708,
"logits/rejected": -1.4616801738739014,
"logps/chosen": -144.83853149414062,
"logps/rejected": -191.320556640625,
"loss": 0.5847,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6087457537651062,
"rewards/margins": 0.3239368498325348,
"rewards/rejected": -0.9326826930046082,
"step": 526
},
{
"epoch": 0.6090404440919904,
"grad_norm": 82.53815106903608,
"learning_rate": 1.6793659712575895e-07,
"logits/chosen": -1.5642480850219727,
"logits/rejected": -1.4599685668945312,
"logps/chosen": -215.29837036132812,
"logps/rejected": -199.14767456054688,
"loss": 0.5928,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5695382356643677,
"rewards/margins": 0.271673321723938,
"rewards/rejected": -0.8412115573883057,
"step": 528
},
{
"epoch": 0.6113474154711268,
"grad_norm": 86.53571512694035,
"learning_rate": 1.676535821398537e-07,
"logits/chosen": -1.3208836317062378,
"logits/rejected": -1.3146097660064697,
"logps/chosen": -189.41128540039062,
"logps/rejected": -232.5477294921875,
"loss": 0.6013,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.654186487197876,
"rewards/margins": 0.4602148234844208,
"rewards/rejected": -1.1144013404846191,
"step": 530
},
{
"epoch": 0.6136543868502632,
"grad_norm": 70.64851504723866,
"learning_rate": 1.6736956436465573e-07,
"logits/chosen": -1.3590030670166016,
"logits/rejected": -1.4608113765716553,
"logps/chosen": -148.809326171875,
"logps/rejected": -203.59759521484375,
"loss": 0.5861,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.496415913105011,
"rewards/margins": 0.31767329573631287,
"rewards/rejected": -0.814089298248291,
"step": 532
},
{
"epoch": 0.6159613582293995,
"grad_norm": 73.57136513502368,
"learning_rate": 1.6708454800999366e-07,
"logits/chosen": -1.4504910707473755,
"logits/rejected": -1.4983229637145996,
"logps/chosen": -166.2091522216797,
"logps/rejected": -206.8488311767578,
"loss": 0.6153,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.49555644392967224,
"rewards/margins": 0.3523869812488556,
"rewards/rejected": -0.8479433655738831,
"step": 534
},
{
"epoch": 0.6182683296085358,
"grad_norm": 67.83021038753246,
"learning_rate": 1.667985373004974e-07,
"logits/chosen": -1.4747323989868164,
"logits/rejected": -1.3922568559646606,
"logps/chosen": -159.47254943847656,
"logps/rejected": -177.21884155273438,
"loss": 0.5691,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.2918567657470703,
"rewards/margins": 0.5216075778007507,
"rewards/rejected": -0.8134642839431763,
"step": 536
},
{
"epoch": 0.6205753009876721,
"grad_norm": 75.55693314924734,
"learning_rate": 1.6651153647553567e-07,
"logits/chosen": -1.6021491289138794,
"logits/rejected": -1.6126930713653564,
"logps/chosen": -165.55172729492188,
"logps/rejected": -197.1583251953125,
"loss": 0.5986,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.505136251449585,
"rewards/margins": 0.2592867612838745,
"rewards/rejected": -0.7644230127334595,
"step": 538
},
{
"epoch": 0.6228822723668085,
"grad_norm": 74.57237448077612,
"learning_rate": 1.6622354978915304e-07,
"logits/chosen": -1.3560292720794678,
"logits/rejected": -1.4895740747451782,
"logps/chosen": -152.60386657714844,
"logps/rejected": -200.48497009277344,
"loss": 0.5976,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.450514554977417,
"rewards/margins": 0.42979568243026733,
"rewards/rejected": -0.8803102374076843,
"step": 540
},
{
"epoch": 0.6251892437459448,
"grad_norm": 76.07758708375029,
"learning_rate": 1.6593458151000687e-07,
"logits/chosen": -1.418495535850525,
"logits/rejected": -1.5285032987594604,
"logps/chosen": -174.468017578125,
"logps/rejected": -212.58534240722656,
"loss": 0.6021,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4992409944534302,
"rewards/margins": 0.357663631439209,
"rewards/rejected": -0.8569046854972839,
"step": 542
},
{
"epoch": 0.6274962151250811,
"grad_norm": 67.61668250943133,
"learning_rate": 1.6564463592130426e-07,
"logits/chosen": -1.6000475883483887,
"logits/rejected": -1.5714551210403442,
"logps/chosen": -129.46788024902344,
"logps/rejected": -137.58729553222656,
"loss": 0.6027,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4155838191509247,
"rewards/margins": 0.31966376304626465,
"rewards/rejected": -0.7352475523948669,
"step": 544
},
{
"epoch": 0.6298031865042174,
"grad_norm": 67.37831547087359,
"learning_rate": 1.6535371732073823e-07,
"logits/chosen": -1.5627467632293701,
"logits/rejected": -1.4833993911743164,
"logps/chosen": -115.5599594116211,
"logps/rejected": -121.90804290771484,
"loss": 0.5859,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.286516010761261,
"rewards/margins": 0.36314332485198975,
"rewards/rejected": -0.6496593356132507,
"step": 546
},
{
"epoch": 0.6321101578833538,
"grad_norm": 79.67037148877638,
"learning_rate": 1.650618300204242e-07,
"logits/chosen": -1.4731521606445312,
"logits/rejected": -1.5530614852905273,
"logps/chosen": -218.06552124023438,
"logps/rejected": -257.6269226074219,
"loss": 0.6104,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7696484923362732,
"rewards/margins": 0.28321802616119385,
"rewards/rejected": -1.0528665781021118,
"step": 548
},
{
"epoch": 0.63441712926249,
"grad_norm": 67.9423797863854,
"learning_rate": 1.6476897834683618e-07,
"logits/chosen": -1.4056189060211182,
"logits/rejected": -1.4078246355056763,
"logps/chosen": -147.92111206054688,
"logps/rejected": -188.60968017578125,
"loss": 0.6018,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5256268978118896,
"rewards/margins": 0.4678364396095276,
"rewards/rejected": -0.9934633374214172,
"step": 550
},
{
"epoch": 0.6367241006416264,
"grad_norm": 68.15375283996126,
"learning_rate": 1.644751666407424e-07,
"logits/chosen": -1.2929272651672363,
"logits/rejected": -1.3170608282089233,
"logps/chosen": -207.3567352294922,
"logps/rejected": -262.3974609375,
"loss": 0.5823,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7716534733772278,
"rewards/margins": 0.6446899771690369,
"rewards/rejected": -1.4163434505462646,
"step": 552
},
{
"epoch": 0.6390310720207627,
"grad_norm": 71.41650018580867,
"learning_rate": 1.6418039925714115e-07,
"logits/chosen": -1.3858839273452759,
"logits/rejected": -1.3953114748001099,
"logps/chosen": -160.35096740722656,
"logps/rejected": -186.47933959960938,
"loss": 0.5559,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5581396222114563,
"rewards/margins": 0.3457927703857422,
"rewards/rejected": -0.9039323329925537,
"step": 554
},
{
"epoch": 0.6413380433998991,
"grad_norm": 76.78836475295354,
"learning_rate": 1.6388468056519612e-07,
"logits/chosen": -1.4668548107147217,
"logits/rejected": -1.4067307710647583,
"logps/chosen": -212.10546875,
"logps/rejected": -193.7842254638672,
"loss": 0.5721,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.618504524230957,
"rewards/margins": 0.36426225304603577,
"rewards/rejected": -0.9827668070793152,
"step": 556
},
{
"epoch": 0.6436450147790354,
"grad_norm": 66.95864858123714,
"learning_rate": 1.6358801494817172e-07,
"logits/chosen": -1.4181556701660156,
"logits/rejected": -1.409440279006958,
"logps/chosen": -139.5923309326172,
"logps/rejected": -183.9441375732422,
"loss": 0.5663,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.42550671100616455,
"rewards/margins": 0.626122236251831,
"rewards/rejected": -1.0516289472579956,
"step": 558
},
{
"epoch": 0.6459519861581717,
"grad_norm": 88.18680458715171,
"learning_rate": 1.6329040680336805e-07,
"logits/chosen": -1.468677282333374,
"logits/rejected": -1.5043675899505615,
"logps/chosen": -161.72213745117188,
"logps/rejected": -206.85214233398438,
"loss": 0.572,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5167573690414429,
"rewards/margins": 0.36671191453933716,
"rewards/rejected": -0.8834693431854248,
"step": 560
},
{
"epoch": 0.648258957537308,
"grad_norm": 71.84112642036989,
"learning_rate": 1.6299186054205575e-07,
"logits/chosen": -1.5098912715911865,
"logits/rejected": -1.4657700061798096,
"logps/chosen": -177.00067138671875,
"logps/rejected": -190.06985473632812,
"loss": 0.5365,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.3948441743850708,
"rewards/margins": 0.5432202816009521,
"rewards/rejected": -0.9380643963813782,
"step": 562
},
{
"epoch": 0.6505659289164444,
"grad_norm": 77.21845596596229,
"learning_rate": 1.6269238058941067e-07,
"logits/chosen": -1.5354855060577393,
"logits/rejected": -1.4872441291809082,
"logps/chosen": -220.86279296875,
"logps/rejected": -242.259765625,
"loss": 0.6141,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.5020161867141724,
"rewards/margins": 0.3912605345249176,
"rewards/rejected": -0.8932766914367676,
"step": 564
},
{
"epoch": 0.6528729002955808,
"grad_norm": 77.14842839642075,
"learning_rate": 1.6239197138444807e-07,
"logits/chosen": -1.4313609600067139,
"logits/rejected": -1.4305431842803955,
"logps/chosen": -99.62786865234375,
"logps/rejected": -128.8907928466797,
"loss": 0.5895,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.1888483613729477,
"rewards/margins": 0.4503237307071686,
"rewards/rejected": -0.6391721367835999,
"step": 566
},
{
"epoch": 0.655179871674717,
"grad_norm": 62.79374975719681,
"learning_rate": 1.6209063737995714e-07,
"logits/chosen": -1.4637759923934937,
"logits/rejected": -1.4549309015274048,
"logps/chosen": -144.82948303222656,
"logps/rejected": -185.9346466064453,
"loss": 0.5515,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.44154876470565796,
"rewards/margins": 0.37137869000434875,
"rewards/rejected": -0.8129273653030396,
"step": 568
},
{
"epoch": 0.6574868430538534,
"grad_norm": 77.33084496555169,
"learning_rate": 1.6178838304243472e-07,
"logits/chosen": -1.491298794746399,
"logits/rejected": -1.5582300424575806,
"logps/chosen": -193.7870635986328,
"logps/rejected": -242.5855712890625,
"loss": 0.5723,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.5082133412361145,
"rewards/margins": 0.6296249628067017,
"rewards/rejected": -1.1378382444381714,
"step": 570
},
{
"epoch": 0.6597938144329897,
"grad_norm": 67.02472308421605,
"learning_rate": 1.6148521285201927e-07,
"logits/chosen": -1.4817756414413452,
"logits/rejected": -1.402366042137146,
"logps/chosen": -154.45765686035156,
"logps/rejected": -178.16561889648438,
"loss": 0.5564,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3961385488510132,
"rewards/margins": 0.5840703248977661,
"rewards/rejected": -0.9802089333534241,
"step": 572
},
{
"epoch": 0.6621007858121261,
"grad_norm": 73.0106659319347,
"learning_rate": 1.6118113130242432e-07,
"logits/chosen": -1.4550271034240723,
"logits/rejected": -1.4115763902664185,
"logps/chosen": -221.6585235595703,
"logps/rejected": -195.1796417236328,
"loss": 0.5774,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.8219617009162903,
"rewards/margins": 0.16280440986156464,
"rewards/rejected": -0.9847662448883057,
"step": 574
},
{
"epoch": 0.6644077571912623,
"grad_norm": 77.31259598468839,
"learning_rate": 1.6087614290087206e-07,
"logits/chosen": -1.4929287433624268,
"logits/rejected": -1.4764537811279297,
"logps/chosen": -230.29653930664062,
"logps/rejected": -284.22412109375,
"loss": 0.5818,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6301875114440918,
"rewards/margins": 0.7476638555526733,
"rewards/rejected": -1.3778512477874756,
"step": 576
},
{
"epoch": 0.6667147285703987,
"grad_norm": 69.04855850678052,
"learning_rate": 1.605702521680263e-07,
"logits/chosen": -1.3067015409469604,
"logits/rejected": -1.338529348373413,
"logps/chosen": -147.36080932617188,
"logps/rejected": -193.80665588378906,
"loss": 0.5757,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.6234080791473389,
"rewards/margins": 0.39194294810295105,
"rewards/rejected": -1.0153510570526123,
"step": 578
},
{
"epoch": 0.669021699949535,
"grad_norm": 81.45402825293101,
"learning_rate": 1.6026346363792565e-07,
"logits/chosen": -1.4524238109588623,
"logits/rejected": -1.3550243377685547,
"logps/chosen": -187.0885772705078,
"logps/rejected": -177.09780883789062,
"loss": 0.6058,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7711231708526611,
"rewards/margins": 0.17797166109085083,
"rewards/rejected": -0.9490947127342224,
"step": 580
},
{
"epoch": 0.6713286713286714,
"grad_norm": 65.47602685653504,
"learning_rate": 1.5995578185791616e-07,
"logits/chosen": -1.387951374053955,
"logits/rejected": -1.3309695720672607,
"logps/chosen": -158.39202880859375,
"logps/rejected": -186.85105895996094,
"loss": 0.5825,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.48583418130874634,
"rewards/margins": 0.503716230392456,
"rewards/rejected": -0.9895503520965576,
"step": 582
},
{
"epoch": 0.6736356427078076,
"grad_norm": 76.89288613284735,
"learning_rate": 1.596472113885841e-07,
"logits/chosen": -1.4493763446807861,
"logits/rejected": -1.4876127243041992,
"logps/chosen": -180.78541564941406,
"logps/rejected": -220.08172607421875,
"loss": 0.5822,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5832819938659668,
"rewards/margins": 0.494464248418808,
"rewards/rejected": -1.0777461528778076,
"step": 584
},
{
"epoch": 0.675942614086944,
"grad_norm": 82.2690699212878,
"learning_rate": 1.5933775680368822e-07,
"logits/chosen": -1.4559937715530396,
"logits/rejected": -1.5102128982543945,
"logps/chosen": -169.15960693359375,
"logps/rejected": -176.64280700683594,
"loss": 0.6272,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5040290355682373,
"rewards/margins": 0.27444028854370117,
"rewards/rejected": -0.7784693241119385,
"step": 586
},
{
"epoch": 0.6782495854660803,
"grad_norm": 76.21062906880101,
"learning_rate": 1.5902742269009194e-07,
"logits/chosen": -1.348806381225586,
"logits/rejected": -1.293540358543396,
"logps/chosen": -135.5105438232422,
"logps/rejected": -156.5147705078125,
"loss": 0.5875,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.5231513977050781,
"rewards/margins": 0.4782097041606903,
"rewards/rejected": -1.0013611316680908,
"step": 588
},
{
"epoch": 0.6805565568452167,
"grad_norm": 75.50192821178838,
"learning_rate": 1.5871621364769553e-07,
"logits/chosen": -1.5168403387069702,
"logits/rejected": -1.4424357414245605,
"logps/chosen": -183.81605529785156,
"logps/rejected": -171.45872497558594,
"loss": 0.6035,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7719120979309082,
"rewards/margins": 0.2601196765899658,
"rewards/rejected": -1.0320318937301636,
"step": 590
},
{
"epoch": 0.6828635282243529,
"grad_norm": 84.93892075040027,
"learning_rate": 1.5840413428936766e-07,
"logits/chosen": -1.3720101118087769,
"logits/rejected": -1.391021490097046,
"logps/chosen": -171.98031616210938,
"logps/rejected": -176.23892211914062,
"loss": 0.599,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7516859769821167,
"rewards/margins": 0.21854539215564728,
"rewards/rejected": -0.9702314138412476,
"step": 592
},
{
"epoch": 0.6851704996034893,
"grad_norm": 66.70595859312724,
"learning_rate": 1.5809118924087733e-07,
"logits/chosen": -1.4547669887542725,
"logits/rejected": -1.430787205696106,
"logps/chosen": -177.32481384277344,
"logps/rejected": -208.61553955078125,
"loss": 0.6102,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5358410477638245,
"rewards/margins": 0.26219645142555237,
"rewards/rejected": -0.7980375289916992,
"step": 594
},
{
"epoch": 0.6874774709826256,
"grad_norm": 82.62176636567787,
"learning_rate": 1.5777738314082511e-07,
"logits/chosen": -1.4137248992919922,
"logits/rejected": -1.404469609260559,
"logps/chosen": -164.01600646972656,
"logps/rejected": -184.97645568847656,
"loss": 0.6472,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5918564200401306,
"rewards/margins": 0.21411672234535217,
"rewards/rejected": -0.8059731721878052,
"step": 596
},
{
"epoch": 0.689784442361762,
"grad_norm": 72.16505210857706,
"learning_rate": 1.5746272064057439e-07,
"logits/chosen": -1.3921738862991333,
"logits/rejected": -1.3382896184921265,
"logps/chosen": -199.48634338378906,
"logps/rejected": -226.77871704101562,
"loss": 0.5858,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.5180130004882812,
"rewards/margins": 0.4014572501182556,
"rewards/rejected": -0.9194702506065369,
"step": 598
},
{
"epoch": 0.6920914137408983,
"grad_norm": 78.66776375616931,
"learning_rate": 1.5714720640418247e-07,
"logits/chosen": -1.511127233505249,
"logits/rejected": -1.5256671905517578,
"logps/chosen": -182.10826110839844,
"logps/rejected": -198.63510131835938,
"loss": 0.618,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.6393432021141052,
"rewards/margins": 0.16456884145736694,
"rewards/rejected": -0.8039120435714722,
"step": 600
},
{
"epoch": 0.6920914137408983,
"eval_logits/chosen": -1.4086966514587402,
"eval_logits/rejected": -1.3254387378692627,
"eval_logps/chosen": -191.96621704101562,
"eval_logps/rejected": -160.9102325439453,
"eval_loss": 0.6056262850761414,
"eval_rewards/accuracies": 0.7200000286102295,
"eval_rewards/chosen": -0.6784057021141052,
"eval_rewards/margins": 0.3610783815383911,
"eval_rewards/rejected": -1.0394840240478516,
"eval_runtime": 37.022,
"eval_samples_per_second": 2.701,
"eval_steps_per_second": 0.675,
"step": 600
}
],
"logging_steps": 2,
"max_steps": 1732,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}