just1nseo's picture
Model save
a92020c verified
raw
history blame
87.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 1065,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"dpo_losses": 0.6931471824645996,
"epoch": 0.0,
"grad_norm": 1.6032202352154772,
"learning_rate": 4.672897196261682e-08,
"logits/chosen": -3.0016818046569824,
"logits/rejected": -2.8469698429107666,
"logps/chosen": -650.2908325195312,
"logps/rejected": -359.48583984375,
"loss": 0.6931,
"positive_losses": 0.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"dpo_losses": 0.6927387714385986,
"epoch": 0.03,
"grad_norm": 14.052093588804325,
"learning_rate": 4.6728971962616824e-07,
"logits/chosen": -2.9367923736572266,
"logits/rejected": -2.819260835647583,
"logps/chosen": -254.90475463867188,
"logps/rejected": -170.36068725585938,
"loss": 0.6989,
"positive_losses": 0.033258650451898575,
"rewards/accuracies": 0.5555555820465088,
"rewards/chosen": 0.0008218331495299935,
"rewards/margins": 0.0008189052459783852,
"rewards/margins_max": 0.0018548837397247553,
"rewards/margins_min": -0.00021707323321606964,
"rewards/margins_std": 0.0014650949742645025,
"rewards/rejected": 2.927754849224584e-06,
"step": 10
},
{
"dpo_losses": 0.6923267245292664,
"epoch": 0.06,
"grad_norm": 1.844492423373157,
"learning_rate": 9.345794392523365e-07,
"logits/chosen": -2.7079358100891113,
"logits/rejected": -2.7515180110931396,
"logps/chosen": -306.1308898925781,
"logps/rejected": -241.56021118164062,
"loss": 0.6932,
"positive_losses": 0.012112426571547985,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.006594317965209484,
"rewards/margins": 0.0016433143755421042,
"rewards/margins_max": 0.002823440358042717,
"rewards/margins_min": 0.00046318816021084785,
"rewards/margins_std": 0.001668950542807579,
"rewards/rejected": 0.004951003938913345,
"step": 20
},
{
"dpo_losses": 0.6899991631507874,
"epoch": 0.08,
"grad_norm": 2.1897418931727595,
"learning_rate": 1.4018691588785047e-06,
"logits/chosen": -2.904411792755127,
"logits/rejected": -2.816619396209717,
"logps/chosen": -358.5197448730469,
"logps/rejected": -251.15103149414062,
"loss": 0.6896,
"positive_losses": 0.0,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.02147643454372883,
"rewards/margins": 0.006321606691926718,
"rewards/margins_max": 0.01196110900491476,
"rewards/margins_min": 0.000682103622239083,
"rewards/margins_std": 0.007975460961461067,
"rewards/rejected": 0.015154826454818249,
"step": 30
},
{
"dpo_losses": 0.6862105131149292,
"epoch": 0.11,
"grad_norm": 1.7300257406359418,
"learning_rate": 1.869158878504673e-06,
"logits/chosen": -2.8441336154937744,
"logits/rejected": -2.7715249061584473,
"logps/chosen": -327.30523681640625,
"logps/rejected": -313.1446228027344,
"loss": 0.6864,
"positive_losses": 0.02085266076028347,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.03371895104646683,
"rewards/margins": 0.013976506888866425,
"rewards/margins_max": 0.017818700522184372,
"rewards/margins_min": 0.010134311392903328,
"rewards/margins_std": 0.0054336837492883205,
"rewards/rejected": 0.019742444157600403,
"step": 40
},
{
"dpo_losses": 0.6820067167282104,
"epoch": 0.14,
"grad_norm": 9.347589322785899,
"learning_rate": 2.3364485981308413e-06,
"logits/chosen": -2.795854091644287,
"logits/rejected": -2.720963954925537,
"logps/chosen": -217.7622833251953,
"logps/rejected": -171.39205932617188,
"loss": 0.6779,
"positive_losses": 0.0,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.0513346791267395,
"rewards/margins": 0.022588390856981277,
"rewards/margins_max": 0.03625725582242012,
"rewards/margins_min": 0.008919527754187584,
"rewards/margins_std": 0.019330691546201706,
"rewards/rejected": 0.028746291995048523,
"step": 50
},
{
"dpo_losses": 0.6643597483634949,
"epoch": 0.17,
"grad_norm": 2.37274745731943,
"learning_rate": 2.8037383177570094e-06,
"logits/chosen": -2.7788054943084717,
"logits/rejected": -2.710609197616577,
"logps/chosen": -256.30633544921875,
"logps/rejected": -233.06576538085938,
"loss": 0.6666,
"positive_losses": 0.0,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.08375100791454315,
"rewards/margins": 0.05946110561490059,
"rewards/margins_max": 0.08825884014368057,
"rewards/margins_min": 0.030663389712572098,
"rewards/margins_std": 0.040726132690906525,
"rewards/rejected": 0.024289902299642563,
"step": 60
},
{
"dpo_losses": 0.6566643714904785,
"epoch": 0.2,
"grad_norm": 1.6634540430479345,
"learning_rate": 3.2710280373831774e-06,
"logits/chosen": -2.635437488555908,
"logits/rejected": -2.678208351135254,
"logps/chosen": -283.38287353515625,
"logps/rejected": -209.6460418701172,
"loss": 0.6558,
"positive_losses": 0.0,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.11453696340322495,
"rewards/margins": 0.07607836276292801,
"rewards/margins_max": 0.13534289598464966,
"rewards/margins_min": 0.01681383326649666,
"rewards/margins_std": 0.08381269872188568,
"rewards/rejected": 0.03845860809087753,
"step": 70
},
{
"dpo_losses": 0.6309095025062561,
"epoch": 0.23,
"grad_norm": 1.7989959804157094,
"learning_rate": 3.738317757009346e-06,
"logits/chosen": -2.9159035682678223,
"logits/rejected": -2.8235018253326416,
"logps/chosen": -335.8651123046875,
"logps/rejected": -286.46331787109375,
"loss": 0.6397,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13743606209754944,
"rewards/margins": 0.1305120289325714,
"rewards/margins_max": 0.18503351509571075,
"rewards/margins_min": 0.07599054276943207,
"rewards/margins_std": 0.07710503041744232,
"rewards/rejected": 0.0069240378215909,
"step": 80
},
{
"dpo_losses": 0.6185696721076965,
"epoch": 0.25,
"grad_norm": 9.307634759665634,
"learning_rate": 4.205607476635514e-06,
"logits/chosen": -2.6819961071014404,
"logits/rejected": -2.7166359424591064,
"logps/chosen": -211.7088623046875,
"logps/rejected": -203.97885131835938,
"loss": 0.6145,
"positive_losses": 0.035182952880859375,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.15242353081703186,
"rewards/margins": 0.16033907234668732,
"rewards/margins_max": 0.24873778223991394,
"rewards/margins_min": 0.07194037735462189,
"rewards/margins_std": 0.12501463294029236,
"rewards/rejected": -0.007915569469332695,
"step": 90
},
{
"dpo_losses": 0.6138414144515991,
"epoch": 0.28,
"grad_norm": 2.169680467803253,
"learning_rate": 4.6728971962616825e-06,
"logits/chosen": -2.783569812774658,
"logits/rejected": -2.812309741973877,
"logps/chosen": -288.1591796875,
"logps/rejected": -341.5180969238281,
"loss": 0.6275,
"positive_losses": 0.8350906372070312,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.1585826575756073,
"rewards/margins": 0.1714317500591278,
"rewards/margins_max": 0.2515793442726135,
"rewards/margins_min": 0.09128417074680328,
"rewards/margins_std": 0.11334581673145294,
"rewards/rejected": -0.012849109247326851,
"step": 100
},
{
"epoch": 0.28,
"eval_dpo_losses": 0.6742300391197205,
"eval_logits/chosen": -2.7527217864990234,
"eval_logits/rejected": -2.71140456199646,
"eval_logps/chosen": -276.58984375,
"eval_logps/rejected": -254.9810333251953,
"eval_loss": 0.8539575338363647,
"eval_positive_losses": 1.6940749883651733,
"eval_rewards/accuracies": 0.60317462682724,
"eval_rewards/chosen": 0.08631354570388794,
"eval_rewards/margins": 0.04429732263088226,
"eval_rewards/margins_max": 0.21467885375022888,
"eval_rewards/margins_min": -0.10308819264173508,
"eval_rewards/margins_std": 0.14203837513923645,
"eval_rewards/rejected": 0.042016226798295975,
"eval_runtime": 285.3929,
"eval_samples_per_second": 7.008,
"eval_steps_per_second": 0.221,
"step": 100
},
{
"dpo_losses": 0.5535503029823303,
"epoch": 0.31,
"grad_norm": 2.2536984881767905,
"learning_rate": 4.999879018839288e-06,
"logits/chosen": -2.7111623287200928,
"logits/rejected": -2.6175503730773926,
"logps/chosen": -252.84732055664062,
"logps/rejected": -252.4491729736328,
"loss": 0.5736,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2623223662376404,
"rewards/margins": 0.32247892022132874,
"rewards/margins_max": 0.4974708557128906,
"rewards/margins_min": 0.14748699963092804,
"rewards/margins_std": 0.24747595191001892,
"rewards/rejected": -0.060156505554914474,
"step": 110
},
{
"dpo_losses": 0.5708788633346558,
"epoch": 0.34,
"grad_norm": 1.8718792057149318,
"learning_rate": 4.99772856836941e-06,
"logits/chosen": -2.873108148574829,
"logits/rejected": -2.8189544677734375,
"logps/chosen": -373.77386474609375,
"logps/rejected": -337.38922119140625,
"loss": 0.5727,
"positive_losses": 0.22691193222999573,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.21501663327217102,
"rewards/margins": 0.27264389395713806,
"rewards/margins_max": 0.36815184354782104,
"rewards/margins_min": 0.17713597416877747,
"rewards/margins_std": 0.13506858050823212,
"rewards/rejected": -0.057627253234386444,
"step": 120
},
{
"dpo_losses": 0.5159657001495361,
"epoch": 0.37,
"grad_norm": 1.9587224479056975,
"learning_rate": 4.992892309373227e-06,
"logits/chosen": -2.7587242126464844,
"logits/rejected": -2.689577341079712,
"logps/chosen": -311.52978515625,
"logps/rejected": -274.8511047363281,
"loss": 0.5718,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24945905804634094,
"rewards/margins": 0.41450828313827515,
"rewards/margins_max": 0.5637356638908386,
"rewards/margins_min": 0.26528093218803406,
"rewards/margins_std": 0.21103934943675995,
"rewards/rejected": -0.1650492250919342,
"step": 130
},
{
"dpo_losses": 0.5120642185211182,
"epoch": 0.39,
"grad_norm": 35.2981995380076,
"learning_rate": 4.985375442281969e-06,
"logits/chosen": -2.725268602371216,
"logits/rejected": -2.7174267768859863,
"logps/chosen": -270.7826843261719,
"logps/rejected": -248.8843536376953,
"loss": 0.5953,
"positive_losses": 0.0,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.25090181827545166,
"rewards/margins": 0.42299699783325195,
"rewards/margins_max": 0.5964112877845764,
"rewards/margins_min": 0.24958273768424988,
"rewards/margins_std": 0.2452448159456253,
"rewards/rejected": -0.1720951795578003,
"step": 140
},
{
"dpo_losses": 0.4928362965583801,
"epoch": 0.42,
"grad_norm": 11.90624935921094,
"learning_rate": 4.9751860499858175e-06,
"logits/chosen": -2.72652530670166,
"logits/rejected": -2.7453625202178955,
"logps/chosen": -301.97021484375,
"logps/rejected": -276.3653259277344,
"loss": 0.5758,
"positive_losses": 0.42821502685546875,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.19235308468341827,
"rewards/margins": 0.47807592153549194,
"rewards/margins_max": 0.6831300854682922,
"rewards/margins_min": 0.2730218172073364,
"rewards/margins_std": 0.2899903357028961,
"rewards/rejected": -0.2857228219509125,
"step": 150
},
{
"dpo_losses": 0.4960567355155945,
"epoch": 0.45,
"grad_norm": 15.847210447883002,
"learning_rate": 4.962335089142376e-06,
"logits/chosen": -2.81313157081604,
"logits/rejected": -2.735961675643921,
"logps/chosen": -244.3223419189453,
"logps/rejected": -264.59417724609375,
"loss": 0.559,
"positive_losses": 0.0,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.25210094451904297,
"rewards/margins": 0.46666574478149414,
"rewards/margins_max": 0.6729411482810974,
"rewards/margins_min": 0.26039019227027893,
"rewards/margins_std": 0.2917175889015198,
"rewards/rejected": -0.21456477046012878,
"step": 160
},
{
"dpo_losses": 0.4443618357181549,
"epoch": 0.48,
"grad_norm": 2.921537165567133,
"learning_rate": 4.946836378394967e-06,
"logits/chosen": -2.8487417697906494,
"logits/rejected": -2.7233359813690186,
"logps/chosen": -293.14263916015625,
"logps/rejected": -265.21044921875,
"loss": 0.4792,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.32291096448898315,
"rewards/margins": 0.6394142508506775,
"rewards/margins_max": 0.8530386686325073,
"rewards/margins_min": 0.42578983306884766,
"rewards/margins_std": 0.30211058259010315,
"rewards/rejected": -0.3165033161640167,
"step": 170
},
{
"dpo_losses": 0.4956347942352295,
"epoch": 0.51,
"grad_norm": 23.196576832752985,
"learning_rate": 4.928706583513441e-06,
"logits/chosen": -2.7180655002593994,
"logits/rejected": -2.674361228942871,
"logps/chosen": -249.37704467773438,
"logps/rejected": -410.07391357421875,
"loss": 0.5836,
"positive_losses": 1.074639916419983,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.17418113350868225,
"rewards/margins": 0.47780531644821167,
"rewards/margins_max": 0.6538316011428833,
"rewards/margins_min": 0.3017791211605072,
"rewards/margins_std": 0.24893875420093536,
"rewards/rejected": -0.3036242425441742,
"step": 180
},
{
"dpo_losses": 0.4040610194206238,
"epoch": 0.54,
"grad_norm": 2.662026262000448,
"learning_rate": 4.907965199473471e-06,
"logits/chosen": -2.6723411083221436,
"logits/rejected": -2.5524985790252686,
"logps/chosen": -320.6319274902344,
"logps/rejected": -257.9935302734375,
"loss": 0.5582,
"positive_losses": 1.0019195079803467,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.3839383125305176,
"rewards/margins": 0.7386760711669922,
"rewards/margins_max": 0.9635330438613892,
"rewards/margins_min": 0.51381915807724,
"rewards/margins_std": 0.31799572706222534,
"rewards/rejected": -0.3547378182411194,
"step": 190
},
{
"dpo_losses": 0.4472725987434387,
"epoch": 0.56,
"grad_norm": 23.173407948282012,
"learning_rate": 4.884634529493591e-06,
"logits/chosen": -2.8709282875061035,
"logits/rejected": -2.7968573570251465,
"logps/chosen": -255.41879272460938,
"logps/rejected": -237.76406860351562,
"loss": 0.599,
"positive_losses": 0.0,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.2910212576389313,
"rewards/margins": 0.6229863166809082,
"rewards/margins_max": 0.8043031692504883,
"rewards/margins_min": 0.4416695535182953,
"rewards/margins_std": 0.25642070174217224,
"rewards/rejected": -0.3319651484489441,
"step": 200
},
{
"epoch": 0.56,
"eval_dpo_losses": 0.6560041308403015,
"eval_logits/chosen": -2.7841696739196777,
"eval_logits/rejected": -2.738633632659912,
"eval_logps/chosen": -291.0660095214844,
"eval_logps/rejected": -275.99658203125,
"eval_loss": 1.9206839799880981,
"eval_positive_losses": 12.58076000213623,
"eval_rewards/accuracies": 0.6388888955116272,
"eval_rewards/chosen": -0.05844784155488014,
"eval_rewards/margins": 0.10969138890504837,
"eval_rewards/margins_max": 0.4903210401535034,
"eval_rewards/margins_min": -0.25554272532463074,
"eval_rewards/margins_std": 0.33160677552223206,
"eval_rewards/rejected": -0.1681392341852188,
"eval_runtime": 284.4185,
"eval_samples_per_second": 7.032,
"eval_steps_per_second": 0.222,
"step": 200
},
{
"dpo_losses": 0.45512253046035767,
"epoch": 0.59,
"grad_norm": 7.436654691984327,
"learning_rate": 4.858739661052539e-06,
"logits/chosen": -2.5205092430114746,
"logits/rejected": -2.5804672241210938,
"logps/chosen": -240.2886962890625,
"logps/rejected": -298.3849182128906,
"loss": 0.5267,
"positive_losses": 0.4501487612724304,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.38874852657318115,
"rewards/margins": 0.6847165822982788,
"rewards/margins_max": 1.0578687191009521,
"rewards/margins_min": 0.31156447529792786,
"rewards/margins_std": 0.5277167558670044,
"rewards/rejected": -0.29596805572509766,
"step": 210
},
{
"dpo_losses": 0.4267025589942932,
"epoch": 0.62,
"grad_norm": 12.252545150739026,
"learning_rate": 4.830308438912687e-06,
"logits/chosen": -2.901047945022583,
"logits/rejected": -2.776557207107544,
"logps/chosen": -341.5310363769531,
"logps/rejected": -316.5777282714844,
"loss": 0.5828,
"positive_losses": 1.5781867504119873,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.28965142369270325,
"rewards/margins": 0.6728664040565491,
"rewards/margins_max": 0.8954153060913086,
"rewards/margins_min": 0.4503174424171448,
"rewards/margins_std": 0.3147316873073578,
"rewards/rejected": -0.38321495056152344,
"step": 220
},
{
"dpo_losses": 0.42703738808631897,
"epoch": 0.65,
"grad_norm": 2.1766995421765545,
"learning_rate": 4.799371435178544e-06,
"logits/chosen": -2.821802854537964,
"logits/rejected": -2.777765989303589,
"logps/chosen": -321.39501953125,
"logps/rejected": -376.64483642578125,
"loss": 0.5028,
"positive_losses": 1.304276466369629,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30395936965942383,
"rewards/margins": 0.7244865298271179,
"rewards/margins_max": 1.1590335369110107,
"rewards/margins_min": 0.2899397909641266,
"rewards/margins_std": 0.6145419478416443,
"rewards/rejected": -0.42052727937698364,
"step": 230
},
{
"dpo_losses": 0.4363502860069275,
"epoch": 0.68,
"grad_norm": 13.650828107929078,
"learning_rate": 4.765961916422575e-06,
"logits/chosen": -2.7546634674072266,
"logits/rejected": -2.707695722579956,
"logps/chosen": -219.1737518310547,
"logps/rejected": -330.49444580078125,
"loss": 0.5883,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29220908880233765,
"rewards/margins": 0.6587773561477661,
"rewards/margins_max": 0.8973654508590698,
"rewards/margins_min": 0.42018923163414,
"rewards/margins_std": 0.33741456270217896,
"rewards/rejected": -0.36656829714775085,
"step": 240
},
{
"dpo_losses": 0.40715283155441284,
"epoch": 0.7,
"grad_norm": 5.403626615804181,
"learning_rate": 4.730115807913627e-06,
"logits/chosen": -2.786029577255249,
"logits/rejected": -2.656646490097046,
"logps/chosen": -316.26605224609375,
"logps/rejected": -292.4571838378906,
"loss": 0.4798,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.36320579051971436,
"rewards/margins": 0.7509908676147461,
"rewards/margins_max": 0.927462100982666,
"rewards/margins_min": 0.5745195150375366,
"rewards/margins_std": 0.24956803023815155,
"rewards/rejected": -0.3877849876880646,
"step": 250
},
{
"dpo_losses": 0.45335307717323303,
"epoch": 0.73,
"grad_norm": 29.921037643309493,
"learning_rate": 4.691871654986485e-06,
"logits/chosen": -2.8433797359466553,
"logits/rejected": -2.7910611629486084,
"logps/chosen": -240.71328735351562,
"logps/rejected": -260.13897705078125,
"loss": 0.5549,
"positive_losses": 1.8418042659759521,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.2628856301307678,
"rewards/margins": 0.6381598711013794,
"rewards/margins_max": 0.8531384468078613,
"rewards/margins_min": 0.42318135499954224,
"rewards/margins_std": 0.3040255904197693,
"rewards/rejected": -0.37527427077293396,
"step": 260
},
{
"dpo_losses": 0.45805755257606506,
"epoch": 0.76,
"grad_norm": 3.6869955202700884,
"learning_rate": 4.651270581594054e-06,
"logits/chosen": -2.8275113105773926,
"logits/rejected": -2.726349353790283,
"logps/chosen": -264.3140869140625,
"logps/rejected": -256.37506103515625,
"loss": 0.5553,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.38106483221054077,
"rewards/margins": 0.6026363372802734,
"rewards/margins_max": 0.8345470428466797,
"rewards/margins_min": 0.37072569131851196,
"rewards/margins_std": 0.3279712498188019,
"rewards/rejected": -0.22157149016857147,
"step": 270
},
{
"dpo_losses": 0.46088677644729614,
"epoch": 0.79,
"grad_norm": 11.384071170544201,
"learning_rate": 4.6083562460867545e-06,
"logits/chosen": -2.7374978065490723,
"logits/rejected": -2.705930233001709,
"logps/chosen": -292.6180114746094,
"logps/rejected": -295.0760803222656,
"loss": 0.6126,
"positive_losses": 1.0429108142852783,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.2991539239883423,
"rewards/margins": 0.611792802810669,
"rewards/margins_max": 0.8936999440193176,
"rewards/margins_min": 0.32988566160202026,
"rewards/margins_std": 0.39867693185806274,
"rewards/rejected": -0.3126388192176819,
"step": 280
},
{
"dpo_losses": 0.5089690685272217,
"epoch": 0.82,
"grad_norm": 2.7173946115224865,
"learning_rate": 4.563174794266684e-06,
"logits/chosen": -2.875331163406372,
"logits/rejected": -2.819256544113159,
"logps/chosen": -263.9188232421875,
"logps/rejected": -286.82647705078125,
"loss": 0.593,
"positive_losses": 1.377386450767517,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.2715073823928833,
"rewards/margins": 0.4865007996559143,
"rewards/margins_max": 0.7703573703765869,
"rewards/margins_min": 0.20264430344104767,
"rewards/margins_std": 0.40143370628356934,
"rewards/rejected": -0.2149934470653534,
"step": 290
},
{
"dpo_losses": 0.44498148560523987,
"epoch": 0.85,
"grad_norm": 8.551683262439843,
"learning_rate": 4.5157748097670125e-06,
"logits/chosen": -2.9059486389160156,
"logits/rejected": -2.793186902999878,
"logps/chosen": -319.2405090332031,
"logps/rejected": -338.54998779296875,
"loss": 0.4901,
"positive_losses": 0.0022247314918786287,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.35988515615463257,
"rewards/margins": 0.6180437803268433,
"rewards/margins_max": 0.8190226554870605,
"rewards/margins_min": 0.41706475615501404,
"rewards/margins_std": 0.2842271625995636,
"rewards/rejected": -0.2581585943698883,
"step": 300
},
{
"epoch": 0.85,
"eval_dpo_losses": 0.6506758332252502,
"eval_logits/chosen": -2.7854835987091064,
"eval_logits/rejected": -2.7329776287078857,
"eval_logps/chosen": -303.7291564941406,
"eval_logps/rejected": -289.5481872558594,
"eval_loss": 2.8066518306732178,
"eval_positive_losses": 22.214069366455078,
"eval_rewards/accuracies": 0.6388888955116272,
"eval_rewards/chosen": -0.18507955968379974,
"eval_rewards/margins": 0.11857547610998154,
"eval_rewards/margins_max": 0.47240880131721497,
"eval_rewards/margins_min": -0.25752344727516174,
"eval_rewards/margins_std": 0.32571399211883545,
"eval_rewards/rejected": -0.3036550283432007,
"eval_runtime": 284.7873,
"eval_samples_per_second": 7.023,
"eval_steps_per_second": 0.221,
"step": 300
},
{
"dpo_losses": 0.43647676706314087,
"epoch": 0.87,
"grad_norm": 2.6162448381062275,
"learning_rate": 4.466207261809989e-06,
"logits/chosen": -2.9903199672698975,
"logits/rejected": -2.7902731895446777,
"logps/chosen": -293.12274169921875,
"logps/rejected": -296.422119140625,
"loss": 0.6852,
"positive_losses": 0.944580078125,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.31041496992111206,
"rewards/margins": 0.6650521755218506,
"rewards/margins_max": 0.992100715637207,
"rewards/margins_min": 0.338003545999527,
"rewards/margins_std": 0.4625166058540344,
"rewards/rejected": -0.35463717579841614,
"step": 310
},
{
"dpo_losses": 0.4618608057498932,
"epoch": 0.9,
"grad_norm": 10.483471531606499,
"learning_rate": 4.414525450399713e-06,
"logits/chosen": -2.8283543586730957,
"logits/rejected": -2.7349746227264404,
"logps/chosen": -286.9427185058594,
"logps/rejected": -262.766845703125,
"loss": 0.527,
"positive_losses": 0.8719180822372437,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.32706111669540405,
"rewards/margins": 0.5900775790214539,
"rewards/margins_max": 0.8372209668159485,
"rewards/margins_min": 0.34293434023857117,
"rewards/margins_std": 0.34951338171958923,
"rewards/rejected": -0.2630165219306946,
"step": 320
},
{
"dpo_losses": 0.40510478615760803,
"epoch": 0.93,
"grad_norm": 20.600609246290738,
"learning_rate": 4.360784949008615e-06,
"logits/chosen": -2.9669108390808105,
"logits/rejected": -2.8032517433166504,
"logps/chosen": -316.91192626953125,
"logps/rejected": -283.3198547363281,
"loss": 0.515,
"positive_losses": 0.5270363092422485,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.45585203170776367,
"rewards/margins": 0.8136453628540039,
"rewards/margins_max": 1.1761964559555054,
"rewards/margins_min": 0.45109423995018005,
"rewards/margins_std": 0.512724757194519,
"rewards/rejected": -0.35779333114624023,
"step": 330
},
{
"dpo_losses": 0.47730112075805664,
"epoch": 0.96,
"grad_norm": 2.164730368336074,
"learning_rate": 4.30504354481929e-06,
"logits/chosen": -2.79738450050354,
"logits/rejected": -2.7073657512664795,
"logps/chosen": -230.3443145751953,
"logps/rejected": -234.2275390625,
"loss": 0.496,
"positive_losses": 2.5491890907287598,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.2501987814903259,
"rewards/margins": 0.5649263858795166,
"rewards/margins_max": 0.820625901222229,
"rewards/margins_min": 0.3092268109321594,
"rewards/margins_std": 0.36161375045776367,
"rewards/rejected": -0.3147276043891907,
"step": 340
},
{
"dpo_losses": 0.3645946681499481,
"epoch": 0.99,
"grad_norm": 19.210785792660445,
"learning_rate": 4.247361176585904e-06,
"logits/chosen": -2.791806697845459,
"logits/rejected": -2.676161289215088,
"logps/chosen": -352.7079162597656,
"logps/rejected": -353.04425048828125,
"loss": 0.584,
"positive_losses": 1.5420730113983154,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3225085139274597,
"rewards/margins": 0.8690497279167175,
"rewards/margins_max": 1.1524405479431152,
"rewards/margins_min": 0.5856587886810303,
"rewards/margins_std": 0.4007752537727356,
"rewards/rejected": -0.546541154384613,
"step": 350
},
{
"dpo_losses": 0.44031819701194763,
"epoch": 1.01,
"grad_norm": 3.4688322040336876,
"learning_rate": 4.187799870182038e-06,
"logits/chosen": -2.756261110305786,
"logits/rejected": -2.6450822353363037,
"logps/chosen": -273.16424560546875,
"logps/rejected": -231.5010986328125,
"loss": 0.4573,
"positive_losses": 0.19403228163719177,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.35820913314819336,
"rewards/margins": 0.6459983587265015,
"rewards/margins_max": 0.8495124578475952,
"rewards/margins_min": 0.4424843192100525,
"rewards/margins_std": 0.28781232237815857,
"rewards/rejected": -0.2877892851829529,
"step": 360
},
{
"dpo_losses": 0.3558691143989563,
"epoch": 1.04,
"grad_norm": 71.7335292506231,
"learning_rate": 4.1264236719042365e-06,
"logits/chosen": -2.6822152137756348,
"logits/rejected": -2.662559986114502,
"logps/chosen": -320.59442138671875,
"logps/rejected": -317.09295654296875,
"loss": 0.4251,
"positive_losses": 0.42721253633499146,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.45041507482528687,
"rewards/margins": 0.9831393957138062,
"rewards/margins_max": 1.3708398342132568,
"rewards/margins_min": 0.5954390168190002,
"rewards/margins_std": 0.5482910871505737,
"rewards/rejected": -0.5327242612838745,
"step": 370
},
{
"dpo_losses": 0.3393256664276123,
"epoch": 1.07,
"grad_norm": 4.337233890868313,
"learning_rate": 4.063298579603001e-06,
"logits/chosen": -2.7261626720428467,
"logits/rejected": -2.5453438758850098,
"logps/chosen": -265.1933288574219,
"logps/rejected": -244.08682250976562,
"loss": 0.3984,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4042983055114746,
"rewards/margins": 0.999946117401123,
"rewards/margins_max": 1.2659950256347656,
"rewards/margins_min": 0.7338972091674805,
"rewards/margins_std": 0.3762499690055847,
"rewards/rejected": -0.5956477522850037,
"step": 380
},
{
"dpo_losses": 0.25663647055625916,
"epoch": 1.1,
"grad_norm": 35.28065142871338,
"learning_rate": 3.998492471715272e-06,
"logits/chosen": -2.7409512996673584,
"logits/rejected": -2.752206325531006,
"logps/chosen": -314.38153076171875,
"logps/rejected": -423.4803161621094,
"loss": 0.5701,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4437999129295349,
"rewards/margins": 1.3619416952133179,
"rewards/margins_max": 1.7605613470077515,
"rewards/margins_min": 0.9633218050003052,
"rewards/margins_std": 0.5637335181236267,
"rewards/rejected": -0.9181416630744934,
"step": 390
},
{
"dpo_losses": 0.3513553738594055,
"epoch": 1.13,
"grad_norm": 4.052912034886079,
"learning_rate": 3.932075034274723e-06,
"logits/chosen": -2.73002552986145,
"logits/rejected": -2.6879513263702393,
"logps/chosen": -205.73922729492188,
"logps/rejected": -290.40057373046875,
"loss": 0.4414,
"positive_losses": 0.4541704058647156,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3537348806858063,
"rewards/margins": 0.9853051900863647,
"rewards/margins_max": 1.2278480529785156,
"rewards/margins_min": 0.7427625060081482,
"rewards/margins_std": 0.34300726652145386,
"rewards/rejected": -0.6315703988075256,
"step": 400
},
{
"epoch": 1.13,
"eval_dpo_losses": 0.6385828852653503,
"eval_logits/chosen": -2.71909236907959,
"eval_logits/rejected": -2.670318365097046,
"eval_logps/chosen": -299.07989501953125,
"eval_logps/rejected": -291.5615539550781,
"eval_loss": 2.6622352600097656,
"eval_positive_losses": 20.927839279174805,
"eval_rewards/accuracies": 0.6746031641960144,
"eval_rewards/chosen": -0.13858698308467865,
"eval_rewards/margins": 0.18520160019397736,
"eval_rewards/margins_max": 0.6970763802528381,
"eval_rewards/margins_min": -0.37488874793052673,
"eval_rewards/margins_std": 0.4832788407802582,
"eval_rewards/rejected": -0.323788583278656,
"eval_runtime": 283.8974,
"eval_samples_per_second": 7.045,
"eval_steps_per_second": 0.222,
"step": 400
},
{
"dpo_losses": 0.38113099336624146,
"epoch": 1.15,
"grad_norm": 1.9724463020625589,
"learning_rate": 3.864117685978339e-06,
"logits/chosen": -2.816284656524658,
"logits/rejected": -2.7134735584259033,
"logps/chosen": -242.77761840820312,
"logps/rejected": -272.8990173339844,
"loss": 0.4468,
"positive_losses": 4.795651912689209,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.2276880443096161,
"rewards/margins": 0.8958386182785034,
"rewards/margins_max": 1.3816872835159302,
"rewards/margins_min": 0.40999001264572144,
"rewards/margins_std": 0.6870937943458557,
"rewards/rejected": -0.6681506037712097,
"step": 410
},
{
"dpo_losses": 0.33210596442222595,
"epoch": 1.18,
"grad_norm": 3.4285412656501766,
"learning_rate": 3.794693501389861e-06,
"logits/chosen": -2.8275389671325684,
"logits/rejected": -2.7307045459747314,
"logps/chosen": -293.709716796875,
"logps/rejected": -331.89312744140625,
"loss": 0.4087,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4260416030883789,
"rewards/margins": 1.108737587928772,
"rewards/margins_max": 1.603941559791565,
"rewards/margins_min": 0.613533616065979,
"rewards/margins_std": 0.7003240585327148,
"rewards/rejected": -0.6826959848403931,
"step": 420
},
{
"dpo_losses": 0.34472885727882385,
"epoch": 1.21,
"grad_norm": 2.846847523997402,
"learning_rate": 3.7238771323626822e-06,
"logits/chosen": -2.7846486568450928,
"logits/rejected": -2.6524085998535156,
"logps/chosen": -342.40692138671875,
"logps/rejected": -332.17010498046875,
"loss": 0.5622,
"positive_losses": 4.125036239624023,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.3678087592124939,
"rewards/margins": 1.078407883644104,
"rewards/margins_max": 1.4884113073349,
"rewards/margins_min": 0.6684045195579529,
"rewards/margins_std": 0.5798323154449463,
"rewards/rejected": -0.7105990648269653,
"step": 430
},
{
"dpo_losses": 0.32919952273368835,
"epoch": 1.24,
"grad_norm": 109.42335538231772,
"learning_rate": 3.651744727766676e-06,
"logits/chosen": -2.7272467613220215,
"logits/rejected": -2.66713285446167,
"logps/chosen": -210.4514617919922,
"logps/rejected": -259.1316833496094,
"loss": 0.4028,
"positive_losses": 0.3457130491733551,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.36259937286376953,
"rewards/margins": 1.016867995262146,
"rewards/margins_max": 1.3856614828109741,
"rewards/margins_min": 0.6480745077133179,
"rewards/margins_std": 0.5215528607368469,
"rewards/rejected": -0.6542686223983765,
"step": 440
},
{
"dpo_losses": 0.31118613481521606,
"epoch": 1.27,
"grad_norm": 38.9440234553471,
"learning_rate": 3.57837385160529e-06,
"logits/chosen": -2.659485340118408,
"logits/rejected": -2.6188011169433594,
"logps/chosen": -273.7745361328125,
"logps/rejected": -349.734619140625,
"loss": 0.4823,
"positive_losses": 2.6830811500549316,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.3455764055252075,
"rewards/margins": 1.166411280632019,
"rewards/margins_max": 1.5491106510162354,
"rewards/margins_min": 0.7837120890617371,
"rewards/margins_std": 0.5412184596061707,
"rewards/rejected": -0.8208349347114563,
"step": 450
},
{
"dpo_losses": 0.24196143448352814,
"epoch": 1.3,
"grad_norm": 2.8256101438878116,
"learning_rate": 3.503843399610941e-06,
"logits/chosen": -2.6595611572265625,
"logits/rejected": -2.6660475730895996,
"logps/chosen": -322.4607849121094,
"logps/rejected": -492.70068359375,
"loss": 0.4169,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5088067650794983,
"rewards/margins": 1.4272974729537964,
"rewards/margins_max": 1.8303813934326172,
"rewards/margins_min": 1.0242136716842651,
"rewards/margins_std": 0.5700467824935913,
"rewards/rejected": -0.9184908866882324,
"step": 460
},
{
"dpo_losses": 0.32115817070007324,
"epoch": 1.32,
"grad_norm": 5.028579287398997,
"learning_rate": 3.4282335144083985e-06,
"logits/chosen": -2.567282199859619,
"logits/rejected": -2.616426706314087,
"logps/chosen": -219.5450439453125,
"logps/rejected": -303.61566162109375,
"loss": 0.537,
"positive_losses": 2.556870937347412,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.25023719668388367,
"rewards/margins": 1.0798122882843018,
"rewards/margins_max": 1.3413639068603516,
"rewards/margins_min": 0.8182605504989624,
"rewards/margins_std": 0.36988988518714905,
"rewards/rejected": -0.8295750617980957,
"step": 470
},
{
"dpo_losses": 0.2937307357788086,
"epoch": 1.35,
"grad_norm": 70.59458692509646,
"learning_rate": 3.351625499337395e-06,
"logits/chosen": -2.821207284927368,
"logits/rejected": -2.655557155609131,
"logps/chosen": -336.3492126464844,
"logps/rejected": -360.6393127441406,
"loss": 0.4803,
"positive_losses": 4.53096866607666,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.37952059507369995,
"rewards/margins": 1.2218748331069946,
"rewards/margins_max": 1.5149763822555542,
"rewards/margins_min": 0.9287732243537903,
"rewards/margins_std": 0.4145084023475647,
"rewards/rejected": -0.8423541784286499,
"step": 480
},
{
"dpo_losses": 0.38940221071243286,
"epoch": 1.38,
"grad_norm": 2.687569639501308,
"learning_rate": 3.2741017310271056e-06,
"logits/chosen": -2.6762735843658447,
"logits/rejected": -2.549715280532837,
"logps/chosen": -201.81640625,
"logps/rejected": -277.5948791503906,
"loss": 0.4423,
"positive_losses": 0.7856195569038391,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2850914001464844,
"rewards/margins": 0.8952637910842896,
"rewards/margins_max": 1.2249016761779785,
"rewards/margins_min": 0.5656259655952454,
"rewards/margins_std": 0.46617835760116577,
"rewards/rejected": -0.6101723909378052,
"step": 490
},
{
"dpo_losses": 0.3359260559082031,
"epoch": 1.41,
"grad_norm": 8.068773298281624,
"learning_rate": 3.195745570816532e-06,
"logits/chosen": -2.582794189453125,
"logits/rejected": -2.5295655727386475,
"logps/chosen": -293.511962890625,
"logps/rejected": -310.9229736328125,
"loss": 0.4651,
"positive_losses": 1.6097240447998047,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.3676465153694153,
"rewards/margins": 1.0873607397079468,
"rewards/margins_max": 1.359550952911377,
"rewards/margins_min": 0.8151704668998718,
"rewards/margins_std": 0.38493508100509644,
"rewards/rejected": -0.7197142243385315,
"step": 500
},
{
"epoch": 1.41,
"eval_dpo_losses": 0.6384106874465942,
"eval_logits/chosen": -2.7216532230377197,
"eval_logits/rejected": -2.6714365482330322,
"eval_logps/chosen": -298.51165771484375,
"eval_logps/rejected": -292.0330505371094,
"eval_loss": 2.6646323204040527,
"eval_positive_losses": 20.608970642089844,
"eval_rewards/accuracies": 0.6626983880996704,
"eval_rewards/chosen": -0.13290439546108246,
"eval_rewards/margins": 0.1955995112657547,
"eval_rewards/margins_max": 0.7628427743911743,
"eval_rewards/margins_min": -0.3882632255554199,
"eval_rewards/margins_std": 0.5195400714874268,
"eval_rewards/rejected": -0.32850393652915955,
"eval_runtime": 285.1068,
"eval_samples_per_second": 7.015,
"eval_steps_per_second": 0.221,
"step": 500
},
{
"dpo_losses": 0.33750054240226746,
"epoch": 1.44,
"grad_norm": 5.205737491948956,
"learning_rate": 3.116641275116018e-06,
"logits/chosen": -2.409104108810425,
"logits/rejected": -2.434281349182129,
"logps/chosen": -200.69908142089844,
"logps/rejected": -388.02001953125,
"loss": 0.398,
"positive_losses": 1.0130329132080078,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.26239317655563354,
"rewards/margins": 1.0276142358779907,
"rewards/margins_max": 1.2639634609222412,
"rewards/margins_min": 0.7912648916244507,
"rewards/margins_std": 0.3342483639717102,
"rewards/rejected": -0.7652209997177124,
"step": 510
},
{
"dpo_losses": 0.2813549041748047,
"epoch": 1.46,
"grad_norm": 81.57789306373847,
"learning_rate": 3.0368739048062956e-06,
"logits/chosen": -2.748539447784424,
"logits/rejected": -2.641331672668457,
"logps/chosen": -305.63671875,
"logps/rejected": -331.99383544921875,
"loss": 0.5374,
"positive_losses": 10.878652572631836,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30826514959335327,
"rewards/margins": 1.2961227893829346,
"rewards/margins_max": 1.768711805343628,
"rewards/margins_min": 0.823533833026886,
"rewards/margins_std": 0.6683418154716492,
"rewards/rejected": -0.9878576397895813,
"step": 520
},
{
"dpo_losses": 0.2712605893611908,
"epoch": 1.49,
"grad_norm": 75.79271498324394,
"learning_rate": 2.956529233772492e-06,
"logits/chosen": -2.689558744430542,
"logits/rejected": -2.6852006912231445,
"logps/chosen": -292.9363098144531,
"logps/rejected": -357.29400634765625,
"loss": 0.3968,
"positive_losses": 0.0,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.4060862958431244,
"rewards/margins": 1.2946475744247437,
"rewards/margins_max": 1.633283019065857,
"rewards/margins_min": 0.9560121297836304,
"rewards/margins_std": 0.47890281677246094,
"rewards/rejected": -0.8885613679885864,
"step": 530
},
{
"dpo_losses": 0.27980148792266846,
"epoch": 1.52,
"grad_norm": 56.497845904041995,
"learning_rate": 2.8756936566714317e-06,
"logits/chosen": -2.7521424293518066,
"logits/rejected": -2.6638569831848145,
"logps/chosen": -310.28753662109375,
"logps/rejected": -327.8934020996094,
"loss": 0.5646,
"positive_losses": 1.8035399913787842,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4047401547431946,
"rewards/margins": 1.3088172674179077,
"rewards/margins_max": 1.7437057495117188,
"rewards/margins_min": 0.8739286661148071,
"rewards/margins_std": 0.6150254011154175,
"rewards/rejected": -0.9040770530700684,
"step": 540
},
{
"dpo_losses": 0.30083730816841125,
"epoch": 1.55,
"grad_norm": 4.003119682647961,
"learning_rate": 2.794454096031429e-06,
"logits/chosen": -2.722224235534668,
"logits/rejected": -2.6790289878845215,
"logps/chosen": -281.0094299316406,
"logps/rejected": -354.3661804199219,
"loss": 0.387,
"positive_losses": 0.4849150776863098,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3756260275840759,
"rewards/margins": 1.2488583326339722,
"rewards/margins_max": 1.8290369510650635,
"rewards/margins_min": 0.668679416179657,
"rewards/margins_std": 0.820496678352356,
"rewards/rejected": -0.8732322454452515,
"step": 550
},
{
"dpo_losses": 0.2974298894405365,
"epoch": 1.58,
"grad_norm": 4.054485926091979,
"learning_rate": 2.71289790878446e-06,
"logits/chosen": -2.6345105171203613,
"logits/rejected": -2.6252238750457764,
"logps/chosen": -266.069580078125,
"logps/rejected": -428.830322265625,
"loss": 0.4149,
"positive_losses": 0.9539718627929688,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.36717483401298523,
"rewards/margins": 1.2800363302230835,
"rewards/margins_max": 1.847495675086975,
"rewards/margins_min": 0.7125769257545471,
"rewards/margins_std": 0.8025087118148804,
"rewards/rejected": -0.9128614664077759,
"step": 560
},
{
"dpo_losses": 0.2434779852628708,
"epoch": 1.61,
"grad_norm": 13.695790466916172,
"learning_rate": 2.6311127923312156e-06,
"logits/chosen": -2.7691166400909424,
"logits/rejected": -2.570652723312378,
"logps/chosen": -357.65362548828125,
"logps/rejected": -422.05902099609375,
"loss": 0.3595,
"positive_losses": 1.6939789056777954,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.35474830865859985,
"rewards/margins": 1.4161592721939087,
"rewards/margins_max": 1.7364327907562256,
"rewards/margins_min": 1.0958856344223022,
"rewards/margins_std": 0.4529353678226471,
"rewards/rejected": -1.061410903930664,
"step": 570
},
{
"dpo_losses": 0.26757892966270447,
"epoch": 1.63,
"grad_norm": 45.10124515413211,
"learning_rate": 2.549186690240057e-06,
"logits/chosen": -2.7345547676086426,
"logits/rejected": -2.6686453819274902,
"logps/chosen": -254.34683227539062,
"logps/rejected": -315.84857177734375,
"loss": 0.5253,
"positive_losses": 0.2569518983364105,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.43774762749671936,
"rewards/margins": 1.3818461894989014,
"rewards/margins_max": 1.815768837928772,
"rewards/margins_min": 0.9479236602783203,
"rewards/margins_std": 0.6136592626571655,
"rewards/rejected": -0.9440986514091492,
"step": 580
},
{
"dpo_losses": 0.28964871168136597,
"epoch": 1.66,
"grad_norm": 7.214863048583421,
"learning_rate": 2.4672076976812548e-06,
"logits/chosen": -2.6155965328216553,
"logits/rejected": -2.465445041656494,
"logps/chosen": -330.9356994628906,
"logps/rejected": -382.1274719238281,
"loss": 0.4009,
"positive_losses": 0.20948180556297302,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.42689600586891174,
"rewards/margins": 1.3260236978530884,
"rewards/margins_max": 1.9126968383789062,
"rewards/margins_min": 0.7393506169319153,
"rewards/margins_std": 0.8296809196472168,
"rewards/rejected": -0.8991276025772095,
"step": 590
},
{
"dpo_losses": 0.3019997179508209,
"epoch": 1.69,
"grad_norm": 2.5656935168095365,
"learning_rate": 2.3852639666982218e-06,
"logits/chosen": -2.696664571762085,
"logits/rejected": -2.6669843196868896,
"logps/chosen": -210.6801300048828,
"logps/rejected": -339.8411560058594,
"loss": 0.5269,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4121875762939453,
"rewards/margins": 1.2055537700653076,
"rewards/margins_max": 1.5584628582000732,
"rewards/margins_min": 0.8526442646980286,
"rewards/margins_std": 0.49908918142318726,
"rewards/rejected": -0.793366014957428,
"step": 600
},
{
"epoch": 1.69,
"eval_dpo_losses": 0.6337167024612427,
"eval_logits/chosen": -2.653167724609375,
"eval_logits/rejected": -2.6025989055633545,
"eval_logps/chosen": -326.8940734863281,
"eval_logps/rejected": -323.9284362792969,
"eval_loss": 5.016211986541748,
"eval_positive_losses": 46.1312141418457,
"eval_rewards/accuracies": 0.6626983880996704,
"eval_rewards/chosen": -0.4167284667491913,
"eval_rewards/margins": 0.2307295948266983,
"eval_rewards/margins_max": 0.8626330494880676,
"eval_rewards/margins_min": -0.4616139829158783,
"eval_rewards/margins_std": 0.5963027477264404,
"eval_rewards/rejected": -0.647458016872406,
"eval_runtime": 284.3544,
"eval_samples_per_second": 7.033,
"eval_steps_per_second": 0.222,
"step": 600
},
{
"dpo_losses": 0.3822602331638336,
"epoch": 1.72,
"grad_norm": 4.499258828055022,
"learning_rate": 2.303443611417584e-06,
"logits/chosen": -2.5053551197052,
"logits/rejected": -2.452122449874878,
"logps/chosen": -285.8536682128906,
"logps/rejected": -345.1878662109375,
"loss": 0.5838,
"positive_losses": 7.787275791168213,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.20293152332305908,
"rewards/margins": 0.9711725115776062,
"rewards/margins_max": 1.5771162509918213,
"rewards/margins_min": 0.36522871255874634,
"rewards/margins_std": 0.8569338917732239,
"rewards/rejected": -0.7682409286499023,
"step": 610
},
{
"dpo_losses": 0.2892194390296936,
"epoch": 1.75,
"grad_norm": 5.081357332154091,
"learning_rate": 2.2218346133000264e-06,
"logits/chosen": -2.5583109855651855,
"logits/rejected": -2.4557156562805176,
"logps/chosen": -241.0048370361328,
"logps/rejected": -288.3791809082031,
"loss": 0.4921,
"positive_losses": 4.499431610107422,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3376957178115845,
"rewards/margins": 1.2491505146026611,
"rewards/margins_max": 1.6783252954483032,
"rewards/margins_min": 0.8199755549430847,
"rewards/margins_std": 0.606944739818573,
"rewards/rejected": -0.9114546775817871,
"step": 620
},
{
"dpo_losses": 0.28637608885765076,
"epoch": 1.77,
"grad_norm": 31.375745057762174,
"learning_rate": 2.140524726533792e-06,
"logits/chosen": -2.611680030822754,
"logits/rejected": -2.492157459259033,
"logps/chosen": -342.9209899902344,
"logps/rejected": -305.1431884765625,
"loss": 0.381,
"positive_losses": 1.477830171585083,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.5035167336463928,
"rewards/margins": 1.3139656782150269,
"rewards/margins_max": 1.746787428855896,
"rewards/margins_min": 0.8811438679695129,
"rewards/margins_std": 0.6121026277542114,
"rewards/rejected": -0.8104490041732788,
"step": 630
},
{
"dpo_losses": 0.27914196252822876,
"epoch": 1.8,
"grad_norm": 56.714432514737815,
"learning_rate": 2.059601383672566e-06,
"logits/chosen": -2.6837282180786133,
"logits/rejected": -2.669649600982666,
"logps/chosen": -205.0702362060547,
"logps/rejected": -292.3086853027344,
"loss": 0.6023,
"positive_losses": 3.721278429031372,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2986445426940918,
"rewards/margins": 1.2236577272415161,
"rewards/margins_max": 1.4498487710952759,
"rewards/margins_min": 0.9974665641784668,
"rewards/margins_std": 0.31988245248794556,
"rewards/rejected": -0.9250132441520691,
"step": 640
},
{
"dpo_losses": 0.3276744782924652,
"epoch": 1.83,
"grad_norm": 108.41037124625116,
"learning_rate": 1.9791516016192214e-06,
"logits/chosen": -2.7006583213806152,
"logits/rejected": -2.657177686691284,
"logps/chosen": -219.15249633789062,
"logps/rejected": -298.5721130371094,
"loss": 0.3902,
"positive_losses": 0.02580871619284153,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2983975410461426,
"rewards/margins": 1.0909839868545532,
"rewards/margins_max": 1.5546290874481201,
"rewards/margins_min": 0.6273389458656311,
"rewards/margins_std": 0.6556931138038635,
"rewards/rejected": -0.7925864458084106,
"step": 650
},
{
"dpo_losses": 0.37573254108428955,
"epoch": 1.86,
"grad_norm": 4.49512981087327,
"learning_rate": 1.8992618880565039e-06,
"logits/chosen": -2.4442310333251953,
"logits/rejected": -2.430908679962158,
"logps/chosen": -247.6465301513672,
"logps/rejected": -270.6328125,
"loss": 0.673,
"positive_losses": 9.613517761230469,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.23618540167808533,
"rewards/margins": 1.0627477169036865,
"rewards/margins_max": 1.7770532369613647,
"rewards/margins_min": 0.3484421372413635,
"rewards/margins_std": 1.0101807117462158,
"rewards/rejected": -0.8265622854232788,
"step": 660
},
{
"dpo_losses": 0.2606434226036072,
"epoch": 1.89,
"grad_norm": 16.900887081181846,
"learning_rate": 1.8200181484252888e-06,
"logits/chosen": -2.728989601135254,
"logits/rejected": -2.65732741355896,
"logps/chosen": -339.34649658203125,
"logps/rejected": -414.9603576660156,
"loss": 0.3802,
"positive_losses": 0.0,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.46093645691871643,
"rewards/margins": 1.4545660018920898,
"rewards/margins_max": 1.9622220993041992,
"rewards/margins_min": 0.9469099044799805,
"rewards/margins_std": 0.7179341316223145,
"rewards/rejected": -0.9936296343803406,
"step": 670
},
{
"dpo_losses": 0.22198085486888885,
"epoch": 1.92,
"grad_norm": 41.51526627620706,
"learning_rate": 1.7415055935504234e-06,
"logits/chosen": -2.705850601196289,
"logits/rejected": -2.6019129753112793,
"logps/chosen": -284.8177795410156,
"logps/rejected": -411.76708984375,
"loss": 0.4159,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.37507936358451843,
"rewards/margins": 1.5223026275634766,
"rewards/margins_max": 1.767469048500061,
"rewards/margins_min": 1.2771363258361816,
"rewards/margins_std": 0.34671759605407715,
"rewards/rejected": -1.1472233533859253,
"step": 680
},
{
"dpo_losses": 0.3486565351486206,
"epoch": 1.94,
"grad_norm": 138.4896910648948,
"learning_rate": 1.6638086480134954e-06,
"logits/chosen": -2.577733039855957,
"logits/rejected": -2.557359218597412,
"logps/chosen": -144.18289184570312,
"logps/rejected": -205.9375762939453,
"loss": 0.4276,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3236793875694275,
"rewards/margins": 1.0879504680633545,
"rewards/margins_max": 1.6922643184661865,
"rewards/margins_min": 0.4836367070674896,
"rewards/margins_std": 0.85462886095047,
"rewards/rejected": -0.7642711400985718,
"step": 690
},
{
"dpo_losses": 0.24665436148643494,
"epoch": 1.97,
"grad_norm": 22.11936611709013,
"learning_rate": 1.5870108593710473e-06,
"logits/chosen": -2.422232151031494,
"logits/rejected": -2.351428508758545,
"logps/chosen": -301.96270751953125,
"logps/rejected": -312.5522766113281,
"loss": 0.3513,
"positive_losses": 0.03521118313074112,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5125211477279663,
"rewards/margins": 1.4875143766403198,
"rewards/margins_max": 1.8352491855621338,
"rewards/margins_min": 1.139779806137085,
"rewards/margins_std": 0.4917708933353424,
"rewards/rejected": -0.9749932289123535,
"step": 700
},
{
"epoch": 1.97,
"eval_dpo_losses": 0.6398608684539795,
"eval_logits/chosen": -2.631686210632324,
"eval_logits/rejected": -2.5807785987854004,
"eval_logps/chosen": -326.29583740234375,
"eval_logps/rejected": -325.2173156738281,
"eval_loss": 4.895449161529541,
"eval_positive_losses": 45.593257904052734,
"eval_rewards/accuracies": 0.6626983880996704,
"eval_rewards/chosen": -0.41074639558792114,
"eval_rewards/margins": 0.24960003793239594,
"eval_rewards/margins_max": 0.9743701815605164,
"eval_rewards/margins_min": -0.5254129767417908,
"eval_rewards/margins_std": 0.6826153993606567,
"eval_rewards/rejected": -0.6603464484214783,
"eval_runtime": 284.0532,
"eval_samples_per_second": 7.041,
"eval_steps_per_second": 0.222,
"step": 700
},
{
"dpo_losses": 0.32716676592826843,
"epoch": 2.0,
"grad_norm": 24.97077759875969,
"learning_rate": 1.511194808315853e-06,
"logits/chosen": -2.5247268676757812,
"logits/rejected": -2.486575126647949,
"logps/chosen": -229.55859375,
"logps/rejected": -268.9668273925781,
"loss": 0.4163,
"positive_losses": 0.5168693661689758,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.28237053751945496,
"rewards/margins": 1.146689772605896,
"rewards/margins_max": 1.594560146331787,
"rewards/margins_min": 0.6988194584846497,
"rewards/margins_std": 0.6333842873573303,
"rewards/rejected": -0.8643192052841187,
"step": 710
},
{
"dpo_losses": 0.21982404589653015,
"epoch": 2.03,
"grad_norm": 1.38353688722549,
"learning_rate": 1.4364420198778662e-06,
"logits/chosen": -2.7155685424804688,
"logits/rejected": -2.609267234802246,
"logps/chosen": -343.7250061035156,
"logps/rejected": -450.3816833496094,
"loss": 0.3634,
"positive_losses": 4.519556999206543,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.456549733877182,
"rewards/margins": 1.6569738388061523,
"rewards/margins_max": 2.1707637310028076,
"rewards/margins_min": 1.143183946609497,
"rewards/margins_std": 0.7266086935997009,
"rewards/rejected": -1.200424075126648,
"step": 720
},
{
"dpo_losses": 0.25513142347335815,
"epoch": 2.06,
"grad_norm": 3.318839287140489,
"learning_rate": 1.3628328757603243e-06,
"logits/chosen": -2.6959056854248047,
"logits/rejected": -2.5843894481658936,
"logps/chosen": -267.92010498046875,
"logps/rejected": -357.7880554199219,
"loss": 0.2684,
"positive_losses": 0.038549043238162994,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.31957200169563293,
"rewards/margins": 1.4021821022033691,
"rewards/margins_max": 1.6910970211029053,
"rewards/margins_min": 1.113266944885254,
"rewards/margins_std": 0.4085877537727356,
"rewards/rejected": -1.082610011100769,
"step": 730
},
{
"dpo_losses": 0.30432650446891785,
"epoch": 2.08,
"grad_norm": 36.95329512381558,
"learning_rate": 1.2904465279052725e-06,
"logits/chosen": -2.634579658508301,
"logits/rejected": -2.56650710105896,
"logps/chosen": -284.7083740234375,
"logps/rejected": -317.93389892578125,
"loss": 0.4788,
"positive_losses": 3.9446158409118652,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.22743673622608185,
"rewards/margins": 1.2028728723526,
"rewards/margins_max": 1.6675021648406982,
"rewards/margins_min": 0.7382434606552124,
"rewards/margins_std": 0.6570851802825928,
"rewards/rejected": -0.9754360914230347,
"step": 740
},
{
"dpo_losses": 0.2779385447502136,
"epoch": 2.11,
"grad_norm": 5.125527517570657,
"learning_rate": 1.219360813381446e-06,
"logits/chosen": -2.462111234664917,
"logits/rejected": -2.498530387878418,
"logps/chosen": -159.8828887939453,
"logps/rejected": -236.5124053955078,
"loss": 0.2882,
"positive_losses": 0.18086472153663635,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3097127676010132,
"rewards/margins": 1.3083655834197998,
"rewards/margins_max": 1.592053771018982,
"rewards/margins_min": 1.0246771574020386,
"rewards/margins_std": 0.4011960029602051,
"rewards/rejected": -0.9986528158187866,
"step": 750
},
{
"dpo_losses": 0.22775745391845703,
"epoch": 2.14,
"grad_norm": 18.30294185818396,
"learning_rate": 1.1496521706860392e-06,
"logits/chosen": -2.651033401489258,
"logits/rejected": -2.537503242492676,
"logps/chosen": -291.1076354980469,
"logps/rejected": -382.2750244140625,
"loss": 0.3201,
"positive_losses": 2.4156768321990967,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.36460763216018677,
"rewards/margins": 1.5596258640289307,
"rewards/margins_max": 1.959398627281189,
"rewards/margins_min": 1.1598527431488037,
"rewards/margins_std": 0.5653643012046814,
"rewards/rejected": -1.1950181722640991,
"step": 760
},
{
"dpo_losses": 0.21675769984722137,
"epoch": 2.17,
"grad_norm": 21.74451175593295,
"learning_rate": 1.0813955575503588e-06,
"logits/chosen": -2.604640483856201,
"logits/rejected": -2.5890743732452393,
"logps/chosen": -301.3707580566406,
"logps/rejected": -381.50506591796875,
"loss": 0.3818,
"positive_losses": 0.353890985250473,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4118216633796692,
"rewards/margins": 1.6504104137420654,
"rewards/margins_max": 2.0086562633514404,
"rewards/margins_min": 1.2921648025512695,
"rewards/margins_std": 0.5066360235214233,
"rewards/rejected": -1.2385889291763306,
"step": 770
},
{
"dpo_losses": 0.24907536804676056,
"epoch": 2.2,
"grad_norm": 5.9813942121541706,
"learning_rate": 1.0146643703377488e-06,
"logits/chosen": -2.734790325164795,
"logits/rejected": -2.537445306777954,
"logps/chosen": -292.0768127441406,
"logps/rejected": -332.3907775878906,
"loss": 0.4576,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5042055249214172,
"rewards/margins": 1.5501843690872192,
"rewards/margins_max": 2.1402950286865234,
"rewards/margins_min": 0.9600737690925598,
"rewards/margins_std": 0.8345423936843872,
"rewards/rejected": -1.0459789037704468,
"step": 780
},
{
"dpo_losses": 0.20471492409706116,
"epoch": 2.23,
"grad_norm": 2.116074836272933,
"learning_rate": 9.495303651204496e-07,
"logits/chosen": -2.611013889312744,
"logits/rejected": -2.5461339950561523,
"logps/chosen": -319.31951904296875,
"logps/rejected": -404.64886474609375,
"loss": 0.4666,
"positive_losses": 0.0,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.3257637321949005,
"rewards/margins": 1.60427725315094,
"rewards/margins_max": 1.9402239322662354,
"rewards/margins_min": 1.2683299779891968,
"rewards/margins_std": 0.4751007556915283,
"rewards/rejected": -1.2785133123397827,
"step": 790
},
{
"dpo_losses": 0.2155081331729889,
"epoch": 2.25,
"grad_norm": 12.840834237921664,
"learning_rate": 8.860635805202616e-07,
"logits/chosen": -2.615548610687256,
"logits/rejected": -2.5271685123443604,
"logps/chosen": -304.5693054199219,
"logps/rejected": -362.62225341796875,
"loss": 0.2795,
"positive_losses": 0.01874256134033203,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4078141152858734,
"rewards/margins": 1.58090341091156,
"rewards/margins_max": 1.9335031509399414,
"rewards/margins_min": 1.2283036708831787,
"rewards/margins_std": 0.49865132570266724,
"rewards/rejected": -1.1730893850326538,
"step": 800
},
{
"epoch": 2.25,
"eval_dpo_losses": 0.6266348958015442,
"eval_logits/chosen": -2.604722738265991,
"eval_logits/rejected": -2.554541826248169,
"eval_logps/chosen": -324.4103088378906,
"eval_logps/rejected": -327.570556640625,
"eval_loss": 4.769333839416504,
"eval_positive_losses": 43.908966064453125,
"eval_rewards/accuracies": 0.682539701461792,
"eval_rewards/chosen": -0.3918909430503845,
"eval_rewards/margins": 0.29198840260505676,
"eval_rewards/margins_max": 1.0657094717025757,
"eval_rewards/margins_min": -0.5265500545501709,
"eval_rewards/margins_std": 0.7165747284889221,
"eval_rewards/rejected": -0.6838793158531189,
"eval_runtime": 284.6208,
"eval_samples_per_second": 7.027,
"eval_steps_per_second": 0.221,
"step": 800
},
{
"dpo_losses": 0.25243309140205383,
"epoch": 2.28,
"grad_norm": 71.28169182787225,
"learning_rate": 8.24332262395994e-07,
"logits/chosen": -2.6843011379241943,
"logits/rejected": -2.6510274410247803,
"logps/chosen": -252.87222290039062,
"logps/rejected": -349.50506591796875,
"loss": 0.3457,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3779425024986267,
"rewards/margins": 1.4469493627548218,
"rewards/margins_max": 2.0055932998657227,
"rewards/margins_min": 0.8883053660392761,
"rewards/margins_std": 0.7900420427322388,
"rewards/rejected": -1.0690069198608398,
"step": 810
},
{
"dpo_losses": 0.25832101702690125,
"epoch": 2.31,
"grad_norm": 245.8246008336025,
"learning_rate": 7.644027904586587e-07,
"logits/chosen": -2.637300968170166,
"logits/rejected": -2.5708765983581543,
"logps/chosen": -227.47787475585938,
"logps/rejected": -322.4635925292969,
"loss": 0.5117,
"positive_losses": 4.7760443687438965,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2738664150238037,
"rewards/margins": 1.4220813512802124,
"rewards/margins_max": 1.896773338317871,
"rewards/margins_min": 0.9473894238471985,
"rewards/margins_std": 0.6713159084320068,
"rewards/rejected": -1.1482150554656982,
"step": 820
},
{
"dpo_losses": 0.16726334393024445,
"epoch": 2.34,
"grad_norm": 1.7593306703555782,
"learning_rate": 7.06339606893347e-07,
"logits/chosen": -2.6265785694122314,
"logits/rejected": -2.5026650428771973,
"logps/chosen": -399.26031494140625,
"logps/rejected": -387.8680419921875,
"loss": 0.2112,
"positive_losses": 0.06428833305835724,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5203009247779846,
"rewards/margins": 1.8566944599151611,
"rewards/margins_max": 2.1375911235809326,
"rewards/margins_min": 1.5757976770401,
"rewards/margins_std": 0.3972480893135071,
"rewards/rejected": -1.3363934755325317,
"step": 830
},
{
"dpo_losses": 0.2129584103822708,
"epoch": 2.37,
"grad_norm": 28.81303382097675,
"learning_rate": 6.502051470645149e-07,
"logits/chosen": -2.721235513687134,
"logits/rejected": -2.5673904418945312,
"logps/chosen": -341.94073486328125,
"logps/rejected": -413.451171875,
"loss": 0.3816,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.34823325276374817,
"rewards/margins": 1.6308362483978271,
"rewards/margins_max": 1.947928786277771,
"rewards/margins_min": 1.313744068145752,
"rewards/margins_std": 0.4484362006187439,
"rewards/rejected": -1.282603144645691,
"step": 840
},
{
"dpo_losses": 0.204869344830513,
"epoch": 2.39,
"grad_norm": 1.8754374311724713,
"learning_rate": 5.960597723792194e-07,
"logits/chosen": -2.610276937484741,
"logits/rejected": -2.4925060272216797,
"logps/chosen": -280.25665283203125,
"logps/rejected": -387.3306579589844,
"loss": 0.429,
"positive_losses": 4.123325824737549,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3918009400367737,
"rewards/margins": 1.684704065322876,
"rewards/margins_max": 2.1113224029541016,
"rewards/margins_min": 1.2580856084823608,
"rewards/margins_std": 0.6033294796943665,
"rewards/rejected": -1.2929030656814575,
"step": 850
},
{
"dpo_losses": 0.18849320709705353,
"epoch": 2.42,
"grad_norm": 2.415129688011,
"learning_rate": 5.43961705380465e-07,
"logits/chosen": -2.5959548950195312,
"logits/rejected": -2.5745034217834473,
"logps/chosen": -274.2839660644531,
"logps/rejected": -413.91650390625,
"loss": 0.3573,
"positive_losses": 0.0,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.4205331802368164,
"rewards/margins": 1.8768619298934937,
"rewards/margins_max": 2.31927490234375,
"rewards/margins_min": 1.4344491958618164,
"rewards/margins_std": 0.6256662011146545,
"rewards/rejected": -1.4563289880752563,
"step": 860
},
{
"dpo_losses": 0.24235720932483673,
"epoch": 2.45,
"grad_norm": 2.200547137281921,
"learning_rate": 4.939669671404871e-07,
"logits/chosen": -2.5770421028137207,
"logits/rejected": -2.521031618118286,
"logps/chosen": -251.25564575195312,
"logps/rejected": -441.2269592285156,
"loss": 0.4093,
"positive_losses": 5.246364116668701,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2414274662733078,
"rewards/margins": 1.4953609704971313,
"rewards/margins_max": 1.9941730499267578,
"rewards/margins_min": 0.9965487718582153,
"rewards/margins_std": 0.7054268717765808,
"rewards/rejected": -1.2539334297180176,
"step": 870
},
{
"dpo_losses": 0.2292724847793579,
"epoch": 2.48,
"grad_norm": 33.76430360961392,
"learning_rate": 4.461293170212644e-07,
"logits/chosen": -2.6965794563293457,
"logits/rejected": -2.543576717376709,
"logps/chosen": -292.703125,
"logps/rejected": -368.0157775878906,
"loss": 0.3654,
"positive_losses": 5.510960578918457,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29951637983322144,
"rewards/margins": 1.5260313749313354,
"rewards/margins_max": 1.979828119277954,
"rewards/margins_min": 1.0722346305847168,
"rewards/margins_std": 0.6417653560638428,
"rewards/rejected": -1.2265150547027588,
"step": 880
},
{
"dpo_losses": 0.15858207643032074,
"epoch": 2.51,
"grad_norm": 5.727775081054632,
"learning_rate": 4.005001948670606e-07,
"logits/chosen": -2.694242238998413,
"logits/rejected": -2.595343828201294,
"logps/chosen": -382.9683532714844,
"logps/rejected": -468.80157470703125,
"loss": 0.463,
"positive_losses": 0.11419792473316193,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5407828092575073,
"rewards/margins": 1.968488097190857,
"rewards/margins_max": 2.390479564666748,
"rewards/margins_min": 1.546496868133545,
"rewards/margins_std": 0.5967859029769897,
"rewards/rejected": -1.4277052879333496,
"step": 890
},
{
"dpo_losses": 0.18081924319267273,
"epoch": 2.54,
"grad_norm": 6.1887239729076455,
"learning_rate": 3.571286656911377e-07,
"logits/chosen": -2.6035306453704834,
"logits/rejected": -2.4794845581054688,
"logps/chosen": -310.08013916015625,
"logps/rejected": -408.18426513671875,
"loss": 0.3544,
"positive_losses": 2.6008810997009277,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4102245271205902,
"rewards/margins": 1.8388206958770752,
"rewards/margins_max": 2.2580726146698,
"rewards/margins_min": 1.4195688962936401,
"rewards/margins_std": 0.5929116606712341,
"rewards/rejected": -1.4285962581634521,
"step": 900
},
{
"epoch": 2.54,
"eval_dpo_losses": 0.6314364075660706,
"eval_logits/chosen": -2.586303472518921,
"eval_logits/rejected": -2.535871744155884,
"eval_logps/chosen": -332.5704345703125,
"eval_logps/rejected": -335.68133544921875,
"eval_loss": 5.36396598815918,
"eval_positive_losses": 51.33633804321289,
"eval_rewards/accuracies": 0.670634925365448,
"eval_rewards/chosen": -0.47349241375923157,
"eval_rewards/margins": 0.29149433970451355,
"eval_rewards/margins_max": 1.078196406364441,
"eval_rewards/margins_min": -0.5344981551170349,
"eval_rewards/margins_std": 0.72893226146698,
"eval_rewards/rejected": -0.7649868130683899,
"eval_runtime": 284.4452,
"eval_samples_per_second": 7.031,
"eval_steps_per_second": 0.221,
"step": 900
},
{
"dpo_losses": 0.20350190997123718,
"epoch": 2.56,
"grad_norm": 1.9747106750644823,
"learning_rate": 3.1606136691612555e-07,
"logits/chosen": -2.7836008071899414,
"logits/rejected": -2.5904271602630615,
"logps/chosen": -345.66265869140625,
"logps/rejected": -354.3844299316406,
"loss": 0.2637,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5323250889778137,
"rewards/margins": 1.6922286748886108,
"rewards/margins_max": 2.0907249450683594,
"rewards/margins_min": 1.2937328815460205,
"rewards/margins_std": 0.5635584592819214,
"rewards/rejected": -1.159903883934021,
"step": 910
},
{
"dpo_losses": 0.22471606731414795,
"epoch": 2.59,
"grad_norm": 11.212944207381554,
"learning_rate": 2.773424582247844e-07,
"logits/chosen": -2.5793869495391846,
"logits/rejected": -2.4063210487365723,
"logps/chosen": -291.543701171875,
"logps/rejected": -320.06353759765625,
"loss": 0.6166,
"positive_losses": 8.062161445617676,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.38504648208618164,
"rewards/margins": 1.6606292724609375,
"rewards/margins_max": 2.1240899562835693,
"rewards/margins_min": 1.1971690654754639,
"rewards/margins_std": 0.655431866645813,
"rewards/rejected": -1.275583028793335,
"step": 920
},
{
"dpo_losses": 0.20711331069469452,
"epoch": 2.62,
"grad_norm": 190.13690585667476,
"learning_rate": 2.410135740750821e-07,
"logits/chosen": -2.6692299842834473,
"logits/rejected": -2.5721378326416016,
"logps/chosen": -300.9726257324219,
"logps/rejected": -399.5599365234375,
"loss": 0.4929,
"positive_losses": 3.012037754058838,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.3912240266799927,
"rewards/margins": 1.6907918453216553,
"rewards/margins_max": 2.0557963848114014,
"rewards/margins_min": 1.3257873058319092,
"rewards/margins_std": 0.5161946415901184,
"rewards/rejected": -1.2995678186416626,
"step": 930
},
{
"dpo_losses": 0.3051915466785431,
"epoch": 2.65,
"grad_norm": 189.05899790144875,
"learning_rate": 2.0711377893064182e-07,
"logits/chosen": -2.639585256576538,
"logits/rejected": -2.488219738006592,
"logps/chosen": -312.65863037109375,
"logps/rejected": -305.5802917480469,
"loss": 0.4531,
"positive_losses": 3.7967441082000732,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2920045554637909,
"rewards/margins": 1.2456369400024414,
"rewards/margins_max": 1.7247259616851807,
"rewards/margins_min": 0.7665479183197021,
"rewards/margins_std": 0.6775342226028442,
"rewards/rejected": -0.9536323547363281,
"step": 940
},
{
"dpo_losses": 0.2704788148403168,
"epoch": 2.68,
"grad_norm": 15.556722893889498,
"learning_rate": 1.756795252547111e-07,
"logits/chosen": -2.588268518447876,
"logits/rejected": -2.501624584197998,
"logps/chosen": -225.04928588867188,
"logps/rejected": -314.3163757324219,
"loss": 0.4599,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3465590476989746,
"rewards/margins": 1.4794371128082275,
"rewards/margins_max": 1.9303573369979858,
"rewards/margins_min": 1.0285165309906006,
"rewards/margins_std": 0.6376978158950806,
"rewards/rejected": -1.1328779458999634,
"step": 950
},
{
"dpo_losses": 0.23120097815990448,
"epoch": 2.7,
"grad_norm": 3.6975387738343986,
"learning_rate": 1.4674461431281013e-07,
"logits/chosen": -2.7935328483581543,
"logits/rejected": -2.7169508934020996,
"logps/chosen": -246.69778442382812,
"logps/rejected": -358.2559509277344,
"loss": 0.3766,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3893265724182129,
"rewards/margins": 1.5332249402999878,
"rewards/margins_max": 2.008496046066284,
"rewards/margins_min": 1.0579537153244019,
"rewards/margins_std": 0.6721349954605103,
"rewards/rejected": -1.143898367881775,
"step": 960
},
{
"dpo_losses": 0.18317696452140808,
"epoch": 2.73,
"grad_norm": 25.967042428441264,
"learning_rate": 1.2034015982622243e-07,
"logits/chosen": -2.68410587310791,
"logits/rejected": -2.5668373107910156,
"logps/chosen": -320.6241760253906,
"logps/rejected": -454.39849853515625,
"loss": 0.3194,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4021673798561096,
"rewards/margins": 1.9114774465560913,
"rewards/margins_max": 2.451068878173828,
"rewards/margins_min": 1.3718855381011963,
"rewards/margins_std": 0.763097882270813,
"rewards/rejected": -1.5093098878860474,
"step": 970
},
{
"dpo_losses": 0.2419268637895584,
"epoch": 2.76,
"grad_norm": 19.29396011638503,
"learning_rate": 9.649455451539419e-08,
"logits/chosen": -2.555974006652832,
"logits/rejected": -2.4670310020446777,
"logps/chosen": -218.39334106445312,
"logps/rejected": -300.92254638671875,
"loss": 0.4254,
"positive_losses": 4.289657115936279,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.25288599729537964,
"rewards/margins": 1.518112301826477,
"rewards/margins_max": 1.9375699758529663,
"rewards/margins_min": 1.098654866218567,
"rewards/margins_std": 0.5932024717330933,
"rewards/rejected": -1.2652263641357422,
"step": 980
},
{
"dpo_losses": 0.20426790416240692,
"epoch": 2.79,
"grad_norm": 145.7358684722982,
"learning_rate": 7.523343956923196e-08,
"logits/chosen": -2.7547340393066406,
"logits/rejected": -2.6413354873657227,
"logps/chosen": -303.62115478515625,
"logps/rejected": -412.58782958984375,
"loss": 0.406,
"positive_losses": 1.1116502285003662,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4799574315547943,
"rewards/margins": 1.7528730630874634,
"rewards/margins_max": 2.2721505165100098,
"rewards/margins_min": 1.2335954904556274,
"rewards/margins_std": 0.734369158744812,
"rewards/rejected": -1.2729156017303467,
"step": 990
},
{
"dpo_losses": 0.2937398850917816,
"epoch": 2.82,
"grad_norm": 147.9672419405728,
"learning_rate": 5.657967707312195e-08,
"logits/chosen": -2.519782543182373,
"logits/rejected": -2.54045033454895,
"logps/chosen": -236.8069610595703,
"logps/rejected": -393.01373291015625,
"loss": 0.545,
"positive_losses": 6.547940254211426,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24184127151966095,
"rewards/margins": 1.3283250331878662,
"rewards/margins_max": 1.8528366088867188,
"rewards/margins_min": 0.8038133382797241,
"rewards/margins_std": 0.7417714595794678,
"rewards/rejected": -1.0864837169647217,
"step": 1000
},
{
"epoch": 2.82,
"eval_dpo_losses": 0.6312186121940613,
"eval_logits/chosen": -2.5872504711151123,
"eval_logits/rejected": -2.5366668701171875,
"eval_logps/chosen": -330.9984436035156,
"eval_logps/rejected": -333.99945068359375,
"eval_loss": 5.222360134124756,
"eval_positive_losses": 49.98057556152344,
"eval_rewards/accuracies": 0.6626983880996704,
"eval_rewards/chosen": -0.4577721953392029,
"eval_rewards/margins": 0.29039543867111206,
"eval_rewards/margins_max": 1.0717767477035522,
"eval_rewards/margins_min": -0.533184289932251,
"eval_rewards/margins_std": 0.724482536315918,
"eval_rewards/rejected": -0.7481676340103149,
"eval_runtime": 284.8086,
"eval_samples_per_second": 7.022,
"eval_steps_per_second": 0.221,
"step": 1000
},
{
"dpo_losses": 0.2392820119857788,
"epoch": 2.85,
"grad_norm": 87.50201169562474,
"learning_rate": 4.055332542531959e-08,
"logits/chosen": -2.7165019512176514,
"logits/rejected": -2.6357262134552,
"logps/chosen": -229.1401824951172,
"logps/rejected": -371.04571533203125,
"loss": 0.5645,
"positive_losses": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.34021449089050293,
"rewards/margins": 1.4760607481002808,
"rewards/margins_max": 1.7028331756591797,
"rewards/margins_min": 1.2492884397506714,
"rewards/margins_std": 0.3207046389579773,
"rewards/rejected": -1.1358463764190674,
"step": 1010
},
{
"dpo_losses": 0.2636774182319641,
"epoch": 2.87,
"grad_norm": 9.243316710391014,
"learning_rate": 2.7171617768147472e-08,
"logits/chosen": -2.5805556774139404,
"logits/rejected": -2.4946963787078857,
"logps/chosen": -200.70706176757812,
"logps/rejected": -348.9754638671875,
"loss": 0.4736,
"positive_losses": 4.167427062988281,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.226848766207695,
"rewards/margins": 1.4039170742034912,
"rewards/margins_max": 1.8958208560943604,
"rewards/margins_min": 0.9120131731033325,
"rewards/margins_std": 0.6956570148468018,
"rewards/rejected": -1.1770681142807007,
"step": 1020
},
{
"dpo_losses": 0.26111191511154175,
"epoch": 2.9,
"grad_norm": 3.7450554356463743,
"learning_rate": 1.6448943457189616e-08,
"logits/chosen": -2.5760231018066406,
"logits/rejected": -2.58748197555542,
"logps/chosen": -268.255126953125,
"logps/rejected": -387.044921875,
"loss": 0.2968,
"positive_losses": 1.4128901958465576,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.34100010991096497,
"rewards/margins": 1.478846788406372,
"rewards/margins_max": 2.0194473266601562,
"rewards/margins_min": 0.9382462501525879,
"rewards/margins_std": 0.7645247578620911,
"rewards/rejected": -1.1378467082977295,
"step": 1030
},
{
"dpo_losses": 0.24359698593616486,
"epoch": 2.93,
"grad_norm": 187.10263101103095,
"learning_rate": 8.39683258841123e-09,
"logits/chosen": -2.5231451988220215,
"logits/rejected": -2.408517360687256,
"logps/chosen": -264.57916259765625,
"logps/rejected": -332.7992248535156,
"loss": 0.3754,
"positive_losses": 0.07207755744457245,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4693472981452942,
"rewards/margins": 1.5425517559051514,
"rewards/margins_max": 1.9166587591171265,
"rewards/margins_min": 1.1684446334838867,
"rewards/margins_std": 0.52906733751297,
"rewards/rejected": -1.0732043981552124,
"step": 1040
},
{
"dpo_losses": 0.20071451365947723,
"epoch": 2.96,
"grad_norm": 3.6062297906425043,
"learning_rate": 3.0239435998430376e-09,
"logits/chosen": -2.645131826400757,
"logits/rejected": -2.5101490020751953,
"logps/chosen": -282.20855712890625,
"logps/rejected": -383.17950439453125,
"loss": 0.3373,
"positive_losses": 1.7565370798110962,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.41964656114578247,
"rewards/margins": 1.6733496189117432,
"rewards/margins_max": 2.073215961456299,
"rewards/margins_min": 1.2734830379486084,
"rewards/margins_std": 0.5654967427253723,
"rewards/rejected": -1.253702998161316,
"step": 1050
},
{
"dpo_losses": 0.22902190685272217,
"epoch": 2.99,
"grad_norm": 3.308213249224383,
"learning_rate": 3.3605396115826695e-10,
"logits/chosen": -2.394101142883301,
"logits/rejected": -2.4773335456848145,
"logps/chosen": -165.21871948242188,
"logps/rejected": -327.92352294921875,
"loss": 0.3158,
"positive_losses": 1.1246204376220703,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.30173832178115845,
"rewards/margins": 1.4864323139190674,
"rewards/margins_max": 1.822080373764038,
"rewards/margins_min": 1.150783896446228,
"rewards/margins_std": 0.4746781885623932,
"rewards/rejected": -1.1846938133239746,
"step": 1060
},
{
"epoch": 3.0,
"step": 1065,
"total_flos": 0.0,
"train_loss": 0.48024289137880566,
"train_runtime": 8933.1726,
"train_samples_per_second": 1.907,
"train_steps_per_second": 0.119
}
],
"logging_steps": 10,
"max_steps": 1065,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}