just1nseo's picture
Model save
0fe756b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 355,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"dpo_losses": 0.6931471824645996,
"epoch": 0.0,
"grad_norm": 1.6018567815095135,
"learning_rate": 1.3888888888888888e-07,
"logits/chosen": -2.861618995666504,
"logits/rejected": -2.8205904960632324,
"logps/chosen": -271.06011962890625,
"logps/rejected": -211.1704559326172,
"loss": 0.6931,
"positive_losses": 0.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"dpo_losses": 0.6928361654281616,
"epoch": 0.03,
"grad_norm": 14.098492351037597,
"learning_rate": 1.3888888888888892e-06,
"logits/chosen": -2.8340628147125244,
"logits/rejected": -2.7916715145111084,
"logps/chosen": -324.87408447265625,
"logps/rejected": -274.8518371582031,
"loss": 0.6969,
"positive_losses": 0.03656284138560295,
"rewards/accuracies": 0.5138888955116272,
"rewards/chosen": 0.001762597355991602,
"rewards/margins": 0.0006246823468245566,
"rewards/margins_max": 0.0034460597671568394,
"rewards/margins_min": -0.002478615380823612,
"rewards/margins_std": 0.002669532783329487,
"rewards/rejected": 0.0011379148345440626,
"step": 10
},
{
"dpo_losses": 0.6901537775993347,
"epoch": 0.06,
"grad_norm": 1.829780676576113,
"learning_rate": 2.7777777777777783e-06,
"logits/chosen": -2.7248008251190186,
"logits/rejected": -2.7065372467041016,
"logps/chosen": -291.9751892089844,
"logps/rejected": -214.52914428710938,
"loss": 0.69,
"positive_losses": 0.00235748291015625,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.01850745640695095,
"rewards/margins": 0.006009287666529417,
"rewards/margins_max": 0.013369890861213207,
"rewards/margins_min": -0.0006899007130414248,
"rewards/margins_std": 0.006301888730376959,
"rewards/rejected": 0.01249817106872797,
"step": 20
},
{
"dpo_losses": 0.6790497303009033,
"epoch": 0.08,
"grad_norm": 2.096661038575657,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": -2.8153939247131348,
"logits/rejected": -2.7460672855377197,
"logps/chosen": -298.10052490234375,
"logps/rejected": -229.7678680419922,
"loss": 0.677,
"positive_losses": 0.0,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.05605363845825195,
"rewards/margins": 0.02858993411064148,
"rewards/margins_max": 0.058357615023851395,
"rewards/margins_min": 0.004640273749828339,
"rewards/margins_std": 0.02467900700867176,
"rewards/rejected": 0.027463700622320175,
"step": 30
},
{
"dpo_losses": 0.6675597429275513,
"epoch": 0.11,
"grad_norm": 1.7320035926217752,
"learning_rate": 4.998060489154965e-06,
"logits/chosen": -2.8310070037841797,
"logits/rejected": -2.751425266265869,
"logps/chosen": -268.48809814453125,
"logps/rejected": -222.01107788085938,
"loss": 0.6662,
"positive_losses": 0.054492950439453125,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.08996561169624329,
"rewards/margins": 0.05272960662841797,
"rewards/margins_max": 0.1101265698671341,
"rewards/margins_min": 0.003616312053054571,
"rewards/margins_std": 0.048521898686885834,
"rewards/rejected": 0.03723599761724472,
"step": 40
},
{
"dpo_losses": 0.6397972106933594,
"epoch": 0.14,
"grad_norm": 9.583890638870626,
"learning_rate": 4.976275538042932e-06,
"logits/chosen": -2.7891061305999756,
"logits/rejected": -2.7175135612487793,
"logps/chosen": -262.20794677734375,
"logps/rejected": -231.79653930664062,
"loss": 0.6446,
"positive_losses": 0.0,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.13362163305282593,
"rewards/margins": 0.11281381547451019,
"rewards/margins_max": 0.23626498878002167,
"rewards/margins_min": 0.022470083087682724,
"rewards/margins_std": 0.0988926962018013,
"rewards/rejected": 0.02080780453979969,
"step": 50
},
{
"dpo_losses": 0.6110584139823914,
"epoch": 0.17,
"grad_norm": 2.0747443213986694,
"learning_rate": 4.93049306999712e-06,
"logits/chosen": -2.7118520736694336,
"logits/rejected": -2.6753315925598145,
"logps/chosen": -296.9767150878906,
"logps/rejected": -263.8233947753906,
"loss": 0.628,
"positive_losses": 0.011554336175322533,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.16662926971912384,
"rewards/margins": 0.17714819312095642,
"rewards/margins_max": 0.30765318870544434,
"rewards/margins_min": 0.05318903177976608,
"rewards/margins_std": 0.11578011512756348,
"rewards/rejected": -0.010518952272832394,
"step": 60
},
{
"dpo_losses": 0.6022371053695679,
"epoch": 0.2,
"grad_norm": 1.5871888283763238,
"learning_rate": 4.861156761634014e-06,
"logits/chosen": -2.7271430492401123,
"logits/rejected": -2.6688759326934814,
"logps/chosen": -303.47613525390625,
"logps/rejected": -236.2406463623047,
"loss": 0.6175,
"positive_losses": 0.19450588524341583,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.19385087490081787,
"rewards/margins": 0.19984133541584015,
"rewards/margins_max": 0.4134605824947357,
"rewards/margins_min": 0.04761160537600517,
"rewards/margins_std": 0.16880682110786438,
"rewards/rejected": -0.00599044980481267,
"step": 70
},
{
"dpo_losses": 0.5768495798110962,
"epoch": 0.23,
"grad_norm": 1.804849988880195,
"learning_rate": 4.7689385491773934e-06,
"logits/chosen": -2.738285779953003,
"logits/rejected": -2.684203863143921,
"logps/chosen": -300.8853454589844,
"logps/rejected": -292.05633544921875,
"loss": 0.6017,
"positive_losses": 0.328561395406723,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.20385125279426575,
"rewards/margins": 0.26062771677970886,
"rewards/margins_max": 0.4970013201236725,
"rewards/margins_min": 0.05170217156410217,
"rewards/margins_std": 0.2058703452348709,
"rewards/rejected": -0.056776486337184906,
"step": 80
},
{
"dpo_losses": 0.5672236084938049,
"epoch": 0.25,
"grad_norm": 2.184742961229221,
"learning_rate": 4.654732116743193e-06,
"logits/chosen": -2.6370556354522705,
"logits/rejected": -2.601066827774048,
"logps/chosen": -252.70535278320312,
"logps/rejected": -203.89418029785156,
"loss": 0.5769,
"positive_losses": 0.07196970283985138,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.2328944206237793,
"rewards/margins": 0.2819642424583435,
"rewards/margins_max": 0.514846682548523,
"rewards/margins_min": 0.09985215216875076,
"rewards/margins_std": 0.19404996931552887,
"rewards/rejected": -0.049069829285144806,
"step": 90
},
{
"dpo_losses": 0.5702880620956421,
"epoch": 0.28,
"grad_norm": 2.550586173059517,
"learning_rate": 4.5196442356717526e-06,
"logits/chosen": -2.6703598499298096,
"logits/rejected": -2.6374478340148926,
"logps/chosen": -264.9583740234375,
"logps/rejected": -273.49615478515625,
"loss": 0.6232,
"positive_losses": 1.2302151918411255,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.16453364491462708,
"rewards/margins": 0.27762115001678467,
"rewards/margins_max": 0.5491287708282471,
"rewards/margins_min": 0.05581303685903549,
"rewards/margins_std": 0.22483690083026886,
"rewards/rejected": -0.113087497651577,
"step": 100
},
{
"epoch": 0.28,
"eval_dpo_losses": 0.6656126976013184,
"eval_logits/chosen": -2.67258620262146,
"eval_logits/rejected": -2.6360833644866943,
"eval_logps/chosen": -280.30804443359375,
"eval_logps/rejected": -261.0971984863281,
"eval_loss": 1.1412982940673828,
"eval_positive_losses": 4.261031627655029,
"eval_rewards/accuracies": 0.6230000257492065,
"eval_rewards/chosen": 0.04285382851958275,
"eval_rewards/margins": 0.06803657114505768,
"eval_rewards/margins_max": 0.40864306688308716,
"eval_rewards/margins_min": -0.22808942198753357,
"eval_rewards/margins_std": 0.2094314992427826,
"eval_rewards/rejected": -0.02518274076282978,
"eval_runtime": 429.2755,
"eval_samples_per_second": 4.659,
"eval_steps_per_second": 0.291,
"step": 100
},
{
"dpo_losses": 0.5097740888595581,
"epoch": 0.31,
"grad_norm": 6.336382416368574,
"learning_rate": 4.364984038837727e-06,
"logits/chosen": -2.742903709411621,
"logits/rejected": -2.654869318008423,
"logps/chosen": -349.24517822265625,
"logps/rejected": -304.54730224609375,
"loss": 0.543,
"positive_losses": 0.44344156980514526,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.25360527634620667,
"rewards/margins": 0.43474069237709045,
"rewards/margins_max": 0.7704421281814575,
"rewards/margins_min": 0.1366521120071411,
"rewards/margins_std": 0.2834155559539795,
"rewards/rejected": -0.1811354160308838,
"step": 110
},
{
"dpo_losses": 0.518837571144104,
"epoch": 0.34,
"grad_norm": 2.194144050007341,
"learning_rate": 4.192250333880045e-06,
"logits/chosen": -2.7281386852264404,
"logits/rejected": -2.670868396759033,
"logps/chosen": -321.75982666015625,
"logps/rejected": -280.87091064453125,
"loss": 0.5524,
"positive_losses": 0.46012669801712036,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.26626402139663696,
"rewards/margins": 0.4130307137966156,
"rewards/margins_max": 0.7945607900619507,
"rewards/margins_min": 0.14706461131572723,
"rewards/margins_std": 0.2963123917579651,
"rewards/rejected": -0.14676669239997864,
"step": 120
},
{
"dpo_losses": 0.4917011260986328,
"epoch": 0.37,
"grad_norm": 1.7534787479023215,
"learning_rate": 4.0031170782990214e-06,
"logits/chosen": -2.711912155151367,
"logits/rejected": -2.634033441543579,
"logps/chosen": -353.554443359375,
"logps/rejected": -320.6388244628906,
"loss": 0.5518,
"positive_losses": 0.8977662920951843,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.2880980372428894,
"rewards/margins": 0.4901772439479828,
"rewards/margins_max": 0.8924927711486816,
"rewards/margins_min": 0.1499636471271515,
"rewards/margins_std": 0.3346417546272278,
"rewards/rejected": -0.20207922160625458,
"step": 130
},
{
"dpo_losses": 0.4866393208503723,
"epoch": 0.39,
"grad_norm": 21.27134583914694,
"learning_rate": 3.7994171571810756e-06,
"logits/chosen": -2.6895060539245605,
"logits/rejected": -2.6512811183929443,
"logps/chosen": -291.05548095703125,
"logps/rejected": -294.4687805175781,
"loss": 0.5718,
"positive_losses": 0.2207096517086029,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.2735855281352997,
"rewards/margins": 0.5197780132293701,
"rewards/margins_max": 1.003483772277832,
"rewards/margins_min": 0.1269286870956421,
"rewards/margins_std": 0.3979441523551941,
"rewards/rejected": -0.24619252979755402,
"step": 140
},
{
"dpo_losses": 0.5046078562736511,
"epoch": 0.42,
"grad_norm": 3.3011186957688583,
"learning_rate": 3.5831246207606597e-06,
"logits/chosen": -2.6959190368652344,
"logits/rejected": -2.658679962158203,
"logps/chosen": -264.2646179199219,
"logps/rejected": -234.5491180419922,
"loss": 0.5366,
"positive_losses": 0.490040123462677,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.24420371651649475,
"rewards/margins": 0.45886701345443726,
"rewards/margins_max": 0.8680801391601562,
"rewards/margins_min": 0.1154303103685379,
"rewards/margins_std": 0.34930768609046936,
"rewards/rejected": -0.2146632969379425,
"step": 150
},
{
"dpo_losses": 0.48088502883911133,
"epoch": 0.45,
"grad_norm": 2.135658014816511,
"learning_rate": 3.3563355539546795e-06,
"logits/chosen": -2.665548801422119,
"logits/rejected": -2.6138901710510254,
"logps/chosen": -274.263427734375,
"logps/rejected": -260.50518798828125,
"loss": 0.5724,
"positive_losses": 0.9731669425964355,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.262834370136261,
"rewards/margins": 0.5239533185958862,
"rewards/margins_max": 0.9884392023086548,
"rewards/margins_min": 0.15575796365737915,
"rewards/margins_std": 0.3754872977733612,
"rewards/rejected": -0.26111894845962524,
"step": 160
},
{
"dpo_losses": 0.4504636824131012,
"epoch": 0.48,
"grad_norm": 3.940043763048366,
"learning_rate": 3.121247763262235e-06,
"logits/chosen": -2.708754777908325,
"logits/rejected": -2.657917022705078,
"logps/chosen": -297.7489013671875,
"logps/rejected": -327.0563049316406,
"loss": 0.4813,
"positive_losses": 0.03098602220416069,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.3187271058559418,
"rewards/margins": 0.6266334652900696,
"rewards/margins_max": 1.0517089366912842,
"rewards/margins_min": 0.17500966787338257,
"rewards/margins_std": 0.3909396231174469,
"rewards/rejected": -0.3079063296318054,
"step": 170
},
{
"dpo_losses": 0.4588772654533386,
"epoch": 0.51,
"grad_norm": 8.823245159881209,
"learning_rate": 2.8801394778833475e-06,
"logits/chosen": -2.6968963146209717,
"logits/rejected": -2.6140356063842773,
"logps/chosen": -305.4325866699219,
"logps/rejected": -326.99798583984375,
"loss": 0.5468,
"positive_losses": 0.8232825994491577,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.2646820843219757,
"rewards/margins": 0.5928131937980652,
"rewards/margins_max": 1.0361554622650146,
"rewards/margins_min": 0.28750157356262207,
"rewards/margins_std": 0.33570224046707153,
"rewards/rejected": -0.32813113927841187,
"step": 180
},
{
"dpo_losses": 0.45539379119873047,
"epoch": 0.54,
"grad_norm": 3.517893013000186,
"learning_rate": 2.6353472714635443e-06,
"logits/chosen": -2.6537580490112305,
"logits/rejected": -2.5634191036224365,
"logps/chosen": -287.6109619140625,
"logps/rejected": -265.6959228515625,
"loss": 0.5435,
"positive_losses": 0.9886103868484497,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.34349915385246277,
"rewards/margins": 0.6255816221237183,
"rewards/margins_max": 1.1905597448349,
"rewards/margins_min": 0.168921560049057,
"rewards/margins_std": 0.453277051448822,
"rewards/rejected": -0.2820824980735779,
"step": 190
},
{
"dpo_losses": 0.44315657019615173,
"epoch": 0.56,
"grad_norm": 27.976402148032502,
"learning_rate": 2.3892434184240536e-06,
"logits/chosen": -2.7400636672973633,
"logits/rejected": -2.662397623062134,
"logps/chosen": -309.39691162109375,
"logps/rejected": -299.7530212402344,
"loss": 0.5625,
"positive_losses": 0.9616166353225708,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.30135902762413025,
"rewards/margins": 0.6429153084754944,
"rewards/margins_max": 1.131412148475647,
"rewards/margins_min": 0.17879006266593933,
"rewards/margins_std": 0.4260264039039612,
"rewards/rejected": -0.34155628085136414,
"step": 200
},
{
"epoch": 0.56,
"eval_dpo_losses": 0.6469283699989319,
"eval_logits/chosen": -2.678022623062134,
"eval_logits/rejected": -2.6380200386047363,
"eval_logps/chosen": -286.4236145019531,
"eval_logps/rejected": -272.83990478515625,
"eval_loss": 1.7185667753219604,
"eval_positive_losses": 9.667731285095215,
"eval_rewards/accuracies": 0.6420000195503235,
"eval_rewards/chosen": -0.018302178010344505,
"eval_rewards/margins": 0.1243075579404831,
"eval_rewards/margins_max": 0.6361650228500366,
"eval_rewards/margins_min": -0.3433184325695038,
"eval_rewards/margins_std": 0.32774004340171814,
"eval_rewards/rejected": -0.14260973036289215,
"eval_runtime": 428.2243,
"eval_samples_per_second": 4.67,
"eval_steps_per_second": 0.292,
"step": 200
},
{
"dpo_losses": 0.4354400634765625,
"epoch": 0.59,
"grad_norm": 23.522369776083625,
"learning_rate": 2.1442129043167877e-06,
"logits/chosen": -2.6434009075164795,
"logits/rejected": -2.6138339042663574,
"logps/chosen": -286.7272033691406,
"logps/rejected": -291.8896789550781,
"loss": 0.513,
"positive_losses": 0.665066123008728,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.33457106351852417,
"rewards/margins": 0.6892200708389282,
"rewards/margins_max": 1.2269551753997803,
"rewards/margins_min": 0.18099449574947357,
"rewards/margins_std": 0.46556130051612854,
"rewards/rejected": -0.35464900732040405,
"step": 210
},
{
"dpo_losses": 0.4387238025665283,
"epoch": 0.62,
"grad_norm": 11.92404423048434,
"learning_rate": 1.9026303129961049e-06,
"logits/chosen": -2.7612462043762207,
"logits/rejected": -2.664234161376953,
"logps/chosen": -319.7461853027344,
"logps/rejected": -306.0053405761719,
"loss": 0.5894,
"positive_losses": 1.1452913284301758,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.33710065484046936,
"rewards/margins": 0.6538791656494141,
"rewards/margins_max": 1.1509373188018799,
"rewards/margins_min": 0.19225715100765228,
"rewards/margins_std": 0.4403897225856781,
"rewards/rejected": -0.3167785704135895,
"step": 220
},
{
"dpo_losses": 0.44511428475379944,
"epoch": 0.65,
"grad_norm": 2.419282473127918,
"learning_rate": 1.66683681459314e-06,
"logits/chosen": -2.773876428604126,
"logits/rejected": -2.67607045173645,
"logps/chosen": -339.04718017578125,
"logps/rejected": -293.1225891113281,
"loss": 0.4763,
"positive_losses": 0.6133368611335754,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.32628515362739563,
"rewards/margins": 0.6365767121315002,
"rewards/margins_max": 1.125410795211792,
"rewards/margins_min": 0.21782192587852478,
"rewards/margins_std": 0.4051855504512787,
"rewards/rejected": -0.3102915287017822,
"step": 230
},
{
"dpo_losses": 0.4544529318809509,
"epoch": 0.68,
"grad_norm": 13.447116267552904,
"learning_rate": 1.4391174773015836e-06,
"logits/chosen": -2.7197587490081787,
"logits/rejected": -2.649749279022217,
"logps/chosen": -302.6105041503906,
"logps/rejected": -321.8402404785156,
"loss": 0.692,
"positive_losses": 2.48455810546875,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.22186538577079773,
"rewards/margins": 0.6085190773010254,
"rewards/margins_max": 1.1415433883666992,
"rewards/margins_min": 0.23370866477489471,
"rewards/margins_std": 0.41311854124069214,
"rewards/rejected": -0.38665369153022766,
"step": 240
},
{
"dpo_losses": 0.45861634612083435,
"epoch": 0.7,
"grad_norm": 5.111403689556549,
"learning_rate": 1.2216791228457778e-06,
"logits/chosen": -2.716823101043701,
"logits/rejected": -2.640800952911377,
"logps/chosen": -280.11114501953125,
"logps/rejected": -281.67138671875,
"loss": 0.4992,
"positive_losses": 0.6084854006767273,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.31169968843460083,
"rewards/margins": 0.6179708242416382,
"rewards/margins_max": 1.2185614109039307,
"rewards/margins_min": 0.1615341305732727,
"rewards/margins_std": 0.4740964472293854,
"rewards/rejected": -0.30627113580703735,
"step": 250
},
{
"dpo_losses": 0.4628082811832428,
"epoch": 0.73,
"grad_norm": 2.699692592075128,
"learning_rate": 1.0166289402331391e-06,
"logits/chosen": -2.7728962898254395,
"logits/rejected": -2.684753894805908,
"logps/chosen": -263.36126708984375,
"logps/rejected": -289.21661376953125,
"loss": 0.5624,
"positive_losses": 0.9304378628730774,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.28732261061668396,
"rewards/margins": 0.5901791453361511,
"rewards/margins_max": 1.102694034576416,
"rewards/margins_min": 0.17682021856307983,
"rewards/margins_std": 0.4229150712490082,
"rewards/rejected": -0.30285659432411194,
"step": 260
},
{
"dpo_losses": 0.4588424265384674,
"epoch": 0.76,
"grad_norm": 2.4735784513371377,
"learning_rate": 8.259540650444736e-07,
"logits/chosen": -2.717153787612915,
"logits/rejected": -2.662932872772217,
"logps/chosen": -278.75482177734375,
"logps/rejected": -291.56866455078125,
"loss": 0.5853,
"positive_losses": 0.9098857641220093,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.30360764265060425,
"rewards/margins": 0.5942984223365784,
"rewards/margins_max": 1.0322821140289307,
"rewards/margins_min": 0.21275146305561066,
"rewards/margins_std": 0.36198341846466064,
"rewards/rejected": -0.2906908392906189,
"step": 270
},
{
"dpo_losses": 0.4629085958003998,
"epoch": 0.79,
"grad_norm": 13.451546074592132,
"learning_rate": 6.515023221586722e-07,
"logits/chosen": -2.6962451934814453,
"logits/rejected": -2.6575076580047607,
"logps/chosen": -274.9664001464844,
"logps/rejected": -304.9722595214844,
"loss": 0.5625,
"positive_losses": 1.4465850591659546,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.2849060893058777,
"rewards/margins": 0.60865718126297,
"rewards/margins_max": 1.1329301595687866,
"rewards/margins_min": 0.1755952537059784,
"rewards/margins_std": 0.4414794445037842,
"rewards/rejected": -0.3237510919570923,
"step": 280
},
{
"dpo_losses": 0.47258663177490234,
"epoch": 0.82,
"grad_norm": 2.654477953260434,
"learning_rate": 4.949643185335288e-07,
"logits/chosen": -2.707307815551758,
"logits/rejected": -2.652792453765869,
"logps/chosen": -259.1030578613281,
"logps/rejected": -292.6324462890625,
"loss": 0.6149,
"positive_losses": 1.7202523946762085,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.27813172340393066,
"rewards/margins": 0.5642735958099365,
"rewards/margins_max": 1.0385398864746094,
"rewards/margins_min": 0.12702254951000214,
"rewards/margins_std": 0.4158683717250824,
"rewards/rejected": -0.28614187240600586,
"step": 290
},
{
"dpo_losses": 0.4324049949645996,
"epoch": 0.85,
"grad_norm": 11.591501845708454,
"learning_rate": 3.578570595810274e-07,
"logits/chosen": -2.7821717262268066,
"logits/rejected": -2.6995315551757812,
"logps/chosen": -309.7518310546875,
"logps/rejected": -320.70916748046875,
"loss": 0.4748,
"positive_losses": 0.8444260358810425,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3676120638847351,
"rewards/margins": 0.6803697943687439,
"rewards/margins_max": 1.199285864830017,
"rewards/margins_min": 0.21351738274097443,
"rewards/margins_std": 0.4415613114833832,
"rewards/rejected": -0.312757670879364,
"step": 300
},
{
"epoch": 0.85,
"eval_dpo_losses": 0.6448404788970947,
"eval_logits/chosen": -2.715327739715576,
"eval_logits/rejected": -2.6732916831970215,
"eval_logps/chosen": -284.4527587890625,
"eval_logps/rejected": -271.32244873046875,
"eval_loss": 1.6048117876052856,
"eval_positive_losses": 8.706162452697754,
"eval_rewards/accuracies": 0.6470000147819519,
"eval_rewards/chosen": 0.0014067561132833362,
"eval_rewards/margins": 0.12884218990802765,
"eval_rewards/margins_max": 0.6374967098236084,
"eval_rewards/margins_min": -0.34605804085731506,
"eval_rewards/margins_std": 0.3295030891895294,
"eval_rewards/rejected": -0.12743544578552246,
"eval_runtime": 428.2498,
"eval_samples_per_second": 4.67,
"eval_steps_per_second": 0.292,
"step": 300
},
{
"dpo_losses": 0.45941466093063354,
"epoch": 0.87,
"grad_norm": 2.6085680781835205,
"learning_rate": 2.4150924791035037e-07,
"logits/chosen": -2.774445056915283,
"logits/rejected": -2.673360824584961,
"logps/chosen": -267.74237060546875,
"logps/rejected": -243.88473510742188,
"loss": 0.5697,
"positive_losses": 1.3653801679611206,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.30073457956314087,
"rewards/margins": 0.5973426699638367,
"rewards/margins_max": 1.1060882806777954,
"rewards/margins_min": 0.18351522088050842,
"rewards/margins_std": 0.4086340069770813,
"rewards/rejected": -0.2966081500053406,
"step": 310
},
{
"dpo_losses": 0.45310109853744507,
"epoch": 0.9,
"grad_norm": 10.060071948421735,
"learning_rate": 1.4704840690808658e-07,
"logits/chosen": -2.738978385925293,
"logits/rejected": -2.680860757827759,
"logps/chosen": -279.5138854980469,
"logps/rejected": -293.9893493652344,
"loss": 0.5692,
"positive_losses": 1.6892318725585938,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.2875928282737732,
"rewards/margins": 0.6207860708236694,
"rewards/margins_max": 1.124011754989624,
"rewards/margins_min": 0.14557920396327972,
"rewards/margins_std": 0.44626301527023315,
"rewards/rejected": -0.33319321274757385,
"step": 320
},
{
"dpo_losses": 0.42673492431640625,
"epoch": 0.93,
"grad_norm": 9.476085880429812,
"learning_rate": 7.538995394063996e-08,
"logits/chosen": -2.8187005519866943,
"logits/rejected": -2.7311813831329346,
"logps/chosen": -318.88360595703125,
"logps/rejected": -302.66058349609375,
"loss": 0.5314,
"positive_losses": 0.5069873929023743,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.35436224937438965,
"rewards/margins": 0.7115713953971863,
"rewards/margins_max": 1.23550546169281,
"rewards/margins_min": 0.2139424830675125,
"rewards/margins_std": 0.4558965563774109,
"rewards/rejected": -0.35720914602279663,
"step": 330
},
{
"dpo_losses": 0.4437997341156006,
"epoch": 0.96,
"grad_norm": 2.682118994824555,
"learning_rate": 2.722832907015971e-08,
"logits/chosen": -2.6981847286224365,
"logits/rejected": -2.6440398693084717,
"logps/chosen": -266.6497802734375,
"logps/rejected": -282.98199462890625,
"loss": 0.5024,
"positive_losses": 0.9627658724784851,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.3319571018218994,
"rewards/margins": 0.6500804424285889,
"rewards/margins_max": 1.2494922876358032,
"rewards/margins_min": 0.25120097398757935,
"rewards/margins_std": 0.4507668614387512,
"rewards/rejected": -0.31812337040901184,
"step": 340
},
{
"dpo_losses": 0.4518283009529114,
"epoch": 0.99,
"grad_norm": 5.762126574549782,
"learning_rate": 3.030265255329623e-09,
"logits/chosen": -2.6820361614227295,
"logits/rejected": -2.6376953125,
"logps/chosen": -285.1527404785156,
"logps/rejected": -317.6675720214844,
"loss": 0.5059,
"positive_losses": 0.9290813207626343,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": 0.2980636656284332,
"rewards/margins": 0.6102195978164673,
"rewards/margins_max": 1.0686355829238892,
"rewards/margins_min": 0.20541608333587646,
"rewards/margins_std": 0.38572338223457336,
"rewards/rejected": -0.31215590238571167,
"step": 350
},
{
"epoch": 1.0,
"step": 355,
"total_flos": 0.0,
"train_loss": 0.5743894765074824,
"train_runtime": 4311.1014,
"train_samples_per_second": 1.317,
"train_steps_per_second": 0.082
}
],
"logging_steps": 10,
"max_steps": 355,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}