chchen's picture
End of training
d04f516 verified
{
"best_metric": 0.8434417247772217,
"best_model_checkpoint": "saves/Mistral-7B-Instruct-v0.3/lora/orpo-salt/checkpoint-1500",
"epoch": 2.9969690846635686,
"eval_steps": 500,
"global_step": 1854,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01616488179430188,
"grad_norm": 8.316998481750488,
"learning_rate": 4.999648198770648e-06,
"logits/chosen": -2.9437620639801025,
"logits/rejected": -2.991391658782959,
"logps/chosen": -1.0850014686584473,
"logps/rejected": -1.700299620628357,
"loss": 1.1424,
"odds_ratio_loss": 0.5736632347106934,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.10850014537572861,
"rewards/margins": 0.06152981519699097,
"rewards/rejected": -0.17002995312213898,
"sft_loss": 1.0850014686584473,
"step": 10
},
{
"epoch": 0.03232976358860376,
"grad_norm": 11.040986061096191,
"learning_rate": 4.998578646361359e-06,
"logits/chosen": -2.942950963973999,
"logits/rejected": -2.972404956817627,
"logps/chosen": -1.057308316230774,
"logps/rejected": -1.3991749286651611,
"loss": 1.1175,
"odds_ratio_loss": 0.6021074056625366,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.10573084652423859,
"rewards/margins": 0.03418666869401932,
"rewards/rejected": -0.1399175077676773,
"sft_loss": 1.057308316230774,
"step": 20
},
{
"epoch": 0.04849464538290564,
"grad_norm": 6.759947776794434,
"learning_rate": 4.996791614004449e-06,
"logits/chosen": -2.967661142349243,
"logits/rejected": -2.988191604614258,
"logps/chosen": -0.9923480749130249,
"logps/rejected": -1.4220160245895386,
"loss": 1.0537,
"odds_ratio_loss": 0.6132165789604187,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.09923480451107025,
"rewards/margins": 0.042966801673173904,
"rewards/rejected": -0.14220160245895386,
"sft_loss": 0.9923480749130249,
"step": 30
},
{
"epoch": 0.06465952717720752,
"grad_norm": 7.137161731719971,
"learning_rate": 4.994287614855618e-06,
"logits/chosen": -2.920475721359253,
"logits/rejected": -2.9866600036621094,
"logps/chosen": -1.0413509607315063,
"logps/rejected": -1.3548152446746826,
"loss": 1.1081,
"odds_ratio_loss": 0.6675835847854614,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.10413509607315063,
"rewards/margins": 0.03134642913937569,
"rewards/rejected": -0.13548150658607483,
"sft_loss": 1.0413509607315063,
"step": 40
},
{
"epoch": 0.0808244089715094,
"grad_norm": 5.560673236846924,
"learning_rate": 4.991067367951343e-06,
"logits/chosen": -3.0365490913391113,
"logits/rejected": -3.02650785446167,
"logps/chosen": -1.0141496658325195,
"logps/rejected": -1.311621904373169,
"loss": 1.0767,
"odds_ratio_loss": 0.6250823140144348,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.10141497850418091,
"rewards/margins": 0.02974722720682621,
"rewards/rejected": -0.13116219639778137,
"sft_loss": 1.0141496658325195,
"step": 50
},
{
"epoch": 0.09698929076581128,
"grad_norm": 2.7108840942382812,
"learning_rate": 4.987131798002389e-06,
"logits/chosen": -2.9634835720062256,
"logits/rejected": -2.984647274017334,
"logps/chosen": -0.9159129858016968,
"logps/rejected": -1.1770719289779663,
"loss": 0.9847,
"odds_ratio_loss": 0.6881905198097229,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.09159130603075027,
"rewards/margins": 0.026115888729691505,
"rewards/rejected": -0.11770719289779663,
"sft_loss": 0.9159129858016968,
"step": 60
},
{
"epoch": 0.11315417256011315,
"grad_norm": 10.522608757019043,
"learning_rate": 4.982482035128285e-06,
"logits/chosen": -2.909318208694458,
"logits/rejected": -2.938032388687134,
"logps/chosen": -0.9783811569213867,
"logps/rejected": -1.2935713529586792,
"loss": 1.0458,
"odds_ratio_loss": 0.6742582321166992,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09783812612295151,
"rewards/margins": 0.03151901811361313,
"rewards/rejected": -0.12935714423656464,
"sft_loss": 0.9783811569213867,
"step": 70
},
{
"epoch": 0.12931905435441504,
"grad_norm": 5.515076160430908,
"learning_rate": 4.9771194145328e-06,
"logits/chosen": -2.9371273517608643,
"logits/rejected": -2.9489827156066895,
"logps/chosen": -0.8305400013923645,
"logps/rejected": -1.074385643005371,
"loss": 0.8918,
"odds_ratio_loss": 0.612562358379364,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08305400609970093,
"rewards/margins": 0.024384554475545883,
"rewards/rejected": -0.10743856430053711,
"sft_loss": 0.8305400013923645,
"step": 80
},
{
"epoch": 0.1454839361487169,
"grad_norm": 4.674871444702148,
"learning_rate": 4.971045476120532e-06,
"logits/chosen": -2.9331607818603516,
"logits/rejected": -2.950925350189209,
"logps/chosen": -0.828619122505188,
"logps/rejected": -1.0910576581954956,
"loss": 0.892,
"odds_ratio_loss": 0.6336237192153931,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08286191523075104,
"rewards/margins": 0.02624385617673397,
"rewards/rejected": -0.10910576581954956,
"sft_loss": 0.828619122505188,
"step": 90
},
{
"epoch": 0.1616488179430188,
"grad_norm": 1.4944851398468018,
"learning_rate": 4.964261964054713e-06,
"logits/chosen": -2.902466297149658,
"logits/rejected": -2.9229366779327393,
"logps/chosen": -0.8629521131515503,
"logps/rejected": -1.1251227855682373,
"loss": 0.9278,
"odds_ratio_loss": 0.6480029821395874,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0862952247262001,
"rewards/margins": 0.02621707320213318,
"rewards/rejected": -0.11251229047775269,
"sft_loss": 0.8629521131515503,
"step": 100
},
{
"epoch": 0.17781369973732067,
"grad_norm": 2.9030275344848633,
"learning_rate": 4.956770826256372e-06,
"logits/chosen": -2.957075595855713,
"logits/rejected": -2.9636242389678955,
"logps/chosen": -0.8815444707870483,
"logps/rejected": -1.0787910223007202,
"loss": 0.9485,
"odds_ratio_loss": 0.6699932813644409,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08815445005893707,
"rewards/margins": 0.019724659621715546,
"rewards/rejected": -0.10787911713123322,
"sft_loss": 0.8815444707870483,
"step": 110
},
{
"epoch": 0.19397858153162256,
"grad_norm": 2.7190232276916504,
"learning_rate": 4.94857421384497e-06,
"logits/chosen": -2.9349093437194824,
"logits/rejected": -2.958707332611084,
"logps/chosen": -0.8882864117622375,
"logps/rejected": -1.1156352758407593,
"loss": 0.9568,
"odds_ratio_loss": 0.6850704550743103,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08882863819599152,
"rewards/margins": 0.022734878584742546,
"rewards/rejected": -0.11156351864337921,
"sft_loss": 0.8882864117622375,
"step": 120
},
{
"epoch": 0.21014346332592443,
"grad_norm": 2.794067859649658,
"learning_rate": 4.939674480520701e-06,
"logits/chosen": -2.9200732707977295,
"logits/rejected": -2.971010446548462,
"logps/chosen": -0.9047578573226929,
"logps/rejected": -1.1047561168670654,
"loss": 0.971,
"odds_ratio_loss": 0.6627197861671448,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.09047579020261765,
"rewards/margins": 0.019999820739030838,
"rewards/rejected": -0.11047561466693878,
"sft_loss": 0.9047578573226929,
"step": 130
},
{
"epoch": 0.2263083451202263,
"grad_norm": 2.3947913646698,
"learning_rate": 4.930074181888613e-06,
"logits/chosen": -2.9679551124572754,
"logits/rejected": -3.003051280975342,
"logps/chosen": -0.8538404703140259,
"logps/rejected": -1.0635920763015747,
"loss": 0.9144,
"odds_ratio_loss": 0.6060706377029419,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0853840559720993,
"rewards/margins": 0.020975153893232346,
"rewards/rejected": -0.10635919868946075,
"sft_loss": 0.8538404703140259,
"step": 140
},
{
"epoch": 0.2424732269145282,
"grad_norm": 1.2783300876617432,
"learning_rate": 4.91977607472475e-06,
"logits/chosen": -2.9910409450531006,
"logits/rejected": -3.0015668869018555,
"logps/chosen": -0.8571484684944153,
"logps/rejected": -1.0444114208221436,
"loss": 0.9202,
"odds_ratio_loss": 0.6310030221939087,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08571484684944153,
"rewards/margins": 0.018726304173469543,
"rewards/rejected": -0.10444115102291107,
"sft_loss": 0.8571484684944153,
"step": 150
},
{
"epoch": 0.2586381087088301,
"grad_norm": 1.8696929216384888,
"learning_rate": 4.908783116184534e-06,
"logits/chosen": -2.924990177154541,
"logits/rejected": -2.9346060752868652,
"logps/chosen": -0.8140287399291992,
"logps/rejected": -1.0593068599700928,
"loss": 0.8729,
"odds_ratio_loss": 0.5890231132507324,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08140286803245544,
"rewards/margins": 0.024527812376618385,
"rewards/rejected": -0.10593068599700928,
"sft_loss": 0.8140287399291992,
"step": 160
},
{
"epoch": 0.27480299050313195,
"grad_norm": 1.7881205081939697,
"learning_rate": 4.897098462953598e-06,
"logits/chosen": -3.010953426361084,
"logits/rejected": -3.0108227729797363,
"logps/chosen": -0.8341928720474243,
"logps/rejected": -1.1377969980239868,
"loss": 0.8988,
"odds_ratio_loss": 0.6464797258377075,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08341928571462631,
"rewards/margins": 0.030360404402017593,
"rewards/rejected": -0.1137797012925148,
"sft_loss": 0.8341928720474243,
"step": 170
},
{
"epoch": 0.2909678722974338,
"grad_norm": 1.495703935623169,
"learning_rate": 4.884725470341331e-06,
"logits/chosen": -2.979276180267334,
"logits/rejected": -3.003962755203247,
"logps/chosen": -0.8357053995132446,
"logps/rejected": -1.0890777111053467,
"loss": 0.894,
"odds_ratio_loss": 0.5833606123924255,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.08357055485248566,
"rewards/margins": 0.025337230414152145,
"rewards/rejected": -0.1089077740907669,
"sft_loss": 0.8357053995132446,
"step": 180
},
{
"epoch": 0.3071327540917357,
"grad_norm": 12.648097038269043,
"learning_rate": 4.871667691317377e-06,
"logits/chosen": -3.0223159790039062,
"logits/rejected": -3.021965742111206,
"logps/chosen": -0.9438085556030273,
"logps/rejected": -0.9988954663276672,
"loss": 1.0229,
"odds_ratio_loss": 0.7914139628410339,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.09438085556030273,
"rewards/margins": 0.005508692469447851,
"rewards/rejected": -0.09988953918218613,
"sft_loss": 0.9438085556030273,
"step": 190
},
{
"epoch": 0.3232976358860376,
"grad_norm": 10.996000289916992,
"learning_rate": 4.857928875491392e-06,
"logits/chosen": -2.9932854175567627,
"logits/rejected": -2.9929189682006836,
"logps/chosen": -0.7825512886047363,
"logps/rejected": -0.9704290628433228,
"loss": 0.8476,
"odds_ratio_loss": 0.65040123462677,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07825513184070587,
"rewards/margins": 0.018787771463394165,
"rewards/rejected": -0.09704291075468063,
"sft_loss": 0.7825512886047363,
"step": 200
},
{
"epoch": 0.33946251768033947,
"grad_norm": 2.8774170875549316,
"learning_rate": 4.843512968036314e-06,
"logits/chosen": -2.942992925643921,
"logits/rejected": -2.9653749465942383,
"logps/chosen": -0.7782715559005737,
"logps/rejected": -0.9667309522628784,
"loss": 0.8401,
"odds_ratio_loss": 0.6183562874794006,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07782715559005737,
"rewards/margins": 0.018845947459340096,
"rewards/rejected": -0.09667309373617172,
"sft_loss": 0.7782715559005737,
"step": 210
},
{
"epoch": 0.35562739947464134,
"grad_norm": 1.0692508220672607,
"learning_rate": 4.828424108555486e-06,
"logits/chosen": -3.0436058044433594,
"logits/rejected": -3.0385215282440186,
"logps/chosen": -0.9878608584403992,
"logps/rejected": -1.2349365949630737,
"loss": 1.0531,
"odds_ratio_loss": 0.6525717377662659,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.09878608584403992,
"rewards/margins": 0.024707583710551262,
"rewards/rejected": -0.12349365651607513,
"sft_loss": 0.9878608584403992,
"step": 220
},
{
"epoch": 0.3717922812689432,
"grad_norm": 0.9835062026977539,
"learning_rate": 4.812666629893957e-06,
"logits/chosen": -3.0219979286193848,
"logits/rejected": -3.0635628700256348,
"logps/chosen": -0.8264273405075073,
"logps/rejected": -0.9492254257202148,
"loss": 0.8963,
"odds_ratio_loss": 0.6982277631759644,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08264274150133133,
"rewards/margins": 0.012279799208045006,
"rewards/rejected": -0.09492253512144089,
"sft_loss": 0.8264273405075073,
"step": 230
},
{
"epoch": 0.3879571630632451,
"grad_norm": 2.8942973613739014,
"learning_rate": 4.796245056894273e-06,
"logits/chosen": -2.9647586345672607,
"logits/rejected": -3.0110132694244385,
"logps/chosen": -0.8473427891731262,
"logps/rejected": -1.0194810628890991,
"loss": 0.9194,
"odds_ratio_loss": 0.7206528782844543,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08473427593708038,
"rewards/margins": 0.01721382327377796,
"rewards/rejected": -0.10194810479879379,
"sft_loss": 0.8473427891731262,
"step": 240
},
{
"epoch": 0.404122044857547,
"grad_norm": 1.4256747961044312,
"learning_rate": 4.779164105097148e-06,
"logits/chosen": -3.019832134246826,
"logits/rejected": -3.0398221015930176,
"logps/chosen": -0.7970795631408691,
"logps/rejected": -1.0736204385757446,
"loss": 0.859,
"odds_ratio_loss": 0.6194061040878296,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07970795035362244,
"rewards/margins": 0.02765408717095852,
"rewards/rejected": -0.1073620468378067,
"sft_loss": 0.7970795631408691,
"step": 250
},
{
"epoch": 0.42028692665184886,
"grad_norm": 2.344681739807129,
"learning_rate": 4.761428679387373e-06,
"logits/chosen": -3.003972291946411,
"logits/rejected": -3.0536952018737793,
"logps/chosen": -0.8031284213066101,
"logps/rejected": -0.9748827815055847,
"loss": 0.8698,
"odds_ratio_loss": 0.666685163974762,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.0803128331899643,
"rewards/margins": 0.017175443470478058,
"rewards/rejected": -0.09748829156160355,
"sft_loss": 0.8031284213066101,
"step": 260
},
{
"epoch": 0.4364518084461507,
"grad_norm": 3.5396459102630615,
"learning_rate": 4.7430438725853515e-06,
"logits/chosen": -2.9764037132263184,
"logits/rejected": -3.0066990852355957,
"logps/chosen": -0.8188526034355164,
"logps/rejected": -1.1920969486236572,
"loss": 0.8795,
"odds_ratio_loss": 0.6065649390220642,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08188526332378387,
"rewards/margins": 0.03732443228363991,
"rewards/rejected": -0.11920969188213348,
"sft_loss": 0.8188526034355164,
"step": 270
},
{
"epoch": 0.4526166902404526,
"grad_norm": 2.1748006343841553,
"learning_rate": 4.724014963984669e-06,
"logits/chosen": -3.0459182262420654,
"logits/rejected": -3.0646400451660156,
"logps/chosen": -0.8359659910202026,
"logps/rejected": -1.090301513671875,
"loss": 0.9006,
"odds_ratio_loss": 0.6466442942619324,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0835966020822525,
"rewards/margins": 0.02543354593217373,
"rewards/rejected": -0.10903014987707138,
"sft_loss": 0.8359659910202026,
"step": 280
},
{
"epoch": 0.4687815720347545,
"grad_norm": 4.898100852966309,
"learning_rate": 4.704347417836116e-06,
"logits/chosen": -3.0006985664367676,
"logits/rejected": -3.0678975582122803,
"logps/chosen": -0.7437621355056763,
"logps/rejected": -0.9980353116989136,
"loss": 0.8091,
"odds_ratio_loss": 0.6530498266220093,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07437621057033539,
"rewards/margins": 0.025427332147955894,
"rewards/rejected": -0.09980354458093643,
"sft_loss": 0.7437621355056763,
"step": 290
},
{
"epoch": 0.4849464538290564,
"grad_norm": 1.6706643104553223,
"learning_rate": 4.684046881778603e-06,
"logits/chosen": -3.0016372203826904,
"logits/rejected": -3.0139174461364746,
"logps/chosen": -0.7875638008117676,
"logps/rejected": -0.9269906282424927,
"loss": 0.8522,
"odds_ratio_loss": 0.6463567614555359,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07875639200210571,
"rewards/margins": 0.013942673802375793,
"rewards/rejected": -0.0926990658044815,
"sft_loss": 0.7875638008117676,
"step": 300
},
{
"epoch": 0.5011113356233583,
"grad_norm": 2.1470396518707275,
"learning_rate": 4.663119185217409e-06,
"logits/chosen": -3.0117690563201904,
"logits/rejected": -3.0633795261383057,
"logps/chosen": -0.779082715511322,
"logps/rejected": -1.0186303853988647,
"loss": 0.8398,
"odds_ratio_loss": 0.6070552468299866,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07790827751159668,
"rewards/margins": 0.023954764008522034,
"rewards/rejected": -0.10186304897069931,
"sft_loss": 0.779082715511322,
"step": 310
},
{
"epoch": 0.5172762174176602,
"grad_norm": 1.40939199924469,
"learning_rate": 4.641570337650232e-06,
"logits/chosen": -3.0548150539398193,
"logits/rejected": -3.0848946571350098,
"logps/chosen": -0.7415497303009033,
"logps/rejected": -0.982310950756073,
"loss": 0.8005,
"odds_ratio_loss": 0.589560866355896,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07415496557950974,
"rewards/margins": 0.024076132103800774,
"rewards/rejected": -0.09823110699653625,
"sft_loss": 0.7415497303009033,
"step": 320
},
{
"epoch": 0.533441099211962,
"grad_norm": 1.4589684009552002,
"learning_rate": 4.61940652694154e-06,
"logits/chosen": -2.9786767959594727,
"logits/rejected": -3.0444042682647705,
"logps/chosen": -0.8274497985839844,
"logps/rejected": -1.0422160625457764,
"loss": 0.8954,
"odds_ratio_loss": 0.6792970895767212,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08274497836828232,
"rewards/margins": 0.021476630121469498,
"rewards/rejected": -0.10422160476446152,
"sft_loss": 0.8274497985839844,
"step": 330
},
{
"epoch": 0.5496059810062639,
"grad_norm": 1.7346688508987427,
"learning_rate": 4.596634117545689e-06,
"logits/chosen": -3.0816709995269775,
"logits/rejected": -3.0875000953674316,
"logps/chosen": -0.8030570149421692,
"logps/rejected": -1.0277900695800781,
"loss": 0.8675,
"odds_ratio_loss": 0.6442909836769104,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08030570298433304,
"rewards/margins": 0.022473318502306938,
"rewards/rejected": -0.10277901589870453,
"sft_loss": 0.8030570149421692,
"step": 340
},
{
"epoch": 0.5657708628005658,
"grad_norm": 1.8591201305389404,
"learning_rate": 4.573259648679335e-06,
"logits/chosen": -3.0649001598358154,
"logits/rejected": -3.029323101043701,
"logps/chosen": -0.7972906231880188,
"logps/rejected": -1.030458688735962,
"loss": 0.8615,
"odds_ratio_loss": 0.6421998739242554,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07972906529903412,
"rewards/margins": 0.02331680618226528,
"rewards/rejected": -0.10304586589336395,
"sft_loss": 0.7972906231880188,
"step": 350
},
{
"epoch": 0.5819357445948676,
"grad_norm": 6.8644022941589355,
"learning_rate": 4.549289832443663e-06,
"logits/chosen": -3.0592093467712402,
"logits/rejected": -3.0790326595306396,
"logps/chosen": -0.7847403287887573,
"logps/rejected": -1.0331987142562866,
"loss": 0.8515,
"odds_ratio_loss": 0.6680801510810852,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.0784740298986435,
"rewards/margins": 0.024845842272043228,
"rewards/rejected": -0.10331986844539642,
"sft_loss": 0.7847403287887573,
"step": 360
},
{
"epoch": 0.5981006263891695,
"grad_norm": 2.649265766143799,
"learning_rate": 4.524731551896978e-06,
"logits/chosen": -3.0282368659973145,
"logits/rejected": -3.0504040718078613,
"logps/chosen": -0.7369459271430969,
"logps/rejected": -0.8965708017349243,
"loss": 0.801,
"odds_ratio_loss": 0.6401507258415222,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07369460165500641,
"rewards/margins": 0.015962477773427963,
"rewards/rejected": -0.08965708315372467,
"sft_loss": 0.7369459271430969,
"step": 370
},
{
"epoch": 0.6142655081834714,
"grad_norm": 3.4245967864990234,
"learning_rate": 4.4995918590781925e-06,
"logits/chosen": -3.061760425567627,
"logits/rejected": -3.0728158950805664,
"logps/chosen": -0.7757102251052856,
"logps/rejected": -0.9465000033378601,
"loss": 0.8422,
"odds_ratio_loss": 0.6651015281677246,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07757101953029633,
"rewards/margins": 0.017078977078199387,
"rewards/rejected": -0.09464999288320541,
"sft_loss": 0.7757102251052856,
"step": 380
},
{
"epoch": 0.6304303899777733,
"grad_norm": 1.3795065879821777,
"learning_rate": 4.473877972981797e-06,
"logits/chosen": -3.0188069343566895,
"logits/rejected": -3.024099588394165,
"logps/chosen": -0.7883812189102173,
"logps/rejected": -1.0172218084335327,
"loss": 0.8495,
"odds_ratio_loss": 0.6112133860588074,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07883813232183456,
"rewards/margins": 0.02288406528532505,
"rewards/rejected": -0.10172219574451447,
"sft_loss": 0.7883812189102173,
"step": 390
},
{
"epoch": 0.6465952717720752,
"grad_norm": 1.795720100402832,
"learning_rate": 4.447597277484894e-06,
"logits/chosen": -2.9778666496276855,
"logits/rejected": -3.0176868438720703,
"logps/chosen": -0.743531346321106,
"logps/rejected": -0.9195922017097473,
"loss": 0.8054,
"odds_ratio_loss": 0.6191025972366333,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07435314357280731,
"rewards/margins": 0.017606090754270554,
"rewards/rejected": -0.09195923060178757,
"sft_loss": 0.743531346321106,
"step": 400
},
{
"epoch": 0.6627601535663771,
"grad_norm": 1.847349762916565,
"learning_rate": 4.42075731922687e-06,
"logits/chosen": -3.0738515853881836,
"logits/rejected": -3.080390453338623,
"logps/chosen": -0.8688371777534485,
"logps/rejected": -1.0517194271087646,
"loss": 0.9327,
"odds_ratio_loss": 0.6381778120994568,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08688371628522873,
"rewards/margins": 0.01828821375966072,
"rewards/rejected": -0.10517191886901855,
"sft_loss": 0.8688371777534485,
"step": 410
},
{
"epoch": 0.6789250353606789,
"grad_norm": 5.425457954406738,
"learning_rate": 4.3933658054423465e-06,
"logits/chosen": -3.0373263359069824,
"logits/rejected": -3.0470852851867676,
"logps/chosen": -0.7733573913574219,
"logps/rejected": -1.0374972820281982,
"loss": 0.8332,
"odds_ratio_loss": 0.5986987352371216,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07733573764562607,
"rewards/margins": 0.026413992047309875,
"rewards/rejected": -0.10374973714351654,
"sft_loss": 0.7733573913574219,
"step": 420
},
{
"epoch": 0.6950899171549808,
"grad_norm": 2.5281944274902344,
"learning_rate": 4.365430601748003e-06,
"logits/chosen": -3.036982536315918,
"logits/rejected": -3.0820653438568115,
"logps/chosen": -0.8373786807060242,
"logps/rejected": -0.9543370008468628,
"loss": 0.9048,
"odds_ratio_loss": 0.673850953578949,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08373787254095078,
"rewards/margins": 0.011695821769535542,
"rewards/rejected": -0.09543369710445404,
"sft_loss": 0.8373786807060242,
"step": 430
},
{
"epoch": 0.7112547989492827,
"grad_norm": 6.225044250488281,
"learning_rate": 4.336959729883925e-06,
"logits/chosen": -3.0365397930145264,
"logits/rejected": -3.063159942626953,
"logps/chosen": -0.7677688598632812,
"logps/rejected": -0.8747943043708801,
"loss": 0.8394,
"odds_ratio_loss": 0.7163954973220825,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0767768919467926,
"rewards/margins": 0.010702535510063171,
"rewards/rejected": -0.08747942745685577,
"sft_loss": 0.7677688598632812,
"step": 440
},
{
"epoch": 0.7274196807435845,
"grad_norm": 4.238840103149414,
"learning_rate": 4.307961365410118e-06,
"logits/chosen": -3.031554698944092,
"logits/rejected": -3.0537819862365723,
"logps/chosen": -0.7852329015731812,
"logps/rejected": -0.9492766261100769,
"loss": 0.8479,
"odds_ratio_loss": 0.6268799901008606,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07852329313755035,
"rewards/margins": 0.01640438288450241,
"rewards/rejected": -0.09492767602205276,
"sft_loss": 0.7852329015731812,
"step": 450
},
{
"epoch": 0.7435845625378864,
"grad_norm": 3.3165500164031982,
"learning_rate": 4.278443835358854e-06,
"logits/chosen": -3.0518264770507812,
"logits/rejected": -3.045757293701172,
"logps/chosen": -0.7719421982765198,
"logps/rejected": -1.0236573219299316,
"loss": 0.831,
"odds_ratio_loss": 0.5902360081672668,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0771942138671875,
"rewards/margins": 0.025171533226966858,
"rewards/rejected": -0.10236574709415436,
"sft_loss": 0.7719421982765198,
"step": 460
},
{
"epoch": 0.7597494443321883,
"grad_norm": 2.330195426940918,
"learning_rate": 4.248415615843523e-06,
"logits/chosen": -3.079732656478882,
"logits/rejected": -3.0858983993530273,
"logps/chosen": -0.7835872769355774,
"logps/rejected": -0.9195672273635864,
"loss": 0.8504,
"odds_ratio_loss": 0.6679321527481079,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0783587247133255,
"rewards/margins": 0.01359798014163971,
"rewards/rejected": -0.0919567197561264,
"sft_loss": 0.7835872769355774,
"step": 470
},
{
"epoch": 0.7759143261264903,
"grad_norm": 6.524634838104248,
"learning_rate": 4.217885329624666e-06,
"logits/chosen": -3.0687060356140137,
"logits/rejected": -3.066584348678589,
"logps/chosen": -0.7517032623291016,
"logps/rejected": -0.9625965356826782,
"loss": 0.8127,
"odds_ratio_loss": 0.6098276376724243,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07517032325267792,
"rewards/margins": 0.021089335903525352,
"rewards/rejected": -0.09625966101884842,
"sft_loss": 0.7517032623291016,
"step": 480
},
{
"epoch": 0.7920792079207921,
"grad_norm": 3.629946231842041,
"learning_rate": 4.186861743633911e-06,
"logits/chosen": -3.0519471168518066,
"logits/rejected": -3.0878560543060303,
"logps/chosen": -0.7691044807434082,
"logps/rejected": -1.0053441524505615,
"loss": 0.8348,
"odds_ratio_loss": 0.6568228006362915,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07691045105457306,
"rewards/margins": 0.023623958230018616,
"rewards/rejected": -0.10053440183401108,
"sft_loss": 0.7691044807434082,
"step": 490
},
{
"epoch": 0.808244089715094,
"grad_norm": 1.9105418920516968,
"learning_rate": 4.155353766456497e-06,
"logits/chosen": -3.1221230030059814,
"logits/rejected": -3.1099045276641846,
"logps/chosen": -0.8151519894599915,
"logps/rejected": -0.9320052862167358,
"loss": 0.8803,
"odds_ratio_loss": 0.6510958671569824,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08151519298553467,
"rewards/margins": 0.011685335077345371,
"rewards/rejected": -0.09320052713155746,
"sft_loss": 0.8151519894599915,
"step": 500
},
{
"epoch": 0.808244089715094,
"eval_logits/chosen": -3.052025556564331,
"eval_logits/rejected": -3.0746355056762695,
"eval_logps/chosen": -0.7961810827255249,
"eval_logps/rejected": -0.9834145307540894,
"eval_loss": 0.8619003891944885,
"eval_odds_ratio_loss": 0.6571925282478333,
"eval_rewards/accuracies": 0.5654545426368713,
"eval_rewards/chosen": -0.07961811125278473,
"eval_rewards/margins": 0.018723346292972565,
"eval_rewards/rejected": -0.0983414575457573,
"eval_runtime": 369.3504,
"eval_samples_per_second": 2.978,
"eval_sft_loss": 0.7961810827255249,
"eval_steps_per_second": 1.489,
"step": 500
},
{
"epoch": 0.8244089715093958,
"grad_norm": 2.1485707759857178,
"learning_rate": 4.123370445773134e-06,
"logits/chosen": -3.0945208072662354,
"logits/rejected": -3.1061959266662598,
"logps/chosen": -0.788953423500061,
"logps/rejected": -0.8694869875907898,
"loss": 0.8606,
"odds_ratio_loss": 0.7168292999267578,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07889534533023834,
"rewards/margins": 0.008053350262343884,
"rewards/rejected": -0.0869487002491951,
"sft_loss": 0.788953423500061,
"step": 510
},
{
"epoch": 0.8405738533036977,
"grad_norm": 5.462285041809082,
"learning_rate": 4.090920965761906e-06,
"logits/chosen": -3.0212631225585938,
"logits/rejected": -3.031066417694092,
"logps/chosen": -0.8095367550849915,
"logps/rejected": -0.968643844127655,
"loss": 0.8752,
"odds_ratio_loss": 0.656153678894043,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.0809536725282669,
"rewards/margins": 0.015910711139440536,
"rewards/rejected": -0.09686438739299774,
"sft_loss": 0.8095367550849915,
"step": 520
},
{
"epoch": 0.8567387350979996,
"grad_norm": 4.947661399841309,
"learning_rate": 4.058014644460991e-06,
"logits/chosen": -3.0334737300872803,
"logits/rejected": -3.049567937850952,
"logps/chosen": -0.756322979927063,
"logps/rejected": -0.8999664187431335,
"loss": 0.8179,
"odds_ratio_loss": 0.6152733564376831,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07563230395317078,
"rewards/margins": 0.014364344999194145,
"rewards/rejected": -0.08999665081501007,
"sft_loss": 0.756322979927063,
"step": 530
},
{
"epoch": 0.8729036168923014,
"grad_norm": 2.2244179248809814,
"learning_rate": 4.024660931092939e-06,
"logits/chosen": -3.0092921257019043,
"logits/rejected": -3.0213623046875,
"logps/chosen": -0.7882963418960571,
"logps/rejected": -0.9846014976501465,
"loss": 0.8497,
"odds_ratio_loss": 0.6144701838493347,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07882963120937347,
"rewards/margins": 0.019630510360002518,
"rewards/rejected": -0.0984601378440857,
"sft_loss": 0.7882963418960571,
"step": 540
},
{
"epoch": 0.8890684986866033,
"grad_norm": 1.6228642463684082,
"learning_rate": 3.990869403351272e-06,
"logits/chosen": -3.051035165786743,
"logits/rejected": -3.073690176010132,
"logps/chosen": -0.794242262840271,
"logps/rejected": -1.0482288599014282,
"loss": 0.8507,
"odds_ratio_loss": 0.564966082572937,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.07942423224449158,
"rewards/margins": 0.025398656725883484,
"rewards/rejected": -0.10482288897037506,
"sft_loss": 0.794242262840271,
"step": 550
},
{
"epoch": 0.9052333804809052,
"grad_norm": 3.156888008117676,
"learning_rate": 3.956649764650206e-06,
"logits/chosen": -3.1107125282287598,
"logits/rejected": -3.121273994445801,
"logps/chosen": -0.7970541715621948,
"logps/rejected": -0.9795141220092773,
"loss": 0.864,
"odds_ratio_loss": 0.6697722673416138,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.07970540225505829,
"rewards/margins": 0.018245995044708252,
"rewards/rejected": -0.09795141220092773,
"sft_loss": 0.7970541715621948,
"step": 560
},
{
"epoch": 0.9213982622752072,
"grad_norm": 4.841604232788086,
"learning_rate": 3.92201184133826e-06,
"logits/chosen": -3.082477331161499,
"logits/rejected": -3.0972495079040527,
"logps/chosen": -0.7547809481620789,
"logps/rejected": -0.9688172340393066,
"loss": 0.8159,
"odds_ratio_loss": 0.6109867095947266,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07547809928655624,
"rewards/margins": 0.021403620019555092,
"rewards/rejected": -0.09688171744346619,
"sft_loss": 0.7547809481620789,
"step": 570
},
{
"epoch": 0.937563144069509,
"grad_norm": 2.3844194412231445,
"learning_rate": 3.886965579876572e-06,
"logits/chosen": -3.114271640777588,
"logits/rejected": -3.1176934242248535,
"logps/chosen": -0.7549653053283691,
"logps/rejected": -0.858431339263916,
"loss": 0.8214,
"odds_ratio_loss": 0.664365828037262,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.07549652457237244,
"rewards/margins": 0.010346608236432076,
"rewards/rejected": -0.08584313094615936,
"sft_loss": 0.7549653053283691,
"step": 580
},
{
"epoch": 0.9537280258638109,
"grad_norm": 1.683606743812561,
"learning_rate": 3.851521043982716e-06,
"logits/chosen": -3.1139140129089355,
"logits/rejected": -3.134669780731201,
"logps/chosen": -0.793258011341095,
"logps/rejected": -0.9314570426940918,
"loss": 0.8603,
"odds_ratio_loss": 0.670341432094574,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07932581007480621,
"rewards/margins": 0.013819903135299683,
"rewards/rejected": -0.0931456983089447,
"sft_loss": 0.793258011341095,
"step": 590
},
{
"epoch": 0.9698929076581128,
"grad_norm": 6.090215682983398,
"learning_rate": 3.81568841174086e-06,
"logits/chosen": -3.0772290229797363,
"logits/rejected": -3.1182470321655273,
"logps/chosen": -0.7526463270187378,
"logps/rejected": -0.9828107953071594,
"loss": 0.8172,
"odds_ratio_loss": 0.6451513171195984,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07526463270187378,
"rewards/margins": 0.023016449064016342,
"rewards/rejected": -0.09828107804059982,
"sft_loss": 0.7526463270187378,
"step": 600
},
{
"epoch": 0.9860577894524146,
"grad_norm": 1.9422472715377808,
"learning_rate": 3.7794779726790664e-06,
"logits/chosen": -3.064382314682007,
"logits/rejected": -3.09804105758667,
"logps/chosen": -0.736395537853241,
"logps/rejected": -0.9441590309143066,
"loss": 0.798,
"odds_ratio_loss": 0.6159018278121948,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07363955676555634,
"rewards/margins": 0.02077634632587433,
"rewards/rejected": -0.09441590309143066,
"sft_loss": 0.736395537853241,
"step": 610
},
{
"epoch": 1.0022226712467166,
"grad_norm": 3.0195484161376953,
"learning_rate": 3.7429001248146096e-06,
"logits/chosen": -3.0517828464508057,
"logits/rejected": -3.0764060020446777,
"logps/chosen": -0.7535146474838257,
"logps/rejected": -0.9733519554138184,
"loss": 0.8118,
"odds_ratio_loss": 0.582946240901947,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07535146921873093,
"rewards/margins": 0.02198372781276703,
"rewards/rejected": -0.09733519703149796,
"sft_loss": 0.7535146474838257,
"step": 620
},
{
"epoch": 1.0183875530410185,
"grad_norm": 1.4307211637496948,
"learning_rate": 3.7059653716681227e-06,
"logits/chosen": -3.0821685791015625,
"logits/rejected": -3.0729098320007324,
"logps/chosen": -0.8158019185066223,
"logps/rejected": -1.0031102895736694,
"loss": 0.8821,
"odds_ratio_loss": 0.6627525091171265,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08158019185066223,
"rewards/margins": 0.01873084530234337,
"rewards/rejected": -0.1003110408782959,
"sft_loss": 0.8158019185066223,
"step": 630
},
{
"epoch": 1.0345524348353203,
"grad_norm": 1.925041675567627,
"learning_rate": 3.668684319247463e-06,
"logits/chosen": -3.11029052734375,
"logits/rejected": -3.133237600326538,
"logps/chosen": -0.7127649188041687,
"logps/rejected": -0.9635750651359558,
"loss": 0.7692,
"odds_ratio_loss": 0.5642341375350952,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07127650082111359,
"rewards/margins": 0.02508101798593998,
"rewards/rejected": -0.09635750949382782,
"sft_loss": 0.7127649188041687,
"step": 640
},
{
"epoch": 1.0507173166296222,
"grad_norm": 1.3039898872375488,
"learning_rate": 3.6310676730021373e-06,
"logits/chosen": -3.1486330032348633,
"logits/rejected": -3.1563305854797363,
"logps/chosen": -0.7767339944839478,
"logps/rejected": -0.9257136583328247,
"loss": 0.8402,
"odds_ratio_loss": 0.6346315145492554,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07767340540885925,
"rewards/margins": 0.014897963032126427,
"rewards/rejected": -0.09257137030363083,
"sft_loss": 0.7767339944839478,
"step": 650
},
{
"epoch": 1.066882198423924,
"grad_norm": 8.102341651916504,
"learning_rate": 3.593126234749178e-06,
"logits/chosen": -3.1005399227142334,
"logits/rejected": -3.1532058715820312,
"logps/chosen": -0.8182880282402039,
"logps/rejected": -0.9324063062667847,
"loss": 0.8843,
"odds_ratio_loss": 0.6604392528533936,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08182881772518158,
"rewards/margins": 0.011411817744374275,
"rewards/rejected": -0.0932406336069107,
"sft_loss": 0.8182880282402039,
"step": 660
},
{
"epoch": 1.083047080218226,
"grad_norm": 2.537015438079834,
"learning_rate": 3.554870899571343e-06,
"logits/chosen": -3.116610050201416,
"logits/rejected": -3.1441800594329834,
"logps/chosen": -0.7676142454147339,
"logps/rejected": -0.9440106153488159,
"loss": 0.8313,
"odds_ratio_loss": 0.6365249156951904,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07676141709089279,
"rewards/margins": 0.0176396407186985,
"rewards/rejected": -0.09440106153488159,
"sft_loss": 0.7676142454147339,
"step": 670
},
{
"epoch": 1.0992119620125278,
"grad_norm": 6.732462406158447,
"learning_rate": 3.5163126526885373e-06,
"logits/chosen": -3.077462673187256,
"logits/rejected": -3.1246635913848877,
"logps/chosen": -0.7416545152664185,
"logps/rejected": -0.9575145840644836,
"loss": 0.8024,
"odds_ratio_loss": 0.607850193977356,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.0741654485464096,
"rewards/margins": 0.02158600650727749,
"rewards/rejected": -0.09575144946575165,
"sft_loss": 0.7416545152664185,
"step": 680
},
{
"epoch": 1.1153768438068297,
"grad_norm": 1.995318055152893,
"learning_rate": 3.4774625663033484e-06,
"logits/chosen": -3.088719367980957,
"logits/rejected": -3.1156630516052246,
"logps/chosen": -0.7460827231407166,
"logps/rejected": -0.913016140460968,
"loss": 0.8092,
"odds_ratio_loss": 0.6309585571289062,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07460827380418777,
"rewards/margins": 0.01669333688914776,
"rewards/rejected": -0.09130160510540009,
"sft_loss": 0.7460827231407166,
"step": 690
},
{
"epoch": 1.1315417256011315,
"grad_norm": 1.8286596536636353,
"learning_rate": 3.4383317964216067e-06,
"logits/chosen": -3.0861454010009766,
"logits/rejected": -3.1318280696868896,
"logps/chosen": -0.7198506593704224,
"logps/rejected": -0.8674876093864441,
"loss": 0.7881,
"odds_ratio_loss": 0.6827356219291687,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07198506593704224,
"rewards/margins": 0.014763685874640942,
"rewards/rejected": -0.08674876391887665,
"sft_loss": 0.7198506593704224,
"step": 700
},
{
"epoch": 1.1477066073954334,
"grad_norm": 3.1164824962615967,
"learning_rate": 3.398931579648877e-06,
"logits/chosen": -3.1039557456970215,
"logits/rejected": -3.141571521759033,
"logps/chosen": -0.7915211915969849,
"logps/rejected": -1.108933687210083,
"loss": 0.8534,
"odds_ratio_loss": 0.6187322735786438,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07915211468935013,
"rewards/margins": 0.031741250306367874,
"rewards/rejected": -0.1108933687210083,
"sft_loss": 0.7915211915969849,
"step": 710
},
{
"epoch": 1.1638714891897353,
"grad_norm": 1.4664254188537598,
"learning_rate": 3.359273229963813e-06,
"logits/chosen": -3.1003873348236084,
"logits/rejected": -3.103529214859009,
"logps/chosen": -0.730610191822052,
"logps/rejected": -0.8717561960220337,
"loss": 0.7957,
"odds_ratio_loss": 0.6507007479667664,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0730610117316246,
"rewards/margins": 0.014114594087004662,
"rewards/rejected": -0.08717561513185501,
"sft_loss": 0.730610191822052,
"step": 720
},
{
"epoch": 1.1800363709840371,
"grad_norm": 1.3891639709472656,
"learning_rate": 3.319368135469285e-06,
"logits/chosen": -3.1091885566711426,
"logits/rejected": -3.150235652923584,
"logps/chosen": -0.7716542482376099,
"logps/rejected": -1.0244488716125488,
"loss": 0.837,
"odds_ratio_loss": 0.6535542011260986,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07716542482376099,
"rewards/margins": 0.025279458612203598,
"rewards/rejected": -0.10244487226009369,
"sft_loss": 0.7716542482376099,
"step": 730
},
{
"epoch": 1.196201252778339,
"grad_norm": 3.498404026031494,
"learning_rate": 3.279227755122228e-06,
"logits/chosen": -3.0913896560668945,
"logits/rejected": -3.114501476287842,
"logps/chosen": -0.7233365178108215,
"logps/rejected": -1.0984933376312256,
"loss": 0.778,
"odds_ratio_loss": 0.5463576912879944,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.07233365625143051,
"rewards/margins": 0.037515684962272644,
"rewards/rejected": -0.10984933376312256,
"sft_loss": 0.7233365178108215,
"step": 740
},
{
"epoch": 1.2123661345726409,
"grad_norm": 1.6691545248031616,
"learning_rate": 3.2388636154431417e-06,
"logits/chosen": -3.1425840854644775,
"logits/rejected": -3.175088405609131,
"logps/chosen": -0.7860497236251831,
"logps/rejected": -1.0689184665679932,
"loss": 0.846,
"odds_ratio_loss": 0.5996078252792358,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07860498130321503,
"rewards/margins": 0.028286874294281006,
"rewards/rejected": -0.10689185559749603,
"sft_loss": 0.7860497236251831,
"step": 750
},
{
"epoch": 1.2285310163669427,
"grad_norm": 1.6291383504867554,
"learning_rate": 3.198287307206192e-06,
"logits/chosen": -3.0734639167785645,
"logits/rejected": -3.0943312644958496,
"logps/chosen": -0.7356687784194946,
"logps/rejected": -0.9488394856452942,
"loss": 0.7952,
"odds_ratio_loss": 0.5951117277145386,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07356687635183334,
"rewards/margins": 0.021317074075341225,
"rewards/rejected": -0.09488394856452942,
"sft_loss": 0.7356687784194946,
"step": 760
},
{
"epoch": 1.2446958981612446,
"grad_norm": 1.6178934574127197,
"learning_rate": 3.157510482110856e-06,
"logits/chosen": -3.143188238143921,
"logits/rejected": -3.1431174278259277,
"logps/chosen": -0.7597110271453857,
"logps/rejected": -0.9578113555908203,
"loss": 0.8251,
"odds_ratio_loss": 0.6543140411376953,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0759711042046547,
"rewards/margins": 0.019810039550065994,
"rewards/rejected": -0.09578114002943039,
"sft_loss": 0.7597110271453857,
"step": 770
},
{
"epoch": 1.2608607799555465,
"grad_norm": 1.6571840047836304,
"learning_rate": 3.116544849436077e-06,
"logits/chosen": -3.084550142288208,
"logits/rejected": -3.089481830596924,
"logps/chosen": -0.804486095905304,
"logps/rejected": -1.1236674785614014,
"loss": 0.8661,
"odds_ratio_loss": 0.6160328388214111,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08044860512018204,
"rewards/margins": 0.03191814199090004,
"rewards/rejected": -0.11236675083637238,
"sft_loss": 0.804486095905304,
"step": 780
},
{
"epoch": 1.2770256617498483,
"grad_norm": 2.3182828426361084,
"learning_rate": 3.0754021726778848e-06,
"logits/chosen": -3.086887836456299,
"logits/rejected": -3.093517780303955,
"logps/chosen": -0.7223183512687683,
"logps/rejected": -1.0067367553710938,
"loss": 0.7785,
"odds_ratio_loss": 0.5618979334831238,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.07223184406757355,
"rewards/margins": 0.028441840782761574,
"rewards/rejected": -0.10067367553710938,
"sft_loss": 0.7223183512687683,
"step": 790
},
{
"epoch": 1.2931905435441502,
"grad_norm": 1.3017044067382812,
"learning_rate": 3.0340942661714463e-06,
"logits/chosen": -3.1351680755615234,
"logits/rejected": -3.134371280670166,
"logps/chosen": -0.796164333820343,
"logps/rejected": -0.9728119969367981,
"loss": 0.8595,
"odds_ratio_loss": 0.6329259276390076,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07961643487215042,
"rewards/margins": 0.017664765939116478,
"rewards/rejected": -0.09728120267391205,
"sft_loss": 0.796164333820343,
"step": 800
},
{
"epoch": 1.3093554253384523,
"grad_norm": 2.427229166030884,
"learning_rate": 2.992632991698512e-06,
"logits/chosen": -3.082369327545166,
"logits/rejected": -3.110403060913086,
"logps/chosen": -0.7753532528877258,
"logps/rejected": -0.9790387153625488,
"loss": 0.8377,
"odds_ratio_loss": 0.6236809492111206,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07753531634807587,
"rewards/margins": 0.0203685499727726,
"rewards/rejected": -0.09790387749671936,
"sft_loss": 0.7753532528877258,
"step": 810
},
{
"epoch": 1.3255203071327541,
"grad_norm": 1.9767731428146362,
"learning_rate": 2.9510302550812537e-06,
"logits/chosen": -3.089890480041504,
"logits/rejected": -3.138611078262329,
"logps/chosen": -0.6810489892959595,
"logps/rejected": -1.002256155014038,
"loss": 0.7382,
"odds_ratio_loss": 0.571026623249054,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.06810488551855087,
"rewards/margins": 0.032120734453201294,
"rewards/rejected": -0.10022562742233276,
"sft_loss": 0.6810489892959595,
"step": 820
},
{
"epoch": 1.341685188927056,
"grad_norm": 4.014182090759277,
"learning_rate": 2.9092980027634325e-06,
"logits/chosen": -3.095803737640381,
"logits/rejected": -3.1307053565979004,
"logps/chosen": -0.7113646268844604,
"logps/rejected": -0.9739853143692017,
"loss": 0.773,
"odds_ratio_loss": 0.6160944700241089,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.0711364597082138,
"rewards/margins": 0.02626206912100315,
"rewards/rejected": -0.0973985344171524,
"sft_loss": 0.7113646268844604,
"step": 830
},
{
"epoch": 1.3578500707213579,
"grad_norm": 2.0923309326171875,
"learning_rate": 2.867448218379927e-06,
"logits/chosen": -3.092233419418335,
"logits/rejected": -3.1379733085632324,
"logps/chosen": -0.8104713559150696,
"logps/rejected": -0.9551480412483215,
"loss": 0.8783,
"odds_ratio_loss": 0.6786799430847168,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08104713261127472,
"rewards/margins": 0.014467673376202583,
"rewards/rejected": -0.09551481157541275,
"sft_loss": 0.8104713559150696,
"step": 840
},
{
"epoch": 1.3740149525156597,
"grad_norm": 5.772988319396973,
"learning_rate": 2.825492919315559e-06,
"logits/chosen": -3.1081454753875732,
"logits/rejected": -3.1527466773986816,
"logps/chosen": -0.8419367671012878,
"logps/rejected": -0.9567692875862122,
"loss": 0.9094,
"odds_ratio_loss": 0.6746194958686829,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08419367671012878,
"rewards/margins": 0.011483250185847282,
"rewards/rejected": -0.09567694365978241,
"sft_loss": 0.8419367671012878,
"step": 850
},
{
"epoch": 1.3901798343099616,
"grad_norm": 3.8201489448547363,
"learning_rate": 2.7834441532542482e-06,
"logits/chosen": -3.1463775634765625,
"logits/rejected": -3.1600661277770996,
"logps/chosen": -0.7513538599014282,
"logps/rejected": -0.964932918548584,
"loss": 0.8093,
"odds_ratio_loss": 0.579144299030304,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07513538748025894,
"rewards/margins": 0.021357912570238113,
"rewards/rejected": -0.09649328887462616,
"sft_loss": 0.7513538599014282,
"step": 860
},
{
"epoch": 1.4063447161042635,
"grad_norm": 2.0883562564849854,
"learning_rate": 2.74131399471945e-06,
"logits/chosen": -3.1237385272979736,
"logits/rejected": -3.1525537967681885,
"logps/chosen": -0.7669543027877808,
"logps/rejected": -0.9222270250320435,
"loss": 0.8335,
"odds_ratio_loss": 0.6657834053039551,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07669542729854584,
"rewards/margins": 0.0155272725969553,
"rewards/rejected": -0.09222270548343658,
"sft_loss": 0.7669543027877808,
"step": 870
},
{
"epoch": 1.4225095978985653,
"grad_norm": 2.7775161266326904,
"learning_rate": 2.6991145416068947e-06,
"logits/chosen": -3.078185796737671,
"logits/rejected": -3.1287853717803955,
"logps/chosen": -0.7748882174491882,
"logps/rejected": -0.8965023159980774,
"loss": 0.8403,
"odds_ratio_loss": 0.6543062925338745,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07748880982398987,
"rewards/margins": 0.012161416001617908,
"rewards/rejected": -0.0896502360701561,
"sft_loss": 0.7748882174491882,
"step": 880
},
{
"epoch": 1.4386744796928672,
"grad_norm": 1.0541467666625977,
"learning_rate": 2.6568579117106143e-06,
"logits/chosen": -3.077782392501831,
"logits/rejected": -3.0923101902008057,
"logps/chosen": -0.7235719561576843,
"logps/rejected": -0.9604678153991699,
"loss": 0.7832,
"odds_ratio_loss": 0.5962098240852356,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07235720008611679,
"rewards/margins": 0.02368958666920662,
"rewards/rejected": -0.09604678303003311,
"sft_loss": 0.7235719561576843,
"step": 890
},
{
"epoch": 1.454839361487169,
"grad_norm": 1.1944150924682617,
"learning_rate": 2.6145562392432544e-06,
"logits/chosen": -3.139462947845459,
"logits/rejected": -3.1433637142181396,
"logps/chosen": -0.7596802115440369,
"logps/rejected": -0.9312537312507629,
"loss": 0.8259,
"odds_ratio_loss": 0.6623841524124146,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07596802711486816,
"rewards/margins": 0.01715734973549843,
"rewards/rejected": -0.09312538057565689,
"sft_loss": 0.7596802115440369,
"step": 900
},
{
"epoch": 1.471004243281471,
"grad_norm": 3.4694266319274902,
"learning_rate": 2.5722216713516682e-06,
"logits/chosen": -3.0813755989074707,
"logits/rejected": -3.128629207611084,
"logps/chosen": -0.7107754945755005,
"logps/rejected": -0.9305357933044434,
"loss": 0.7723,
"odds_ratio_loss": 0.6156551837921143,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07107754051685333,
"rewards/margins": 0.021976038813591003,
"rewards/rejected": -0.09305357933044434,
"sft_loss": 0.7107754945755005,
"step": 910
},
{
"epoch": 1.4871691250757728,
"grad_norm": 4.237662315368652,
"learning_rate": 2.5298663646288064e-06,
"logits/chosen": -3.1253597736358643,
"logits/rejected": -3.1492881774902344,
"logps/chosen": -0.7484380602836609,
"logps/rejected": -0.9847918748855591,
"loss": 0.8097,
"odds_ratio_loss": 0.6130812168121338,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07484380900859833,
"rewards/margins": 0.023635383695364,
"rewards/rejected": -0.09847918897867203,
"sft_loss": 0.7484380602836609,
"step": 920
},
{
"epoch": 1.503334006870075,
"grad_norm": 2.1491661071777344,
"learning_rate": 2.487502481622879e-06,
"logits/chosen": -3.0927679538726807,
"logits/rejected": -3.110395908355713,
"logps/chosen": -0.7894285321235657,
"logps/rejected": -0.9456027150154114,
"loss": 0.8507,
"odds_ratio_loss": 0.6129187345504761,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.07894285023212433,
"rewards/margins": 0.01561742089688778,
"rewards/rejected": -0.09456028044223785,
"sft_loss": 0.7894285321235657,
"step": 930
},
{
"epoch": 1.5194988886643768,
"grad_norm": 1.948228359222412,
"learning_rate": 2.4451421873448253e-06,
"logits/chosen": -3.0993218421936035,
"logits/rejected": -3.1382908821105957,
"logps/chosen": -0.7644907832145691,
"logps/rejected": -0.9289671778678894,
"loss": 0.8309,
"odds_ratio_loss": 0.6645576357841492,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07644907385110855,
"rewards/margins": 0.016447637230157852,
"rewards/rejected": -0.0928967148065567,
"sft_loss": 0.7644907832145691,
"step": 940
},
{
"epoch": 1.5356637704586786,
"grad_norm": 5.180652141571045,
"learning_rate": 2.40279764577506e-06,
"logits/chosen": -3.1537530422210693,
"logits/rejected": -3.1876587867736816,
"logps/chosen": -0.7837322354316711,
"logps/rejected": -0.9118620157241821,
"loss": 0.8488,
"odds_ratio_loss": 0.6507243514060974,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07837323099374771,
"rewards/margins": 0.012812974862754345,
"rewards/rejected": -0.09118620306253433,
"sft_loss": 0.7837322354316711,
"step": 950
},
{
"epoch": 1.5518286522529805,
"grad_norm": 3.1721439361572266,
"learning_rate": 2.3604810163705242e-06,
"logits/chosen": -3.117772102355957,
"logits/rejected": -3.1582953929901123,
"logps/chosen": -0.6980777978897095,
"logps/rejected": -0.9280182719230652,
"loss": 0.756,
"odds_ratio_loss": 0.5792473554611206,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06980777531862259,
"rewards/margins": 0.02299405261874199,
"rewards/rejected": -0.09280182421207428,
"sft_loss": 0.6980777978897095,
"step": 960
},
{
"epoch": 1.5679935340472824,
"grad_norm": 2.930034637451172,
"learning_rate": 2.3182044505730364e-06,
"logits/chosen": -3.102292537689209,
"logits/rejected": -3.1203739643096924,
"logps/chosen": -0.6645774245262146,
"logps/rejected": -0.8846859931945801,
"loss": 0.7224,
"odds_ratio_loss": 0.5782453417778015,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.06645774096250534,
"rewards/margins": 0.02201084978878498,
"rewards/rejected": -0.08846859633922577,
"sft_loss": 0.6645774245262146,
"step": 970
},
{
"epoch": 1.5841584158415842,
"grad_norm": 4.4542083740234375,
"learning_rate": 2.275980088319941e-06,
"logits/chosen": -3.1215856075286865,
"logits/rejected": -3.1170594692230225,
"logps/chosen": -0.718813955783844,
"logps/rejected": -0.8550910949707031,
"loss": 0.7869,
"odds_ratio_loss": 0.6806875467300415,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07188138365745544,
"rewards/margins": 0.013627724722027779,
"rewards/rejected": -0.08550911396741867,
"sft_loss": 0.718813955783844,
"step": 980
},
{
"epoch": 1.600323297635886,
"grad_norm": 3.0541982650756836,
"learning_rate": 2.2338200545580577e-06,
"logits/chosen": -3.0836963653564453,
"logits/rejected": -3.1206369400024414,
"logps/chosen": -0.710333526134491,
"logps/rejected": -0.9507579803466797,
"loss": 0.7727,
"odds_ratio_loss": 0.623470664024353,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07103335857391357,
"rewards/margins": 0.02404244802892208,
"rewards/rejected": -0.09507580101490021,
"sft_loss": 0.710333526134491,
"step": 990
},
{
"epoch": 1.616488179430188,
"grad_norm": 1.4722503423690796,
"learning_rate": 2.191736455761947e-06,
"logits/chosen": -3.127290964126587,
"logits/rejected": -3.1501638889312744,
"logps/chosen": -0.6768070459365845,
"logps/rejected": -0.8489478826522827,
"loss": 0.7341,
"odds_ratio_loss": 0.5731813311576843,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.06768069416284561,
"rewards/margins": 0.017214089632034302,
"rewards/rejected": -0.08489479124546051,
"sft_loss": 0.6768070459365845,
"step": 1000
},
{
"epoch": 1.616488179430188,
"eval_logits/chosen": -3.095952272415161,
"eval_logits/rejected": -3.1193559169769287,
"eval_logps/chosen": -0.7794692516326904,
"eval_logps/rejected": -0.9804279208183289,
"eval_loss": 0.8449718356132507,
"eval_odds_ratio_loss": 0.655025839805603,
"eval_rewards/accuracies": 0.5672727227210999,
"eval_rewards/chosen": -0.07794692367315292,
"eval_rewards/margins": 0.020095879212021828,
"eval_rewards/rejected": -0.098042793571949,
"eval_runtime": 367.195,
"eval_samples_per_second": 2.996,
"eval_sft_loss": 0.7794692516326904,
"eval_steps_per_second": 1.498,
"step": 1000
},
{
"epoch": 1.6326530612244898,
"grad_norm": 2.0673484802246094,
"learning_rate": 2.1497413764574673e-06,
"logits/chosen": -3.119420051574707,
"logits/rejected": -3.1228458881378174,
"logps/chosen": -0.7603039145469666,
"logps/rejected": -1.017571210861206,
"loss": 0.8184,
"odds_ratio_loss": 0.5807704329490662,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07603039592504501,
"rewards/margins": 0.025726735591888428,
"rewards/rejected": -0.10175712406635284,
"sft_loss": 0.7603039145469666,
"step": 1010
},
{
"epoch": 1.6488179430187917,
"grad_norm": 2.212749719619751,
"learning_rate": 2.1078468757516395e-06,
"logits/chosen": -3.0987765789031982,
"logits/rejected": -3.1337482929229736,
"logps/chosen": -0.7031766772270203,
"logps/rejected": -0.9084986448287964,
"loss": 0.762,
"odds_ratio_loss": 0.5880716443061829,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07031767070293427,
"rewards/margins": 0.020532192662358284,
"rewards/rejected": -0.0908498615026474,
"sft_loss": 0.7031766772270203,
"step": 1020
},
{
"epoch": 1.6649828248130936,
"grad_norm": 1.7040510177612305,
"learning_rate": 2.0660649838698145e-06,
"logits/chosen": -3.1251206398010254,
"logits/rejected": -3.140993595123291,
"logps/chosen": -0.7490108013153076,
"logps/rejected": -0.9883764982223511,
"loss": 0.8102,
"odds_ratio_loss": 0.6117704510688782,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07490108907222748,
"rewards/margins": 0.023936569690704346,
"rewards/rejected": -0.09883765131235123,
"sft_loss": 0.7490108013153076,
"step": 1030
},
{
"epoch": 1.6811477066073954,
"grad_norm": 2.5308990478515625,
"learning_rate": 2.0244076987011284e-06,
"logits/chosen": -3.156313419342041,
"logits/rejected": -3.149254560470581,
"logps/chosen": -0.7889419794082642,
"logps/rejected": -0.9795150756835938,
"loss": 0.8507,
"odds_ratio_loss": 0.6177965998649597,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07889419794082642,
"rewards/margins": 0.019057301804423332,
"rewards/rejected": -0.0979515090584755,
"sft_loss": 0.7889419794082642,
"step": 1040
},
{
"epoch": 1.6973125884016973,
"grad_norm": 1.713526725769043,
"learning_rate": 1.982886982353251e-06,
"logits/chosen": -3.1534323692321777,
"logits/rejected": -3.143846035003662,
"logps/chosen": -0.7670282125473022,
"logps/rejected": -1.0469207763671875,
"loss": 0.8306,
"odds_ratio_loss": 0.635661244392395,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07670283317565918,
"rewards/margins": 0.027989249676465988,
"rewards/rejected": -0.10469207912683487,
"sft_loss": 0.7670282125473022,
"step": 1050
},
{
"epoch": 1.7134774701959992,
"grad_norm": 2.0344197750091553,
"learning_rate": 1.941514757717392e-06,
"logits/chosen": -3.1048598289489746,
"logits/rejected": -3.1271190643310547,
"logps/chosen": -0.7640289068222046,
"logps/rejected": -1.0232574939727783,
"loss": 0.8194,
"odds_ratio_loss": 0.5533130764961243,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07640289515256882,
"rewards/margins": 0.025922849774360657,
"rewards/rejected": -0.10232573747634888,
"sft_loss": 0.7640289068222046,
"step": 1060
},
{
"epoch": 1.729642351990301,
"grad_norm": 4.370427131652832,
"learning_rate": 1.9003029050445953e-06,
"logits/chosen": -3.0902037620544434,
"logits/rejected": -3.1260688304901123,
"logps/chosen": -0.7717964053153992,
"logps/rejected": -0.9319893717765808,
"loss": 0.8345,
"odds_ratio_loss": 0.6267774701118469,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07717963308095932,
"rewards/margins": 0.016019299626350403,
"rewards/rejected": -0.09319894015789032,
"sft_loss": 0.7717964053153992,
"step": 1070
},
{
"epoch": 1.745807233784603,
"grad_norm": 2.206437110900879,
"learning_rate": 1.8592632585342523e-06,
"logits/chosen": -3.108377456665039,
"logits/rejected": -3.137510299682617,
"logps/chosen": -0.730410635471344,
"logps/rejected": -0.9709598422050476,
"loss": 0.7903,
"odds_ratio_loss": 0.599312424659729,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07304105907678604,
"rewards/margins": 0.02405492775142193,
"rewards/rejected": -0.09709598869085312,
"sft_loss": 0.730410635471344,
"step": 1080
},
{
"epoch": 1.7619721155789048,
"grad_norm": 3.54589581489563,
"learning_rate": 1.8184076029358527e-06,
"logits/chosen": -3.0873734951019287,
"logits/rejected": -3.0916082859039307,
"logps/chosen": -0.7298410534858704,
"logps/rejected": -0.8651703000068665,
"loss": 0.797,
"odds_ratio_loss": 0.6711704134941101,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07298411428928375,
"rewards/margins": 0.01353292353451252,
"rewards/rejected": -0.08651703596115112,
"sft_loss": 0.7298410534858704,
"step": 1090
},
{
"epoch": 1.7781369973732066,
"grad_norm": 1.9253672361373901,
"learning_rate": 1.7777476701649318e-06,
"logits/chosen": -3.0890607833862305,
"logits/rejected": -3.1127114295959473,
"logps/chosen": -0.7377376556396484,
"logps/rejected": -0.9456714391708374,
"loss": 0.7993,
"odds_ratio_loss": 0.6157868504524231,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07377376407384872,
"rewards/margins": 0.02079339325428009,
"rewards/rejected": -0.09456716477870941,
"sft_loss": 0.7377376556396484,
"step": 1100
},
{
"epoch": 1.7943018791675085,
"grad_norm": 2.634758949279785,
"learning_rate": 1.7372951359341925e-06,
"logits/chosen": -3.1294898986816406,
"logits/rejected": -3.1275429725646973,
"logps/chosen": -0.7182799577713013,
"logps/rejected": -0.897103488445282,
"loss": 0.7819,
"odds_ratio_loss": 0.6362147331237793,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.07182799279689789,
"rewards/margins": 0.01788235269486904,
"rewards/rejected": -0.08971034735441208,
"sft_loss": 0.7182799577713013,
"step": 1110
},
{
"epoch": 1.8104667609618104,
"grad_norm": 4.11432409286499,
"learning_rate": 1.6970616164007547e-06,
"logits/chosen": -3.109255075454712,
"logits/rejected": -3.1157066822052,
"logps/chosen": -0.7095004320144653,
"logps/rejected": -0.9157026410102844,
"loss": 0.7723,
"odds_ratio_loss": 0.6284032464027405,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07095004618167877,
"rewards/margins": 0.020620223134756088,
"rewards/rejected": -0.09157026559114456,
"sft_loss": 0.7095004320144653,
"step": 1120
},
{
"epoch": 1.8266316427561122,
"grad_norm": 3.5484671592712402,
"learning_rate": 1.6570586648305276e-06,
"logits/chosen": -3.1144962310791016,
"logits/rejected": -3.162351608276367,
"logps/chosen": -0.7635061144828796,
"logps/rejected": -0.9983874559402466,
"loss": 0.8279,
"odds_ratio_loss": 0.6435292959213257,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.0763506144285202,
"rewards/margins": 0.023488130420446396,
"rewards/rejected": -0.0998387485742569,
"sft_loss": 0.7635061144828796,
"step": 1130
},
{
"epoch": 1.842796524550414,
"grad_norm": 2.315484046936035,
"learning_rate": 1.6172977682806151e-06,
"logits/chosen": -3.1250674724578857,
"logits/rejected": -3.169023036956787,
"logps/chosen": -0.7526008486747742,
"logps/rejected": -0.9915586709976196,
"loss": 0.8118,
"odds_ratio_loss": 0.5918548703193665,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07526008039712906,
"rewards/margins": 0.023895783349871635,
"rewards/rejected": -0.09915586560964584,
"sft_loss": 0.7526008486747742,
"step": 1140
},
{
"epoch": 1.858961406344716,
"grad_norm": 1.8489255905151367,
"learning_rate": 1.5777903443007586e-06,
"logits/chosen": -3.0853919982910156,
"logits/rejected": -3.067697286605835,
"logps/chosen": -0.761858344078064,
"logps/rejected": -1.018090844154358,
"loss": 0.8238,
"odds_ratio_loss": 0.6194515824317932,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07618583738803864,
"rewards/margins": 0.02562323771417141,
"rewards/rejected": -0.1018090695142746,
"sft_loss": 0.761858344078064,
"step": 1150
},
{
"epoch": 1.8751262881390178,
"grad_norm": 3.776698589324951,
"learning_rate": 1.5385477376547226e-06,
"logits/chosen": -3.117295742034912,
"logits/rejected": -3.1292173862457275,
"logps/chosen": -0.7485045194625854,
"logps/rejected": -0.9388057589530945,
"loss": 0.8087,
"odds_ratio_loss": 0.6023274064064026,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07485045492649078,
"rewards/margins": 0.019030118361115456,
"rewards/rejected": -0.09388057142496109,
"sft_loss": 0.7485045194625854,
"step": 1160
},
{
"epoch": 1.89129116993332,
"grad_norm": 2.622969627380371,
"learning_rate": 1.4995812170625845e-06,
"logits/chosen": -3.100135564804077,
"logits/rejected": -3.120415210723877,
"logps/chosen": -0.7464675307273865,
"logps/rejected": -1.0535166263580322,
"loss": 0.8049,
"odds_ratio_loss": 0.5842532515525818,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07464675605297089,
"rewards/margins": 0.030704926699399948,
"rewards/rejected": -0.10535167157649994,
"sft_loss": 0.7464675307273865,
"step": 1170
},
{
"epoch": 1.9074560517276218,
"grad_norm": 2.950418710708618,
"learning_rate": 1.4609019719648666e-06,
"logits/chosen": -3.1377501487731934,
"logits/rejected": -3.1489570140838623,
"logps/chosen": -0.7555149793624878,
"logps/rejected": -0.9784708023071289,
"loss": 0.8149,
"odds_ratio_loss": 0.5935994386672974,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07555149495601654,
"rewards/margins": 0.022295573726296425,
"rewards/rejected": -0.09784707427024841,
"sft_loss": 0.7555149793624878,
"step": 1180
},
{
"epoch": 1.9236209335219236,
"grad_norm": 3.5587480068206787,
"learning_rate": 1.42252110930943e-06,
"logits/chosen": -3.077787399291992,
"logits/rejected": -3.097930908203125,
"logps/chosen": -0.6736657023429871,
"logps/rejected": -0.9208394289016724,
"loss": 0.7314,
"odds_ratio_loss": 0.5776128768920898,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0673665776848793,
"rewards/margins": 0.024717364460229874,
"rewards/rejected": -0.09208394587039948,
"sft_loss": 0.6736657023429871,
"step": 1190
},
{
"epoch": 1.9397858153162255,
"grad_norm": 4.154015064239502,
"learning_rate": 1.3844496503620493e-06,
"logits/chosen": -3.0993504524230957,
"logits/rejected": -3.152251720428467,
"logps/chosen": -0.781159520149231,
"logps/rejected": -0.947624683380127,
"loss": 0.8436,
"odds_ratio_loss": 0.6244069933891296,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.07811595499515533,
"rewards/margins": 0.01664651185274124,
"rewards/rejected": -0.09476246684789658,
"sft_loss": 0.781159520149231,
"step": 1200
},
{
"epoch": 1.9559506971105274,
"grad_norm": 3.2580513954162598,
"learning_rate": 1.3466985275416081e-06,
"logits/chosen": -3.074446439743042,
"logits/rejected": -3.0826351642608643,
"logps/chosen": -0.8331464529037476,
"logps/rejected": -0.9720133543014526,
"loss": 0.9006,
"odds_ratio_loss": 0.6748316287994385,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08331465721130371,
"rewards/margins": 0.013886680826544762,
"rewards/rejected": -0.09720133244991302,
"sft_loss": 0.8331464529037476,
"step": 1210
},
{
"epoch": 1.9721155789048292,
"grad_norm": 3.6283507347106934,
"learning_rate": 1.309278581280791e-06,
"logits/chosen": -3.1137681007385254,
"logits/rejected": -3.1031088829040527,
"logps/chosen": -0.6959558129310608,
"logps/rejected": -0.9509153366088867,
"loss": 0.7562,
"odds_ratio_loss": 0.602826714515686,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0695955827832222,
"rewards/margins": 0.025495965033769608,
"rewards/rejected": -0.09509153664112091,
"sft_loss": 0.6959558129310608,
"step": 1220
},
{
"epoch": 1.9882804606991311,
"grad_norm": 1.468146800994873,
"learning_rate": 1.272200556913199e-06,
"logits/chosen": -3.116267204284668,
"logits/rejected": -3.1380016803741455,
"logps/chosen": -0.7585796117782593,
"logps/rejected": -0.9659263491630554,
"loss": 0.827,
"odds_ratio_loss": 0.6845245361328125,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07585795968770981,
"rewards/margins": 0.020734664052724838,
"rewards/rejected": -0.09659262746572495,
"sft_loss": 0.7585796117782593,
"step": 1230
},
{
"epoch": 2.004445342493433,
"grad_norm": 2.9131782054901123,
"learning_rate": 1.2354751015877698e-06,
"logits/chosen": -3.0867080688476562,
"logits/rejected": -3.120702028274536,
"logps/chosen": -0.7115014791488647,
"logps/rejected": -0.9811463356018066,
"loss": 0.7717,
"odds_ratio_loss": 0.6017391681671143,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07115015387535095,
"rewards/margins": 0.02696448564529419,
"rewards/rejected": -0.09811463207006454,
"sft_loss": 0.7115014791488647,
"step": 1240
},
{
"epoch": 2.020610224287735,
"grad_norm": 5.614047527313232,
"learning_rate": 1.1991127612113945e-06,
"logits/chosen": -3.103969097137451,
"logits/rejected": -3.1521923542022705,
"logps/chosen": -0.748454749584198,
"logps/rejected": -0.9711889028549194,
"loss": 0.8062,
"odds_ratio_loss": 0.5778619050979614,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07484547793865204,
"rewards/margins": 0.022273404523730278,
"rewards/rejected": -0.09711887687444687,
"sft_loss": 0.748454749584198,
"step": 1250
},
{
"epoch": 2.036775106082037,
"grad_norm": 3.2708330154418945,
"learning_rate": 1.1631239774206035e-06,
"logits/chosen": -3.087298631668091,
"logits/rejected": -3.0905699729919434,
"logps/chosen": -0.7345898747444153,
"logps/rejected": -0.9632331132888794,
"loss": 0.7998,
"odds_ratio_loss": 0.6524921655654907,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07345898449420929,
"rewards/margins": 0.022864321246743202,
"rewards/rejected": -0.09632330387830734,
"sft_loss": 0.7345898747444153,
"step": 1260
},
{
"epoch": 2.052939987876339,
"grad_norm": 5.292441368103027,
"learning_rate": 1.1275190845831978e-06,
"logits/chosen": -3.094177007675171,
"logits/rejected": -3.105118989944458,
"logps/chosen": -0.6928293704986572,
"logps/rejected": -0.9615745544433594,
"loss": 0.7462,
"odds_ratio_loss": 0.5332867503166199,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.06928294897079468,
"rewards/margins": 0.026874512434005737,
"rewards/rejected": -0.09615744650363922,
"sft_loss": 0.6928293704986572,
"step": 1270
},
{
"epoch": 2.0691048696706407,
"grad_norm": 2.8853020668029785,
"learning_rate": 1.0923083068306778e-06,
"logits/chosen": -3.1118180751800537,
"logits/rejected": -3.0961227416992188,
"logps/chosen": -0.739953339099884,
"logps/rejected": -1.0457074642181396,
"loss": 0.7958,
"odds_ratio_loss": 0.5585684776306152,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07399533689022064,
"rewards/margins": 0.03057541325688362,
"rewards/rejected": -0.10457074642181396,
"sft_loss": 0.739953339099884,
"step": 1280
},
{
"epoch": 2.0852697514649425,
"grad_norm": 2.046693801879883,
"learning_rate": 1.0575017551223348e-06,
"logits/chosen": -3.0827836990356445,
"logits/rejected": -3.099609851837158,
"logps/chosen": -0.6882905960083008,
"logps/rejected": -0.9119550585746765,
"loss": 0.7477,
"odds_ratio_loss": 0.594234824180603,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.06882907450199127,
"rewards/margins": 0.02236645109951496,
"rewards/rejected": -0.09119551628828049,
"sft_loss": 0.6882905960083008,
"step": 1290
},
{
"epoch": 2.1014346332592444,
"grad_norm": 3.569880723953247,
"learning_rate": 1.023109424341833e-06,
"logits/chosen": -3.091522693634033,
"logits/rejected": -3.1377675533294678,
"logps/chosen": -0.7397576570510864,
"logps/rejected": -0.9756298065185547,
"loss": 0.8002,
"odds_ratio_loss": 0.6045941114425659,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07397577166557312,
"rewards/margins": 0.023587211966514587,
"rewards/rejected": -0.09756298363208771,
"sft_loss": 0.7397576570510864,
"step": 1300
},
{
"epoch": 2.1175995150535463,
"grad_norm": 2.92287278175354,
"learning_rate": 9.891411904271273e-07,
"logits/chosen": -3.091432571411133,
"logits/rejected": -3.097942352294922,
"logps/chosen": -0.7274054884910583,
"logps/rejected": -0.9433088302612305,
"loss": 0.7877,
"odds_ratio_loss": 0.602813720703125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07274055480957031,
"rewards/margins": 0.021590325981378555,
"rewards/rejected": -0.09433087706565857,
"sft_loss": 0.7274054884910583,
"step": 1310
},
{
"epoch": 2.133764396847848,
"grad_norm": 1.6639999151229858,
"learning_rate": 9.556068075345363e-07,
"logits/chosen": -3.1349058151245117,
"logits/rejected": -3.1192307472229004,
"logps/chosen": -0.7446939945220947,
"logps/rejected": -0.9326783418655396,
"loss": 0.8047,
"odds_ratio_loss": 0.5995923280715942,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07446939498186111,
"rewards/margins": 0.018798431381583214,
"rewards/rejected": -0.09326782822608948,
"sft_loss": 0.7446939945220947,
"step": 1320
},
{
"epoch": 2.14992927864215,
"grad_norm": 1.6199049949645996,
"learning_rate": 9.225159052377838e-07,
"logits/chosen": -3.085836887359619,
"logits/rejected": -3.1293978691101074,
"logps/chosen": -0.7635077238082886,
"logps/rejected": -1.0566623210906982,
"loss": 0.8227,
"odds_ratio_loss": 0.591996431350708,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07635077089071274,
"rewards/margins": 0.029315466061234474,
"rewards/rejected": -0.10566625744104385,
"sft_loss": 0.7635077238082886,
"step": 1330
},
{
"epoch": 2.166094160436452,
"grad_norm": 14.56561279296875,
"learning_rate": 8.898779857628184e-07,
"logits/chosen": -3.0713887214660645,
"logits/rejected": -3.125135898590088,
"logps/chosen": -0.6671522259712219,
"logps/rejected": -0.8694232702255249,
"loss": 0.7265,
"odds_ratio_loss": 0.5937641859054565,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.06671522557735443,
"rewards/margins": 0.02022710070014,
"rewards/rejected": -0.08694233000278473,
"sft_loss": 0.6671522259712219,
"step": 1340
},
{
"epoch": 2.1822590422307537,
"grad_norm": 2.1917502880096436,
"learning_rate": 8.577024212591975e-07,
"logits/chosen": -3.152409791946411,
"logits/rejected": -3.142133951187134,
"logps/chosen": -0.7823044657707214,
"logps/rejected": -0.9282905459403992,
"loss": 0.8458,
"odds_ratio_loss": 0.6353241205215454,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07823045551776886,
"rewards/margins": 0.0145986033603549,
"rewards/rejected": -0.09282905608415604,
"sft_loss": 0.7823044657707214,
"step": 1350
},
{
"epoch": 2.1984239240250556,
"grad_norm": 2.377164363861084,
"learning_rate": 8.259984511088276e-07,
"logits/chosen": -3.0746350288391113,
"logits/rejected": -3.107931613922119,
"logps/chosen": -0.7670485973358154,
"logps/rejected": -0.9791353344917297,
"loss": 0.8321,
"odds_ratio_loss": 0.6501604914665222,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07670485973358154,
"rewards/margins": 0.02120867185294628,
"rewards/rejected": -0.09791352599859238,
"sft_loss": 0.7670485973358154,
"step": 1360
},
{
"epoch": 2.2145888058193575,
"grad_norm": 1.4493186473846436,
"learning_rate": 7.947751792728237e-07,
"logits/chosen": -3.1149420738220215,
"logits/rejected": -3.106590986251831,
"logps/chosen": -0.736428439617157,
"logps/rejected": -1.0205104351043701,
"loss": 0.7973,
"odds_ratio_loss": 0.6086055040359497,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07364284247159958,
"rewards/margins": 0.02840820886194706,
"rewards/rejected": -0.10205105692148209,
"sft_loss": 0.736428439617157,
"step": 1370
},
{
"epoch": 2.2307536876136593,
"grad_norm": 2.596027374267578,
"learning_rate": 7.640415716772626e-07,
"logits/chosen": -3.1113359928131104,
"logits/rejected": -3.1498453617095947,
"logps/chosen": -0.7384335398674011,
"logps/rejected": -1.0052040815353394,
"loss": 0.7991,
"odds_ratio_loss": 0.606296718120575,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.07384335994720459,
"rewards/margins": 0.026677042245864868,
"rewards/rejected": -0.10052040964365005,
"sft_loss": 0.7384335398674011,
"step": 1380
},
{
"epoch": 2.246918569407961,
"grad_norm": 1.8183890581130981,
"learning_rate": 7.338064536385722e-07,
"logits/chosen": -3.1014134883880615,
"logits/rejected": -3.1173253059387207,
"logps/chosen": -0.7216871380805969,
"logps/rejected": -1.0171915292739868,
"loss": 0.7805,
"odds_ratio_loss": 0.5884080529212952,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.07216870784759521,
"rewards/margins": 0.029550448060035706,
"rewards/rejected": -0.10171915590763092,
"sft_loss": 0.7216871380805969,
"step": 1390
},
{
"epoch": 2.263083451202263,
"grad_norm": 2.7193105220794678,
"learning_rate": 7.040785073292883e-07,
"logits/chosen": -3.052278995513916,
"logits/rejected": -3.0792136192321777,
"logps/chosen": -0.8029114603996277,
"logps/rejected": -1.0175930261611938,
"loss": 0.8702,
"odds_ratio_loss": 0.6730188727378845,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08029113709926605,
"rewards/margins": 0.02146816998720169,
"rewards/rejected": -0.10175931453704834,
"sft_loss": 0.8029114603996277,
"step": 1400
},
{
"epoch": 2.279248332996565,
"grad_norm": 1.6374540328979492,
"learning_rate": 6.748662692849297e-07,
"logits/chosen": -3.0864005088806152,
"logits/rejected": -3.1004090309143066,
"logps/chosen": -0.6923743486404419,
"logps/rejected": -1.0528560876846313,
"loss": 0.747,
"odds_ratio_loss": 0.5459417104721069,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.06923744082450867,
"rewards/margins": 0.03604819253087044,
"rewards/rejected": -0.10528562217950821,
"sft_loss": 0.6923743486404419,
"step": 1410
},
{
"epoch": 2.295413214790867,
"grad_norm": 5.552002429962158,
"learning_rate": 6.46178127952686e-07,
"logits/chosen": -3.1168243885040283,
"logits/rejected": -3.132902145385742,
"logps/chosen": -0.7120259404182434,
"logps/rejected": -0.9511427879333496,
"loss": 0.7669,
"odds_ratio_loss": 0.5487939119338989,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.0712025836110115,
"rewards/margins": 0.023911695927381516,
"rewards/rejected": -0.09511429071426392,
"sft_loss": 0.7120259404182434,
"step": 1420
},
{
"epoch": 2.3115780965851687,
"grad_norm": 6.720409870147705,
"learning_rate": 6.180223212826289e-07,
"logits/chosen": -3.1055123805999756,
"logits/rejected": -3.0998032093048096,
"logps/chosen": -0.7326000928878784,
"logps/rejected": -0.947357177734375,
"loss": 0.7933,
"odds_ratio_loss": 0.6070013046264648,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07326001673936844,
"rewards/margins": 0.021475713700056076,
"rewards/rejected": -0.09473572671413422,
"sft_loss": 0.7326000928878784,
"step": 1430
},
{
"epoch": 2.3277429783794705,
"grad_norm": 1.7139315605163574,
"learning_rate": 5.904069343621443e-07,
"logits/chosen": -3.117607593536377,
"logits/rejected": -3.1124439239501953,
"logps/chosen": -0.7263907194137573,
"logps/rejected": -0.9684870839118958,
"loss": 0.7844,
"odds_ratio_loss": 0.5803811550140381,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07263907045125961,
"rewards/margins": 0.024209631606936455,
"rewards/rejected": -0.09684871137142181,
"sft_loss": 0.7263907194137573,
"step": 1440
},
{
"epoch": 2.3439078601737724,
"grad_norm": 4.18460750579834,
"learning_rate": 5.633398970942544e-07,
"logits/chosen": -3.0699353218078613,
"logits/rejected": -3.1113662719726562,
"logps/chosen": -0.71888267993927,
"logps/rejected": -0.9213592410087585,
"loss": 0.7809,
"odds_ratio_loss": 0.6200512647628784,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0718882754445076,
"rewards/margins": 0.020247649401426315,
"rewards/rejected": -0.09213592857122421,
"sft_loss": 0.71888267993927,
"step": 1450
},
{
"epoch": 2.3600727419680743,
"grad_norm": 2.875581741333008,
"learning_rate": 5.368289819205069e-07,
"logits/chosen": -3.1074984073638916,
"logits/rejected": -3.116929054260254,
"logps/chosen": -0.675830602645874,
"logps/rejected": -0.9225249290466309,
"loss": 0.7373,
"odds_ratio_loss": 0.6144708395004272,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.06758306175470352,
"rewards/margins": 0.02466944232583046,
"rewards/rejected": -0.09225250035524368,
"sft_loss": 0.675830602645874,
"step": 1460
},
{
"epoch": 2.376237623762376,
"grad_norm": 3.366608142852783,
"learning_rate": 5.108818015890785e-07,
"logits/chosen": -3.1295876502990723,
"logits/rejected": -3.1546216011047363,
"logps/chosen": -0.8030446171760559,
"logps/rejected": -0.9700073003768921,
"loss": 0.8661,
"odds_ratio_loss": 0.6306995153427124,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08030445873737335,
"rewards/margins": 0.016696274280548096,
"rewards/rejected": -0.09700073301792145,
"sft_loss": 0.8030446171760559,
"step": 1470
},
{
"epoch": 2.392402505556678,
"grad_norm": 2.7780675888061523,
"learning_rate": 4.855058069687291e-07,
"logits/chosen": -3.067026376724243,
"logits/rejected": -3.0960917472839355,
"logps/chosen": -0.7043627500534058,
"logps/rejected": -0.9909790754318237,
"loss": 0.7594,
"odds_ratio_loss": 0.5507315397262573,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07043628394603729,
"rewards/margins": 0.028661629185080528,
"rewards/rejected": -0.09909790754318237,
"sft_loss": 0.7043627500534058,
"step": 1480
},
{
"epoch": 2.40856738735098,
"grad_norm": 3.2751307487487793,
"learning_rate": 4.607082849092523e-07,
"logits/chosen": -3.099853277206421,
"logits/rejected": -3.1020543575286865,
"logps/chosen": -0.7886861562728882,
"logps/rejected": -0.976923942565918,
"loss": 0.8511,
"odds_ratio_loss": 0.6243327856063843,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07886861264705658,
"rewards/margins": 0.018823768943548203,
"rewards/rejected": -0.09769239276647568,
"sft_loss": 0.7886861562728882,
"step": 1490
},
{
"epoch": 2.4247322691452817,
"grad_norm": 3.08627986907959,
"learning_rate": 4.3649635614901405e-07,
"logits/chosen": -3.0676140785217285,
"logits/rejected": -3.144824504852295,
"logps/chosen": -0.704692006111145,
"logps/rejected": -0.8520969152450562,
"loss": 0.7691,
"odds_ratio_loss": 0.6437775492668152,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0704691931605339,
"rewards/margins": 0.014740494079887867,
"rewards/rejected": -0.0852096900343895,
"sft_loss": 0.704692006111145,
"step": 1500
},
{
"epoch": 2.4247322691452817,
"eval_logits/chosen": -3.083430051803589,
"eval_logits/rejected": -3.107269287109375,
"eval_logps/chosen": -0.777300238609314,
"eval_logps/rejected": -0.9876723289489746,
"eval_loss": 0.8434417247772217,
"eval_odds_ratio_loss": 0.6614136099815369,
"eval_rewards/accuracies": 0.5690909028053284,
"eval_rewards/chosen": -0.07773003727197647,
"eval_rewards/margins": 0.021037202328443527,
"eval_rewards/rejected": -0.0987672358751297,
"eval_runtime": 371.5479,
"eval_samples_per_second": 2.961,
"eval_sft_loss": 0.777300238609314,
"eval_steps_per_second": 1.48,
"step": 1500
},
{
"epoch": 2.4408971509395836,
"grad_norm": 2.1558029651641846,
"learning_rate": 4.128769732701973e-07,
"logits/chosen": -3.0588672161102295,
"logits/rejected": -3.053375720977783,
"logps/chosen": -0.7354205250740051,
"logps/rejected": -0.9381022453308105,
"loss": 0.7959,
"odds_ratio_loss": 0.6045327186584473,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0735420510172844,
"rewards/margins": 0.02026817761361599,
"rewards/rejected": -0.09381023049354553,
"sft_loss": 0.7354205250740051,
"step": 1510
},
{
"epoch": 2.4570620327338855,
"grad_norm": 3.434962749481201,
"learning_rate": 3.8985691870233046e-07,
"logits/chosen": -3.1096482276916504,
"logits/rejected": -3.1095433235168457,
"logps/chosen": -0.7352281212806702,
"logps/rejected": -0.9943790435791016,
"loss": 0.7949,
"odds_ratio_loss": 0.596770703792572,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07352282106876373,
"rewards/margins": 0.025915095582604408,
"rewards/rejected": -0.0994379073381424,
"sft_loss": 0.7352281212806702,
"step": 1520
},
{
"epoch": 2.4732269145281873,
"grad_norm": 2.586085557937622,
"learning_rate": 3.6744280277467904e-07,
"logits/chosen": -3.1208150386810303,
"logits/rejected": -3.1456665992736816,
"logps/chosen": -0.7365155816078186,
"logps/rejected": -0.9590933918952942,
"loss": 0.7993,
"odds_ratio_loss": 0.6281741857528687,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07365155220031738,
"rewards/margins": 0.022257793694734573,
"rewards/rejected": -0.09590934216976166,
"sft_loss": 0.7365155816078186,
"step": 1530
},
{
"epoch": 2.489391796322489,
"grad_norm": 6.254690170288086,
"learning_rate": 3.456410618180503e-07,
"logits/chosen": -3.0379281044006348,
"logits/rejected": -3.0891363620758057,
"logps/chosen": -0.6726978421211243,
"logps/rejected": -1.0303423404693604,
"loss": 0.7295,
"odds_ratio_loss": 0.5683592557907104,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.06726977974176407,
"rewards/margins": 0.03576444461941719,
"rewards/rejected": -0.10303423553705215,
"sft_loss": 0.6726978421211243,
"step": 1540
},
{
"epoch": 2.5055566781167915,
"grad_norm": 2.056490898132324,
"learning_rate": 3.244579563165753e-07,
"logits/chosen": -3.0974183082580566,
"logits/rejected": -3.1228187084198,
"logps/chosen": -0.711591362953186,
"logps/rejected": -1.0496152639389038,
"loss": 0.7689,
"odds_ratio_loss": 0.5726686716079712,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07115913927555084,
"rewards/margins": 0.033802393823862076,
"rewards/rejected": -0.10496153682470322,
"sft_loss": 0.711591362953186,
"step": 1550
},
{
"epoch": 2.521721559911093,
"grad_norm": 1.438101053237915,
"learning_rate": 3.038995691099697e-07,
"logits/chosen": -3.08099365234375,
"logits/rejected": -3.1116271018981934,
"logps/chosen": -0.7405564188957214,
"logps/rejected": -1.075670838356018,
"loss": 0.7985,
"odds_ratio_loss": 0.5797412991523743,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07405563443899155,
"rewards/margins": 0.0335114449262619,
"rewards/rejected": -0.10756708681583405,
"sft_loss": 0.7405564188957214,
"step": 1560
},
{
"epoch": 2.5378864417053952,
"grad_norm": 9.284449577331543,
"learning_rate": 2.839718036468192e-07,
"logits/chosen": -3.127392530441284,
"logits/rejected": -3.1654505729675293,
"logps/chosen": -0.841861367225647,
"logps/rejected": -0.9995994567871094,
"loss": 0.9096,
"odds_ratio_loss": 0.6777136921882629,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.0841861441731453,
"rewards/margins": 0.015773817896842957,
"rewards/rejected": -0.09995995461940765,
"sft_loss": 0.841861367225647,
"step": 1570
},
{
"epoch": 2.5540513234996967,
"grad_norm": 2.9508066177368164,
"learning_rate": 2.646803822893723e-07,
"logits/chosen": -3.1416640281677246,
"logits/rejected": -3.1590213775634766,
"logps/chosen": -0.7539080381393433,
"logps/rejected": -0.9754577875137329,
"loss": 0.8158,
"odds_ratio_loss": 0.618767499923706,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07539081573486328,
"rewards/margins": 0.02215495891869068,
"rewards/rejected": -0.09754578024148941,
"sft_loss": 0.7539080381393433,
"step": 1580
},
{
"epoch": 2.570216205293999,
"grad_norm": 2.8406593799591064,
"learning_rate": 2.460308446703341e-07,
"logits/chosen": -3.1490259170532227,
"logits/rejected": -3.129563808441162,
"logps/chosen": -0.7597383260726929,
"logps/rejected": -0.9107489585876465,
"loss": 0.8268,
"odds_ratio_loss": 0.671094536781311,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07597382366657257,
"rewards/margins": 0.01510106772184372,
"rewards/rejected": -0.09107489883899689,
"sft_loss": 0.7597383260726929,
"step": 1590
},
{
"epoch": 2.5863810870883004,
"grad_norm": 3.932293653488159,
"learning_rate": 2.2802854610213143e-07,
"logits/chosen": -3.0843563079833984,
"logits/rejected": -3.080658197402954,
"logps/chosen": -0.6698503494262695,
"logps/rejected": -1.0470666885375977,
"loss": 0.7238,
"odds_ratio_loss": 0.5398669838905334,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.06698504090309143,
"rewards/margins": 0.03772163391113281,
"rewards/rejected": -0.10470668226480484,
"sft_loss": 0.6698503494262695,
"step": 1600
},
{
"epoch": 2.6025459688826027,
"grad_norm": 3.3689260482788086,
"learning_rate": 2.106786560391072e-07,
"logits/chosen": -3.08052134513855,
"logits/rejected": -3.1363070011138916,
"logps/chosen": -0.7746056914329529,
"logps/rejected": -0.9704595804214478,
"loss": 0.8356,
"odds_ratio_loss": 0.6099545359611511,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07746056467294693,
"rewards/margins": 0.019585389643907547,
"rewards/rejected": -0.09704595804214478,
"sft_loss": 0.7746056914329529,
"step": 1610
},
{
"epoch": 2.6187108506769046,
"grad_norm": 1.7709842920303345,
"learning_rate": 1.9398615659308255e-07,
"logits/chosen": -3.1098244190216064,
"logits/rejected": -3.1587026119232178,
"logps/chosen": -0.7326648831367493,
"logps/rejected": -0.9062278866767883,
"loss": 0.7951,
"odds_ratio_loss": 0.624650776386261,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07326649129390717,
"rewards/margins": 0.017356308177113533,
"rewards/rejected": -0.09062279760837555,
"sft_loss": 0.7326648831367493,
"step": 1620
},
{
"epoch": 2.6348757324712064,
"grad_norm": 2.0778872966766357,
"learning_rate": 1.7795584110272184e-07,
"logits/chosen": -3.1534359455108643,
"logits/rejected": -3.1423211097717285,
"logps/chosen": -0.7454973459243774,
"logps/rejected": -0.936767578125,
"loss": 0.8102,
"odds_ratio_loss": 0.6473931670188904,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07454973459243774,
"rewards/margins": 0.019127020612359047,
"rewards/rejected": -0.09367676079273224,
"sft_loss": 0.7454973459243774,
"step": 1630
},
{
"epoch": 2.6510406142655083,
"grad_norm": 3.814206838607788,
"learning_rate": 1.6259231275709636e-07,
"logits/chosen": -3.149001121520996,
"logits/rejected": -3.1615355014801025,
"logps/chosen": -0.716022789478302,
"logps/rejected": -0.8857117891311646,
"loss": 0.7825,
"odds_ratio_loss": 0.6644307971000671,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07160228490829468,
"rewards/margins": 0.016968905925750732,
"rewards/rejected": -0.08857118338346481,
"sft_loss": 0.716022789478302,
"step": 1640
},
{
"epoch": 2.66720549605981,
"grad_norm": 1.4002131223678589,
"learning_rate": 1.478999832738548e-07,
"logits/chosen": -3.1335608959198,
"logits/rejected": -3.1366758346557617,
"logps/chosen": -0.7220789194107056,
"logps/rejected": -1.0130765438079834,
"loss": 0.7818,
"odds_ratio_loss": 0.597649335861206,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07220789790153503,
"rewards/margins": 0.02909977361559868,
"rewards/rejected": -0.10130767524242401,
"sft_loss": 0.7220789194107056,
"step": 1650
},
{
"epoch": 2.683370377854112,
"grad_norm": 2.5022165775299072,
"learning_rate": 1.338830716323769e-07,
"logits/chosen": -3.0961499214172363,
"logits/rejected": -3.1152243614196777,
"logps/chosen": -0.7230523228645325,
"logps/rejected": -0.8930460810661316,
"loss": 0.7852,
"odds_ratio_loss": 0.6216701865196228,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07230523973703384,
"rewards/margins": 0.016999371349811554,
"rewards/rejected": -0.0893046110868454,
"sft_loss": 0.7230523228645325,
"step": 1660
},
{
"epoch": 2.699535259648414,
"grad_norm": 3.8939476013183594,
"learning_rate": 1.205456028622723e-07,
"logits/chosen": -3.1416923999786377,
"logits/rejected": -3.130157232284546,
"logps/chosen": -0.6890848875045776,
"logps/rejected": -0.9714914560317993,
"loss": 0.7441,
"odds_ratio_loss": 0.550174355506897,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.06890848278999329,
"rewards/margins": 0.028240656480193138,
"rewards/rejected": -0.09714914858341217,
"sft_loss": 0.6890848875045776,
"step": 1670
},
{
"epoch": 2.7157001414427158,
"grad_norm": 1.6698890924453735,
"learning_rate": 1.0789140688756805e-07,
"logits/chosen": -3.1566967964172363,
"logits/rejected": -3.149425745010376,
"logps/chosen": -0.7115526795387268,
"logps/rejected": -0.9621270298957825,
"loss": 0.7691,
"odds_ratio_loss": 0.5751715898513794,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07115526497364044,
"rewards/margins": 0.025057435035705566,
"rewards/rejected": -0.09621270000934601,
"sft_loss": 0.7115526795387268,
"step": 1680
},
{
"epoch": 2.7318650232370176,
"grad_norm": 5.264458179473877,
"learning_rate": 9.592411742693098e-08,
"logits/chosen": -3.078084945678711,
"logits/rejected": -3.0921549797058105,
"logps/chosen": -0.7386698722839355,
"logps/rejected": -0.9247487783432007,
"loss": 0.8064,
"odds_ratio_loss": 0.6777487397193909,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07386698573827744,
"rewards/margins": 0.018607888370752335,
"rewards/rejected": -0.09247487038373947,
"sft_loss": 0.7386698722839355,
"step": 1690
},
{
"epoch": 2.7480299050313195,
"grad_norm": 2.103006362915039,
"learning_rate": 8.464717095022168e-08,
"logits/chosen": -3.064146041870117,
"logits/rejected": -3.0713071823120117,
"logps/chosen": -0.7025493383407593,
"logps/rejected": -0.955985426902771,
"loss": 0.7608,
"odds_ratio_loss": 0.5827249884605408,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.07025493681430817,
"rewards/margins": 0.025343608111143112,
"rewards/rejected": -0.09559854120016098,
"sft_loss": 0.7025493383407593,
"step": 1700
},
{
"epoch": 2.7641947868256214,
"grad_norm": 2.800687789916992,
"learning_rate": 7.406380569169841e-08,
"logits/chosen": -3.0971622467041016,
"logits/rejected": -3.142702341079712,
"logps/chosen": -0.752859890460968,
"logps/rejected": -0.8985753059387207,
"loss": 0.8177,
"odds_ratio_loss": 0.6488373875617981,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.07528599351644516,
"rewards/margins": 0.014571529813110828,
"rewards/rejected": -0.08985751867294312,
"sft_loss": 0.752859890460968,
"step": 1710
},
{
"epoch": 2.7803596686199232,
"grad_norm": 11.731331825256348,
"learning_rate": 6.417706072013808e-08,
"logits/chosen": -3.1208655834198,
"logits/rejected": -3.1548678874969482,
"logps/chosen": -0.7304657101631165,
"logps/rejected": -0.9190757870674133,
"loss": 0.7931,
"odds_ratio_loss": 0.6268162727355957,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07304656505584717,
"rewards/margins": 0.01886100508272648,
"rewards/rejected": -0.0919075757265091,
"sft_loss": 0.7304657101631165,
"step": 1720
},
{
"epoch": 2.796524550414225,
"grad_norm": 4.022006034851074,
"learning_rate": 5.498977506615294e-08,
"logits/chosen": -3.1070895195007324,
"logits/rejected": -3.145142078399658,
"logps/chosen": -0.7585607767105103,
"logps/rejected": -0.914089560508728,
"loss": 0.8244,
"odds_ratio_loss": 0.6582176089286804,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.07585608214139938,
"rewards/margins": 0.015552881173789501,
"rewards/rejected": -0.09140896052122116,
"sft_loss": 0.7585607767105103,
"step": 1730
},
{
"epoch": 2.812689432208527,
"grad_norm": 2.081886053085327,
"learning_rate": 4.6504586906947756e-08,
"logits/chosen": -3.1400046348571777,
"logits/rejected": -3.159885883331299,
"logps/chosen": -0.7848642468452454,
"logps/rejected": -0.9642190933227539,
"loss": 0.8438,
"odds_ratio_loss": 0.5897284746170044,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07848642766475677,
"rewards/margins": 0.01793549209833145,
"rewards/rejected": -0.09642191231250763,
"sft_loss": 0.7848642468452454,
"step": 1740
},
{
"epoch": 2.828854314002829,
"grad_norm": 9.670836448669434,
"learning_rate": 3.8723932808754914e-08,
"logits/chosen": -3.1591954231262207,
"logits/rejected": -3.179676055908203,
"logps/chosen": -0.832345187664032,
"logps/rejected": -0.9517787098884583,
"loss": 0.8997,
"odds_ratio_loss": 0.6738894581794739,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0832345113158226,
"rewards/margins": 0.01194334588944912,
"rewards/rejected": -0.09517785906791687,
"sft_loss": 0.832345187664032,
"step": 1750
},
{
"epoch": 2.8450191957971307,
"grad_norm": 4.27302360534668,
"learning_rate": 3.1650047027158014e-08,
"logits/chosen": -3.132617473602295,
"logits/rejected": -3.1612956523895264,
"logps/chosen": -0.7341474294662476,
"logps/rejected": -0.9550244212150574,
"loss": 0.7929,
"odds_ratio_loss": 0.5877509117126465,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07341475039720535,
"rewards/margins": 0.0220877043902874,
"rewards/rejected": -0.09550245106220245,
"sft_loss": 0.7341474294662476,
"step": 1760
},
{
"epoch": 2.8611840775914326,
"grad_norm": 3.392895460128784,
"learning_rate": 2.5284960865517848e-08,
"logits/chosen": -3.093036413192749,
"logits/rejected": -3.1320202350616455,
"logps/chosen": -0.6883664727210999,
"logps/rejected": -0.9603809118270874,
"loss": 0.7445,
"odds_ratio_loss": 0.5616418123245239,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.06883664429187775,
"rewards/margins": 0.027201462537050247,
"rewards/rejected": -0.0960381031036377,
"sft_loss": 0.6883664727210999,
"step": 1770
},
{
"epoch": 2.8773489593857344,
"grad_norm": 2.937486410140991,
"learning_rate": 1.9630502091670388e-08,
"logits/chosen": -3.09773325920105,
"logits/rejected": -3.1201109886169434,
"logps/chosen": -0.7190070152282715,
"logps/rejected": -0.9666651487350464,
"loss": 0.7761,
"odds_ratio_loss": 0.5710408687591553,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07190070301294327,
"rewards/margins": 0.02476580999791622,
"rewards/rejected": -0.09666652232408524,
"sft_loss": 0.7190070152282715,
"step": 1780
},
{
"epoch": 2.8935138411800363,
"grad_norm": 3.6368203163146973,
"learning_rate": 1.4688294413074677e-08,
"logits/chosen": -3.076406717300415,
"logits/rejected": -3.1111807823181152,
"logps/chosen": -0.6571230888366699,
"logps/rejected": -0.9525207281112671,
"loss": 0.7146,
"odds_ratio_loss": 0.5746604204177856,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.06571231037378311,
"rewards/margins": 0.02953975833952427,
"rewards/rejected": -0.09525207430124283,
"sft_loss": 0.6571230888366699,
"step": 1790
},
{
"epoch": 2.909678722974338,
"grad_norm": 2.2510814666748047,
"learning_rate": 1.0459757010556626e-08,
"logits/chosen": -3.1042840480804443,
"logits/rejected": -3.1341700553894043,
"logps/chosen": -0.7357559204101562,
"logps/rejected": -0.8860123753547668,
"loss": 0.7989,
"odds_ratio_loss": 0.6314257383346558,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.07357560098171234,
"rewards/margins": 0.015025639906525612,
"rewards/rejected": -0.0886012390255928,
"sft_loss": 0.7357559204101562,
"step": 1800
},
{
"epoch": 2.92584360476864,
"grad_norm": 2.13761043548584,
"learning_rate": 6.94610413078306e-09,
"logits/chosen": -3.028646945953369,
"logits/rejected": -3.105076789855957,
"logps/chosen": -0.7331860661506653,
"logps/rejected": -1.0410078763961792,
"loss": 0.7943,
"odds_ratio_loss": 0.6107637882232666,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07331861555576324,
"rewards/margins": 0.03078218176960945,
"rewards/rejected": -0.104100801050663,
"sft_loss": 0.7331860661506653,
"step": 1810
},
{
"epoch": 2.942008486562942,
"grad_norm": 2.224161148071289,
"learning_rate": 4.14834473758563e-09,
"logits/chosen": -3.0697426795959473,
"logits/rejected": -3.098311185836792,
"logps/chosen": -0.6795364618301392,
"logps/rejected": -0.9198546409606934,
"loss": 0.7354,
"odds_ratio_loss": 0.5586915016174316,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.06795365363359451,
"rewards/margins": 0.02403181977570057,
"rewards/rejected": -0.09198546409606934,
"sft_loss": 0.6795364618301392,
"step": 1820
},
{
"epoch": 2.9581733683572438,
"grad_norm": 1.7413092851638794,
"learning_rate": 2.067282222230349e-09,
"logits/chosen": -3.101163864135742,
"logits/rejected": -3.1490769386291504,
"logps/chosen": -0.69708651304245,
"logps/rejected": -0.9820737838745117,
"loss": 0.753,
"odds_ratio_loss": 0.5595835447311401,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.06970865279436111,
"rewards/margins": 0.028498733416199684,
"rewards/rejected": -0.09820737689733505,
"sft_loss": 0.69708651304245,
"step": 1830
},
{
"epoch": 2.9743382501515456,
"grad_norm": 9.5276517868042,
"learning_rate": 7.035141727212979e-10,
"logits/chosen": -3.111255168914795,
"logits/rejected": -3.1476521492004395,
"logps/chosen": -0.6813724040985107,
"logps/rejected": -0.9502272605895996,
"loss": 0.7369,
"odds_ratio_loss": 0.555463969707489,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.06813724339008331,
"rewards/margins": 0.026885494589805603,
"rewards/rejected": -0.09502272307872772,
"sft_loss": 0.6813724040985107,
"step": 1840
},
{
"epoch": 2.9905031319458475,
"grad_norm": 10.846762657165527,
"learning_rate": 5.743220219761592e-11,
"logits/chosen": -3.105236530303955,
"logits/rejected": -3.1389710903167725,
"logps/chosen": -0.8284826278686523,
"logps/rejected": -1.0016568899154663,
"loss": 0.8952,
"odds_ratio_loss": 0.6669169068336487,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08284827321767807,
"rewards/margins": 0.017317429184913635,
"rewards/rejected": -0.1001657024025917,
"sft_loss": 0.8284826278686523,
"step": 1850
},
{
"epoch": 2.9969690846635686,
"step": 1854,
"total_flos": 2.0970902870084813e+18,
"train_loss": 0.8330582246554065,
"train_runtime": 34111.2463,
"train_samples_per_second": 0.871,
"train_steps_per_second": 0.054
}
],
"logging_steps": 10,
"max_steps": 1854,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 2.0970902870084813e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}