Mistral-7B-Instruct-v0.3-ORPO / trainer_state.json
chchen's picture
End of training
ddf3f54 verified
raw
history blame
No virus
107 kB
{
"best_metric": 0.8734214901924133,
"best_model_checkpoint": "saves/Mistral-7B-Instruct-v0.3/lora/orpo/checkpoint-1500",
"epoch": 2.997999555456768,
"eval_steps": 500,
"global_step": 1686,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017781729273171815,
"grad_norm": 2.492755651473999,
"learning_rate": 4.9995745934141085e-06,
"logits/chosen": -2.952331781387329,
"logits/rejected": -2.973951816558838,
"logps/chosen": -1.0092018842697144,
"logps/rejected": -1.3774441480636597,
"loss": 1.0773,
"odds_ratio_loss": 0.6805658936500549,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.1009201854467392,
"rewards/margins": 0.03682423382997513,
"rewards/rejected": -0.13774441182613373,
"sft_loss": 1.0092018842697144,
"step": 10
},
{
"epoch": 0.03556345854634363,
"grad_norm": 8.398221969604492,
"learning_rate": 4.9982812903243405e-06,
"logits/chosen": -2.924294948577881,
"logits/rejected": -2.994157314300537,
"logps/chosen": -1.0329482555389404,
"logps/rejected": -1.2759336233139038,
"loss": 1.1014,
"odds_ratio_loss": 0.6848658323287964,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.10329482704401016,
"rewards/margins": 0.024298548698425293,
"rewards/rejected": -0.12759338319301605,
"sft_loss": 1.0329482555389404,
"step": 20
},
{
"epoch": 0.05334518781951545,
"grad_norm": 2.1793289184570312,
"learning_rate": 4.996120496405222e-06,
"logits/chosen": -2.9549760818481445,
"logits/rejected": -2.9626007080078125,
"logps/chosen": -1.0005769729614258,
"logps/rejected": -1.488245964050293,
"loss": 1.0616,
"odds_ratio_loss": 0.6106585264205933,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1000577062368393,
"rewards/margins": 0.048766884952783585,
"rewards/rejected": -0.14882458746433258,
"sft_loss": 1.0005769729614258,
"step": 30
},
{
"epoch": 0.07112691709268726,
"grad_norm": 2.8343796730041504,
"learning_rate": 4.99309296196014e-06,
"logits/chosen": -2.924588203430176,
"logits/rejected": -2.9867076873779297,
"logps/chosen": -1.0675694942474365,
"logps/rejected": -1.2621403932571411,
"loss": 1.1331,
"odds_ratio_loss": 0.6557044982910156,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.10675694793462753,
"rewards/margins": 0.019457101821899414,
"rewards/rejected": -0.12621404230594635,
"sft_loss": 1.0675694942474365,
"step": 40
},
{
"epoch": 0.08890864636585907,
"grad_norm": 2.04829478263855,
"learning_rate": 4.989199738255166e-06,
"logits/chosen": -2.956892967224121,
"logits/rejected": -2.9954347610473633,
"logps/chosen": -0.9171065092086792,
"logps/rejected": -1.2301478385925293,
"loss": 0.9797,
"odds_ratio_loss": 0.6256455183029175,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0917106494307518,
"rewards/margins": 0.03130412846803665,
"rewards/rejected": -0.12301478534936905,
"sft_loss": 0.9171065092086792,
"step": 50
},
{
"epoch": 0.1066903756390309,
"grad_norm": 5.50786018371582,
"learning_rate": 4.984442177154031e-06,
"logits/chosen": -2.9277195930480957,
"logits/rejected": -2.9476375579833984,
"logps/chosen": -0.9940068125724792,
"logps/rejected": -1.2362287044525146,
"loss": 1.0632,
"odds_ratio_loss": 0.6921108365058899,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09940069168806076,
"rewards/margins": 0.02422218956053257,
"rewards/rejected": -0.12362287193536758,
"sft_loss": 0.9940068125724792,
"step": 60
},
{
"epoch": 0.12447210491220272,
"grad_norm": 1.7581864595413208,
"learning_rate": 4.978821930648704e-06,
"logits/chosen": -2.9205572605133057,
"logits/rejected": -2.973936080932617,
"logps/chosen": -0.9317066073417664,
"logps/rejected": -1.0809520483016968,
"loss": 1.0027,
"odds_ratio_loss": 0.7100493907928467,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.093170665204525,
"rewards/margins": 0.014924542978405952,
"rewards/rejected": -0.1080952063202858,
"sft_loss": 0.9317066073417664,
"step": 70
},
{
"epoch": 0.14225383418537452,
"grad_norm": 3.336517572402954,
"learning_rate": 4.97234095028576e-06,
"logits/chosen": -2.9701972007751465,
"logits/rejected": -2.971057415008545,
"logps/chosen": -0.9333993792533875,
"logps/rejected": -1.1716864109039307,
"loss": 0.9964,
"odds_ratio_loss": 0.6299672722816467,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0933399349451065,
"rewards/margins": 0.023828700184822083,
"rewards/rejected": -0.11716864258050919,
"sft_loss": 0.9333993792533875,
"step": 80
},
{
"epoch": 0.16003556345854633,
"grad_norm": 1.333382248878479,
"learning_rate": 4.965001486488743e-06,
"logits/chosen": -2.9220926761627197,
"logits/rejected": -2.951408863067627,
"logps/chosen": -0.8873022198677063,
"logps/rejected": -1.1284812688827515,
"loss": 0.9472,
"odds_ratio_loss": 0.5987495183944702,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08873023092746735,
"rewards/margins": 0.02411791868507862,
"rewards/rejected": -0.11284814029932022,
"sft_loss": 0.8873022198677063,
"step": 90
},
{
"epoch": 0.17781729273171815,
"grad_norm": 1.715163230895996,
"learning_rate": 4.956806087776732e-06,
"logits/chosen": -3.0303444862365723,
"logits/rejected": -3.04186749458313,
"logps/chosen": -0.9242479205131531,
"logps/rejected": -1.3088445663452148,
"loss": 0.9857,
"odds_ratio_loss": 0.6147152185440063,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.09242479503154755,
"rewards/margins": 0.038459669798612595,
"rewards/rejected": -0.13088446855545044,
"sft_loss": 0.9242479205131531,
"step": 100
},
{
"epoch": 0.19559902200489,
"grad_norm": 2.949481248855591,
"learning_rate": 4.947757599879411e-06,
"logits/chosen": -3.0064456462860107,
"logits/rejected": -3.0399320125579834,
"logps/chosen": -0.9601238965988159,
"logps/rejected": -1.2331488132476807,
"loss": 1.0259,
"odds_ratio_loss": 0.6574784517288208,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.09601239114999771,
"rewards/margins": 0.027302492409944534,
"rewards/rejected": -0.12331489473581314,
"sft_loss": 0.9601238965988159,
"step": 110
},
{
"epoch": 0.2133807512780618,
"grad_norm": 1.2405259609222412,
"learning_rate": 4.937859164748931e-06,
"logits/chosen": -3.0256314277648926,
"logits/rejected": -3.044879913330078,
"logps/chosen": -0.8803631067276001,
"logps/rejected": -1.0130887031555176,
"loss": 0.9473,
"odds_ratio_loss": 0.668988823890686,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08803631365299225,
"rewards/margins": 0.013272559270262718,
"rewards/rejected": -0.10130886733531952,
"sft_loss": 0.8803631067276001,
"step": 120
},
{
"epoch": 0.23116248055123362,
"grad_norm": 2.040465831756592,
"learning_rate": 4.92711421946891e-06,
"logits/chosen": -3.0067856311798096,
"logits/rejected": -2.970612049102783,
"logps/chosen": -0.8932172060012817,
"logps/rejected": -1.1789153814315796,
"loss": 0.9558,
"odds_ratio_loss": 0.6254903674125671,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08932172507047653,
"rewards/margins": 0.028569817543029785,
"rewards/rejected": -0.11789155006408691,
"sft_loss": 0.8932172060012817,
"step": 130
},
{
"epoch": 0.24894420982440543,
"grad_norm": 1.586767554283142,
"learning_rate": 4.915526495060961e-06,
"logits/chosen": -3.0685572624206543,
"logits/rejected": -3.0535078048706055,
"logps/chosen": -0.8625435829162598,
"logps/rejected": -1.1399943828582764,
"loss": 0.9238,
"odds_ratio_loss": 0.612372636795044,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08625435084104538,
"rewards/margins": 0.027745097875595093,
"rewards/rejected": -0.11399944871664047,
"sft_loss": 0.8625435829162598,
"step": 140
},
{
"epoch": 0.26672593909757725,
"grad_norm": 1.953273057937622,
"learning_rate": 4.903100015189153e-06,
"logits/chosen": -3.0217204093933105,
"logits/rejected": -3.059971570968628,
"logps/chosen": -0.8424757719039917,
"logps/rejected": -1.0430591106414795,
"loss": 0.9062,
"odds_ratio_loss": 0.6374109983444214,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08424757421016693,
"rewards/margins": 0.020058346912264824,
"rewards/rejected": -0.1043059229850769,
"sft_loss": 0.8424757719039917,
"step": 150
},
{
"epoch": 0.28450766837074903,
"grad_norm": 4.5785298347473145,
"learning_rate": 4.889839094762848e-06,
"logits/chosen": -3.001889705657959,
"logits/rejected": -3.0023865699768066,
"logps/chosen": -0.887285053730011,
"logps/rejected": -1.1001445055007935,
"loss": 0.952,
"odds_ratio_loss": 0.6474493741989136,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08872850239276886,
"rewards/margins": 0.02128593623638153,
"rewards/rejected": -0.11001445353031158,
"sft_loss": 0.887285053730011,
"step": 160
},
{
"epoch": 0.3022893976439209,
"grad_norm": 1.2961128950119019,
"learning_rate": 4.875748338438416e-06,
"logits/chosen": -3.055670738220215,
"logits/rejected": -3.0634965896606445,
"logps/chosen": -0.8919625282287598,
"logps/rejected": -1.0326893329620361,
"loss": 0.9602,
"odds_ratio_loss": 0.6827921271324158,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08919624984264374,
"rewards/margins": 0.014072686433792114,
"rewards/rejected": -0.10326894372701645,
"sft_loss": 0.8919625282287598,
"step": 170
},
{
"epoch": 0.32007112691709266,
"grad_norm": 2.0726120471954346,
"learning_rate": 4.8608326390203386e-06,
"logits/chosen": -3.05631685256958,
"logits/rejected": -3.0371289253234863,
"logps/chosen": -0.8544119000434875,
"logps/rejected": -1.0456076860427856,
"loss": 0.9188,
"odds_ratio_loss": 0.6443353891372681,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08544120192527771,
"rewards/margins": 0.01911957561969757,
"rewards/rejected": -0.10456077009439468,
"sft_loss": 0.8544119000434875,
"step": 180
},
{
"epoch": 0.3378528561902645,
"grad_norm": 2.612196922302246,
"learning_rate": 4.845097175762251e-06,
"logits/chosen": -3.0822339057922363,
"logits/rejected": -3.1029491424560547,
"logps/chosen": -0.907193660736084,
"logps/rejected": -1.0687024593353271,
"loss": 0.9743,
"odds_ratio_loss": 0.6714697480201721,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.09071935713291168,
"rewards/margins": 0.01615087315440178,
"rewards/rejected": -0.10687023401260376,
"sft_loss": 0.907193660736084,
"step": 190
},
{
"epoch": 0.3556345854634363,
"grad_norm": 3.9038424491882324,
"learning_rate": 4.8285474125685286e-06,
"logits/chosen": -3.066904067993164,
"logits/rejected": -3.086334705352783,
"logps/chosen": -0.9056366086006165,
"logps/rejected": -1.0252189636230469,
"loss": 0.9757,
"odds_ratio_loss": 0.7004884481430054,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.09056366980075836,
"rewards/margins": 0.01195824146270752,
"rewards/rejected": -0.10252189636230469,
"sft_loss": 0.9056366086006165,
"step": 200
},
{
"epoch": 0.37341631473660813,
"grad_norm": 1.57925283908844,
"learning_rate": 4.811189096097025e-06,
"logits/chosen": -3.044316530227661,
"logits/rejected": -3.068372964859009,
"logps/chosen": -0.882292628288269,
"logps/rejected": -1.1092549562454224,
"loss": 0.9473,
"odds_ratio_loss": 0.6500683426856995,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08822925388813019,
"rewards/margins": 0.0226962361484766,
"rewards/rejected": -0.11092549562454224,
"sft_loss": 0.882292628288269,
"step": 210
},
{
"epoch": 0.39119804400978,
"grad_norm": 3.1554384231567383,
"learning_rate": 4.793028253763633e-06,
"logits/chosen": -3.1082234382629395,
"logits/rejected": -3.1198127269744873,
"logps/chosen": -0.878674328327179,
"logps/rejected": -1.0521525144577026,
"loss": 0.9507,
"odds_ratio_loss": 0.7204707860946655,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08786743879318237,
"rewards/margins": 0.01734781637787819,
"rewards/rejected": -0.10521525144577026,
"sft_loss": 0.878674328327179,
"step": 220
},
{
"epoch": 0.40897977328295176,
"grad_norm": 3.0515213012695312,
"learning_rate": 4.774071191649352e-06,
"logits/chosen": -3.0294933319091797,
"logits/rejected": -3.036970615386963,
"logps/chosen": -0.8506752252578735,
"logps/rejected": -1.1133465766906738,
"loss": 0.9107,
"odds_ratio_loss": 0.6004607677459717,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08506752550601959,
"rewards/margins": 0.026267144829034805,
"rewards/rejected": -0.1113346666097641,
"sft_loss": 0.8506752252578735,
"step": 230
},
{
"epoch": 0.4267615025561236,
"grad_norm": 4.107941627502441,
"learning_rate": 4.7543244923105975e-06,
"logits/chosen": -3.052797794342041,
"logits/rejected": -3.0861849784851074,
"logps/chosen": -0.9088889956474304,
"logps/rejected": -0.9762862324714661,
"loss": 0.9829,
"odds_ratio_loss": 0.7397087812423706,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.09088890254497528,
"rewards/margins": 0.006739714182913303,
"rewards/rejected": -0.09762861579656601,
"sft_loss": 0.9088889956474304,
"step": 240
},
{
"epoch": 0.4445432318292954,
"grad_norm": 1.7963005304336548,
"learning_rate": 4.733795012493506e-06,
"logits/chosen": -3.077770471572876,
"logits/rejected": -3.1305344104766846,
"logps/chosen": -0.9044251441955566,
"logps/rejected": -1.036949872970581,
"loss": 0.9739,
"odds_ratio_loss": 0.6946715712547302,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.09044251590967178,
"rewards/margins": 0.013252484612166882,
"rewards/rejected": -0.10369499772787094,
"sft_loss": 0.9044251441955566,
"step": 250
},
{
"epoch": 0.46232496110246724,
"grad_norm": 1.4697704315185547,
"learning_rate": 4.712489880753035e-06,
"logits/chosen": -3.078249931335449,
"logits/rejected": -3.072510004043579,
"logps/chosen": -0.81315678358078,
"logps/rejected": -0.9732586741447449,
"loss": 0.8776,
"odds_ratio_loss": 0.644811749458313,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08131568133831024,
"rewards/margins": 0.016010191291570663,
"rewards/rejected": -0.0973258763551712,
"sft_loss": 0.81315678358078,
"step": 260
},
{
"epoch": 0.480106690375639,
"grad_norm": 5.489832878112793,
"learning_rate": 4.690416494977673e-06,
"logits/chosen": -3.067095994949341,
"logits/rejected": -3.109727382659912,
"logps/chosen": -0.8310638666152954,
"logps/rejected": -1.1116752624511719,
"loss": 0.8925,
"odds_ratio_loss": 0.6144498586654663,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0831063836812973,
"rewards/margins": 0.028061147779226303,
"rewards/rejected": -0.1111675351858139,
"sft_loss": 0.8310638666152954,
"step": 270
},
{
"epoch": 0.49788841964881086,
"grad_norm": 1.4339563846588135,
"learning_rate": 4.667582519820639e-06,
"logits/chosen": -3.068760395050049,
"logits/rejected": -3.1055545806884766,
"logps/chosen": -0.9461262822151184,
"logps/rejected": -1.0382800102233887,
"loss": 1.018,
"odds_ratio_loss": 0.7186475992202759,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.09461262822151184,
"rewards/margins": 0.009215375408530235,
"rewards/rejected": -0.10382799804210663,
"sft_loss": 0.9461262822151184,
"step": 280
},
{
"epoch": 0.5156701489219827,
"grad_norm": 3.6930854320526123,
"learning_rate": 4.643995884038443e-06,
"logits/chosen": -3.0967042446136475,
"logits/rejected": -3.1315600872039795,
"logps/chosen": -0.8749726414680481,
"logps/rejected": -1.058611273765564,
"loss": 0.9404,
"odds_ratio_loss": 0.6541867256164551,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08749726414680481,
"rewards/margins": 0.018363865092396736,
"rewards/rejected": -0.1058611273765564,
"sft_loss": 0.8749726414680481,
"step": 290
},
{
"epoch": 0.5334518781951545,
"grad_norm": 1.7125145196914673,
"learning_rate": 4.6196647777377475e-06,
"logits/chosen": -3.0732457637786865,
"logits/rejected": -3.093071222305298,
"logps/chosen": -0.8488075137138367,
"logps/rejected": -0.9796191453933716,
"loss": 0.9176,
"odds_ratio_loss": 0.6876064538955688,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.0848807543516159,
"rewards/margins": 0.013081158511340618,
"rewards/rejected": -0.0979619026184082,
"sft_loss": 0.8488075137138367,
"step": 300
},
{
"epoch": 0.5512336074683263,
"grad_norm": 1.6855430603027344,
"learning_rate": 4.59459764953147e-06,
"logits/chosen": -3.115689754486084,
"logits/rejected": -3.099546194076538,
"logps/chosen": -0.8772395253181458,
"logps/rejected": -1.0352530479431152,
"loss": 0.9431,
"odds_ratio_loss": 0.6584862470626831,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08772395551204681,
"rewards/margins": 0.015801362693309784,
"rewards/rejected": -0.1035253182053566,
"sft_loss": 0.8772395253181458,
"step": 310
},
{
"epoch": 0.5690153367414981,
"grad_norm": 3.039783239364624,
"learning_rate": 4.568803203605133e-06,
"logits/chosen": -3.1416523456573486,
"logits/rejected": -3.1220498085021973,
"logps/chosen": -0.8318166732788086,
"logps/rejected": -1.035842776298523,
"loss": 0.8969,
"odds_ratio_loss": 0.650640606880188,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08318166434764862,
"rewards/margins": 0.020402604714035988,
"rewards/rejected": -0.10358426719903946,
"sft_loss": 0.8318166732788086,
"step": 320
},
{
"epoch": 0.58679706601467,
"grad_norm": 1.5947670936584473,
"learning_rate": 4.542290396694462e-06,
"logits/chosen": -3.100538969039917,
"logits/rejected": -3.1203720569610596,
"logps/chosen": -0.855880081653595,
"logps/rejected": -1.0065386295318604,
"loss": 0.9255,
"odds_ratio_loss": 0.6964801549911499,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.0855880007147789,
"rewards/margins": 0.01506584882736206,
"rewards/rejected": -0.10065384954214096,
"sft_loss": 0.855880081653595,
"step": 330
},
{
"epoch": 0.6045787952878418,
"grad_norm": 3.104470729827881,
"learning_rate": 4.515068434975298e-06,
"logits/chosen": -3.0526375770568848,
"logits/rejected": -3.0920848846435547,
"logps/chosen": -0.8729322552680969,
"logps/rejected": -1.0834085941314697,
"loss": 0.9376,
"odds_ratio_loss": 0.6469117403030396,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08729322254657745,
"rewards/margins": 0.021047625690698624,
"rewards/rejected": -0.10834084451198578,
"sft_loss": 0.8729322552680969,
"step": 340
},
{
"epoch": 0.6223605245610135,
"grad_norm": 1.5185527801513672,
"learning_rate": 4.487146770866887e-06,
"logits/chosen": -3.1082205772399902,
"logits/rejected": -3.146754503250122,
"logps/chosen": -0.866405189037323,
"logps/rejected": -0.985508143901825,
"loss": 0.9343,
"odds_ratio_loss": 0.6789035797119141,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08664052188396454,
"rewards/margins": 0.011910290457308292,
"rewards/rejected": -0.09855081886053085,
"sft_loss": 0.866405189037323,
"step": 350
},
{
"epoch": 0.6401422538341853,
"grad_norm": 2.0399420261383057,
"learning_rate": 4.458535099749666e-06,
"logits/chosen": -3.114278793334961,
"logits/rejected": -3.1290249824523926,
"logps/chosen": -0.9554277658462524,
"logps/rejected": -1.0395957231521606,
"loss": 1.0312,
"odds_ratio_loss": 0.7574664950370789,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.095542773604393,
"rewards/margins": 0.008416806347668171,
"rewards/rejected": -0.1039595827460289,
"sft_loss": 0.9554277658462524,
"step": 360
},
{
"epoch": 0.6579239831073572,
"grad_norm": 2.1999988555908203,
"learning_rate": 4.429243356598694e-06,
"logits/chosen": -3.0874438285827637,
"logits/rejected": -3.098285436630249,
"logps/chosen": -0.8949627876281738,
"logps/rejected": -1.1512229442596436,
"loss": 0.9596,
"odds_ratio_loss": 0.645936131477356,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08949627727270126,
"rewards/margins": 0.02562602423131466,
"rewards/rejected": -0.11512230336666107,
"sft_loss": 0.8949627876281738,
"step": 370
},
{
"epoch": 0.675705712380529,
"grad_norm": 5.087428092956543,
"learning_rate": 4.399281712533875e-06,
"logits/chosen": -3.118114709854126,
"logits/rejected": -3.1233677864074707,
"logps/chosen": -0.8167802095413208,
"logps/rejected": -0.9615100026130676,
"loss": 0.8862,
"odds_ratio_loss": 0.6943861246109009,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08167801797389984,
"rewards/margins": 0.01447297353297472,
"rewards/rejected": -0.09615099430084229,
"sft_loss": 0.8167802095413208,
"step": 380
},
{
"epoch": 0.6934874416537008,
"grad_norm": 2.3240132331848145,
"learning_rate": 4.368660571288192e-06,
"logits/chosen": -3.1258320808410645,
"logits/rejected": -3.1687591075897217,
"logps/chosen": -0.8471567034721375,
"logps/rejected": -0.9503539204597473,
"loss": 0.917,
"odds_ratio_loss": 0.6983430981636047,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.08471567928791046,
"rewards/margins": 0.010319720953702927,
"rewards/rejected": -0.09503538906574249,
"sft_loss": 0.8471567034721375,
"step": 390
},
{
"epoch": 0.7112691709268726,
"grad_norm": 1.7417421340942383,
"learning_rate": 4.337390565595163e-06,
"logits/chosen": -3.0782721042633057,
"logits/rejected": -3.099292278289795,
"logps/chosen": -0.9293394088745117,
"logps/rejected": -0.9703164100646973,
"loss": 1.0041,
"odds_ratio_loss": 0.7478191256523132,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.09293393790721893,
"rewards/margins": 0.004097697325050831,
"rewards/rejected": -0.09703163802623749,
"sft_loss": 0.9293394088745117,
"step": 400
},
{
"epoch": 0.7290509002000445,
"grad_norm": 2.362359046936035,
"learning_rate": 4.305482553496786e-06,
"logits/chosen": -3.0271878242492676,
"logits/rejected": -3.0372941493988037,
"logps/chosen": -0.8028362989425659,
"logps/rejected": -0.994833767414093,
"loss": 0.868,
"odds_ratio_loss": 0.6515198945999146,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08028362691402435,
"rewards/margins": 0.01919974386692047,
"rewards/rejected": -0.09948337078094482,
"sft_loss": 0.8028362989425659,
"step": 410
},
{
"epoch": 0.7468326294732163,
"grad_norm": 2.797231674194336,
"learning_rate": 4.272947614573244e-06,
"logits/chosen": -3.0782933235168457,
"logits/rejected": -3.116833448410034,
"logps/chosen": -0.8883565068244934,
"logps/rejected": -1.025665283203125,
"loss": 0.9551,
"odds_ratio_loss": 0.6675280332565308,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08883564919233322,
"rewards/margins": 0.013730885460972786,
"rewards/rejected": -0.10256652534008026,
"sft_loss": 0.8883565068244934,
"step": 420
},
{
"epoch": 0.7646143587463881,
"grad_norm": 1.3282934427261353,
"learning_rate": 4.23979704609569e-06,
"logits/chosen": -3.10003399848938,
"logits/rejected": -3.1360583305358887,
"logps/chosen": -0.8379910588264465,
"logps/rejected": -0.9617422819137573,
"loss": 0.9024,
"odds_ratio_loss": 0.6443312168121338,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08379910886287689,
"rewards/margins": 0.01237512193620205,
"rewards/rejected": -0.0961742252111435,
"sft_loss": 0.8379910588264465,
"step": 430
},
{
"epoch": 0.78239608801956,
"grad_norm": 2.023909330368042,
"learning_rate": 4.206042359103435e-06,
"logits/chosen": -3.0400068759918213,
"logits/rejected": -3.081937313079834,
"logps/chosen": -0.8709232211112976,
"logps/rejected": -1.0840847492218018,
"loss": 0.9355,
"odds_ratio_loss": 0.6454750299453735,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0870923325419426,
"rewards/margins": 0.021316152065992355,
"rewards/rejected": -0.10840848833322525,
"sft_loss": 0.8709232211112976,
"step": 440
},
{
"epoch": 0.8001778172927317,
"grad_norm": 1.6410523653030396,
"learning_rate": 4.17169527440691e-06,
"logits/chosen": -3.09321928024292,
"logits/rejected": -3.0952792167663574,
"logps/chosen": -0.858233630657196,
"logps/rejected": -0.9470105171203613,
"loss": 0.9306,
"odds_ratio_loss": 0.7236040830612183,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.08582336455583572,
"rewards/margins": 0.008877689950168133,
"rewards/rejected": -0.09470105171203613,
"sft_loss": 0.858233630657196,
"step": 450
},
{
"epoch": 0.8179595465659035,
"grad_norm": 3.3464248180389404,
"learning_rate": 4.136767718517797e-06,
"logits/chosen": -3.0975563526153564,
"logits/rejected": -3.112638235092163,
"logps/chosen": -0.7778853178024292,
"logps/rejected": -0.9875160455703735,
"loss": 0.8402,
"odds_ratio_loss": 0.6234691143035889,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07778853178024292,
"rewards/margins": 0.020963061600923538,
"rewards/rejected": -0.09875159710645676,
"sft_loss": 0.7778853178024292,
"step": 460
},
{
"epoch": 0.8357412758390753,
"grad_norm": 6.556829929351807,
"learning_rate": 4.1012718195077196e-06,
"logits/chosen": -3.1534528732299805,
"logits/rejected": -3.208789348602295,
"logps/chosen": -0.8605148196220398,
"logps/rejected": -0.9714852571487427,
"loss": 0.9289,
"odds_ratio_loss": 0.6839339137077332,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.08605148643255234,
"rewards/margins": 0.011097034439444542,
"rewards/rejected": -0.09714852273464203,
"sft_loss": 0.8605148196220398,
"step": 470
},
{
"epoch": 0.8535230051122472,
"grad_norm": 1.461613655090332,
"learning_rate": 4.065219902796953e-06,
"logits/chosen": -3.090115785598755,
"logits/rejected": -3.088887929916382,
"logps/chosen": -0.8349069356918335,
"logps/rejected": -1.0508782863616943,
"loss": 0.8996,
"odds_ratio_loss": 0.6472839713096619,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08349069207906723,
"rewards/margins": 0.021597128361463547,
"rewards/rejected": -0.10508781671524048,
"sft_loss": 0.8349069356918335,
"step": 480
},
{
"epoch": 0.871304734385419,
"grad_norm": 1.3518534898757935,
"learning_rate": 4.028624486874608e-06,
"logits/chosen": -3.1022493839263916,
"logits/rejected": -3.1475415229797363,
"logps/chosen": -0.8089026212692261,
"logps/rejected": -1.0444796085357666,
"loss": 0.8747,
"odds_ratio_loss": 0.6580694317817688,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08089026063680649,
"rewards/margins": 0.023557689040899277,
"rewards/rejected": -0.10444796085357666,
"sft_loss": 0.8089026212692261,
"step": 490
},
{
"epoch": 0.8890864636585908,
"grad_norm": 1.6888097524642944,
"learning_rate": 3.99149827895177e-06,
"logits/chosen": -3.127162456512451,
"logits/rejected": -3.143782138824463,
"logps/chosen": -0.8767441511154175,
"logps/rejected": -0.9676705598831177,
"loss": 0.9464,
"odds_ratio_loss": 0.6963816285133362,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.08767442405223846,
"rewards/margins": 0.009092646650969982,
"rewards/rejected": -0.09676706790924072,
"sft_loss": 0.8767441511154175,
"step": 500
},
{
"epoch": 0.8890864636585908,
"eval_logits/chosen": -3.114872932434082,
"eval_logits/rejected": -3.143216371536255,
"eval_logps/chosen": -0.828136146068573,
"eval_logps/rejected": -1.0306241512298584,
"eval_loss": 0.8918758630752563,
"eval_odds_ratio_loss": 0.6373972296714783,
"eval_rewards/accuracies": 0.5690000057220459,
"eval_rewards/chosen": -0.08281362056732178,
"eval_rewards/margins": 0.020248806104063988,
"eval_rewards/rejected": -0.10306241363286972,
"eval_runtime": 348.9195,
"eval_samples_per_second": 2.866,
"eval_sft_loss": 0.828136146068573,
"eval_steps_per_second": 1.433,
"step": 500
},
{
"epoch": 0.9068681929317626,
"grad_norm": 1.511196494102478,
"learning_rate": 3.953854170549114e-06,
"logits/chosen": -3.118255138397217,
"logits/rejected": -3.1173043251037598,
"logps/chosen": -0.8566571474075317,
"logps/rejected": -0.9489420056343079,
"loss": 0.9245,
"odds_ratio_loss": 0.6780352592468262,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08566570281982422,
"rewards/margins": 0.009228493086993694,
"rewards/rejected": -0.09489420056343079,
"sft_loss": 0.8566571474075317,
"step": 510
},
{
"epoch": 0.9246499222049345,
"grad_norm": 2.5393214225769043,
"learning_rate": 3.91570523302051e-06,
"logits/chosen": -3.1395115852355957,
"logits/rejected": -3.147805690765381,
"logps/chosen": -0.7916607856750488,
"logps/rejected": -0.9899943470954895,
"loss": 0.8583,
"odds_ratio_loss": 0.6660428047180176,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.07916607707738876,
"rewards/margins": 0.01983334682881832,
"rewards/rejected": -0.09899942576885223,
"sft_loss": 0.7916607856750488,
"step": 520
},
{
"epoch": 0.9424316514781063,
"grad_norm": 1.5944111347198486,
"learning_rate": 3.8770647130141996e-06,
"logits/chosen": -3.150245428085327,
"logits/rejected": -3.141481876373291,
"logps/chosen": -0.8228055834770203,
"logps/rejected": -0.9976710081100464,
"loss": 0.8888,
"odds_ratio_loss": 0.6599084734916687,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08228056132793427,
"rewards/margins": 0.017486536875367165,
"rewards/rejected": -0.09976708889007568,
"sft_loss": 0.8228055834770203,
"step": 530
},
{
"epoch": 0.960213380751278,
"grad_norm": 2.3844027519226074,
"learning_rate": 3.837946027873086e-06,
"logits/chosen": -3.106717586517334,
"logits/rejected": -3.109330177307129,
"logps/chosen": -0.8973621129989624,
"logps/rejected": -1.0649579763412476,
"loss": 0.966,
"odds_ratio_loss": 0.6861368417739868,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.089736208319664,
"rewards/margins": 0.01675957441329956,
"rewards/rejected": -0.10649579763412476,
"sft_loss": 0.8973621129989624,
"step": 540
},
{
"epoch": 0.9779951100244498,
"grad_norm": 5.164077281951904,
"learning_rate": 3.7983627609757713e-06,
"logits/chosen": -3.167064666748047,
"logits/rejected": -3.16302490234375,
"logps/chosen": -0.8979376554489136,
"logps/rejected": -0.9942687153816223,
"loss": 0.9667,
"odds_ratio_loss": 0.6874598264694214,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.08979376405477524,
"rewards/margins": 0.009633105248212814,
"rewards/rejected": -0.09942687302827835,
"sft_loss": 0.8979376554489136,
"step": 550
},
{
"epoch": 0.9957768392976217,
"grad_norm": 1.5917680263519287,
"learning_rate": 3.758328657019924e-06,
"logits/chosen": -3.1346166133880615,
"logits/rejected": -3.1376471519470215,
"logps/chosen": -0.8218947649002075,
"logps/rejected": -1.0310758352279663,
"loss": 0.8891,
"odds_ratio_loss": 0.6719549298286438,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08218947798013687,
"rewards/margins": 0.020918114110827446,
"rewards/rejected": -0.10310759395360947,
"sft_loss": 0.8218947649002075,
"step": 560
},
{
"epoch": 1.0135585685707935,
"grad_norm": 6.842823505401611,
"learning_rate": 3.717857617249642e-06,
"logits/chosen": -3.1036324501037598,
"logits/rejected": -3.145653009414673,
"logps/chosen": -0.8951196670532227,
"logps/rejected": -1.0871955156326294,
"loss": 0.9655,
"odds_ratio_loss": 0.7041261792182922,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.08951196819543839,
"rewards/margins": 0.01920759119093418,
"rewards/rejected": -0.10871955007314682,
"sft_loss": 0.8951196670532227,
"step": 570
},
{
"epoch": 1.0313402978439654,
"grad_norm": 1.3233413696289062,
"learning_rate": 3.6769636946284543e-06,
"logits/chosen": -3.145310878753662,
"logits/rejected": -3.1411328315734863,
"logps/chosen": -0.8030536770820618,
"logps/rejected": -0.9519485235214233,
"loss": 0.8686,
"odds_ratio_loss": 0.6551788449287415,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08030536770820618,
"rewards/margins": 0.01488947682082653,
"rewards/rejected": -0.09519485384225845,
"sft_loss": 0.8030536770820618,
"step": 580
},
{
"epoch": 1.049122027117137,
"grad_norm": 1.561957597732544,
"learning_rate": 3.6356610889596355e-06,
"logits/chosen": -3.1137917041778564,
"logits/rejected": -3.15521502494812,
"logps/chosen": -0.8285630941390991,
"logps/rejected": -0.9533591270446777,
"loss": 0.895,
"odds_ratio_loss": 0.6645855903625488,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.08285631239414215,
"rewards/margins": 0.012479597702622414,
"rewards/rejected": -0.09533590078353882,
"sft_loss": 0.8285630941390991,
"step": 590
},
{
"epoch": 1.066903756390309,
"grad_norm": 2.0521960258483887,
"learning_rate": 3.593964141955541e-06,
"logits/chosen": -3.0969531536102295,
"logits/rejected": -3.0988070964813232,
"logps/chosen": -0.8090001344680786,
"logps/rejected": -0.9104982614517212,
"loss": 0.8782,
"odds_ratio_loss": 0.6919649839401245,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08090001344680786,
"rewards/margins": 0.01014980860054493,
"rewards/rejected": -0.09104982018470764,
"sft_loss": 0.8090001344680786,
"step": 600
},
{
"epoch": 1.0846854856634809,
"grad_norm": 1.7750905752182007,
"learning_rate": 3.5518873322576573e-06,
"logits/chosen": -3.044728994369507,
"logits/rejected": -3.0994975566864014,
"logps/chosen": -0.8208731412887573,
"logps/rejected": -0.9647499918937683,
"loss": 0.8855,
"odds_ratio_loss": 0.6465209126472473,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08208731561899185,
"rewards/margins": 0.014387677423655987,
"rewards/rejected": -0.09647499024868011,
"sft_loss": 0.8208731412887573,
"step": 610
},
{
"epoch": 1.1024672149366526,
"grad_norm": 1.3415883779525757,
"learning_rate": 3.5094452704091143e-06,
"logits/chosen": -3.0923125743865967,
"logits/rejected": -3.084038734436035,
"logps/chosen": -0.7946149110794067,
"logps/rejected": -0.9547470808029175,
"loss": 0.8602,
"odds_ratio_loss": 0.6559656858444214,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0794614925980568,
"rewards/margins": 0.016013216227293015,
"rewards/rejected": -0.09547470510005951,
"sft_loss": 0.7946149110794067,
"step": 620
},
{
"epoch": 1.1202489442098245,
"grad_norm": 3.5497653484344482,
"learning_rate": 3.46665269378139e-06,
"logits/chosen": -3.059072971343994,
"logits/rejected": -3.069256067276001,
"logps/chosen": -0.8376399874687195,
"logps/rejected": -0.9781678318977356,
"loss": 0.9066,
"odds_ratio_loss": 0.6894850134849548,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08376399427652359,
"rewards/margins": 0.01405278779566288,
"rewards/rejected": -0.09781678020954132,
"sft_loss": 0.8376399874687195,
"step": 630
},
{
"epoch": 1.1380306734829961,
"grad_norm": 3.2695467472076416,
"learning_rate": 3.4235244614569794e-06,
"logits/chosen": -3.0830600261688232,
"logits/rejected": -3.0843684673309326,
"logps/chosen": -0.8990565538406372,
"logps/rejected": -0.9821575880050659,
"loss": 0.9716,
"odds_ratio_loss": 0.7258378267288208,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08990565687417984,
"rewards/margins": 0.008310111239552498,
"rewards/rejected": -0.09821576625108719,
"sft_loss": 0.8990565538406372,
"step": 640
},
{
"epoch": 1.155812402756168,
"grad_norm": 1.196513056755066,
"learning_rate": 3.3800755490698008e-06,
"logits/chosen": -3.126264810562134,
"logits/rejected": -3.124204635620117,
"logps/chosen": -0.8168405294418335,
"logps/rejected": -1.0534611940383911,
"loss": 0.8775,
"odds_ratio_loss": 0.6068293452262878,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08168406784534454,
"rewards/margins": 0.023662051185965538,
"rewards/rejected": -0.10534612834453583,
"sft_loss": 0.8168405294418335,
"step": 650
},
{
"epoch": 1.17359413202934,
"grad_norm": 1.7081139087677002,
"learning_rate": 3.3363210436051287e-06,
"logits/chosen": -3.130343198776245,
"logits/rejected": -3.126983165740967,
"logps/chosen": -0.8528251647949219,
"logps/rejected": -1.019565224647522,
"loss": 0.9228,
"odds_ratio_loss": 0.6997644901275635,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08528250455856323,
"rewards/margins": 0.01667400822043419,
"rewards/rejected": -0.10195653140544891,
"sft_loss": 0.8528251647949219,
"step": 660
},
{
"epoch": 1.1913758613025116,
"grad_norm": 1.8305083513259888,
"learning_rate": 3.292276138160867e-06,
"logits/chosen": -3.109675645828247,
"logits/rejected": -3.1157774925231934,
"logps/chosen": -0.7888280153274536,
"logps/rejected": -0.9577935338020325,
"loss": 0.8538,
"odds_ratio_loss": 0.6501932144165039,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.07888280600309372,
"rewards/margins": 0.016896549612283707,
"rewards/rejected": -0.09577935189008713,
"sft_loss": 0.7888280153274536,
"step": 670
},
{
"epoch": 1.2091575905756835,
"grad_norm": 2.812506675720215,
"learning_rate": 3.2479561266719694e-06,
"logits/chosen": -3.1019396781921387,
"logits/rejected": -3.107755184173584,
"logps/chosen": -0.8298002481460571,
"logps/rejected": -0.9901537895202637,
"loss": 0.8946,
"odds_ratio_loss": 0.6482952833175659,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08298002183437347,
"rewards/margins": 0.01603536494076252,
"rewards/rejected": -0.09901538491249084,
"sft_loss": 0.8298002481460571,
"step": 680
},
{
"epoch": 1.2269393198488552,
"grad_norm": 3.2340750694274902,
"learning_rate": 3.2033763985998533e-06,
"logits/chosen": -3.121992588043213,
"logits/rejected": -3.124979257583618,
"logps/chosen": -0.7747536301612854,
"logps/rejected": -1.1079862117767334,
"loss": 0.8347,
"odds_ratio_loss": 0.5990911722183228,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07747535407543182,
"rewards/margins": 0.033323269337415695,
"rewards/rejected": -0.11079863458871841,
"sft_loss": 0.7747536301612854,
"step": 690
},
{
"epoch": 1.244721049122027,
"grad_norm": 1.9602211713790894,
"learning_rate": 3.1585524335886335e-06,
"logits/chosen": -3.1363680362701416,
"logits/rejected": -3.1302547454833984,
"logps/chosen": -0.7745245695114136,
"logps/rejected": -0.9697211980819702,
"loss": 0.838,
"odds_ratio_loss": 0.6345950365066528,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07745245844125748,
"rewards/margins": 0.019519677385687828,
"rewards/rejected": -0.09697212278842926,
"sft_loss": 0.7745245695114136,
"step": 700
},
{
"epoch": 1.262502778395199,
"grad_norm": 3.0812952518463135,
"learning_rate": 3.1134997960900536e-06,
"logits/chosen": -3.0771961212158203,
"logits/rejected": -3.08510160446167,
"logps/chosen": -0.7646561861038208,
"logps/rejected": -1.0509836673736572,
"loss": 0.8246,
"odds_ratio_loss": 0.5993521809577942,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07646562159061432,
"rewards/margins": 0.028632745146751404,
"rewards/rejected": -0.10509836673736572,
"sft_loss": 0.7646561861038208,
"step": 710
},
{
"epoch": 1.2802845076683709,
"grad_norm": 1.5706931352615356,
"learning_rate": 3.0682341299589583e-06,
"logits/chosen": -3.096446990966797,
"logits/rejected": -3.110931873321533,
"logps/chosen": -0.8055674433708191,
"logps/rejected": -0.9553298950195312,
"loss": 0.8716,
"odds_ratio_loss": 0.6605285406112671,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08055675029754639,
"rewards/margins": 0.014976252801716328,
"rewards/rejected": -0.09553299844264984,
"sft_loss": 0.8055674433708191,
"step": 720
},
{
"epoch": 1.2980662369415426,
"grad_norm": 1.670327067375183,
"learning_rate": 3.022771153021201e-06,
"logits/chosen": -3.127776622772217,
"logits/rejected": -3.1598572731018066,
"logps/chosen": -0.7699373960494995,
"logps/rejected": -0.9526535272598267,
"loss": 0.8354,
"odds_ratio_loss": 0.654297947883606,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.07699373364448547,
"rewards/margins": 0.018271619454026222,
"rewards/rejected": -0.09526535123586655,
"sft_loss": 0.7699373960494995,
"step": 730
},
{
"epoch": 1.3158479662147144,
"grad_norm": 1.666502833366394,
"learning_rate": 2.9771266516158625e-06,
"logits/chosen": -3.0938611030578613,
"logits/rejected": -3.111356735229492,
"logps/chosen": -0.795330822467804,
"logps/rejected": -0.9487611055374146,
"loss": 0.8641,
"odds_ratio_loss": 0.687196671962738,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.07953307777643204,
"rewards/margins": 0.015343038365244865,
"rewards/rejected": -0.09487612545490265,
"sft_loss": 0.795330822467804,
"step": 740
},
{
"epoch": 1.3336296954878861,
"grad_norm": 1.529642939567566,
"learning_rate": 2.9313164751136802e-06,
"logits/chosen": -3.082942485809326,
"logits/rejected": -3.1158337593078613,
"logps/chosen": -0.789255678653717,
"logps/rejected": -0.9912340044975281,
"loss": 0.8503,
"odds_ratio_loss": 0.6099725961685181,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07892556488513947,
"rewards/margins": 0.020197834819555283,
"rewards/rejected": -0.09912340342998505,
"sft_loss": 0.789255678653717,
"step": 750
},
{
"epoch": 1.351411424761058,
"grad_norm": 2.9339799880981445,
"learning_rate": 2.8853565304135956e-06,
"logits/chosen": -3.1478281021118164,
"logits/rejected": -3.144963264465332,
"logps/chosen": -0.8711767196655273,
"logps/rejected": -0.9750477075576782,
"loss": 0.9427,
"odds_ratio_loss": 0.7154635190963745,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08711767941713333,
"rewards/margins": 0.010387107729911804,
"rewards/rejected": -0.09750477969646454,
"sft_loss": 0.8711767196655273,
"step": 760
},
{
"epoch": 1.36919315403423,
"grad_norm": 3.5656025409698486,
"learning_rate": 2.839262776419313e-06,
"logits/chosen": -3.1182093620300293,
"logits/rejected": -3.1154582500457764,
"logps/chosen": -0.7866981029510498,
"logps/rejected": -1.0985205173492432,
"loss": 0.8467,
"odds_ratio_loss": 0.5998324155807495,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07866980135440826,
"rewards/margins": 0.031182238832116127,
"rewards/rejected": -0.10985203832387924,
"sft_loss": 0.7866981029510498,
"step": 770
},
{
"epoch": 1.3869748833074016,
"grad_norm": 2.08962345123291,
"learning_rate": 2.793051218497817e-06,
"logits/chosen": -3.1209683418273926,
"logits/rejected": -3.1391050815582275,
"logps/chosen": -0.797200083732605,
"logps/rejected": -0.8991384506225586,
"loss": 0.8653,
"odds_ratio_loss": 0.6806570887565613,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.07972002029418945,
"rewards/margins": 0.010193833149969578,
"rewards/rejected": -0.08991385996341705,
"sft_loss": 0.797200083732605,
"step": 780
},
{
"epoch": 1.4047566125805735,
"grad_norm": 1.7654404640197754,
"learning_rate": 2.7467379029217437e-06,
"logits/chosen": -3.092345714569092,
"logits/rejected": -3.099000930786133,
"logps/chosen": -0.7963561415672302,
"logps/rejected": -0.9906966090202332,
"loss": 0.8603,
"odds_ratio_loss": 0.639264702796936,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.0796356275677681,
"rewards/margins": 0.019434038549661636,
"rewards/rejected": -0.09906966984272003,
"sft_loss": 0.7963561415672302,
"step": 790
},
{
"epoch": 1.4225383418537452,
"grad_norm": 1.4254413843154907,
"learning_rate": 2.7003389112975546e-06,
"logits/chosen": -3.1396844387054443,
"logits/rejected": -3.180053949356079,
"logps/chosen": -0.844267725944519,
"logps/rejected": -0.9890397191047668,
"loss": 0.9108,
"odds_ratio_loss": 0.6649594902992249,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.08442677557468414,
"rewards/margins": 0.01447719894349575,
"rewards/rejected": -0.09890398383140564,
"sft_loss": 0.844267725944519,
"step": 800
},
{
"epoch": 1.440320071126917,
"grad_norm": 3.8261585235595703,
"learning_rate": 2.653870354981437e-06,
"logits/chosen": -3.123039722442627,
"logits/rejected": -3.1270766258239746,
"logps/chosen": -0.7622265219688416,
"logps/rejected": -0.9670180082321167,
"loss": 0.8268,
"odds_ratio_loss": 0.6459091901779175,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.07622265070676804,
"rewards/margins": 0.020479146391153336,
"rewards/rejected": -0.09670180082321167,
"sft_loss": 0.7622265219688416,
"step": 810
},
{
"epoch": 1.458101800400089,
"grad_norm": 6.478664875030518,
"learning_rate": 2.6073483694848777e-06,
"logits/chosen": -3.0914266109466553,
"logits/rejected": -3.1468262672424316,
"logps/chosen": -0.7940482497215271,
"logps/rejected": -0.9618217349052429,
"loss": 0.8608,
"odds_ratio_loss": 0.6678633093833923,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07940482348203659,
"rewards/margins": 0.016777347773313522,
"rewards/rejected": -0.09618218243122101,
"sft_loss": 0.7940482497215271,
"step": 820
},
{
"epoch": 1.4758835296732609,
"grad_norm": 1.7955982685089111,
"learning_rate": 2.560789108871847e-06,
"logits/chosen": -3.087249755859375,
"logits/rejected": -3.099762439727783,
"logps/chosen": -0.8293372392654419,
"logps/rejected": -1.0816946029663086,
"loss": 0.8942,
"odds_ratio_loss": 0.6483136415481567,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08293372392654419,
"rewards/margins": 0.025235742330551147,
"rewards/rejected": -0.10816947370767593,
"sft_loss": 0.8293372392654419,
"step": 830
},
{
"epoch": 1.4936652589464325,
"grad_norm": 4.553436279296875,
"learning_rate": 2.514208740149544e-06,
"logits/chosen": -3.123802900314331,
"logits/rejected": -3.1615843772888184,
"logps/chosen": -0.8601408004760742,
"logps/rejected": -1.0482033491134644,
"loss": 0.9282,
"odds_ratio_loss": 0.6806772947311401,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08601407706737518,
"rewards/margins": 0.018806258216500282,
"rewards/rejected": -0.10482033342123032,
"sft_loss": 0.8601408004760742,
"step": 840
},
{
"epoch": 1.5114469882196042,
"grad_norm": 3.1794512271881104,
"learning_rate": 2.46762343765464e-06,
"logits/chosen": -3.1444077491760254,
"logits/rejected": -3.1544933319091797,
"logps/chosen": -0.8352905511856079,
"logps/rejected": -1.0490363836288452,
"loss": 0.898,
"odds_ratio_loss": 0.6273452639579773,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0835290476679802,
"rewards/margins": 0.021374579519033432,
"rewards/rejected": -0.10490362346172333,
"sft_loss": 0.8352905511856079,
"step": 850
},
{
"epoch": 1.5292287174927761,
"grad_norm": 1.8062447309494019,
"learning_rate": 2.4210493774369903e-06,
"logits/chosen": -3.0938150882720947,
"logits/rejected": -3.102355718612671,
"logps/chosen": -0.8377913236618042,
"logps/rejected": -0.9871052503585815,
"loss": 0.9059,
"odds_ratio_loss": 0.6812715530395508,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08377913385629654,
"rewards/margins": 0.014931400306522846,
"rewards/rejected": -0.09871052205562592,
"sft_loss": 0.8377913236618042,
"step": 860
},
{
"epoch": 1.547010446765948,
"grad_norm": 1.5386985540390015,
"learning_rate": 2.374502731642732e-06,
"logits/chosen": -3.1051342487335205,
"logits/rejected": -3.1245017051696777,
"logps/chosen": -0.8524861335754395,
"logps/rejected": -1.017881155014038,
"loss": 0.9179,
"odds_ratio_loss": 0.6543157696723938,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08524861931800842,
"rewards/margins": 0.016539499163627625,
"rewards/rejected": -0.10178811848163605,
"sft_loss": 0.8524861335754395,
"step": 870
},
{
"epoch": 1.56479217603912,
"grad_norm": 2.0160138607025146,
"learning_rate": 2.3279996628987556e-06,
"logits/chosen": -3.093174457550049,
"logits/rejected": -3.1271913051605225,
"logps/chosen": -0.8324817419052124,
"logps/rejected": -0.9784995913505554,
"loss": 0.8998,
"odds_ratio_loss": 0.6732369661331177,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.08324816823005676,
"rewards/margins": 0.01460178941488266,
"rewards/rejected": -0.09784995764493942,
"sft_loss": 0.8324817419052124,
"step": 880
},
{
"epoch": 1.5825739053122916,
"grad_norm": 1.5362610816955566,
"learning_rate": 2.281556318700474e-06,
"logits/chosen": -3.1063926219940186,
"logits/rejected": -3.150496482849121,
"logps/chosen": -0.7895249128341675,
"logps/rejected": -0.9089393615722656,
"loss": 0.8592,
"odds_ratio_loss": 0.6968772411346436,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07895249128341675,
"rewards/margins": 0.011941445991396904,
"rewards/rejected": -0.0908939391374588,
"sft_loss": 0.7895249128341675,
"step": 890
},
{
"epoch": 1.6003556345854635,
"grad_norm": 3.5220394134521484,
"learning_rate": 2.2351888258048408e-06,
"logits/chosen": -3.0469326972961426,
"logits/rejected": -3.095856189727783,
"logps/chosen": -0.798681914806366,
"logps/rejected": -0.9773387908935547,
"loss": 0.8628,
"odds_ratio_loss": 0.641067385673523,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07986819744110107,
"rewards/margins": 0.017865682020783424,
"rewards/rejected": -0.09773387759923935,
"sft_loss": 0.798681914806366,
"step": 900
},
{
"epoch": 1.6181373638586352,
"grad_norm": 2.17846941947937,
"learning_rate": 2.188913284630584e-06,
"logits/chosen": -3.1179308891296387,
"logits/rejected": -3.14939284324646,
"logps/chosen": -0.8766034841537476,
"logps/rejected": -0.9665753245353699,
"loss": 0.9481,
"odds_ratio_loss": 0.7149003148078918,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.08766035735607147,
"rewards/margins": 0.008997179567813873,
"rewards/rejected": -0.09665753692388535,
"sft_loss": 0.8766034841537476,
"step": 910
},
{
"epoch": 1.635919093131807,
"grad_norm": 6.3163251876831055,
"learning_rate": 2.1427457636675652e-06,
"logits/chosen": -3.118419647216797,
"logits/rejected": -3.1435821056365967,
"logps/chosen": -0.8221105337142944,
"logps/rejected": -0.9653439521789551,
"loss": 0.8905,
"odds_ratio_loss": 0.6840168833732605,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08221106976270676,
"rewards/margins": 0.014323326759040356,
"rewards/rejected": -0.09653439372777939,
"sft_loss": 0.8221105337142944,
"step": 920
},
{
"epoch": 1.653700822404979,
"grad_norm": 2.3323636054992676,
"learning_rate": 2.096702293897247e-06,
"logits/chosen": -3.118809700012207,
"logits/rejected": -3.1240172386169434,
"logps/chosen": -0.7927727699279785,
"logps/rejected": -1.0566480159759521,
"loss": 0.856,
"odds_ratio_loss": 0.6325381994247437,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.07927727699279785,
"rewards/margins": 0.026387536898255348,
"rewards/rejected": -0.10566481202840805,
"sft_loss": 0.7927727699279785,
"step": 930
},
{
"epoch": 1.6714825516781509,
"grad_norm": 2.5871617794036865,
"learning_rate": 2.0507988632261672e-06,
"logits/chosen": -3.0783491134643555,
"logits/rejected": -3.142695188522339,
"logps/chosen": -0.788642406463623,
"logps/rejected": -0.9959260821342468,
"loss": 0.8502,
"odds_ratio_loss": 0.6157304048538208,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07886423915624619,
"rewards/margins": 0.020728373900055885,
"rewards/rejected": -0.09959261119365692,
"sft_loss": 0.788642406463623,
"step": 940
},
{
"epoch": 1.6892642809513225,
"grad_norm": 6.09738302230835,
"learning_rate": 2.005051410934382e-06,
"logits/chosen": -3.1027965545654297,
"logits/rejected": -3.1486849784851074,
"logps/chosen": -0.8924347162246704,
"logps/rejected": -1.025657057762146,
"loss": 0.9605,
"odds_ratio_loss": 0.6808988451957703,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.08924347162246704,
"rewards/margins": 0.013322234153747559,
"rewards/rejected": -0.1025657057762146,
"sft_loss": 0.8924347162246704,
"step": 950
},
{
"epoch": 1.7070460102244942,
"grad_norm": 2.1956799030303955,
"learning_rate": 1.9594758221407843e-06,
"logits/chosen": -3.1190600395202637,
"logits/rejected": -3.1192500591278076,
"logps/chosen": -0.7558837532997131,
"logps/rejected": -0.9841734766960144,
"loss": 0.8162,
"odds_ratio_loss": 0.6032260060310364,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07558837532997131,
"rewards/margins": 0.0228289682418108,
"rewards/rejected": -0.09841735661029816,
"sft_loss": 0.7558837532997131,
"step": 960
},
{
"epoch": 1.724827739497666,
"grad_norm": 3.2010116577148438,
"learning_rate": 1.9140879222872408e-06,
"logits/chosen": -3.119006872177124,
"logits/rejected": -3.144542694091797,
"logps/chosen": -0.7783070206642151,
"logps/rejected": -0.8883264660835266,
"loss": 0.8488,
"odds_ratio_loss": 0.70525062084198,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.07783070206642151,
"rewards/margins": 0.011001949198544025,
"rewards/rejected": -0.08883266150951385,
"sft_loss": 0.7783070206642151,
"step": 970
},
{
"epoch": 1.742609468770838,
"grad_norm": 1.4073106050491333,
"learning_rate": 1.8689034716434346e-06,
"logits/chosen": -3.143158197402954,
"logits/rejected": -3.162026882171631,
"logps/chosen": -0.849888026714325,
"logps/rejected": -0.9449575543403625,
"loss": 0.9213,
"odds_ratio_loss": 0.7145692706108093,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08498881012201309,
"rewards/margins": 0.009506945498287678,
"rewards/rejected": -0.0944957509636879,
"sft_loss": 0.849888026714325,
"step": 980
},
{
"epoch": 1.76039119804401,
"grad_norm": 1.643964171409607,
"learning_rate": 1.8239381598343576e-06,
"logits/chosen": -3.1246304512023926,
"logits/rejected": -3.1464321613311768,
"logps/chosen": -0.7999427914619446,
"logps/rejected": -0.9506388902664185,
"loss": 0.8683,
"odds_ratio_loss": 0.6839095950126648,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.07999428361654282,
"rewards/margins": 0.015069609507918358,
"rewards/rejected": -0.09506388753652573,
"sft_loss": 0.7999427914619446,
"step": 990
},
{
"epoch": 1.7781729273171816,
"grad_norm": 4.767948150634766,
"learning_rate": 1.779207600392312e-06,
"logits/chosen": -3.123835325241089,
"logits/rejected": -3.1298935413360596,
"logps/chosen": -0.8067057728767395,
"logps/rejected": -0.9445611238479614,
"loss": 0.8737,
"odds_ratio_loss": 0.6700451970100403,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.080670565366745,
"rewards/margins": 0.013785535469651222,
"rewards/rejected": -0.09445609152317047,
"sft_loss": 0.8067057728767395,
"step": 1000
},
{
"epoch": 1.7781729273171816,
"eval_logits/chosen": -3.1139109134674072,
"eval_logits/rejected": -3.1430606842041016,
"eval_logps/chosen": -0.8136406540870667,
"eval_logps/rejected": -1.018557071685791,
"eval_loss": 0.8773505687713623,
"eval_odds_ratio_loss": 0.6370999217033386,
"eval_rewards/accuracies": 0.5759999752044678,
"eval_rewards/chosen": -0.08136406540870667,
"eval_rewards/margins": 0.020491650328040123,
"eval_rewards/rejected": -0.10185571014881134,
"eval_runtime": 189.1267,
"eval_samples_per_second": 5.287,
"eval_sft_loss": 0.8136406540870667,
"eval_steps_per_second": 2.644,
"step": 1000
},
{
"epoch": 1.7959546565903532,
"grad_norm": 2.2980809211730957,
"learning_rate": 1.7347273253353552e-06,
"logits/chosen": -3.0896313190460205,
"logits/rejected": -3.117469310760498,
"logps/chosen": -0.8154736757278442,
"logps/rejected": -0.9821268320083618,
"loss": 0.8833,
"odds_ratio_loss": 0.6783260107040405,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.08154736459255219,
"rewards/margins": 0.016665320843458176,
"rewards/rejected": -0.09821268171072006,
"sft_loss": 0.8154736757278442,
"step": 1010
},
{
"epoch": 1.8137363858635251,
"grad_norm": 4.3619232177734375,
"learning_rate": 1.690512779774029e-06,
"logits/chosen": -3.108875036239624,
"logits/rejected": -3.119654655456543,
"logps/chosen": -0.8301160931587219,
"logps/rejected": -1.0722554922103882,
"loss": 0.8927,
"odds_ratio_loss": 0.6254863142967224,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.08301161974668503,
"rewards/margins": 0.024213943630456924,
"rewards/rejected": -0.10722555965185165,
"sft_loss": 0.8301160931587219,
"step": 1020
},
{
"epoch": 1.831518115136697,
"grad_norm": 2.628239870071411,
"learning_rate": 1.6465793165482838e-06,
"logits/chosen": -3.098904609680176,
"logits/rejected": -3.1030189990997314,
"logps/chosen": -0.7733818888664246,
"logps/rejected": -0.9600175619125366,
"loss": 0.8352,
"odds_ratio_loss": 0.6180769205093384,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07733818888664246,
"rewards/margins": 0.018663574010133743,
"rewards/rejected": -0.0960017591714859,
"sft_loss": 0.7733818888664246,
"step": 1030
},
{
"epoch": 1.849299844409869,
"grad_norm": 2.7811410427093506,
"learning_rate": 1.6029421908964305e-06,
"logits/chosen": -3.0989787578582764,
"logits/rejected": -3.1128220558166504,
"logps/chosen": -0.7662326693534851,
"logps/rejected": -1.2116987705230713,
"loss": 0.8252,
"odds_ratio_loss": 0.5896168351173401,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07662326097488403,
"rewards/margins": 0.044546615332365036,
"rewards/rejected": -0.12116988748311996,
"sft_loss": 0.7662326693534851,
"step": 1040
},
{
"epoch": 1.8670815736830408,
"grad_norm": 2.588897466659546,
"learning_rate": 1.559616555157985e-06,
"logits/chosen": -3.1540348529815674,
"logits/rejected": -3.1318535804748535,
"logps/chosen": -0.8036566972732544,
"logps/rejected": -0.9966574907302856,
"loss": 0.8694,
"odds_ratio_loss": 0.656964123249054,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08036566525697708,
"rewards/margins": 0.019300078973174095,
"rewards/rejected": -0.09966574609279633,
"sft_loss": 0.8036566972732544,
"step": 1050
},
{
"epoch": 1.8848633029562125,
"grad_norm": 3.195645332336426,
"learning_rate": 1.516617453512252e-06,
"logits/chosen": -3.133869171142578,
"logits/rejected": -3.1599550247192383,
"logps/chosen": -0.8567641377449036,
"logps/rejected": -0.9691047668457031,
"loss": 0.9289,
"odds_ratio_loss": 0.7213839888572693,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0856764167547226,
"rewards/margins": 0.011234072968363762,
"rewards/rejected": -0.0969104915857315,
"sft_loss": 0.8567641377449036,
"step": 1060
},
{
"epoch": 1.9026450322293842,
"grad_norm": 3.544257164001465,
"learning_rate": 1.473959816754449e-06,
"logits/chosen": -3.1071698665618896,
"logits/rejected": -3.119621992111206,
"logps/chosen": -0.8016077280044556,
"logps/rejected": -0.9158931970596313,
"loss": 0.8714,
"odds_ratio_loss": 0.6980700492858887,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08016077429056168,
"rewards/margins": 0.011428546160459518,
"rewards/rejected": -0.09158932417631149,
"sft_loss": 0.8016077280044556,
"step": 1070
},
{
"epoch": 1.920426761502556,
"grad_norm": 2.2053537368774414,
"learning_rate": 1.4316584571112213e-06,
"logits/chosen": -3.1642978191375732,
"logits/rejected": -3.1734910011291504,
"logps/chosen": -0.8405769467353821,
"logps/rejected": -0.9534690976142883,
"loss": 0.9088,
"odds_ratio_loss": 0.6824837327003479,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08405770361423492,
"rewards/margins": 0.011289209127426147,
"rewards/rejected": -0.09534691274166107,
"sft_loss": 0.8405769467353821,
"step": 1080
},
{
"epoch": 1.938208490775728,
"grad_norm": 3.7732386589050293,
"learning_rate": 1.389728063097306e-06,
"logits/chosen": -3.134726047515869,
"logits/rejected": -3.1553549766540527,
"logps/chosen": -0.8409829139709473,
"logps/rejected": -1.037058711051941,
"loss": 0.9054,
"odds_ratio_loss": 0.6446219682693481,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08409829437732697,
"rewards/margins": 0.019607581198215485,
"rewards/rejected": -0.10370586812496185,
"sft_loss": 0.8409829139709473,
"step": 1090
},
{
"epoch": 1.9559902200488999,
"grad_norm": 1.9941437244415283,
"learning_rate": 1.348183194415179e-06,
"logits/chosen": -3.12330961227417,
"logits/rejected": -3.0894432067871094,
"logps/chosen": -0.8183244466781616,
"logps/rejected": -1.0717554092407227,
"loss": 0.8794,
"odds_ratio_loss": 0.6103757619857788,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.08183245360851288,
"rewards/margins": 0.025343094021081924,
"rewards/rejected": -0.1071755513548851,
"sft_loss": 0.8183244466781616,
"step": 1100
},
{
"epoch": 1.9737719493220716,
"grad_norm": 2.1723690032958984,
"learning_rate": 1.3070382768994015e-06,
"logits/chosen": -3.1375839710235596,
"logits/rejected": -3.1476972103118896,
"logps/chosen": -0.8107814788818359,
"logps/rejected": -0.9439038038253784,
"loss": 0.8756,
"odds_ratio_loss": 0.6477454900741577,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.08107815682888031,
"rewards/margins": 0.01331222616136074,
"rewards/rejected": -0.0943903774023056,
"sft_loss": 0.8107814788818359,
"step": 1110
},
{
"epoch": 1.9915536785952432,
"grad_norm": 2.5445873737335205,
"learning_rate": 1.2663075975074746e-06,
"logits/chosen": -3.1265861988067627,
"logits/rejected": -3.1423563957214355,
"logps/chosen": -0.79461270570755,
"logps/rejected": -1.0579864978790283,
"loss": 0.8606,
"odds_ratio_loss": 0.6598424911499023,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07946126163005829,
"rewards/margins": 0.026337390765547752,
"rewards/rejected": -0.10579866170883179,
"sft_loss": 0.79461270570755,
"step": 1120
},
{
"epoch": 2.009335407868415,
"grad_norm": 5.374589920043945,
"learning_rate": 1.2260052993589034e-06,
"logits/chosen": -3.117713212966919,
"logits/rejected": -3.1392993927001953,
"logps/chosen": -0.9073926210403442,
"logps/rejected": -0.9984840154647827,
"loss": 0.9829,
"odds_ratio_loss": 0.7546229362487793,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.09073926508426666,
"rewards/margins": 0.009109143167734146,
"rewards/rejected": -0.09984840452671051,
"sft_loss": 0.9073926210403442,
"step": 1130
},
{
"epoch": 2.027117137141587,
"grad_norm": 1.554049015045166,
"learning_rate": 1.1861453768242099e-06,
"logits/chosen": -3.1674160957336426,
"logits/rejected": -3.16347599029541,
"logps/chosen": -0.730399489402771,
"logps/rejected": -0.9565631747245789,
"loss": 0.7897,
"odds_ratio_loss": 0.5933364629745483,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.0730399638414383,
"rewards/margins": 0.02261636219918728,
"rewards/rejected": -0.09565632045269012,
"sft_loss": 0.730399489402771,
"step": 1140
},
{
"epoch": 2.044898866414759,
"grad_norm": 10.319910049438477,
"learning_rate": 1.1467416706655982e-06,
"logits/chosen": -3.141704559326172,
"logits/rejected": -3.173985719680786,
"logps/chosen": -0.8747559785842896,
"logps/rejected": -1.045388102531433,
"loss": 0.9448,
"odds_ratio_loss": 0.7000676989555359,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.08747559785842896,
"rewards/margins": 0.017063220962882042,
"rewards/rejected": -0.10453881323337555,
"sft_loss": 0.8747559785842896,
"step": 1150
},
{
"epoch": 2.062680595687931,
"grad_norm": 2.64601993560791,
"learning_rate": 1.1078078632309559e-06,
"logits/chosen": -3.1251769065856934,
"logits/rejected": -3.154083251953125,
"logps/chosen": -0.7768465280532837,
"logps/rejected": -0.9674509167671204,
"loss": 0.8405,
"odds_ratio_loss": 0.6370204091072083,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0776846632361412,
"rewards/margins": 0.01906043104827404,
"rewards/rejected": -0.0967450961470604,
"sft_loss": 0.7768465280532837,
"step": 1160
},
{
"epoch": 2.0804623249611023,
"grad_norm": 8.88864517211914,
"learning_rate": 1.0693574737028627e-06,
"logits/chosen": -3.1327998638153076,
"logits/rejected": -3.1513829231262207,
"logps/chosen": -0.811104416847229,
"logps/rejected": -0.959033191204071,
"loss": 0.8792,
"odds_ratio_loss": 0.6805382966995239,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08111042529344559,
"rewards/margins": 0.01479288749396801,
"rewards/rejected": -0.09590331465005875,
"sft_loss": 0.811104416847229,
"step": 1170
},
{
"epoch": 2.098244054234274,
"grad_norm": 2.3200793266296387,
"learning_rate": 1.0314038534042586e-06,
"logits/chosen": -3.154930830001831,
"logits/rejected": -3.1501238346099854,
"logps/chosen": -0.7636488676071167,
"logps/rejected": -0.9957521557807922,
"loss": 0.8285,
"odds_ratio_loss": 0.6481651067733765,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.07636488974094391,
"rewards/margins": 0.023210326209664345,
"rewards/rejected": -0.0995752140879631,
"sft_loss": 0.7636488676071167,
"step": 1180
},
{
"epoch": 2.116025783507446,
"grad_norm": 3.470479965209961,
"learning_rate": 9.939601811623946e-07,
"logits/chosen": -3.1409804821014404,
"logits/rejected": -3.14042329788208,
"logps/chosen": -0.813196063041687,
"logps/rejected": -0.9785780906677246,
"loss": 0.8813,
"odds_ratio_loss": 0.6813095808029175,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08131961524486542,
"rewards/margins": 0.01653820462524891,
"rewards/rejected": -0.09785781800746918,
"sft_loss": 0.813196063041687,
"step": 1190
},
{
"epoch": 2.133807512780618,
"grad_norm": 2.298424005508423,
"learning_rate": 9.570394587326825e-07,
"logits/chosen": -3.1406095027923584,
"logits/rejected": -3.138267993927002,
"logps/chosen": -0.7988274693489075,
"logps/rejected": -1.0399543046951294,
"loss": 0.8608,
"odds_ratio_loss": 0.6200910806655884,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07988274842500687,
"rewards/margins": 0.024112680926918983,
"rewards/rejected": -0.1039954274892807,
"sft_loss": 0.7988274693489075,
"step": 1200
},
{
"epoch": 2.15158924205379,
"grad_norm": 1.9331620931625366,
"learning_rate": 9.206545062840302e-07,
"logits/chosen": -3.181776285171509,
"logits/rejected": -3.1430013179779053,
"logps/chosen": -0.7699235081672668,
"logps/rejected": -1.0029253959655762,
"loss": 0.831,
"odds_ratio_loss": 0.6103402376174927,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07699234783649445,
"rewards/margins": 0.02330019325017929,
"rewards/rejected": -0.10029254108667374,
"sft_loss": 0.7699235081672668,
"step": 1210
},
{
"epoch": 2.1693709713269618,
"grad_norm": 1.9117600917816162,
"learning_rate": 8.848179579472285e-07,
"logits/chosen": -3.16937518119812,
"logits/rejected": -3.171738862991333,
"logps/chosen": -0.7665938138961792,
"logps/rejected": -0.8684927821159363,
"loss": 0.8349,
"odds_ratio_loss": 0.6835185885429382,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.07665937393903732,
"rewards/margins": 0.010189898312091827,
"rewards/rejected": -0.08684928715229034,
"sft_loss": 0.7665938138961792,
"step": 1220
},
{
"epoch": 2.1871527006001332,
"grad_norm": 5.053982734680176,
"learning_rate": 8.495422574279403e-07,
"logits/chosen": -3.1011910438537598,
"logits/rejected": -3.092721462249756,
"logps/chosen": -0.7057160139083862,
"logps/rejected": -0.9676464796066284,
"loss": 0.7646,
"odds_ratio_loss": 0.5885148048400879,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07057160139083862,
"rewards/margins": 0.026193052530288696,
"rewards/rejected": -0.09676466137170792,
"sft_loss": 0.7057160139083862,
"step": 1230
},
{
"epoch": 2.204934429873305,
"grad_norm": 2.2379298210144043,
"learning_rate": 8.148396536858063e-07,
"logits/chosen": -3.1442742347717285,
"logits/rejected": -3.1458396911621094,
"logps/chosen": -0.8305691480636597,
"logps/rejected": -1.0573723316192627,
"loss": 0.8959,
"odds_ratio_loss": 0.6531893610954285,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08305691182613373,
"rewards/margins": 0.02268032357096672,
"rewards/rejected": -0.10573724657297134,
"sft_loss": 0.8305691480636597,
"step": 1240
},
{
"epoch": 2.222716159146477,
"grad_norm": 2.2036707401275635,
"learning_rate": 7.807221966811815e-07,
"logits/chosen": -3.1296424865722656,
"logits/rejected": -3.142879009246826,
"logps/chosen": -0.815384566783905,
"logps/rejected": -0.9788722991943359,
"loss": 0.8822,
"odds_ratio_loss": 0.6686090230941772,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.08153846114873886,
"rewards/margins": 0.01634877361357212,
"rewards/rejected": -0.09788723289966583,
"sft_loss": 0.815384566783905,
"step": 1250
},
{
"epoch": 2.240497888419649,
"grad_norm": 1.7958269119262695,
"learning_rate": 7.47201733190962e-07,
"logits/chosen": -3.1007513999938965,
"logits/rejected": -3.1123993396759033,
"logps/chosen": -0.7537363767623901,
"logps/rejected": -0.9363679885864258,
"loss": 0.8162,
"odds_ratio_loss": 0.6245176792144775,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.07537363469600677,
"rewards/margins": 0.018263157457113266,
"rewards/rejected": -0.09363678842782974,
"sft_loss": 0.7537363767623901,
"step": 1260
},
{
"epoch": 2.258279617692821,
"grad_norm": 2.2002153396606445,
"learning_rate": 7.142899026949721e-07,
"logits/chosen": -3.181532382965088,
"logits/rejected": -3.189258098602295,
"logps/chosen": -0.7867833971977234,
"logps/rejected": -0.9312666654586792,
"loss": 0.8504,
"odds_ratio_loss": 0.6360999941825867,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07867833971977234,
"rewards/margins": 0.01444832980632782,
"rewards/rejected": -0.09312666952610016,
"sft_loss": 0.7867833971977234,
"step": 1270
},
{
"epoch": 2.2760613469659923,
"grad_norm": 5.216893672943115,
"learning_rate": 6.819981333343273e-07,
"logits/chosen": -3.0660347938537598,
"logits/rejected": -3.095858097076416,
"logps/chosen": -0.7724840641021729,
"logps/rejected": -0.9914291501045227,
"loss": 0.8347,
"odds_ratio_loss": 0.6223303079605103,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07724839448928833,
"rewards/margins": 0.021894508972764015,
"rewards/rejected": -0.09914292395114899,
"sft_loss": 0.7724840641021729,
"step": 1280
},
{
"epoch": 2.293843076239164,
"grad_norm": 2.3061211109161377,
"learning_rate": 6.503376379431839e-07,
"logits/chosen": -3.1206648349761963,
"logits/rejected": -3.1211998462677,
"logps/chosen": -0.8609710931777954,
"logps/rejected": -0.918415904045105,
"loss": 0.9299,
"odds_ratio_loss": 0.6894447803497314,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0860971063375473,
"rewards/margins": 0.005744467489421368,
"rewards/rejected": -0.09184158593416214,
"sft_loss": 0.8609710931777954,
"step": 1290
},
{
"epoch": 2.311624805512336,
"grad_norm": 1.7814314365386963,
"learning_rate": 6.193194101552502e-07,
"logits/chosen": -3.126063346862793,
"logits/rejected": -3.1108012199401855,
"logps/chosen": -0.7555452585220337,
"logps/rejected": -1.0133960247039795,
"loss": 0.8139,
"odds_ratio_loss": 0.5832154154777527,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.07555452734231949,
"rewards/margins": 0.025785094127058983,
"rewards/rejected": -0.10133961588144302,
"sft_loss": 0.7555452585220337,
"step": 1300
},
{
"epoch": 2.329406534785508,
"grad_norm": 3.5369389057159424,
"learning_rate": 5.889542205864083e-07,
"logits/chosen": -3.1118927001953125,
"logits/rejected": -3.1322388648986816,
"logps/chosen": -0.809761643409729,
"logps/rejected": -1.0040373802185059,
"loss": 0.873,
"odds_ratio_loss": 0.6319769620895386,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.08097617328166962,
"rewards/margins": 0.019427578896284103,
"rewards/rejected": -0.10040374100208282,
"sft_loss": 0.809761643409729,
"step": 1310
},
{
"epoch": 2.34718826405868,
"grad_norm": 1.8906471729278564,
"learning_rate": 5.592526130947862e-07,
"logits/chosen": -3.094481945037842,
"logits/rejected": -3.1228842735290527,
"logps/chosen": -0.8294750452041626,
"logps/rejected": -1.0103859901428223,
"loss": 0.8988,
"odds_ratio_loss": 0.6931812763214111,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.0829475075006485,
"rewards/margins": 0.018091093748807907,
"rewards/rejected": -0.10103859752416611,
"sft_loss": 0.8294750452041626,
"step": 1320
},
{
"epoch": 2.3649699933318518,
"grad_norm": 2.300255298614502,
"learning_rate": 5.302249011195507e-07,
"logits/chosen": -3.091862916946411,
"logits/rejected": -3.1117231845855713,
"logps/chosen": -0.7831630110740662,
"logps/rejected": -0.9011325836181641,
"loss": 0.8506,
"odds_ratio_loss": 0.6743569374084473,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07831630855798721,
"rewards/margins": 0.01179695688188076,
"rewards/rejected": -0.09011325985193253,
"sft_loss": 0.7831630110740662,
"step": 1330
},
{
"epoch": 2.382751722605023,
"grad_norm": 2.0519402027130127,
"learning_rate": 5.018811640997307e-07,
"logits/chosen": -3.1082584857940674,
"logits/rejected": -3.143366813659668,
"logps/chosen": -0.8331505656242371,
"logps/rejected": -1.1331783533096313,
"loss": 0.8955,
"odds_ratio_loss": 0.6230587363243103,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.08331505209207535,
"rewards/margins": 0.030002791434526443,
"rewards/rejected": -0.1133178323507309,
"sft_loss": 0.8331505656242371,
"step": 1340
},
{
"epoch": 2.400533451878195,
"grad_norm": 2.004222869873047,
"learning_rate": 4.7423124397427105e-07,
"logits/chosen": -3.0787928104400635,
"logits/rejected": -3.1223533153533936,
"logps/chosen": -0.8188081979751587,
"logps/rejected": -0.9587352871894836,
"loss": 0.8855,
"odds_ratio_loss": 0.6670054197311401,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.08188082277774811,
"rewards/margins": 0.013992704451084137,
"rewards/rejected": -0.09587351977825165,
"sft_loss": 0.8188081979751587,
"step": 1350
},
{
"epoch": 2.418315181151367,
"grad_norm": 2.007474422454834,
"learning_rate": 4.472847417645787e-07,
"logits/chosen": -3.1503520011901855,
"logits/rejected": -3.1351065635681152,
"logps/chosen": -0.8101593255996704,
"logps/rejected": -1.086388349533081,
"loss": 0.8703,
"odds_ratio_loss": 0.6015632748603821,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.0810159370303154,
"rewards/margins": 0.027622899040579796,
"rewards/rejected": -0.10863884538412094,
"sft_loss": 0.8101593255996704,
"step": 1360
},
{
"epoch": 2.436096910424539,
"grad_norm": 1.4029066562652588,
"learning_rate": 4.210510142406993e-07,
"logits/chosen": -3.122816562652588,
"logits/rejected": -3.095937490463257,
"logps/chosen": -0.7846102714538574,
"logps/rejected": -1.0122342109680176,
"loss": 0.8472,
"odds_ratio_loss": 0.626270055770874,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07846103608608246,
"rewards/margins": 0.022762387990951538,
"rewards/rejected": -0.101223424077034,
"sft_loss": 0.7846102714538574,
"step": 1370
},
{
"epoch": 2.4538786396977104,
"grad_norm": 1.7324745655059814,
"learning_rate": 3.9553917067232966e-07,
"logits/chosen": -3.122037172317505,
"logits/rejected": -3.1394925117492676,
"logps/chosen": -0.8041805028915405,
"logps/rejected": -0.9916130900382996,
"loss": 0.87,
"odds_ratio_loss": 0.658187210559845,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08041805773973465,
"rewards/margins": 0.018743254244327545,
"rewards/rejected": -0.0991613045334816,
"sft_loss": 0.8041805028915405,
"step": 1380
},
{
"epoch": 2.4716603689708823,
"grad_norm": 2.2863593101501465,
"learning_rate": 3.707580696657509e-07,
"logits/chosen": -3.118274450302124,
"logits/rejected": -3.109182834625244,
"logps/chosen": -0.7898752689361572,
"logps/rejected": -0.945044219493866,
"loss": 0.8546,
"odds_ratio_loss": 0.6472212672233582,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.07898753136396408,
"rewards/margins": 0.015516892075538635,
"rewards/rejected": -0.09450441598892212,
"sft_loss": 0.7898752689361572,
"step": 1390
},
{
"epoch": 2.489442098244054,
"grad_norm": 2.1385016441345215,
"learning_rate": 3.4671631608781815e-07,
"logits/chosen": -3.125810384750366,
"logits/rejected": -3.1365230083465576,
"logps/chosen": -0.8170459866523743,
"logps/rejected": -1.0128613710403442,
"loss": 0.8858,
"odds_ratio_loss": 0.6880038380622864,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.08170458674430847,
"rewards/margins": 0.019581545144319534,
"rewards/rejected": -0.1012861356139183,
"sft_loss": 0.8170459866523743,
"step": 1400
},
{
"epoch": 2.507223827517226,
"grad_norm": 2.561035633087158,
"learning_rate": 3.234222580780405e-07,
"logits/chosen": -3.1027114391326904,
"logits/rejected": -3.124307632446289,
"logps/chosen": -0.7941089868545532,
"logps/rejected": -0.9503856897354126,
"loss": 0.8579,
"odds_ratio_loss": 0.6381289958953857,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07941089570522308,
"rewards/margins": 0.015627671033143997,
"rewards/rejected": -0.09503857046365738,
"sft_loss": 0.7941089868545532,
"step": 1410
},
{
"epoch": 2.525005556790398,
"grad_norm": 2.1497950553894043,
"learning_rate": 3.0088398414982375e-07,
"logits/chosen": -3.0808122158050537,
"logits/rejected": -3.118448257446289,
"logps/chosen": -0.8251555562019348,
"logps/rejected": -1.0561994314193726,
"loss": 0.8918,
"odds_ratio_loss": 0.6662226319313049,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.08251555263996124,
"rewards/margins": 0.023104388266801834,
"rewards/rejected": -0.10561992973089218,
"sft_loss": 0.8251555562019348,
"step": 1420
},
{
"epoch": 2.54278728606357,
"grad_norm": 2.1733312606811523,
"learning_rate": 2.7910932038184487e-07,
"logits/chosen": -3.0798656940460205,
"logits/rejected": -3.0586531162261963,
"logps/chosen": -0.8029570579528809,
"logps/rejected": -0.9888774156570435,
"loss": 0.8665,
"odds_ratio_loss": 0.6356409192085266,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.08029570430517197,
"rewards/margins": 0.018592040985822678,
"rewards/rejected": -0.09888775646686554,
"sft_loss": 0.8029570579528809,
"step": 1430
},
{
"epoch": 2.5605690153367417,
"grad_norm": 2.0504164695739746,
"learning_rate": 2.5810582770057325e-07,
"logits/chosen": -3.1239726543426514,
"logits/rejected": -3.1643452644348145,
"logps/chosen": -0.7773226499557495,
"logps/rejected": -0.9956240653991699,
"loss": 0.8421,
"odds_ratio_loss": 0.6477575302124023,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.07773226499557495,
"rewards/margins": 0.02183014526963234,
"rewards/rejected": -0.09956242144107819,
"sft_loss": 0.7773226499557495,
"step": 1440
},
{
"epoch": 2.578350744609913,
"grad_norm": 2.4383292198181152,
"learning_rate": 2.3788079925484402e-07,
"logits/chosen": -3.1351797580718994,
"logits/rejected": -3.1292059421539307,
"logps/chosen": -0.8360335230827332,
"logps/rejected": -0.9335094690322876,
"loss": 0.9052,
"odds_ratio_loss": 0.6920183300971985,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.08360335975885391,
"rewards/margins": 0.009747589938342571,
"rewards/rejected": -0.09335094690322876,
"sft_loss": 0.8360335230827332,
"step": 1450
},
{
"epoch": 2.596132473883085,
"grad_norm": 2.4566612243652344,
"learning_rate": 2.1844125788342661e-07,
"logits/chosen": -3.108156681060791,
"logits/rejected": -3.1151247024536133,
"logps/chosen": -0.7554203867912292,
"logps/rejected": -1.1023544073104858,
"loss": 0.8163,
"odds_ratio_loss": 0.6091145277023315,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07554203271865845,
"rewards/margins": 0.034693412482738495,
"rewards/rejected": -0.11023545265197754,
"sft_loss": 0.7554203867912292,
"step": 1460
},
{
"epoch": 2.613914203156257,
"grad_norm": 2.3035502433776855,
"learning_rate": 1.9979395367644428e-07,
"logits/chosen": -3.143681049346924,
"logits/rejected": -3.1587703227996826,
"logps/chosen": -0.7682673931121826,
"logps/rejected": -0.9972553253173828,
"loss": 0.8278,
"odds_ratio_loss": 0.5949780344963074,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.07682673633098602,
"rewards/margins": 0.02289879135787487,
"rewards/rejected": -0.09972552955150604,
"sft_loss": 0.7682673931121826,
"step": 1470
},
{
"epoch": 2.631695932429429,
"grad_norm": 3.8865010738372803,
"learning_rate": 1.81945361631512e-07,
"logits/chosen": -3.1585988998413086,
"logits/rejected": -3.178792715072632,
"logps/chosen": -0.8142994046211243,
"logps/rejected": -0.951032817363739,
"loss": 0.8828,
"odds_ratio_loss": 0.6848469972610474,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.08142994344234467,
"rewards/margins": 0.013673332519829273,
"rewards/rejected": -0.09510327875614166,
"sft_loss": 0.8142994046211243,
"step": 1480
},
{
"epoch": 2.6494776617026004,
"grad_norm": 3.1666336059570312,
"learning_rate": 1.6490167940538343e-07,
"logits/chosen": -3.154137372970581,
"logits/rejected": -3.1491308212280273,
"logps/chosen": -0.7683095932006836,
"logps/rejected": -1.0064373016357422,
"loss": 0.8283,
"odds_ratio_loss": 0.6001058220863342,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.07683096826076508,
"rewards/margins": 0.023812763392925262,
"rewards/rejected": -0.10064373165369034,
"sft_loss": 0.7683095932006836,
"step": 1490
},
{
"epoch": 2.6672593909757722,
"grad_norm": 9.307645797729492,
"learning_rate": 1.4866882516191339e-07,
"logits/chosen": -3.0799524784088135,
"logits/rejected": -3.1244568824768066,
"logps/chosen": -0.8257862329483032,
"logps/rejected": -1.011817216873169,
"loss": 0.8923,
"odds_ratio_loss": 0.6649435758590698,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.0825786143541336,
"rewards/margins": 0.018603112548589706,
"rewards/rejected": -0.10118173062801361,
"sft_loss": 0.8257862329483032,
"step": 1500
},
{
"epoch": 2.6672593909757722,
"eval_logits/chosen": -3.1171207427978516,
"eval_logits/rejected": -3.145516872406006,
"eval_logps/chosen": -0.8098240494728088,
"eval_logps/rejected": -1.0174543857574463,
"eval_loss": 0.8734214901924133,
"eval_odds_ratio_loss": 0.6359757781028748,
"eval_rewards/accuracies": 0.5720000267028809,
"eval_rewards/chosen": -0.08098240196704865,
"eval_rewards/margins": 0.02076304331421852,
"eval_rewards/rejected": -0.10174543410539627,
"eval_runtime": 237.6146,
"eval_samples_per_second": 4.208,
"eval_sft_loss": 0.8098240494728088,
"eval_steps_per_second": 2.104,
"step": 1500
},
{
"epoch": 2.685041120248944,
"grad_norm": 4.906961441040039,
"learning_rate": 1.3325243551706057e-07,
"logits/chosen": -3.0958564281463623,
"logits/rejected": -3.1364972591400146,
"logps/chosen": -0.7746607065200806,
"logps/rejected": -1.0890486240386963,
"loss": 0.834,
"odds_ratio_loss": 0.5929327607154846,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.07746607810258865,
"rewards/margins": 0.03143879026174545,
"rewards/rejected": -0.10890486091375351,
"sft_loss": 0.7746607065200806,
"step": 1510
},
{
"epoch": 2.702822849522116,
"grad_norm": 8.813859939575195,
"learning_rate": 1.1865786358165737e-07,
"logits/chosen": -3.1081910133361816,
"logits/rejected": -3.160123586654663,
"logps/chosen": -0.7778806686401367,
"logps/rejected": -0.9513812065124512,
"loss": 0.842,
"odds_ratio_loss": 0.6413436532020569,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.07778806984424591,
"rewards/margins": 0.017350060865283012,
"rewards/rejected": -0.09513812512159348,
"sft_loss": 0.7778806686401367,
"step": 1520
},
{
"epoch": 2.720604578795288,
"grad_norm": 5.624754428863525,
"learning_rate": 1.0489017710262311e-07,
"logits/chosen": -3.1469523906707764,
"logits/rejected": -3.178358554840088,
"logps/chosen": -0.8407548666000366,
"logps/rejected": -1.1098471879959106,
"loss": 0.9062,
"odds_ratio_loss": 0.6548250317573547,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.08407549560070038,
"rewards/margins": 0.026909226551651955,
"rewards/rejected": -0.11098472028970718,
"sft_loss": 0.8407548666000366,
"step": 1530
},
{
"epoch": 2.73838630806846,
"grad_norm": 3.942481756210327,
"learning_rate": 9.195415670326446e-08,
"logits/chosen": -3.13153076171875,
"logits/rejected": -3.1526730060577393,
"logps/chosen": -0.8119581341743469,
"logps/rejected": -1.0097941160202026,
"loss": 0.8766,
"odds_ratio_loss": 0.6465227007865906,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.08119582384824753,
"rewards/margins": 0.019783606752753258,
"rewards/rejected": -0.10097942501306534,
"sft_loss": 0.8119581341743469,
"step": 1540
},
{
"epoch": 2.7561680373416317,
"grad_norm": 3.0953104496002197,
"learning_rate": 7.985429422327384e-08,
"logits/chosen": -3.1453542709350586,
"logits/rejected": -3.165792226791382,
"logps/chosen": -0.8054162263870239,
"logps/rejected": -0.9290239214897156,
"loss": 0.875,
"odds_ratio_loss": 0.6959558129310608,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.08054163306951523,
"rewards/margins": 0.01236076932400465,
"rewards/rejected": -0.09290239959955215,
"sft_loss": 0.8054162263870239,
"step": 1550
},
{
"epoch": 2.773949766614803,
"grad_norm": 1.9901708364486694,
"learning_rate": 6.859479115900818e-08,
"logits/chosen": -3.118248224258423,
"logits/rejected": -3.158768892288208,
"logps/chosen": -0.7784756422042847,
"logps/rejected": -0.9923986196517944,
"loss": 0.8408,
"odds_ratio_loss": 0.6230874061584473,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.07784756273031235,
"rewards/margins": 0.021392302587628365,
"rewards/rejected": -0.09923987090587616,
"sft_loss": 0.7784756422042847,
"step": 1560
},
{
"epoch": 2.791731495887975,
"grad_norm": 10.119742393493652,
"learning_rate": 5.817955720457902e-08,
"logits/chosen": -3.107785701751709,
"logits/rejected": -3.1253585815429688,
"logps/chosen": -0.8034731149673462,
"logps/rejected": -0.9698166847229004,
"loss": 0.8686,
"odds_ratio_loss": 0.6507803201675415,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.08034731447696686,
"rewards/margins": 0.016634367406368256,
"rewards/rejected": -0.09698168933391571,
"sft_loss": 0.8034731149673462,
"step": 1570
},
{
"epoch": 2.809513225161147,
"grad_norm": 3.319011926651001,
"learning_rate": 4.861220889427199e-08,
"logits/chosen": -3.1124069690704346,
"logits/rejected": -3.131798505783081,
"logps/chosen": -0.8197135925292969,
"logps/rejected": -0.9885567426681519,
"loss": 0.887,
"odds_ratio_loss": 0.6726602911949158,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.08197136968374252,
"rewards/margins": 0.01688431203365326,
"rewards/rejected": -0.09885567426681519,
"sft_loss": 0.8197135925292969,
"step": 1580
},
{
"epoch": 2.827294954434319,
"grad_norm": 2.032493829727173,
"learning_rate": 3.9896068346758074e-08,
"logits/chosen": -3.134514331817627,
"logits/rejected": -3.1544039249420166,
"logps/chosen": -0.8260439038276672,
"logps/rejected": -0.9724828004837036,
"loss": 0.8923,
"odds_ratio_loss": 0.6625251770019531,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.08260440081357956,
"rewards/margins": 0.01464388333261013,
"rewards/rejected": -0.09724827855825424,
"sft_loss": 0.8260439038276672,
"step": 1590
},
{
"epoch": 2.8450766837074903,
"grad_norm": 4.936295986175537,
"learning_rate": 3.203416211153832e-08,
"logits/chosen": -3.1045830249786377,
"logits/rejected": -3.163196086883545,
"logps/chosen": -0.8155530691146851,
"logps/rejected": -0.9749042391777039,
"loss": 0.8845,
"odds_ratio_loss": 0.6895264983177185,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0815553218126297,
"rewards/margins": 0.015935102477669716,
"rewards/rejected": -0.09749042987823486,
"sft_loss": 0.8155530691146851,
"step": 1600
},
{
"epoch": 2.8628584129806622,
"grad_norm": 3.0522594451904297,
"learning_rate": 2.5029220118019393e-08,
"logits/chosen": -3.0816335678100586,
"logits/rejected": -3.120738983154297,
"logps/chosen": -0.8227775692939758,
"logps/rejected": -0.9608209729194641,
"loss": 0.8883,
"odds_ratio_loss": 0.6554335355758667,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.08227775990962982,
"rewards/margins": 0.013804334215819836,
"rewards/rejected": -0.09608209133148193,
"sft_loss": 0.8227775692939758,
"step": 1610
},
{
"epoch": 2.880640142253834,
"grad_norm": 6.9008331298828125,
"learning_rate": 1.8883674727586122e-08,
"logits/chosen": -3.120664119720459,
"logits/rejected": -3.1391146183013916,
"logps/chosen": -0.7796843647956848,
"logps/rejected": -1.048107385635376,
"loss": 0.8417,
"odds_ratio_loss": 0.620618999004364,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.07796843349933624,
"rewards/margins": 0.026842307299375534,
"rewards/rejected": -0.10481073707342148,
"sft_loss": 0.7796843647956848,
"step": 1620
},
{
"epoch": 2.898421871527006,
"grad_norm": 1.4706188440322876,
"learning_rate": 1.3599659889000639e-08,
"logits/chosen": -3.1607601642608643,
"logits/rejected": -3.1758437156677246,
"logps/chosen": -0.7644230723381042,
"logps/rejected": -0.8909217715263367,
"loss": 0.8322,
"odds_ratio_loss": 0.6777127981185913,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.07644230127334595,
"rewards/margins": 0.01264987699687481,
"rewards/rejected": -0.0890921801328659,
"sft_loss": 0.7644230723381042,
"step": 1630
},
{
"epoch": 2.916203600800178,
"grad_norm": 1.4733339548110962,
"learning_rate": 9.179010397421528e-09,
"logits/chosen": -3.1298046112060547,
"logits/rejected": -3.1558828353881836,
"logps/chosen": -0.7814117670059204,
"logps/rejected": -0.9674129486083984,
"loss": 0.8469,
"odds_ratio_loss": 0.6549249887466431,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.07814116775989532,
"rewards/margins": 0.01860012486577034,
"rewards/rejected": -0.09674130380153656,
"sft_loss": 0.7814117670059204,
"step": 1640
},
{
"epoch": 2.93398533007335,
"grad_norm": 1.6659821271896362,
"learning_rate": 5.623261257296509e-09,
"logits/chosen": -3.100876569747925,
"logits/rejected": -3.1546549797058105,
"logps/chosen": -0.7405164241790771,
"logps/rejected": -0.9196218252182007,
"loss": 0.8037,
"odds_ratio_loss": 0.6319458484649658,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.0740516409277916,
"rewards/margins": 0.017910538241267204,
"rewards/rejected": -0.09196218103170395,
"sft_loss": 0.7405164241790771,
"step": 1650
},
{
"epoch": 2.9517670593465217,
"grad_norm": 2.577908754348755,
"learning_rate": 2.933647149357122e-09,
"logits/chosen": -3.1165366172790527,
"logits/rejected": -3.137150764465332,
"logps/chosen": -0.7795825600624084,
"logps/rejected": -0.9781227111816406,
"loss": 0.8445,
"odds_ratio_loss": 0.648685097694397,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.07795824855566025,
"rewards/margins": 0.01985403150320053,
"rewards/rejected": -0.09781228005886078,
"sft_loss": 0.7795825600624084,
"step": 1660
},
{
"epoch": 2.969548788619693,
"grad_norm": 2.166626453399658,
"learning_rate": 1.1111020018930717e-09,
"logits/chosen": -3.156930446624756,
"logits/rejected": -3.1462855339050293,
"logps/chosen": -0.8264468908309937,
"logps/rejected": -0.9435287714004517,
"loss": 0.8929,
"odds_ratio_loss": 0.6643285751342773,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.08264468610286713,
"rewards/margins": 0.011708182282745838,
"rewards/rejected": -0.09435287117958069,
"sft_loss": 0.8264468908309937,
"step": 1670
},
{
"epoch": 2.987330517892865,
"grad_norm": 2.1216466426849365,
"learning_rate": 1.5625866646051813e-10,
"logits/chosen": -3.1541049480438232,
"logits/rejected": -3.1485564708709717,
"logps/chosen": -0.779746413230896,
"logps/rejected": -0.9904630780220032,
"loss": 0.8399,
"odds_ratio_loss": 0.6012987494468689,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07797463238239288,
"rewards/margins": 0.021071670576930046,
"rewards/rejected": -0.09904631227254868,
"sft_loss": 0.779746413230896,
"step": 1680
},
{
"epoch": 2.997999555456768,
"step": 1686,
"total_flos": 1.9814178520144282e+18,
"train_loss": 0.8985705958283811,
"train_runtime": 25618.6466,
"train_samples_per_second": 1.054,
"train_steps_per_second": 0.066
}
],
"logging_steps": 10,
"max_steps": 1686,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.9814178520144282e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}