Mistral-7B-Instruct-v0.2-MI-6e-7 / trainer_state.json
tengxiao1
TX
8550315
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9987943737441393,
"eval_steps": 400,
"global_step": 466,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010716677829872739,
"grad_norm": 38.81959429763923,
"learning_rate": 6.382978723404255e-08,
"logits/chosen": -2.397952079772949,
"logits/rejected": -2.391846179962158,
"logps/chosen": -0.5666699409484863,
"logps/rejected": -0.5553711652755737,
"loss": 1.5469,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5666699409484863,
"rewards/margins": -0.01129874400794506,
"rewards/rejected": -0.5553711652755737,
"step": 5
},
{
"epoch": 0.021433355659745478,
"grad_norm": 17.957819802244767,
"learning_rate": 1.276595744680851e-07,
"logits/chosen": -2.402738571166992,
"logits/rejected": -2.3730971813201904,
"logps/chosen": -0.5517541766166687,
"logps/rejected": -0.5785264372825623,
"loss": 1.5538,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.5517541766166687,
"rewards/margins": 0.026772266253829002,
"rewards/rejected": -0.5785264372825623,
"step": 10
},
{
"epoch": 0.032150033489618215,
"grad_norm": 17.06492283094742,
"learning_rate": 1.9148936170212767e-07,
"logits/chosen": -2.4437928199768066,
"logits/rejected": -2.449697732925415,
"logps/chosen": -0.5636163353919983,
"logps/rejected": -0.5669411420822144,
"loss": 1.5619,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.5636163353919983,
"rewards/margins": 0.00332476943731308,
"rewards/rejected": -0.5669411420822144,
"step": 15
},
{
"epoch": 0.042866711319490956,
"grad_norm": 17.478232600769196,
"learning_rate": 2.553191489361702e-07,
"logits/chosen": -2.383941650390625,
"logits/rejected": -2.3943183422088623,
"logps/chosen": -0.5459321737289429,
"logps/rejected": -0.5427771806716919,
"loss": 1.5322,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.5459321737289429,
"rewards/margins": -0.00315500283613801,
"rewards/rejected": -0.5427771806716919,
"step": 20
},
{
"epoch": 0.0535833891493637,
"grad_norm": 14.134950451452564,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -2.2786340713500977,
"logits/rejected": -2.2805464267730713,
"logps/chosen": -0.5260549783706665,
"logps/rejected": -0.5430394411087036,
"loss": 1.5298,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.5260549783706665,
"rewards/margins": 0.016984451562166214,
"rewards/rejected": -0.5430394411087036,
"step": 25
},
{
"epoch": 0.06430006697923643,
"grad_norm": 19.57863908597214,
"learning_rate": 3.8297872340425535e-07,
"logits/chosen": -2.3897128105163574,
"logits/rejected": -2.4030909538269043,
"logps/chosen": -0.5465933680534363,
"logps/rejected": -0.5372768640518188,
"loss": 1.5509,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.5465933680534363,
"rewards/margins": -0.009316539391875267,
"rewards/rejected": -0.5372768640518188,
"step": 30
},
{
"epoch": 0.07501674480910918,
"grad_norm": 24.218016837268095,
"learning_rate": 4.4680851063829783e-07,
"logits/chosen": -2.453273296356201,
"logits/rejected": -2.424668788909912,
"logps/chosen": -0.5341351628303528,
"logps/rejected": -0.5890725255012512,
"loss": 1.5479,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.5341351628303528,
"rewards/margins": 0.054937295615673065,
"rewards/rejected": -0.5890725255012512,
"step": 35
},
{
"epoch": 0.08573342263898191,
"grad_norm": 20.81509422651472,
"learning_rate": 5.106382978723404e-07,
"logits/chosen": -2.3677382469177246,
"logits/rejected": -2.3493103981018066,
"logps/chosen": -0.565592885017395,
"logps/rejected": -0.5375810861587524,
"loss": 1.5573,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.565592885017395,
"rewards/margins": -0.02801181748509407,
"rewards/rejected": -0.5375810861587524,
"step": 40
},
{
"epoch": 0.09645010046885466,
"grad_norm": 17.50647386551691,
"learning_rate": 5.74468085106383e-07,
"logits/chosen": -2.3343653678894043,
"logits/rejected": -2.32906436920166,
"logps/chosen": -0.5402032732963562,
"logps/rejected": -0.5591766238212585,
"loss": 1.5421,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.5402032732963562,
"rewards/margins": 0.018973344936966896,
"rewards/rejected": -0.5591766238212585,
"step": 45
},
{
"epoch": 0.1071667782987274,
"grad_norm": 16.375037037224466,
"learning_rate": 5.999241095449976e-07,
"logits/chosen": -2.3641974925994873,
"logits/rejected": -2.3596482276916504,
"logps/chosen": -0.5401940941810608,
"logps/rejected": -0.525315523147583,
"loss": 1.5284,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.5401940941810608,
"rewards/margins": -0.01487857848405838,
"rewards/rejected": -0.525315523147583,
"step": 50
},
{
"epoch": 0.11788345612860013,
"grad_norm": 26.218018133925373,
"learning_rate": 5.994604735812144e-07,
"logits/chosen": -2.4210665225982666,
"logits/rejected": -2.424318790435791,
"logps/chosen": -0.565641462802887,
"logps/rejected": -0.5864871740341187,
"loss": 1.5392,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.565641462802887,
"rewards/margins": 0.020845741033554077,
"rewards/rejected": -0.5864871740341187,
"step": 55
},
{
"epoch": 0.12860013395847286,
"grad_norm": 42.6456644243847,
"learning_rate": 5.985760137627685e-07,
"logits/chosen": -2.325913906097412,
"logits/rejected": -2.3350510597229004,
"logps/chosen": -0.4915548861026764,
"logps/rejected": -0.5130532383918762,
"loss": 1.5405,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.4915548861026764,
"rewards/margins": 0.02149834856390953,
"rewards/rejected": -0.5130532383918762,
"step": 60
},
{
"epoch": 0.13931681178834562,
"grad_norm": 17.49728999516173,
"learning_rate": 5.972719729975655e-07,
"logits/chosen": -2.3687386512756348,
"logits/rejected": -2.3732752799987793,
"logps/chosen": -0.5264291167259216,
"logps/rejected": -0.5606903433799744,
"loss": 1.5441,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5264291167259216,
"rewards/margins": 0.03426120802760124,
"rewards/rejected": -0.5606903433799744,
"step": 65
},
{
"epoch": 0.15003348961821836,
"grad_norm": 19.04975638080615,
"learning_rate": 5.955501838194784e-07,
"logits/chosen": -2.2692012786865234,
"logits/rejected": -2.2734649181365967,
"logps/chosen": -0.5329629778862,
"logps/rejected": -0.585782527923584,
"loss": 1.5335,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5329629778862,
"rewards/margins": 0.05281956121325493,
"rewards/rejected": -0.585782527923584,
"step": 70
},
{
"epoch": 0.1607501674480911,
"grad_norm": 16.46150164067359,
"learning_rate": 5.934130658131361e-07,
"logits/chosen": -2.3084473609924316,
"logits/rejected": -2.303145408630371,
"logps/chosen": -0.4908691346645355,
"logps/rejected": -0.5239783525466919,
"loss": 1.5327,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4908691346645355,
"rewards/margins": 0.03310922160744667,
"rewards/rejected": -0.5239783525466919,
"step": 75
},
{
"epoch": 0.17146684527796383,
"grad_norm": 21.613971984342516,
"learning_rate": 5.908636222137454e-07,
"logits/chosen": -2.291396141052246,
"logits/rejected": -2.3133578300476074,
"logps/chosen": -0.48883646726608276,
"logps/rejected": -0.5628662109375,
"loss": 1.5301,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.48883646726608276,
"rewards/margins": 0.07402969151735306,
"rewards/rejected": -0.5628662109375,
"step": 80
},
{
"epoch": 0.18218352310783656,
"grad_norm": 22.2008339670987,
"learning_rate": 5.879054356867243e-07,
"logits/chosen": -2.328059673309326,
"logits/rejected": -2.3216350078582764,
"logps/chosen": -0.5081610679626465,
"logps/rejected": -0.5643308162689209,
"loss": 1.5335,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5081610679626465,
"rewards/margins": 0.05616975575685501,
"rewards/rejected": -0.5643308162689209,
"step": 85
},
{
"epoch": 0.19290020093770932,
"grad_norm": 19.64419890416597,
"learning_rate": 5.84542663293077e-07,
"logits/chosen": -2.272433280944824,
"logits/rejected": -2.2766337394714355,
"logps/chosen": -0.5117042660713196,
"logps/rejected": -0.558184802532196,
"loss": 1.5348,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5117042660713196,
"rewards/margins": 0.046480584889650345,
"rewards/rejected": -0.558184802532196,
"step": 90
},
{
"epoch": 0.20361687876758205,
"grad_norm": 17.22227696113398,
"learning_rate": 5.807800306475876e-07,
"logits/chosen": -2.3275113105773926,
"logits/rejected": -2.3395214080810547,
"logps/chosen": -0.6069667935371399,
"logps/rejected": -0.6348728537559509,
"loss": 1.5217,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.6069667935371399,
"rewards/margins": 0.027906125411391258,
"rewards/rejected": -0.6348728537559509,
"step": 95
},
{
"epoch": 0.2143335565974548,
"grad_norm": 24.396288528469587,
"learning_rate": 5.766228252780373e-07,
"logits/chosen": -2.368147373199463,
"logits/rejected": -2.377194881439209,
"logps/chosen": -0.5941327214241028,
"logps/rejected": -0.6152836680412292,
"loss": 1.5435,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.5941327214241028,
"rewards/margins": 0.0211509857326746,
"rewards/rejected": -0.6152836680412292,
"step": 100
},
{
"epoch": 0.22505023442732752,
"grad_norm": 16.23048110894587,
"learning_rate": 5.720768891947834e-07,
"logits/chosen": -2.3831636905670166,
"logits/rejected": -2.383808135986328,
"logps/chosen": -0.5236924886703491,
"logps/rejected": -0.5740348100662231,
"loss": 1.517,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.5236924886703491,
"rewards/margins": 0.05034228041768074,
"rewards/rejected": -0.5740348100662231,
"step": 105
},
{
"epoch": 0.23576691225720026,
"grad_norm": 22.95053096013821,
"learning_rate": 5.671486106811365e-07,
"logits/chosen": -2.4293274879455566,
"logits/rejected": -2.4386584758758545,
"logps/chosen": -0.5232604146003723,
"logps/rejected": -0.5748019218444824,
"loss": 1.54,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.5232604146003723,
"rewards/margins": 0.051541589200496674,
"rewards/rejected": -0.5748019218444824,
"step": 110
},
{
"epoch": 0.24648359008707302,
"grad_norm": 30.43281833851228,
"learning_rate": 5.618449153160763e-07,
"logits/chosen": -2.521904945373535,
"logits/rejected": -2.5165414810180664,
"logps/chosen": -0.5098231434822083,
"logps/rejected": -0.5272140502929688,
"loss": 1.5652,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.5098231434822083,
"rewards/margins": 0.01739095151424408,
"rewards/rejected": -0.5272140502929688,
"step": 115
},
{
"epoch": 0.2572002679169457,
"grad_norm": 18.218447276406668,
"learning_rate": 5.56173256241918e-07,
"logits/chosen": -2.6067259311676025,
"logits/rejected": -2.594320774078369,
"logps/chosen": -0.5535318851470947,
"logps/rejected": -0.5515246987342834,
"loss": 1.553,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5535318851470947,
"rewards/margins": -0.002007170347496867,
"rewards/rejected": -0.5515246987342834,
"step": 120
},
{
"epoch": 0.2679169457468185,
"grad_norm": 19.047294066362046,
"learning_rate": 5.501416036906106e-07,
"logits/chosen": -2.599743366241455,
"logits/rejected": -2.602095603942871,
"logps/chosen": -0.5660097599029541,
"logps/rejected": -0.5823434591293335,
"loss": 1.5462,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.5660097599029541,
"rewards/margins": 0.01633365824818611,
"rewards/rejected": -0.5823434591293335,
"step": 125
},
{
"epoch": 0.27863362357669125,
"grad_norm": 25.91707976400477,
"learning_rate": 5.437584337833803e-07,
"logits/chosen": -2.6727051734924316,
"logits/rejected": -2.6547203063964844,
"logps/chosen": -0.5484704375267029,
"logps/rejected": -0.5791813731193542,
"loss": 1.5217,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.5484704375267029,
"rewards/margins": 0.030710989609360695,
"rewards/rejected": -0.5791813731193542,
"step": 130
},
{
"epoch": 0.289350301406564,
"grad_norm": 24.930436449789845,
"learning_rate": 5.370327166194635e-07,
"logits/chosen": -2.6759390830993652,
"logits/rejected": -2.688563346862793,
"logps/chosen": -0.5326634049415588,
"logps/rejected": -0.5721167325973511,
"loss": 1.5358,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.5326634049415588,
"rewards/margins": 0.03945332020521164,
"rewards/rejected": -0.5721167325973511,
"step": 135
},
{
"epoch": 0.3000669792364367,
"grad_norm": 25.343122236521907,
"learning_rate": 5.299739036706635e-07,
"logits/chosen": -2.6917319297790527,
"logits/rejected": -2.6787917613983154,
"logps/chosen": -0.5328460931777954,
"logps/rejected": -0.5668941736221313,
"loss": 1.5162,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.5328460931777954,
"rewards/margins": 0.034048013389110565,
"rewards/rejected": -0.5668941736221313,
"step": 140
},
{
"epoch": 0.31078365706630945,
"grad_norm": 20.469732566491704,
"learning_rate": 5.225919144994487e-07,
"logits/chosen": -2.7843101024627686,
"logits/rejected": -2.7595150470733643,
"logps/chosen": -0.5282408595085144,
"logps/rejected": -0.5839791893959045,
"loss": 1.5106,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5282408595085144,
"rewards/margins": 0.05573834106326103,
"rewards/rejected": -0.5839791893959045,
"step": 145
},
{
"epoch": 0.3215003348961822,
"grad_norm": 22.910568604755916,
"learning_rate": 5.148971228192543e-07,
"logits/chosen": -2.769007921218872,
"logits/rejected": -2.7604851722717285,
"logps/chosen": -0.5120500326156616,
"logps/rejected": -0.5518966913223267,
"loss": 1.5387,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5120500326156616,
"rewards/margins": 0.03984668105840683,
"rewards/rejected": -0.5518966913223267,
"step": 150
},
{
"epoch": 0.3322170127260549,
"grad_norm": 21.1001003359098,
"learning_rate": 5.069003419165781e-07,
"logits/chosen": -2.8498682975769043,
"logits/rejected": -2.8591020107269287,
"logps/chosen": -0.5589969754219055,
"logps/rejected": -0.5984258651733398,
"loss": 1.5176,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.5589969754219055,
"rewards/margins": 0.039428871124982834,
"rewards/rejected": -0.5984258651733398,
"step": 155
},
{
"epoch": 0.34293369055592765,
"grad_norm": 26.259271435648458,
"learning_rate": 4.986128094553569e-07,
"logits/chosen": -2.8450496196746826,
"logits/rejected": -2.8269691467285156,
"logps/chosen": -0.5768808126449585,
"logps/rejected": -0.6496576070785522,
"loss": 1.5296,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.5768808126449585,
"rewards/margins": 0.07277677953243256,
"rewards/rejected": -0.6496576070785522,
"step": 160
},
{
"epoch": 0.3536503683858004,
"grad_norm": 22.95727659422549,
"learning_rate": 4.900461716849745e-07,
"logits/chosen": -2.9189038276672363,
"logits/rejected": -2.8916220664978027,
"logps/chosen": -0.5302075147628784,
"logps/rejected": -0.5729304552078247,
"loss": 1.5205,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5302075147628784,
"rewards/margins": 0.042722832411527634,
"rewards/rejected": -0.5729304552078247,
"step": 165
},
{
"epoch": 0.3643670462156731,
"grad_norm": 17.76635841368691,
"learning_rate": 4.812124670740974e-07,
"logits/chosen": -2.9066505432128906,
"logits/rejected": -2.910203456878662,
"logps/chosen": -0.5274362564086914,
"logps/rejected": -0.6013033986091614,
"loss": 1.5096,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5274362564086914,
"rewards/margins": 0.0738670751452446,
"rewards/rejected": -0.6013033986091614,
"step": 170
},
{
"epoch": 0.3750837240455459,
"grad_norm": 22.708920984772448,
"learning_rate": 4.7212410939333393e-07,
"logits/chosen": -2.9743309020996094,
"logits/rejected": -2.9548959732055664,
"logps/chosen": -0.5641797780990601,
"logps/rejected": -0.594096302986145,
"loss": 1.5387,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5641797780990601,
"rewards/margins": 0.029916446655988693,
"rewards/rejected": -0.594096302986145,
"step": 175
},
{
"epoch": 0.38580040187541864,
"grad_norm": 22.445712735196388,
"learning_rate": 4.6279387027049207e-07,
"logits/chosen": -3.0900559425354004,
"logits/rejected": -3.091893434524536,
"logps/chosen": -0.5932881236076355,
"logps/rejected": -0.6334934234619141,
"loss": 1.5474,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.5932881236076355,
"rewards/margins": 0.040205273777246475,
"rewards/rejected": -0.6334934234619141,
"step": 180
},
{
"epoch": 0.3965170797052914,
"grad_norm": 30.045592149378802,
"learning_rate": 4.5323486124294974e-07,
"logits/chosen": -3.1286568641662598,
"logits/rejected": -3.1514105796813965,
"logps/chosen": -0.5893043279647827,
"logps/rejected": -0.6178286075592041,
"loss": 1.52,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5893043279647827,
"rewards/margins": 0.02852421998977661,
"rewards/rejected": -0.6178286075592041,
"step": 185
},
{
"epoch": 0.4072337575351641,
"grad_norm": 19.66079670307951,
"learning_rate": 4.434605153323596e-07,
"logits/chosen": -3.0138182640075684,
"logits/rejected": -3.027487277984619,
"logps/chosen": -0.5490652322769165,
"logps/rejected": -0.7248018383979797,
"loss": 1.5211,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.5490652322769165,
"rewards/margins": 0.17573660612106323,
"rewards/rejected": -0.7248018383979797,
"step": 190
},
{
"epoch": 0.41795043536503684,
"grad_norm": 23.76319559347926,
"learning_rate": 4.334845681675802e-07,
"logits/chosen": -3.263240098953247,
"logits/rejected": -3.234492063522339,
"logps/chosen": -0.523744523525238,
"logps/rejected": -0.5552490949630737,
"loss": 1.5291,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.523744523525238,
"rewards/margins": 0.03150450438261032,
"rewards/rejected": -0.5552490949630737,
"step": 195
},
{
"epoch": 0.4286671131949096,
"grad_norm": 22.67820361436796,
"learning_rate": 4.233210386823613e-07,
"logits/chosen": -3.1373372077941895,
"logits/rejected": -3.1572506427764893,
"logps/chosen": -0.49712926149368286,
"logps/rejected": -0.5306284427642822,
"loss": 1.5161,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.49712926149368286,
"rewards/margins": 0.033499158918857574,
"rewards/rejected": -0.5306284427642822,
"step": 200
},
{
"epoch": 0.4393837910247823,
"grad_norm": 26.07083589291384,
"learning_rate": 4.129842094149083e-07,
"logits/chosen": -3.277681827545166,
"logits/rejected": -3.2762560844421387,
"logps/chosen": -0.4885168969631195,
"logps/rejected": -0.5218795537948608,
"loss": 1.5331,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.4885168969631195,
"rewards/margins": 0.03336270898580551,
"rewards/rejected": -0.5218795537948608,
"step": 205
},
{
"epoch": 0.45010046885465504,
"grad_norm": 21.312251857603005,
"learning_rate": 4.024886064370107e-07,
"logits/chosen": -3.2972412109375,
"logits/rejected": -3.2872118949890137,
"logps/chosen": -0.509524941444397,
"logps/rejected": -0.5547453761100769,
"loss": 1.5062,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.509524941444397,
"rewards/margins": 0.04522045701742172,
"rewards/rejected": -0.5547453761100769,
"step": 210
},
{
"epoch": 0.4608171466845278,
"grad_norm": 22.199858140310962,
"learning_rate": 3.9184897894093836e-07,
"logits/chosen": -3.3184287548065186,
"logits/rejected": -3.3149967193603516,
"logps/chosen": -0.5457042455673218,
"logps/rejected": -0.5951209664344788,
"loss": 1.4947,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.5457042455673218,
"rewards/margins": 0.049416683614254,
"rewards/rejected": -0.5951209664344788,
"step": 215
},
{
"epoch": 0.4715338245144005,
"grad_norm": 26.637536554337164,
"learning_rate": 3.8108027851279425e-07,
"logits/chosen": -3.3705334663391113,
"logits/rejected": -3.348128080368042,
"logps/chosen": -0.5251081585884094,
"logps/rejected": -0.6058255434036255,
"loss": 1.533,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5251081585884094,
"rewards/margins": 0.08071742951869965,
"rewards/rejected": -0.6058255434036255,
"step": 220
},
{
"epoch": 0.4822505023442733,
"grad_norm": 20.985181328903547,
"learning_rate": 3.701976381214462e-07,
"logits/chosen": -3.441849946975708,
"logits/rejected": -3.436166286468506,
"logps/chosen": -0.5670623183250427,
"logps/rejected": -0.6455426812171936,
"loss": 1.5056,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.5670623183250427,
"rewards/margins": 0.0784803032875061,
"rewards/rejected": -0.6455426812171936,
"step": 225
},
{
"epoch": 0.49296718017414604,
"grad_norm": 26.37006096745831,
"learning_rate": 3.5921635085256784e-07,
"logits/chosen": -3.3313636779785156,
"logits/rejected": -3.316943407058716,
"logps/chosen": -0.5642744898796082,
"logps/rejected": -0.6056590676307678,
"loss": 1.4952,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.5642744898796082,
"rewards/margins": 0.041384514421224594,
"rewards/rejected": -0.6056590676307678,
"step": 230
},
{
"epoch": 0.5036838580040187,
"grad_norm": 21.351561473671772,
"learning_rate": 3.4815184841767167e-07,
"logits/chosen": -3.3021767139434814,
"logits/rejected": -3.2890796661376953,
"logps/chosen": -0.5324856042861938,
"logps/rejected": -0.6350933313369751,
"loss": 1.5016,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5324856042861938,
"rewards/margins": 0.10260789096355438,
"rewards/rejected": -0.6350933313369751,
"step": 235
},
{
"epoch": 0.5144005358338914,
"grad_norm": 25.25074061630777,
"learning_rate": 3.3701967946833387e-07,
"logits/chosen": -3.2254951000213623,
"logits/rejected": -3.232588529586792,
"logps/chosen": -0.5533393621444702,
"logps/rejected": -0.6351491212844849,
"loss": 1.5163,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5533393621444702,
"rewards/margins": 0.08180973678827286,
"rewards/rejected": -0.6351491212844849,
"step": 240
},
{
"epoch": 0.5251172136637642,
"grad_norm": 25.884047164441235,
"learning_rate": 3.258354877460875e-07,
"logits/chosen": -3.2459404468536377,
"logits/rejected": -3.2240214347839355,
"logps/chosen": -0.5927519202232361,
"logps/rejected": -0.6273369193077087,
"loss": 1.5018,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5927519202232361,
"rewards/margins": 0.03458496928215027,
"rewards/rejected": -0.6273369193077087,
"step": 245
},
{
"epoch": 0.535833891493637,
"grad_norm": 24.128535335441807,
"learning_rate": 3.1461499009868705e-07,
"logits/chosen": -3.229731321334839,
"logits/rejected": -3.2193870544433594,
"logps/chosen": -0.6217538118362427,
"logps/rejected": -0.6148039102554321,
"loss": 1.5303,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.6217538118362427,
"rewards/margins": -0.006949885282665491,
"rewards/rejected": -0.6148039102554321,
"step": 250
},
{
"epoch": 0.5465505693235098,
"grad_norm": 22.132762654520068,
"learning_rate": 3.033739543936404e-07,
"logits/chosen": -3.251239776611328,
"logits/rejected": -3.251615047454834,
"logps/chosen": -0.5319584608078003,
"logps/rejected": -0.6038156747817993,
"loss": 1.4956,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5319584608078003,
"rewards/margins": 0.0718572586774826,
"rewards/rejected": -0.6038156747817993,
"step": 255
},
{
"epoch": 0.5572672471533825,
"grad_norm": 20.467314286083344,
"learning_rate": 2.921281773600424e-07,
"logits/chosen": -3.172785520553589,
"logits/rejected": -3.191011905670166,
"logps/chosen": -0.5374451875686646,
"logps/rejected": -0.6392644047737122,
"loss": 1.5245,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5374451875686646,
"rewards/margins": 0.10181926190853119,
"rewards/rejected": -0.6392644047737122,
"step": 260
},
{
"epoch": 0.5679839249832552,
"grad_norm": 21.60647847703085,
"learning_rate": 2.808934623898511e-07,
"logits/chosen": -3.1863551139831543,
"logits/rejected": -3.1833932399749756,
"logps/chosen": -0.5692937970161438,
"logps/rejected": -0.6735215783119202,
"loss": 1.4938,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.5692937970161438,
"rewards/margins": 0.10422778129577637,
"rewards/rejected": -0.6735215783119202,
"step": 265
},
{
"epoch": 0.578700602813128,
"grad_norm": 25.07152854176611,
"learning_rate": 2.696855973298007e-07,
"logits/chosen": -3.1901869773864746,
"logits/rejected": -3.183385133743286,
"logps/chosen": -0.536370575428009,
"logps/rejected": -0.5907222628593445,
"loss": 1.5085,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.536370575428009,
"rewards/margins": 0.05435168743133545,
"rewards/rejected": -0.5907222628593445,
"step": 270
},
{
"epoch": 0.5894172806430007,
"grad_norm": 18.910155061928734,
"learning_rate": 2.585203322951589e-07,
"logits/chosen": -3.274017810821533,
"logits/rejected": -3.2836010456085205,
"logps/chosen": -0.4976142942905426,
"logps/rejected": -0.5548876523971558,
"loss": 1.5118,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.4976142942905426,
"rewards/margins": 0.05727345868945122,
"rewards/rejected": -0.5548876523971558,
"step": 275
},
{
"epoch": 0.6001339584728734,
"grad_norm": 21.74570057366724,
"learning_rate": 2.47413357536509e-07,
"logits/chosen": -3.2025809288024902,
"logits/rejected": -3.1752185821533203,
"logps/chosen": -0.569342851638794,
"logps/rejected": -0.623855471611023,
"loss": 1.5032,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.569342851638794,
"rewards/margins": 0.054512638598680496,
"rewards/rejected": -0.623855471611023,
"step": 280
},
{
"epoch": 0.6108506363027462,
"grad_norm": 25.233010095872693,
"learning_rate": 2.3638028139065624e-07,
"logits/chosen": -3.230616331100464,
"logits/rejected": -3.2365059852600098,
"logps/chosen": -0.5778087973594666,
"logps/rejected": -0.5905576944351196,
"loss": 1.521,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.5778087973594666,
"rewards/margins": 0.012748857028782368,
"rewards/rejected": -0.5905576944351196,
"step": 285
},
{
"epoch": 0.6215673141326189,
"grad_norm": 52.99096570461333,
"learning_rate": 2.2543660834664724e-07,
"logits/chosen": -3.3016669750213623,
"logits/rejected": -3.28556489944458,
"logps/chosen": -0.5023082494735718,
"logps/rejected": -0.5799704790115356,
"loss": 1.496,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5023082494735718,
"rewards/margins": 0.07766219973564148,
"rewards/rejected": -0.5799704790115356,
"step": 290
},
{
"epoch": 0.6322839919624916,
"grad_norm": 22.86414304018135,
"learning_rate": 2.1459771725772267e-07,
"logits/chosen": -3.2204766273498535,
"logits/rejected": -3.243717908859253,
"logps/chosen": -0.5589109063148499,
"logps/rejected": -0.6346697807312012,
"loss": 1.4948,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5589109063148499,
"rewards/margins": 0.07575888931751251,
"rewards/rejected": -0.6346697807312012,
"step": 295
},
{
"epoch": 0.6430006697923644,
"grad_norm": 24.245750925813,
"learning_rate": 2.0387883972982259e-07,
"logits/chosen": -3.3650691509246826,
"logits/rejected": -3.367690324783325,
"logps/chosen": -0.5385848879814148,
"logps/rejected": -0.6212387681007385,
"loss": 1.4876,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.5385848879814148,
"rewards/margins": 0.08265385776758194,
"rewards/rejected": -0.6212387681007385,
"step": 300
},
{
"epoch": 0.6537173476222371,
"grad_norm": 23.847848751632885,
"learning_rate": 1.9329503871701592e-07,
"logits/chosen": -3.309741973876953,
"logits/rejected": -3.281573534011841,
"logps/chosen": -0.5459524989128113,
"logps/rejected": -0.5997665524482727,
"loss": 1.497,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5459524989128113,
"rewards/margins": 0.05381406098604202,
"rewards/rejected": -0.5997665524482727,
"step": 305
},
{
"epoch": 0.6644340254521098,
"grad_norm": 27.809403889861738,
"learning_rate": 1.8286118735393015e-07,
"logits/chosen": -3.3402085304260254,
"logits/rejected": -3.3407912254333496,
"logps/chosen": -0.5383692979812622,
"logps/rejected": -0.5971530079841614,
"loss": 1.4859,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.5383692979812622,
"rewards/margins": 0.058783747255802155,
"rewards/rejected": -0.5971530079841614,
"step": 310
},
{
"epoch": 0.6751507032819826,
"grad_norm": 20.70533413800174,
"learning_rate": 1.7259194805493042e-07,
"logits/chosen": -3.2731971740722656,
"logits/rejected": -3.2823867797851562,
"logps/chosen": -0.5446811318397522,
"logps/rejected": -0.606468915939331,
"loss": 1.4954,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.5446811318397522,
"rewards/margins": 0.06178779527544975,
"rewards/rejected": -0.606468915939331,
"step": 315
},
{
"epoch": 0.6858673811118553,
"grad_norm": 23.184720457564282,
"learning_rate": 1.6250175190941725e-07,
"logits/chosen": -3.3182265758514404,
"logits/rejected": -3.2908051013946533,
"logps/chosen": -0.5561486482620239,
"logps/rejected": -0.5695281028747559,
"loss": 1.5159,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.5561486482620239,
"rewards/margins": 0.0133795365691185,
"rewards/rejected": -0.5695281028747559,
"step": 320
},
{
"epoch": 0.696584058941728,
"grad_norm": 27.421134529600376,
"learning_rate": 1.5260477840220057e-07,
"logits/chosen": -3.309216260910034,
"logits/rejected": -3.318588972091675,
"logps/chosen": -0.5043013095855713,
"logps/rejected": -0.5974953770637512,
"loss": 1.4727,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.5043013095855713,
"rewards/margins": 0.09319403767585754,
"rewards/rejected": -0.5974953770637512,
"step": 325
},
{
"epoch": 0.7073007367716008,
"grad_norm": 24.907218358327214,
"learning_rate": 1.4291493548744542e-07,
"logits/chosen": -3.2981224060058594,
"logits/rejected": -3.269383668899536,
"logps/chosen": -0.5354940295219421,
"logps/rejected": -0.5932218432426453,
"loss": 1.492,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.5354940295219421,
"rewards/margins": 0.05772777646780014,
"rewards/rejected": -0.5932218432426453,
"step": 330
},
{
"epoch": 0.7180174146014735,
"grad_norm": 40.51550261895159,
"learning_rate": 1.334458400441933e-07,
"logits/chosen": -3.3821797370910645,
"logits/rejected": -3.373931884765625,
"logps/chosen": -0.5368712544441223,
"logps/rejected": -0.6348738670349121,
"loss": 1.5027,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5368712544441223,
"rewards/margins": 0.09800264984369278,
"rewards/rejected": -0.6348738670349121,
"step": 335
},
{
"epoch": 0.7287340924313462,
"grad_norm": 25.305322827743687,
"learning_rate": 1.2421079874092336e-07,
"logits/chosen": -3.311006546020508,
"logits/rejected": -3.2969226837158203,
"logps/chosen": -0.5727181434631348,
"logps/rejected": -0.6479278802871704,
"loss": 1.516,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5727181434631348,
"rewards/margins": 0.07520972192287445,
"rewards/rejected": -0.6479278802871704,
"step": 340
},
{
"epoch": 0.739450770261219,
"grad_norm": 22.209687488331966,
"learning_rate": 1.1522278933604484e-07,
"logits/chosen": -3.3381361961364746,
"logits/rejected": -3.3484432697296143,
"logps/chosen": -0.5813694596290588,
"logps/rejected": -0.6797500848770142,
"loss": 1.4932,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.5813694596290588,
"rewards/margins": 0.09838052839040756,
"rewards/rejected": -0.6797500848770142,
"step": 345
},
{
"epoch": 0.7501674480910918,
"grad_norm": 22.960003013518822,
"learning_rate": 1.0649444244059717e-07,
"logits/chosen": -3.32041597366333,
"logits/rejected": -3.3459019660949707,
"logps/chosen": -0.5482354164123535,
"logps/rejected": -0.6205809712409973,
"loss": 1.4932,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.5482354164123535,
"rewards/margins": 0.07234560698270798,
"rewards/rejected": -0.6205809712409973,
"step": 350
},
{
"epoch": 0.7608841259209645,
"grad_norm": 25.349782143414405,
"learning_rate": 9.803802376878795e-08,
"logits/chosen": -3.3137733936309814,
"logits/rejected": -3.3020172119140625,
"logps/chosen": -0.5893365740776062,
"logps/rejected": -0.6166855096817017,
"loss": 1.4957,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.5893365740776062,
"rewards/margins": 0.02734885737299919,
"rewards/rejected": -0.6166855096817017,
"step": 355
},
{
"epoch": 0.7716008037508373,
"grad_norm": 22.556111949917664,
"learning_rate": 8.98654169013098e-08,
"logits/chosen": -3.3032424449920654,
"logits/rejected": -3.288992404937744,
"logps/chosen": -0.521614670753479,
"logps/rejected": -0.5875999331474304,
"loss": 1.5004,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.521614670753479,
"rewards/margins": 0.06598522514104843,
"rewards/rejected": -0.5875999331474304,
"step": 360
},
{
"epoch": 0.78231748158071,
"grad_norm": 23.52734286974739,
"learning_rate": 8.198810658566058e-08,
"logits/chosen": -3.3537094593048096,
"logits/rejected": -3.348142147064209,
"logps/chosen": -0.5639868974685669,
"logps/rejected": -0.6276763677597046,
"loss": 1.5179,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5639868974685669,
"rewards/margins": 0.06368952244520187,
"rewards/rejected": -0.6276763677597046,
"step": 365
},
{
"epoch": 0.7930341594105828,
"grad_norm": 35.138524177158146,
"learning_rate": 7.441716259693182e-08,
"logits/chosen": -3.3430557250976562,
"logits/rejected": -3.365880250930786,
"logps/chosen": -0.5710283517837524,
"logps/rejected": -0.6515873670578003,
"loss": 1.5075,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5710283517837524,
"rewards/margins": 0.08055897057056427,
"rewards/rejected": -0.6515873670578003,
"step": 370
},
{
"epoch": 0.8037508372404555,
"grad_norm": 23.46216038728244,
"learning_rate": 6.716322418174835e-08,
"logits/chosen": -3.309415102005005,
"logits/rejected": -3.2904000282287598,
"logps/chosen": -0.6181533336639404,
"logps/rejected": -0.7165501713752747,
"loss": 1.4916,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.6181533336639404,
"rewards/margins": 0.09839687496423721,
"rewards/rejected": -0.7165501713752747,
"step": 375
},
{
"epoch": 0.8144675150703282,
"grad_norm": 26.38220306063447,
"learning_rate": 6.023648510721696e-08,
"logits/chosen": -3.391897678375244,
"logits/rejected": -3.358309268951416,
"logps/chosen": -0.5505380630493164,
"logps/rejected": -0.6369754076004028,
"loss": 1.5034,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.5505380630493164,
"rewards/margins": 0.08643738180398941,
"rewards/rejected": -0.6369754076004028,
"step": 380
},
{
"epoch": 0.825184192900201,
"grad_norm": 24.084433663112158,
"learning_rate": 5.364667933589596e-08,
"logits/chosen": -3.2913315296173096,
"logits/rejected": -3.3047938346862793,
"logps/chosen": -0.5729898810386658,
"logps/rejected": -0.6550789475440979,
"loss": 1.4915,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5729898810386658,
"rewards/margins": 0.08208902180194855,
"rewards/rejected": -0.6550789475440979,
"step": 385
},
{
"epoch": 0.8359008707300737,
"grad_norm": 25.406153364539314,
"learning_rate": 4.74030673469165e-08,
"logits/chosen": -3.3330624103546143,
"logits/rejected": -3.293489933013916,
"logps/chosen": -0.613795280456543,
"logps/rejected": -0.6435045003890991,
"loss": 1.5266,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.613795280456543,
"rewards/margins": 0.029709184542298317,
"rewards/rejected": -0.6435045003890991,
"step": 390
},
{
"epoch": 0.8466175485599464,
"grad_norm": 24.427651638617984,
"learning_rate": 4.1514423122476606e-08,
"logits/chosen": -3.3390536308288574,
"logits/rejected": -3.3218231201171875,
"logps/chosen": -0.5534718632698059,
"logps/rejected": -0.600857138633728,
"loss": 1.507,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.5534718632698059,
"rewards/margins": 0.04738527163863182,
"rewards/rejected": -0.600857138633728,
"step": 395
},
{
"epoch": 0.8573342263898192,
"grad_norm": 26.673472503173954,
"learning_rate": 3.598902181799717e-08,
"logits/chosen": -3.298213481903076,
"logits/rejected": -3.2662785053253174,
"logps/chosen": -0.5154682397842407,
"logps/rejected": -0.6383693814277649,
"loss": 1.4828,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5154682397842407,
"rewards/margins": 0.12290115654468536,
"rewards/rejected": -0.6383693814277649,
"step": 400
},
{
"epoch": 0.8573342263898192,
"eval_logits/chosen": -3.409546136856079,
"eval_logits/rejected": -3.403578758239746,
"eval_logps/chosen": -0.5692862868309021,
"eval_logps/rejected": -0.6257904171943665,
"eval_loss": 1.5201373100280762,
"eval_rewards/accuracies": 0.5691489577293396,
"eval_rewards/chosen": -0.5692862868309021,
"eval_rewards/margins": 0.05650414153933525,
"eval_rewards/rejected": -0.6257904171943665,
"eval_runtime": 432.4468,
"eval_samples_per_second": 6.923,
"eval_steps_per_second": 0.435,
"step": 400
},
{
"epoch": 0.8680509042196919,
"grad_norm": 24.460303851950272,
"learning_rate": 3.0834628133265293e-08,
"logits/chosen": -3.308946132659912,
"logits/rejected": -3.293513536453247,
"logps/chosen": -0.5684244632720947,
"logps/rejected": -0.6289744973182678,
"loss": 1.5056,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.5684244632720947,
"rewards/margins": 0.06054999679327011,
"rewards/rejected": -0.6289744973182678,
"step": 405
},
{
"epoch": 0.8787675820495646,
"grad_norm": 22.9693279472293,
"learning_rate": 2.6058485400908248e-08,
"logits/chosen": -3.358743190765381,
"logits/rejected": -3.3271114826202393,
"logps/chosen": -0.5511162877082825,
"logps/rejected": -0.5816215872764587,
"loss": 1.4873,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.5511162877082825,
"rewards/margins": 0.030505258589982986,
"rewards/rejected": -0.5816215872764587,
"step": 410
},
{
"epoch": 0.8894842598794374,
"grad_norm": 25.421672297355457,
"learning_rate": 2.1667305407530255e-08,
"logits/chosen": -3.2762393951416016,
"logits/rejected": -3.2448742389678955,
"logps/chosen": -0.548682689666748,
"logps/rejected": -0.6257365942001343,
"loss": 1.4819,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.548682689666748,
"rewards/margins": 0.07705400884151459,
"rewards/rejected": -0.6257365942001343,
"step": 415
},
{
"epoch": 0.9002009377093101,
"grad_norm": 27.04902626819421,
"learning_rate": 1.7667258961816723e-08,
"logits/chosen": -3.2720954418182373,
"logits/rejected": -3.2802345752716064,
"logps/chosen": -0.5331937074661255,
"logps/rejected": -0.573731541633606,
"loss": 1.5172,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.5331937074661255,
"rewards/margins": 0.04053787142038345,
"rewards/rejected": -0.573731541633606,
"step": 420
},
{
"epoch": 0.9109176155391828,
"grad_norm": 27.138507466839215,
"learning_rate": 1.4063967222860872e-08,
"logits/chosen": -3.280641555786133,
"logits/rejected": -3.268662214279175,
"logps/chosen": -0.5173559188842773,
"logps/rejected": -0.6271434426307678,
"loss": 1.4903,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.5173559188842773,
"rewards/margins": 0.10978756844997406,
"rewards/rejected": -0.6271434426307678,
"step": 425
},
{
"epoch": 0.9216342933690556,
"grad_norm": 36.01846846843038,
"learning_rate": 1.086249380089782e-08,
"logits/chosen": -3.33141827583313,
"logits/rejected": -3.3434956073760986,
"logps/chosen": -0.615702748298645,
"logps/rejected": -0.7053866386413574,
"loss": 1.4914,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.615702748298645,
"rewards/margins": 0.0896839126944542,
"rewards/rejected": -0.7053866386413574,
"step": 430
},
{
"epoch": 0.9323509711989283,
"grad_norm": 29.36505146378418,
"learning_rate": 8.067337641547777e-09,
"logits/chosen": -3.3964333534240723,
"logits/rejected": -3.4042282104492188,
"logps/chosen": -0.5159146189689636,
"logps/rejected": -0.6444130539894104,
"loss": 1.4866,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.5159146189689636,
"rewards/margins": 0.12849843502044678,
"rewards/rejected": -0.6444130539894104,
"step": 435
},
{
"epoch": 0.943067649028801,
"grad_norm": 23.999063503219116,
"learning_rate": 5.682426703567034e-09,
"logits/chosen": -3.2346031665802,
"logits/rejected": -3.2309417724609375,
"logps/chosen": -0.52639240026474,
"logps/rejected": -0.6208442449569702,
"loss": 1.4676,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.52639240026474,
"rewards/margins": 0.09445185959339142,
"rewards/rejected": -0.6208442449569702,
"step": 440
},
{
"epoch": 0.9537843268586738,
"grad_norm": 28.590598764799935,
"learning_rate": 3.7111124389918146e-09,
"logits/chosen": -3.2654852867126465,
"logits/rejected": -3.264702558517456,
"logps/chosen": -0.5419159531593323,
"logps/rejected": -0.6305166482925415,
"loss": 1.4876,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.5419159531593323,
"rewards/margins": 0.08860062062740326,
"rewards/rejected": -0.6305166482925415,
"step": 445
},
{
"epoch": 0.9645010046885466,
"grad_norm": 23.071429282207497,
"learning_rate": 2.156165083431627e-09,
"logits/chosen": -3.2962241172790527,
"logits/rejected": -3.283967971801758,
"logps/chosen": -0.5432751774787903,
"logps/rejected": -0.6289895176887512,
"loss": 1.4804,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.5432751774787903,
"rewards/margins": 0.08571438491344452,
"rewards/rejected": -0.6289895176887512,
"step": 450
},
{
"epoch": 0.9752176825184193,
"grad_norm": 25.76239166835448,
"learning_rate": 1.019769763130851e-09,
"logits/chosen": -3.2721996307373047,
"logits/rejected": -3.271106243133545,
"logps/chosen": -0.5389679670333862,
"logps/rejected": -0.6317923665046692,
"loss": 1.4855,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.5389679670333862,
"rewards/margins": 0.09282433986663818,
"rewards/rejected": -0.6317923665046692,
"step": 455
},
{
"epoch": 0.9859343603482921,
"grad_norm": 29.67085990506561,
"learning_rate": 3.0352342426868125e-10,
"logits/chosen": -3.287473678588867,
"logits/rejected": -3.29301381111145,
"logps/chosen": -0.5480049252510071,
"logps/rejected": -0.6534655690193176,
"loss": 1.4972,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.5480049252510071,
"rewards/margins": 0.10546054691076279,
"rewards/rejected": -0.6534655690193176,
"step": 460
},
{
"epoch": 0.9966510381781648,
"grad_norm": 29.944439651933386,
"learning_rate": 8.432588813089836e-12,
"logits/chosen": -3.3211536407470703,
"logits/rejected": -3.304069995880127,
"logps/chosen": -0.583086371421814,
"logps/rejected": -0.6369927525520325,
"loss": 1.5004,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.583086371421814,
"rewards/margins": 0.05390629172325134,
"rewards/rejected": -0.6369927525520325,
"step": 465
},
{
"epoch": 0.9987943737441393,
"step": 466,
"total_flos": 0.0,
"train_loss": 1.5151263257976253,
"train_runtime": 19305.9847,
"train_samples_per_second": 3.093,
"train_steps_per_second": 0.024
}
],
"logging_steps": 5,
"max_steps": 466,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}