{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 4164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007204610951008645, "grad_norm": 2.336021900177002, "learning_rate": 1.199040767386091e-10, "logits/chosen": -1.3860063552856445, "logits/rejected": -1.3949532508850098, "logps/chosen": -34.621925354003906, "logps/rejected": -37.30891418457031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.007204610951008645, "grad_norm": 2.7931599617004395, "learning_rate": 1.199040767386091e-09, "logits/chosen": -1.546767234802246, "logits/rejected": -1.5282517671585083, "logps/chosen": -42.52494812011719, "logps/rejected": -44.546756744384766, "loss": 0.6932, "rewards/accuracies": 0.3680555522441864, "rewards/chosen": -0.00010908626427408308, "rewards/margins": -0.00013866486551705748, "rewards/rejected": 2.95786012429744e-05, "step": 10 }, { "epoch": 0.01440922190201729, "grad_norm": 2.9333579540252686, "learning_rate": 2.398081534772182e-09, "logits/chosen": -1.5552335977554321, "logits/rejected": -1.5412750244140625, "logps/chosen": -44.075599670410156, "logps/rejected": -46.59809112548828, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 9.86563172773458e-05, "rewards/margins": 0.00012048264034092426, "rewards/rejected": -2.182633033953607e-05, "step": 20 }, { "epoch": 0.021613832853025938, "grad_norm": 3.4909462928771973, "learning_rate": 3.597122302158273e-09, "logits/chosen": -1.511649250984192, "logits/rejected": -1.5045579671859741, "logps/chosen": -47.84404754638672, "logps/rejected": -50.805335998535156, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 4.7573317715432495e-05, "rewards/margins": 0.00018799836107064039, "rewards/rejected": -0.0001404250506311655, "step": 30 }, { "epoch": 0.02881844380403458, "grad_norm": 2.575885772705078, "learning_rate": 4.796163069544364e-09, "logits/chosen": -1.5584311485290527, "logits/rejected": -1.5544531345367432, "logps/chosen": -43.06354522705078, "logps/rejected": -45.552642822265625, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0001178598977276124, "rewards/margins": -6.472436507465318e-05, "rewards/rejected": -5.313555084285326e-05, "step": 40 }, { "epoch": 0.03602305475504323, "grad_norm": 2.653576374053955, "learning_rate": 5.995203836930456e-09, "logits/chosen": -1.469327688217163, "logits/rejected": -1.4684107303619385, "logps/chosen": -42.99556350708008, "logps/rejected": -44.81259536743164, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00011682316107908264, "rewards/margins": -2.396003037574701e-05, "rewards/rejected": 0.0001407831732649356, "step": 50 }, { "epoch": 0.043227665706051875, "grad_norm": 3.936546564102173, "learning_rate": 7.194244604316546e-09, "logits/chosen": -1.567378044128418, "logits/rejected": -1.5606577396392822, "logps/chosen": -50.69051742553711, "logps/rejected": -52.026954650878906, "loss": 0.6931, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -3.9090933569241315e-05, "rewards/margins": 1.4172494047670625e-05, "rewards/rejected": -5.326343307388015e-05, "step": 60 }, { "epoch": 0.05043227665706052, "grad_norm": 2.3031318187713623, "learning_rate": 8.393285371702639e-09, "logits/chosen": -1.5360815525054932, "logits/rejected": -1.5282552242279053, "logps/chosen": -50.07221221923828, "logps/rejected": -52.78315353393555, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.2442261474207044e-05, "rewards/margins": 4.4568816520040855e-05, "rewards/rejected": -8.701106708031148e-05, "step": 70 }, { "epoch": 0.05763688760806916, "grad_norm": 3.459690570831299, "learning_rate": 9.592326139088728e-09, "logits/chosen": -1.5700651407241821, "logits/rejected": -1.5622011423110962, "logps/chosen": -51.09720230102539, "logps/rejected": -52.700775146484375, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00014371283759828657, "rewards/margins": 0.0003250584995839745, "rewards/rejected": -0.00018134564743377268, "step": 80 }, { "epoch": 0.06484149855907781, "grad_norm": 2.787858009338379, "learning_rate": 1.0791366906474819e-08, "logits/chosen": -1.5036344528198242, "logits/rejected": -1.5007749795913696, "logps/chosen": -49.02019500732422, "logps/rejected": -51.165855407714844, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -3.3910677302628756e-05, "rewards/margins": -8.141305443132296e-05, "rewards/rejected": 4.750239895656705e-05, "step": 90 }, { "epoch": 0.07204610951008646, "grad_norm": 2.978163957595825, "learning_rate": 1.1990407673860912e-08, "logits/chosen": -1.5843650102615356, "logits/rejected": -1.5736749172210693, "logps/chosen": -45.69003677368164, "logps/rejected": -48.75359344482422, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 5.446890099847224e-06, "rewards/margins": -4.437283132574521e-05, "rewards/rejected": 4.981973324902356e-05, "step": 100 }, { "epoch": 0.0792507204610951, "grad_norm": 2.3196158409118652, "learning_rate": 1.3189448441247003e-08, "logits/chosen": -1.4547462463378906, "logits/rejected": -1.4310463666915894, "logps/chosen": -48.958641052246094, "logps/rejected": -51.16803741455078, "loss": 0.6933, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.00026367080863565207, "rewards/margins": -0.0002573660749476403, "rewards/rejected": -6.304704584181309e-06, "step": 110 }, { "epoch": 0.08645533141210375, "grad_norm": 2.3112876415252686, "learning_rate": 1.4388489208633092e-08, "logits/chosen": -1.4834020137786865, "logits/rejected": -1.480193853378296, "logps/chosen": -44.25951385498047, "logps/rejected": -46.622459411621094, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -6.177094473969191e-05, "rewards/margins": 0.00016041506023611873, "rewards/rejected": -0.00022218600497581065, "step": 120 }, { "epoch": 0.0936599423631124, "grad_norm": 3.208045482635498, "learning_rate": 1.5587529976019183e-08, "logits/chosen": -1.5762228965759277, "logits/rejected": -1.570908784866333, "logps/chosen": -49.414695739746094, "logps/rejected": -51.2851448059082, "loss": 0.6933, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -8.6869033111725e-05, "rewards/margins": -0.0002423443365842104, "rewards/rejected": 0.000155475310748443, "step": 130 }, { "epoch": 0.10086455331412104, "grad_norm": 2.822239875793457, "learning_rate": 1.6786570743405277e-08, "logits/chosen": -1.4522497653961182, "logits/rejected": -1.4427827596664429, "logps/chosen": -45.83789825439453, "logps/rejected": -50.24224090576172, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 8.89547445694916e-05, "rewards/margins": 4.058218473801389e-05, "rewards/rejected": 4.837256346945651e-05, "step": 140 }, { "epoch": 0.10806916426512968, "grad_norm": 3.587710380554199, "learning_rate": 1.7985611510791365e-08, "logits/chosen": -1.4713795185089111, "logits/rejected": -1.4646179676055908, "logps/chosen": -48.30494689941406, "logps/rejected": -51.4416389465332, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -4.7554804041283205e-05, "rewards/margins": 4.275305400369689e-05, "rewards/rejected": -9.030787623487413e-05, "step": 150 }, { "epoch": 0.11527377521613832, "grad_norm": 2.4577479362487793, "learning_rate": 1.9184652278177456e-08, "logits/chosen": -1.5048809051513672, "logits/rejected": -1.4864928722381592, "logps/chosen": -41.26644515991211, "logps/rejected": -44.58869171142578, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00011363021621946245, "rewards/margins": -0.00019988985150121152, "rewards/rejected": 8.625965710962191e-05, "step": 160 }, { "epoch": 0.12247838616714697, "grad_norm": 3.1715707778930664, "learning_rate": 2.038369304556355e-08, "logits/chosen": -1.5167441368103027, "logits/rejected": -1.4980051517486572, "logps/chosen": -44.847110748291016, "logps/rejected": -46.87194061279297, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 6.15198805462569e-05, "rewards/margins": 4.4902008085045964e-05, "rewards/rejected": 1.661786882323213e-05, "step": 170 }, { "epoch": 0.12968299711815562, "grad_norm": 2.6304078102111816, "learning_rate": 2.1582733812949638e-08, "logits/chosen": -1.5800201892852783, "logits/rejected": -1.5682014226913452, "logps/chosen": -45.10023880004883, "logps/rejected": -46.83604049682617, "loss": 0.6932, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -8.165388862835243e-05, "rewards/margins": -0.00017628518980927765, "rewards/rejected": 9.46312939049676e-05, "step": 180 }, { "epoch": 0.13688760806916425, "grad_norm": 2.9077770709991455, "learning_rate": 2.278177458033573e-08, "logits/chosen": -1.5893758535385132, "logits/rejected": -1.5856701135635376, "logps/chosen": -42.25619125366211, "logps/rejected": -45.408897399902344, "loss": 0.6931, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 5.6146480346797034e-05, "rewards/margins": 9.998455789173022e-05, "rewards/rejected": -4.3838103010784835e-05, "step": 190 }, { "epoch": 0.1440922190201729, "grad_norm": 3.549778699874878, "learning_rate": 2.3980815347721823e-08, "logits/chosen": -1.5360238552093506, "logits/rejected": -1.5294934511184692, "logps/chosen": -43.458560943603516, "logps/rejected": -47.107208251953125, "loss": 0.6931, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00025255040964111686, "rewards/margins": 0.00011713804269675165, "rewards/rejected": 0.00013541239604819566, "step": 200 }, { "epoch": 0.15129682997118155, "grad_norm": 3.0489232540130615, "learning_rate": 2.517985611510791e-08, "logits/chosen": -1.5652011632919312, "logits/rejected": -1.55386221408844, "logps/chosen": -43.03472137451172, "logps/rejected": -43.407081604003906, "loss": 0.6931, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 2.3424910978064872e-05, "rewards/margins": 8.2639220636338e-05, "rewards/rejected": -5.9214326029177755e-05, "step": 210 }, { "epoch": 0.1585014409221902, "grad_norm": 2.841609001159668, "learning_rate": 2.6378896882494006e-08, "logits/chosen": -1.480421781539917, "logits/rejected": -1.4737306833267212, "logps/chosen": -47.355350494384766, "logps/rejected": -52.514892578125, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 1.830189285101369e-05, "rewards/margins": -0.00019653179333545268, "rewards/rejected": 0.00021483367891050875, "step": 220 }, { "epoch": 0.16570605187319884, "grad_norm": 2.58459734916687, "learning_rate": 2.7577937649880097e-08, "logits/chosen": -1.531930685043335, "logits/rejected": -1.530457615852356, "logps/chosen": -44.46145248413086, "logps/rejected": -48.232242584228516, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 8.398554928135127e-05, "rewards/margins": -8.254259591922164e-05, "rewards/rejected": 0.00016652815975248814, "step": 230 }, { "epoch": 0.1729106628242075, "grad_norm": 3.012089252471924, "learning_rate": 2.8776978417266184e-08, "logits/chosen": -1.579641342163086, "logits/rejected": -1.5689789056777954, "logps/chosen": -49.132789611816406, "logps/rejected": -51.18694305419922, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 9.1132904344704e-05, "rewards/margins": -7.982960960362107e-05, "rewards/rejected": 0.00017096252122428268, "step": 240 }, { "epoch": 0.18011527377521613, "grad_norm": 4.0211029052734375, "learning_rate": 2.997601918465228e-08, "logits/chosen": -1.4552438259124756, "logits/rejected": -1.4445832967758179, "logps/chosen": -49.68731689453125, "logps/rejected": -50.77144241333008, "loss": 0.693, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00020649081852752715, "rewards/margins": 0.0003089832025580108, "rewards/rejected": -0.00010249239858239889, "step": 250 }, { "epoch": 0.1873198847262248, "grad_norm": 3.18790864944458, "learning_rate": 3.1175059952038366e-08, "logits/chosen": -1.4960715770721436, "logits/rejected": -1.490350365638733, "logps/chosen": -50.00872802734375, "logps/rejected": -51.44108200073242, "loss": 0.6932, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 4.101751983398572e-06, "rewards/margins": -8.562284347135574e-05, "rewards/rejected": 8.972459909273311e-05, "step": 260 }, { "epoch": 0.19452449567723343, "grad_norm": 3.316842794418335, "learning_rate": 3.237410071942446e-08, "logits/chosen": -1.5841925144195557, "logits/rejected": -1.5689890384674072, "logps/chosen": -46.749019622802734, "logps/rejected": -49.31301498413086, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00019280468404758722, "rewards/margins": 8.136655378621072e-05, "rewards/rejected": 0.00011143812298541889, "step": 270 }, { "epoch": 0.2017291066282421, "grad_norm": 2.9199252128601074, "learning_rate": 3.3573141486810555e-08, "logits/chosen": -1.5495785474777222, "logits/rejected": -1.5337769985198975, "logps/chosen": -42.458438873291016, "logps/rejected": -43.72154235839844, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00012140088074374944, "rewards/margins": 0.00014567100151907653, "rewards/rejected": -2.4270115318358876e-05, "step": 280 }, { "epoch": 0.20893371757925072, "grad_norm": 2.7867319583892822, "learning_rate": 3.477218225419664e-08, "logits/chosen": -1.5798470973968506, "logits/rejected": -1.5693657398223877, "logps/chosen": -44.48418426513672, "logps/rejected": -45.79877471923828, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0002844139817170799, "rewards/margins": 0.0004200335533823818, "rewards/rejected": -0.0001356196153210476, "step": 290 }, { "epoch": 0.21613832853025935, "grad_norm": 3.150078058242798, "learning_rate": 3.597122302158273e-08, "logits/chosen": -1.556154489517212, "logits/rejected": -1.539389967918396, "logps/chosen": -48.21941375732422, "logps/rejected": -50.685508728027344, "loss": 0.6933, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0001608836610103026, "rewards/margins": -0.0002904186840169132, "rewards/rejected": 0.00012953505211044103, "step": 300 }, { "epoch": 0.22334293948126802, "grad_norm": 3.015779972076416, "learning_rate": 3.717026378896883e-08, "logits/chosen": -1.4409435987472534, "logits/rejected": -1.4314186573028564, "logps/chosen": -48.024295806884766, "logps/rejected": -50.0726432800293, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00016624320414848626, "rewards/margins": 0.000264329748461023, "rewards/rejected": -9.808655886445194e-05, "step": 310 }, { "epoch": 0.23054755043227665, "grad_norm": 2.547288417816162, "learning_rate": 3.836930455635491e-08, "logits/chosen": -1.553330659866333, "logits/rejected": -1.5384843349456787, "logps/chosen": -47.11131286621094, "logps/rejected": -51.53942108154297, "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 3.6099205317441374e-05, "rewards/margins": -0.00022887022350914776, "rewards/rejected": 0.0002649694215506315, "step": 320 }, { "epoch": 0.2377521613832853, "grad_norm": 2.309831380844116, "learning_rate": 3.9568345323741003e-08, "logits/chosen": -1.5248470306396484, "logits/rejected": -1.5183141231536865, "logps/chosen": -50.730499267578125, "logps/rejected": -49.556068420410156, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00026169692864641547, "rewards/margins": 0.0001796955766621977, "rewards/rejected": 8.200139563996345e-05, "step": 330 }, { "epoch": 0.24495677233429394, "grad_norm": 2.6688296794891357, "learning_rate": 4.07673860911271e-08, "logits/chosen": -1.5785892009735107, "logits/rejected": -1.5679844617843628, "logps/chosen": -51.07538604736328, "logps/rejected": -52.128135681152344, "loss": 0.6929, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.00021654777810908854, "rewards/margins": 0.0004967943532392383, "rewards/rejected": -0.00028024654602631927, "step": 340 }, { "epoch": 0.2521613832853026, "grad_norm": 3.666226387023926, "learning_rate": 4.1966426858513185e-08, "logits/chosen": -1.5095466375350952, "logits/rejected": -1.5047543048858643, "logps/chosen": -45.6449089050293, "logps/rejected": -48.59443664550781, "loss": 0.693, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00023484873236157, "rewards/margins": 0.0001965599658433348, "rewards/rejected": 3.82887119485531e-05, "step": 350 }, { "epoch": 0.25936599423631124, "grad_norm": 3.478895664215088, "learning_rate": 4.3165467625899276e-08, "logits/chosen": -1.5015857219696045, "logits/rejected": -1.4911534786224365, "logps/chosen": -53.9161376953125, "logps/rejected": -56.55745315551758, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0001681277499301359, "rewards/margins": 0.00018387913587503135, "rewards/rejected": -1.575138230691664e-05, "step": 360 }, { "epoch": 0.2665706051873199, "grad_norm": 3.750058650970459, "learning_rate": 4.4364508393285374e-08, "logits/chosen": -1.4821351766586304, "logits/rejected": -1.480381727218628, "logps/chosen": -48.33544158935547, "logps/rejected": -53.026039123535156, "loss": 0.6931, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 8.031638571992517e-05, "rewards/margins": 0.00012994577991776168, "rewards/rejected": -4.96293832839001e-05, "step": 370 }, { "epoch": 0.2737752161383285, "grad_norm": 2.4797353744506836, "learning_rate": 4.556354916067146e-08, "logits/chosen": -1.5723177194595337, "logits/rejected": -1.565124750137329, "logps/chosen": -47.00983428955078, "logps/rejected": -48.0451774597168, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -9.197367762681097e-05, "rewards/margins": -0.00010503224621061236, "rewards/rejected": 1.3058568583801389e-05, "step": 380 }, { "epoch": 0.28097982708933716, "grad_norm": 2.919807195663452, "learning_rate": 4.676258992805755e-08, "logits/chosen": -1.544861078262329, "logits/rejected": -1.5350227355957031, "logps/chosen": -48.17707824707031, "logps/rejected": -51.51404571533203, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.00019117488409392536, "rewards/margins": 0.00030245125526562333, "rewards/rejected": -0.0001112763857236132, "step": 390 }, { "epoch": 0.2881844380403458, "grad_norm": 3.4181175231933594, "learning_rate": 4.796163069544365e-08, "logits/chosen": -1.5584533214569092, "logits/rejected": -1.553442120552063, "logps/chosen": -44.867557525634766, "logps/rejected": -46.099464416503906, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0003581286873668432, "rewards/margins": 5.371281804400496e-05, "rewards/rejected": 0.0003044158511329442, "step": 400 }, { "epoch": 0.2953890489913545, "grad_norm": 3.621582269668579, "learning_rate": 4.916067146282973e-08, "logits/chosen": -1.5090839862823486, "logits/rejected": -1.5060720443725586, "logps/chosen": -47.50264358520508, "logps/rejected": -49.63032531738281, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00016462437633890659, "rewards/margins": 0.00023242369934450835, "rewards/rejected": -6.779931572964415e-05, "step": 410 }, { "epoch": 0.3025936599423631, "grad_norm": 2.4397823810577393, "learning_rate": 4.999992091672379e-08, "logits/chosen": -1.4693224430084229, "logits/rejected": -1.4796779155731201, "logps/chosen": -45.638816833496094, "logps/rejected": -48.969810485839844, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.000272547040367499, "rewards/margins": 0.0002472659107297659, "rewards/rejected": 2.5281125999754295e-05, "step": 420 }, { "epoch": 0.30979827089337175, "grad_norm": 2.2847747802734375, "learning_rate": 4.999851500573209e-08, "logits/chosen": -1.497286081314087, "logits/rejected": -1.4975759983062744, "logps/chosen": -46.069549560546875, "logps/rejected": -46.195926666259766, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00028530642157420516, "rewards/margins": 5.241289545665495e-05, "rewards/rejected": 0.00023289353703148663, "step": 430 }, { "epoch": 0.3170028818443804, "grad_norm": 2.459535598754883, "learning_rate": 4.999535180235972e-08, "logits/chosen": -1.4981845617294312, "logits/rejected": -1.4901264905929565, "logps/chosen": -46.00641632080078, "logps/rejected": -49.42933654785156, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00040736678056418896, "rewards/margins": 0.00017191791266668588, "rewards/rejected": 0.00023544885334558785, "step": 440 }, { "epoch": 0.3242074927953891, "grad_norm": 3.167510747909546, "learning_rate": 4.9990431528966836e-08, "logits/chosen": -1.5115673542022705, "logits/rejected": -1.4903723001480103, "logps/chosen": -53.19926071166992, "logps/rejected": -51.2760124206543, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00023766886442899704, "rewards/margins": 7.205204019555822e-05, "rewards/rejected": 0.00016561683150939643, "step": 450 }, { "epoch": 0.3314121037463977, "grad_norm": 3.773123264312744, "learning_rate": 4.9983754531428326e-08, "logits/chosen": -1.5162955522537231, "logits/rejected": -1.4986896514892578, "logps/chosen": -53.790428161621094, "logps/rejected": -55.66992950439453, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0004971676971763372, "rewards/margins": 0.0004848612588830292, "rewards/rejected": 1.2306480130064301e-05, "step": 460 }, { "epoch": 0.33861671469740634, "grad_norm": 3.7350404262542725, "learning_rate": 4.997532127910954e-08, "logits/chosen": -1.5784637928009033, "logits/rejected": -1.5491013526916504, "logps/chosen": -52.63544464111328, "logps/rejected": -53.17797088623047, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00048227087245322764, "rewards/margins": 0.00017049835878424346, "rewards/rejected": 0.00031177254277281463, "step": 470 }, { "epoch": 0.345821325648415, "grad_norm": 3.6733524799346924, "learning_rate": 4.996513236483331e-08, "logits/chosen": -1.6473217010498047, "logits/rejected": -1.633281946182251, "logps/chosen": -42.51477813720703, "logps/rejected": -45.43607711791992, "loss": 0.6929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.00038208425394259393, "rewards/margins": 0.000435996160376817, "rewards/rejected": -5.39118773303926e-05, "step": 480 }, { "epoch": 0.3530259365994236, "grad_norm": 4.256612777709961, "learning_rate": 4.9953188504838225e-08, "logits/chosen": -1.5243208408355713, "logits/rejected": -1.5126068592071533, "logps/chosen": -46.43769836425781, "logps/rejected": -49.511756896972656, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0005211577517911792, "rewards/margins": 0.00022547971457242966, "rewards/rejected": 0.0002956780372187495, "step": 490 }, { "epoch": 0.36023054755043227, "grad_norm": 2.8351917266845703, "learning_rate": 4.993949053872834e-08, "logits/chosen": -1.5284124612808228, "logits/rejected": -1.5051486492156982, "logps/chosen": -42.646305084228516, "logps/rejected": -45.89906311035156, "loss": 0.6929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0006723391707055271, "rewards/margins": 0.0005468688905239105, "rewards/rejected": 0.00012547028018161654, "step": 500 }, { "epoch": 0.36743515850144093, "grad_norm": 2.8914971351623535, "learning_rate": 4.9924039429414086e-08, "logits/chosen": -1.6392762660980225, "logits/rejected": -1.6216766834259033, "logps/chosen": -46.00267791748047, "logps/rejected": -47.987403869628906, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0007155792554840446, "rewards/margins": 0.0006390741327777505, "rewards/rejected": 7.650510087842122e-05, "step": 510 }, { "epoch": 0.3746397694524496, "grad_norm": 3.5407791137695312, "learning_rate": 4.990683626304467e-08, "logits/chosen": -1.534824013710022, "logits/rejected": -1.5296251773834229, "logps/chosen": -53.9051399230957, "logps/rejected": -56.04487991333008, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.000572909542825073, "rewards/margins": 0.00040510791586712003, "rewards/rejected": 0.0001678016851656139, "step": 520 }, { "epoch": 0.3818443804034582, "grad_norm": 3.2243497371673584, "learning_rate": 4.9887882248931646e-08, "logits/chosen": -1.458703637123108, "logits/rejected": -1.4379961490631104, "logps/chosen": -46.455928802490234, "logps/rejected": -47.554115295410156, "loss": 0.6931, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0005672247498296201, "rewards/margins": 0.00012306116695981473, "rewards/rejected": 0.00044416356831789017, "step": 530 }, { "epoch": 0.38904899135446686, "grad_norm": 3.160867929458618, "learning_rate": 4.986717871946393e-08, "logits/chosen": -1.485264778137207, "logits/rejected": -1.4642503261566162, "logps/chosen": -45.888916015625, "logps/rejected": -47.80854797363281, "loss": 0.6929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0008015796774998307, "rewards/margins": 0.0005912004271522164, "rewards/rejected": 0.0002103792503476143, "step": 540 }, { "epoch": 0.3962536023054755, "grad_norm": 3.078800916671753, "learning_rate": 4.984472713001416e-08, "logits/chosen": -1.4299088716506958, "logits/rejected": -1.4215964078903198, "logps/chosen": -48.37324905395508, "logps/rejected": -48.35413360595703, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0007138814544305205, "rewards/margins": 0.0003910651430487633, "rewards/rejected": 0.0003228162822779268, "step": 550 }, { "epoch": 0.4034582132564842, "grad_norm": 3.2446839809417725, "learning_rate": 4.982052905883637e-08, "logits/chosen": -1.5735007524490356, "logits/rejected": -1.563230276107788, "logps/chosen": -48.497169494628906, "logps/rejected": -49.916038513183594, "loss": 0.693, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0005788643029518425, "rewards/margins": 0.00021954579278826714, "rewards/rejected": 0.0003593183937482536, "step": 560 }, { "epoch": 0.4106628242074928, "grad_norm": 2.9157779216766357, "learning_rate": 4.979458620695505e-08, "logits/chosen": -1.5526814460754395, "logits/rejected": -1.523316740989685, "logps/chosen": -52.486785888671875, "logps/rejected": -54.491912841796875, "loss": 0.6928, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0009498519939370453, "rewards/margins": 0.000704078353010118, "rewards/rejected": 0.00024577361182309687, "step": 570 }, { "epoch": 0.41786743515850144, "grad_norm": 3.174314022064209, "learning_rate": 4.976690039804555e-08, "logits/chosen": -1.5769068002700806, "logits/rejected": -1.5633710622787476, "logps/chosen": -42.617549896240234, "logps/rejected": -44.070777893066406, "loss": 0.6929, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0007929207058623433, "rewards/margins": 0.0005030486499890685, "rewards/rejected": 0.0002898719976656139, "step": 580 }, { "epoch": 0.4250720461095101, "grad_norm": 2.7369155883789062, "learning_rate": 4.973747357830592e-08, "logits/chosen": -1.5272729396820068, "logits/rejected": -1.5258095264434814, "logps/chosen": -47.52259063720703, "logps/rejected": -53.1329345703125, "loss": 0.6928, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0009988851379603148, "rewards/margins": 0.0007000741316005588, "rewards/rejected": 0.00029881083173677325, "step": 590 }, { "epoch": 0.4322766570605187, "grad_norm": 2.792400360107422, "learning_rate": 4.970630781632009e-08, "logits/chosen": -1.6299186944961548, "logits/rejected": -1.6194143295288086, "logps/chosen": -45.39707565307617, "logps/rejected": -49.081199645996094, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.001002423232421279, "rewards/margins": 0.0009681530063971877, "rewards/rejected": 3.427025876590051e-05, "step": 600 }, { "epoch": 0.43948126801152737, "grad_norm": 3.9282000064849854, "learning_rate": 4.967340530291242e-08, "logits/chosen": -1.5342658758163452, "logits/rejected": -1.5172860622406006, "logps/chosen": -50.43634796142578, "logps/rejected": -51.06001281738281, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0008850853191688657, "rewards/margins": 0.00033199749304912984, "rewards/rejected": 0.0005530878552235663, "step": 610 }, { "epoch": 0.44668587896253603, "grad_norm": 2.692594289779663, "learning_rate": 4.9638768350993755e-08, "logits/chosen": -1.5681965351104736, "logits/rejected": -1.5538604259490967, "logps/chosen": -42.41255569458008, "logps/rejected": -44.433387756347656, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0007358186412602663, "rewards/margins": 9.383581345900893e-05, "rewards/rejected": 0.0006419828278012574, "step": 620 }, { "epoch": 0.4538904899135447, "grad_norm": 2.3626010417938232, "learning_rate": 4.9602399395398786e-08, "logits/chosen": -1.571014642715454, "logits/rejected": -1.5637868642807007, "logps/chosen": -43.03638458251953, "logps/rejected": -46.532352447509766, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0012007402256131172, "rewards/margins": 0.000781078590080142, "rewards/rejected": 0.00041966172284446657, "step": 630 }, { "epoch": 0.4610951008645533, "grad_norm": 2.919173002243042, "learning_rate": 4.9564300992714914e-08, "logits/chosen": -1.4291613101959229, "logits/rejected": -1.4245188236236572, "logps/chosen": -45.36858367919922, "logps/rejected": -48.030757904052734, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.001236187876202166, "rewards/margins": 0.0011594726238399744, "rewards/rejected": 7.671533967368305e-05, "step": 640 }, { "epoch": 0.46829971181556196, "grad_norm": 3.464174747467041, "learning_rate": 4.952447582110253e-08, "logits/chosen": -1.6131465435028076, "logits/rejected": -1.584380865097046, "logps/chosen": -45.44336700439453, "logps/rejected": -45.41891860961914, "loss": 0.6927, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0014582508010789752, "rewards/margins": 0.0009963499614968896, "rewards/rejected": 0.0004619006940629333, "step": 650 }, { "epoch": 0.4755043227665706, "grad_norm": 3.4227089881896973, "learning_rate": 4.948292668010676e-08, "logits/chosen": -1.5425691604614258, "logits/rejected": -1.5405323505401611, "logps/chosen": -47.156578063964844, "logps/rejected": -50.03830337524414, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0010936754988506436, "rewards/margins": 0.0011404131073504686, "rewards/rejected": -4.6737539378227666e-05, "step": 660 }, { "epoch": 0.4827089337175792, "grad_norm": 3.4312925338745117, "learning_rate": 4.943965649046064e-08, "logits/chosen": -1.5018900632858276, "logits/rejected": -1.4744553565979004, "logps/chosen": -49.83372116088867, "logps/rejected": -51.13875198364258, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0009614540031179786, "rewards/margins": 0.0003561762277968228, "rewards/rejected": 0.0006052777171134949, "step": 670 }, { "epoch": 0.4899135446685879, "grad_norm": 4.607568740844727, "learning_rate": 4.9394668293879835e-08, "logits/chosen": -1.4445058107376099, "logits/rejected": -1.431208848953247, "logps/chosen": -49.71855926513672, "logps/rejected": -49.596534729003906, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0016611143946647644, "rewards/margins": 0.001147769158706069, "rewards/rejected": 0.0005133452359586954, "step": 680 }, { "epoch": 0.49711815561959655, "grad_norm": 3.329970598220825, "learning_rate": 4.93479652528488e-08, "logits/chosen": -1.5312420129776, "logits/rejected": -1.5205342769622803, "logps/chosen": -47.82133102416992, "logps/rejected": -50.619808197021484, "loss": 0.6927, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0012991353869438171, "rewards/margins": 0.000877738930284977, "rewards/rejected": 0.00042139639845117927, "step": 690 }, { "epoch": 0.5043227665706052, "grad_norm": 2.7837576866149902, "learning_rate": 4.929955065039848e-08, "logits/chosen": -1.54481840133667, "logits/rejected": -1.5314353704452515, "logps/chosen": -46.46809005737305, "logps/rejected": -49.25910949707031, "loss": 0.6927, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0013074753805994987, "rewards/margins": 0.0009138353052549064, "rewards/rejected": 0.00039364007534459233, "step": 700 }, { "epoch": 0.5115273775216138, "grad_norm": 2.817760705947876, "learning_rate": 4.92494278898755e-08, "logits/chosen": -1.5252554416656494, "logits/rejected": -1.509037733078003, "logps/chosen": -41.35044860839844, "logps/rejected": -43.40355682373047, "loss": 0.6926, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0012825509766116738, "rewards/margins": 0.000999027630314231, "rewards/rejected": 0.00028352331719361246, "step": 710 }, { "epoch": 0.5187319884726225, "grad_norm": 3.3338074684143066, "learning_rate": 4.9197600494702955e-08, "logits/chosen": -1.4963575601577759, "logits/rejected": -1.4812816381454468, "logps/chosen": -49.3184700012207, "logps/rejected": -52.44392013549805, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0011441984679549932, "rewards/margins": 0.000659407174680382, "rewards/rejected": 0.00048479135148227215, "step": 720 }, { "epoch": 0.5259365994236311, "grad_norm": 2.8599021434783936, "learning_rate": 4.9144072108132725e-08, "logits/chosen": -1.5103414058685303, "logits/rejected": -1.4909043312072754, "logps/chosen": -48.92387771606445, "logps/rejected": -51.066978454589844, "loss": 0.6927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0015517466235905886, "rewards/margins": 0.0009926140774041414, "rewards/rejected": 0.0005591326043941081, "step": 730 }, { "epoch": 0.5331412103746398, "grad_norm": 2.899857521057129, "learning_rate": 4.908884649298937e-08, "logits/chosen": -1.5037367343902588, "logits/rejected": -1.5001060962677002, "logps/chosen": -46.71796417236328, "logps/rejected": -46.27198791503906, "loss": 0.6929, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0012326440773904324, "rewards/margins": 0.0005613928660750389, "rewards/rejected": 0.0006712513277307153, "step": 740 }, { "epoch": 0.5403458213256485, "grad_norm": 2.8896918296813965, "learning_rate": 4.903192753140557e-08, "logits/chosen": -1.5269062519073486, "logits/rejected": -1.510358214378357, "logps/chosen": -48.91157150268555, "logps/rejected": -50.0924072265625, "loss": 0.6925, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.001482138060964644, "rewards/margins": 0.001297360984608531, "rewards/rejected": 0.0001847770472522825, "step": 750 }, { "epoch": 0.547550432276657, "grad_norm": 3.3254764080047607, "learning_rate": 4.897331922454931e-08, "logits/chosen": -1.4530668258666992, "logits/rejected": -1.451474666595459, "logps/chosen": -45.5156135559082, "logps/rejected": -48.615623474121094, "loss": 0.6926, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.001580315874889493, "rewards/margins": 0.001087161828763783, "rewards/rejected": 0.0004931538132950664, "step": 760 }, { "epoch": 0.5547550432276657, "grad_norm": 3.2673330307006836, "learning_rate": 4.891302569234256e-08, "logits/chosen": -1.47372305393219, "logits/rejected": -1.4675675630569458, "logps/chosen": -43.22402572631836, "logps/rejected": -45.916168212890625, "loss": 0.6923, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0018752291798591614, "rewards/margins": 0.0017130204942077398, "rewards/rejected": 0.00016220868565142155, "step": 770 }, { "epoch": 0.5619596541786743, "grad_norm": 2.8122148513793945, "learning_rate": 4.8851051173171656e-08, "logits/chosen": -1.49901282787323, "logits/rejected": -1.4895904064178467, "logps/chosen": -48.40452575683594, "logps/rejected": -50.17125701904297, "loss": 0.6926, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0016933809965848923, "rewards/margins": 0.0010203216224908829, "rewards/rejected": 0.0006730594905093312, "step": 780 }, { "epoch": 0.569164265129683, "grad_norm": 2.8876476287841797, "learning_rate": 4.87874000235894e-08, "logits/chosen": -1.5472230911254883, "logits/rejected": -1.537274956703186, "logps/chosen": -49.910091400146484, "logps/rejected": -53.432830810546875, "loss": 0.6926, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.001698293723165989, "rewards/margins": 0.0011685159988701344, "rewards/rejected": 0.00052977807354182, "step": 790 }, { "epoch": 0.5763688760806917, "grad_norm": 3.3339130878448486, "learning_rate": 4.872207671800876e-08, "logits/chosen": -1.5249817371368408, "logits/rejected": -1.5136332511901855, "logps/chosen": -46.88755416870117, "logps/rejected": -47.896827697753906, "loss": 0.6925, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0017408020794391632, "rewards/margins": 0.0012650018325075507, "rewards/rejected": 0.00047580021782778203, "step": 800 }, { "epoch": 0.5835734870317003, "grad_norm": 2.6908926963806152, "learning_rate": 4.865508584838841e-08, "logits/chosen": -1.5177758932113647, "logits/rejected": -1.5210834741592407, "logps/chosen": -44.743553161621094, "logps/rejected": -47.87196731567383, "loss": 0.6926, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.001408976735547185, "rewards/margins": 0.0010177220683544874, "rewards/rejected": 0.00039125472540035844, "step": 810 }, { "epoch": 0.590778097982709, "grad_norm": 2.710764169692993, "learning_rate": 4.858643212390985e-08, "logits/chosen": -1.5524133443832397, "logits/rejected": -1.5305286645889282, "logps/chosen": -46.923805236816406, "logps/rejected": -47.56487274169922, "loss": 0.6923, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0020391198340803385, "rewards/margins": 0.001677272142842412, "rewards/rejected": 0.00036184763303026557, "step": 820 }, { "epoch": 0.5979827089337176, "grad_norm": 2.6352500915527344, "learning_rate": 4.851612037064643e-08, "logits/chosen": -1.5101532936096191, "logits/rejected": -1.5033133029937744, "logps/chosen": -41.79671859741211, "logps/rejected": -44.642845153808594, "loss": 0.6924, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0022669571917504072, "rewards/margins": 0.0015860407147556543, "rewards/rejected": 0.0006809166516177356, "step": 830 }, { "epoch": 0.6051873198847262, "grad_norm": 2.259105682373047, "learning_rate": 4.8444155531224065e-08, "logits/chosen": -1.5194116830825806, "logits/rejected": -1.5120502710342407, "logps/chosen": -47.2054328918457, "logps/rejected": -47.48673629760742, "loss": 0.6923, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0022509084083139896, "rewards/margins": 0.0016614611959084868, "rewards/rejected": 0.0005894473288208246, "step": 840 }, { "epoch": 0.6123919308357348, "grad_norm": 3.8317296504974365, "learning_rate": 4.8370542664473805e-08, "logits/chosen": -1.5284236669540405, "logits/rejected": -1.517364263534546, "logps/chosen": -47.17852783203125, "logps/rejected": -50.44862747192383, "loss": 0.6922, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.002234196290373802, "rewards/margins": 0.001899405149742961, "rewards/rejected": 0.00033479099511168897, "step": 850 }, { "epoch": 0.6195965417867435, "grad_norm": 2.7945921421051025, "learning_rate": 4.829528694507624e-08, "logits/chosen": -1.5346529483795166, "logits/rejected": -1.5194079875946045, "logps/chosen": -56.83978271484375, "logps/rejected": -56.75861358642578, "loss": 0.6922, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0023796563036739826, "rewards/margins": 0.0018080968875437975, "rewards/rejected": 0.0005715594161301851, "step": 860 }, { "epoch": 0.6268011527377522, "grad_norm": 3.107956647872925, "learning_rate": 4.821839366319768e-08, "logits/chosen": -1.5742651224136353, "logits/rejected": -1.5632001161575317, "logps/chosen": -47.60409927368164, "logps/rejected": -50.62064743041992, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": 0.001870837644673884, "rewards/margins": 0.0018480487633496523, "rewards/rejected": 2.2788974092691205e-05, "step": 870 }, { "epoch": 0.6340057636887608, "grad_norm": 3.052370309829712, "learning_rate": 4.813986822411833e-08, "logits/chosen": -1.5950231552124023, "logits/rejected": -1.5870921611785889, "logps/chosen": -46.46802520751953, "logps/rejected": -47.61067581176758, "loss": 0.6924, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0018212845316156745, "rewards/margins": 0.0015648396220058203, "rewards/rejected": 0.0002564448514021933, "step": 880 }, { "epoch": 0.6412103746397695, "grad_norm": 2.974740505218506, "learning_rate": 4.805971614785231e-08, "logits/chosen": -1.5934207439422607, "logits/rejected": -1.5831716060638428, "logps/chosen": -44.20450973510742, "logps/rejected": -45.84693145751953, "loss": 0.6923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002194278407841921, "rewards/margins": 0.0017778873443603516, "rewards/rejected": 0.000416390917962417, "step": 890 }, { "epoch": 0.6484149855907781, "grad_norm": 3.176281690597534, "learning_rate": 4.797794306875963e-08, "logits/chosen": -1.4426735639572144, "logits/rejected": -1.4458198547363281, "logps/chosen": -52.887428283691406, "logps/rejected": -56.02144241333008, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0018950074445456266, "rewards/margins": 0.0015232457080855966, "rewards/rejected": 0.0003717617364600301, "step": 900 }, { "epoch": 0.6556195965417867, "grad_norm": 3.130652904510498, "learning_rate": 4.7894554735150076e-08, "logits/chosen": -1.4940052032470703, "logits/rejected": -1.4864368438720703, "logps/chosen": -50.41922378540039, "logps/rejected": -51.92753982543945, "loss": 0.6925, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0017981244018301368, "rewards/margins": 0.0012389495968818665, "rewards/rejected": 0.0005591747467406094, "step": 910 }, { "epoch": 0.6628242074927954, "grad_norm": 2.4986073970794678, "learning_rate": 4.7809557008879185e-08, "logits/chosen": -1.5263350009918213, "logits/rejected": -1.5145517587661743, "logps/chosen": -42.057559967041016, "logps/rejected": -43.96914291381836, "loss": 0.6921, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0024824230931699276, "rewards/margins": 0.0021964963525533676, "rewards/rejected": 0.00028592668240889907, "step": 920 }, { "epoch": 0.670028818443804, "grad_norm": 3.264617443084717, "learning_rate": 4.772295586493613e-08, "logits/chosen": -1.5924278497695923, "logits/rejected": -1.5789930820465088, "logps/chosen": -46.364784240722656, "logps/rejected": -48.81365203857422, "loss": 0.6922, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0025308418553322554, "rewards/margins": 0.001982019515708089, "rewards/rejected": 0.0005488225724548101, "step": 930 }, { "epoch": 0.6772334293948127, "grad_norm": 2.300140619277954, "learning_rate": 4.763475739102374e-08, "logits/chosen": -1.473141074180603, "logits/rejected": -1.468505620956421, "logps/chosen": -54.870941162109375, "logps/rejected": -55.6484489440918, "loss": 0.692, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002576081547886133, "rewards/margins": 0.0023079104721546173, "rewards/rejected": 0.0002681712794583291, "step": 940 }, { "epoch": 0.6844380403458213, "grad_norm": 2.9154794216156006, "learning_rate": 4.754496778713054e-08, "logits/chosen": -1.4293240308761597, "logits/rejected": -1.4433194398880005, "logps/chosen": -46.24222183227539, "logps/rejected": -50.87912368774414, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0029074286576360464, "rewards/margins": 0.0015344502171501517, "rewards/rejected": 0.0013729783240705729, "step": 950 }, { "epoch": 0.69164265129683, "grad_norm": 2.978912830352783, "learning_rate": 4.7453593365094926e-08, "logits/chosen": -1.5649144649505615, "logits/rejected": -1.5566378831863403, "logps/chosen": -48.9169807434082, "logps/rejected": -51.32488250732422, "loss": 0.6923, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0020612571388483047, "rewards/margins": 0.0017103856662288308, "rewards/rejected": 0.00035087167634628713, "step": 960 }, { "epoch": 0.6988472622478387, "grad_norm": 3.66062068939209, "learning_rate": 4.736064054816145e-08, "logits/chosen": -1.579488754272461, "logits/rejected": -1.5711945295333862, "logps/chosen": -44.430999755859375, "logps/rejected": -47.700523376464844, "loss": 0.6919, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.0031583986710757017, "rewards/margins": 0.0025061373598873615, "rewards/rejected": 0.0006522616604343057, "step": 970 }, { "epoch": 0.7060518731988472, "grad_norm": 2.772946357727051, "learning_rate": 4.726611587052933e-08, "logits/chosen": -1.4304895401000977, "logits/rejected": -1.4331814050674438, "logps/chosen": -50.69477462768555, "logps/rejected": -55.800628662109375, "loss": 0.6924, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0024115366395562887, "rewards/margins": 0.0015366144943982363, "rewards/rejected": 0.0008749221451580524, "step": 980 }, { "epoch": 0.7132564841498559, "grad_norm": 3.8805480003356934, "learning_rate": 4.71700259768931e-08, "logits/chosen": -1.5389564037322998, "logits/rejected": -1.5328103303909302, "logps/chosen": -50.487335205078125, "logps/rejected": -51.9775390625, "loss": 0.6923, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.002420094795525074, "rewards/margins": 0.001630530459806323, "rewards/rejected": 0.0007895643939264119, "step": 990 }, { "epoch": 0.7204610951008645, "grad_norm": 2.787208080291748, "learning_rate": 4.707237762197549e-08, "logits/chosen": -1.5227080583572388, "logits/rejected": -1.511089563369751, "logps/chosen": -47.02164840698242, "logps/rejected": -49.01847839355469, "loss": 0.6922, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0033734552562236786, "rewards/margins": 0.0018900551367551088, "rewards/rejected": 0.0014834003522992134, "step": 1000 }, { "epoch": 0.7276657060518732, "grad_norm": 3.9273159503936768, "learning_rate": 4.697317767005265e-08, "logits/chosen": -1.530651330947876, "logits/rejected": -1.519158959388733, "logps/chosen": -43.01877212524414, "logps/rejected": -44.823909759521484, "loss": 0.692, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0028608546126633883, "rewards/margins": 0.002323360648006201, "rewards/rejected": 0.0005374938482418656, "step": 1010 }, { "epoch": 0.7348703170028819, "grad_norm": 2.8428092002868652, "learning_rate": 4.6872433094471577e-08, "logits/chosen": -1.5484195947647095, "logits/rejected": -1.5358097553253174, "logps/chosen": -46.577796936035156, "logps/rejected": -48.426883697509766, "loss": 0.6924, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0024873907677829266, "rewards/margins": 0.0014864472905173898, "rewards/rejected": 0.0010009428951889277, "step": 1020 }, { "epoch": 0.7420749279538905, "grad_norm": 2.6796295642852783, "learning_rate": 4.677015097715994e-08, "logits/chosen": -1.4803837537765503, "logits/rejected": -1.4727258682250977, "logps/chosen": -43.49967575073242, "logps/rejected": -46.801334381103516, "loss": 0.6917, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003408711403608322, "rewards/margins": 0.0028357082046568394, "rewards/rejected": 0.0005730032571591437, "step": 1030 }, { "epoch": 0.7492795389048992, "grad_norm": 2.420775890350342, "learning_rate": 4.666633850812825e-08, "logits/chosen": -1.523738145828247, "logits/rejected": -1.507525086402893, "logps/chosen": -46.27619552612305, "logps/rejected": -48.205955505371094, "loss": 0.6922, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.002326422370970249, "rewards/margins": 0.0019570752047002316, "rewards/rejected": 0.00036934722447767854, "step": 1040 }, { "epoch": 0.7564841498559077, "grad_norm": 2.361326217651367, "learning_rate": 4.656100298496439e-08, "logits/chosen": -1.4339568614959717, "logits/rejected": -1.4203391075134277, "logps/chosen": -41.24075698852539, "logps/rejected": -44.33374786376953, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": 0.0036104682367295027, "rewards/margins": 0.003143618581816554, "rewards/rejected": 0.0004668496549129486, "step": 1050 }, { "epoch": 0.7636887608069164, "grad_norm": 3.083638906478882, "learning_rate": 4.6454151812320715e-08, "logits/chosen": -1.5105210542678833, "logits/rejected": -1.48483407497406, "logps/chosen": -47.20132064819336, "logps/rejected": -48.682518005371094, "loss": 0.6916, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0033471949864178896, "rewards/margins": 0.003038776572793722, "rewards/rejected": 0.0003084186464548111, "step": 1060 }, { "epoch": 0.770893371757925, "grad_norm": 3.846761465072632, "learning_rate": 4.6345792501393434e-08, "logits/chosen": -1.4992892742156982, "logits/rejected": -1.4937262535095215, "logps/chosen": -53.687469482421875, "logps/rejected": -57.72792434692383, "loss": 0.6916, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.003973551094532013, "rewards/margins": 0.003104776842519641, "rewards/rejected": 0.0008687739027664065, "step": 1070 }, { "epoch": 0.7780979827089337, "grad_norm": 3.0516538619995117, "learning_rate": 4.6235932669394676e-08, "logits/chosen": -1.5073257684707642, "logits/rejected": -1.4990612268447876, "logps/chosen": -48.099449157714844, "logps/rejected": -51.08936309814453, "loss": 0.6916, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004019264131784439, "rewards/margins": 0.003163648769259453, "rewards/rejected": 0.0008556157117709517, "step": 1080 }, { "epoch": 0.7853025936599424, "grad_norm": 3.5501997470855713, "learning_rate": 4.612458003901698e-08, "logits/chosen": -1.5299046039581299, "logits/rejected": -1.5243635177612305, "logps/chosen": -52.48954391479492, "logps/rejected": -56.07282257080078, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": 0.0027964431792497635, "rewards/margins": 0.0034413684625178576, "rewards/rejected": -0.0006449251086451113, "step": 1090 }, { "epoch": 0.792507204610951, "grad_norm": 3.1133222579956055, "learning_rate": 4.6011742437890476e-08, "logits/chosen": -1.5386149883270264, "logits/rejected": -1.515916347503662, "logps/chosen": -47.325008392333984, "logps/rejected": -48.78601837158203, "loss": 0.692, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.002987902145832777, "rewards/margins": 0.0024031363427639008, "rewards/rejected": 0.0005847656866535544, "step": 1100 }, { "epoch": 0.7997118155619597, "grad_norm": 2.2972168922424316, "learning_rate": 4.589742779803259e-08, "logits/chosen": -1.547487735748291, "logits/rejected": -1.5349434614181519, "logps/chosen": -46.49794387817383, "logps/rejected": -48.665802001953125, "loss": 0.6919, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0025873552076518536, "rewards/margins": 0.0025315138045698404, "rewards/rejected": 5.5841472203610465e-05, "step": 1110 }, { "epoch": 0.8069164265129684, "grad_norm": 2.634446620941162, "learning_rate": 4.5781644155290486e-08, "logits/chosen": -1.4874818325042725, "logits/rejected": -1.4774929285049438, "logps/chosen": -45.514652252197266, "logps/rejected": -46.51914596557617, "loss": 0.6914, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004184176214039326, "rewards/margins": 0.0034972827415913343, "rewards/rejected": 0.000686893705278635, "step": 1120 }, { "epoch": 0.8141210374639769, "grad_norm": 2.841136932373047, "learning_rate": 4.566439964877613e-08, "logits/chosen": -1.522335410118103, "logits/rejected": -1.5161679983139038, "logps/chosen": -43.42735290527344, "logps/rejected": -45.222328186035156, "loss": 0.6921, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.002366938628256321, "rewards/margins": 0.0020665768533945084, "rewards/rejected": 0.00030036186217330396, "step": 1130 }, { "epoch": 0.8213256484149856, "grad_norm": 2.926987409591675, "learning_rate": 4.554570252029421e-08, "logits/chosen": -1.5708011388778687, "logits/rejected": -1.5619392395019531, "logps/chosen": -46.80385971069336, "logps/rejected": -49.117591857910156, "loss": 0.6911, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.004434407688677311, "rewards/margins": 0.004192848689854145, "rewards/rejected": 0.00024155872233677655, "step": 1140 }, { "epoch": 0.8285302593659942, "grad_norm": 2.6681439876556396, "learning_rate": 4.542556111376274e-08, "logits/chosen": -1.5655062198638916, "logits/rejected": -1.5541120767593384, "logps/chosen": -48.73239517211914, "logps/rejected": -50.90290069580078, "loss": 0.6917, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.003030436811968684, "rewards/margins": 0.0029130682814866304, "rewards/rejected": 0.00011736871965695173, "step": 1150 }, { "epoch": 0.8357348703170029, "grad_norm": 3.0899651050567627, "learning_rate": 4.5303983874626506e-08, "logits/chosen": -1.5412328243255615, "logits/rejected": -1.5294244289398193, "logps/chosen": -50.64223098754883, "logps/rejected": -51.3464469909668, "loss": 0.6919, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002868660492822528, "rewards/margins": 0.002512868959456682, "rewards/rejected": 0.00035579182440415025, "step": 1160 }, { "epoch": 0.8429394812680115, "grad_norm": 3.553450345993042, "learning_rate": 4.518097934926339e-08, "logits/chosen": -1.4599460363388062, "logits/rejected": -1.4345753192901611, "logps/chosen": -46.791717529296875, "logps/rejected": -46.883766174316406, "loss": 0.6916, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.003114109393209219, "rewards/margins": 0.0031202633399516344, "rewards/rejected": -6.154028142191237e-06, "step": 1170 }, { "epoch": 0.8501440922190202, "grad_norm": 3.8637311458587646, "learning_rate": 4.505655618438363e-08, "logits/chosen": -1.4245116710662842, "logits/rejected": -1.4106541872024536, "logps/chosen": -48.68230438232422, "logps/rejected": -49.66895294189453, "loss": 0.6918, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.002959758508950472, "rewards/margins": 0.0026471882592886686, "rewards/rejected": 0.0003125699586234987, "step": 1180 }, { "epoch": 0.8573487031700289, "grad_norm": 2.923635244369507, "learning_rate": 4.4930723126421945e-08, "logits/chosen": -1.5897529125213623, "logits/rejected": -1.5666835308074951, "logps/chosen": -49.047271728515625, "logps/rejected": -50.365779876708984, "loss": 0.6915, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003261597827076912, "rewards/margins": 0.0032364025246351957, "rewards/rejected": 2.519541885703802e-05, "step": 1190 }, { "epoch": 0.8645533141210374, "grad_norm": 3.259798049926758, "learning_rate": 4.48034890209227e-08, "logits/chosen": -1.4653351306915283, "logits/rejected": -1.4455980062484741, "logps/chosen": -51.658111572265625, "logps/rejected": -53.60918426513672, "loss": 0.6915, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002871064469218254, "rewards/margins": 0.0033232111018151045, "rewards/rejected": -0.0004521465743891895, "step": 1200 }, { "epoch": 0.8717579250720461, "grad_norm": 2.646080493927002, "learning_rate": 4.4674862811918155e-08, "logits/chosen": -1.4465787410736084, "logits/rejected": -1.4438260793685913, "logps/chosen": -43.387939453125, "logps/rejected": -46.49829864501953, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0034131291322410107, "rewards/margins": 0.00300223333761096, "rewards/rejected": 0.0004108964931219816, "step": 1210 }, { "epoch": 0.8789625360230547, "grad_norm": 3.445535659790039, "learning_rate": 4.454485354129966e-08, "logits/chosen": -1.4952054023742676, "logits/rejected": -1.489473819732666, "logps/chosen": -46.59187698364258, "logps/rejected": -50.117591857910156, "loss": 0.6912, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0038944887928664684, "rewards/margins": 0.0038705854676663876, "rewards/rejected": 2.390358167758677e-05, "step": 1220 }, { "epoch": 0.8861671469740634, "grad_norm": 2.979490041732788, "learning_rate": 4.4413470348182124e-08, "logits/chosen": -1.4463694095611572, "logits/rejected": -1.4229583740234375, "logps/chosen": -48.736202239990234, "logps/rejected": -50.797584533691406, "loss": 0.6912, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0038722821045666933, "rewards/margins": 0.0038511925376951694, "rewards/rejected": 2.1089776055305265e-05, "step": 1230 }, { "epoch": 0.8933717579250721, "grad_norm": 3.6062870025634766, "learning_rate": 4.42807224682615e-08, "logits/chosen": -1.503015160560608, "logits/rejected": -1.490422248840332, "logps/chosen": -42.87591552734375, "logps/rejected": -45.96302032470703, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": 0.0038226053584367037, "rewards/margins": 0.004490352235734463, "rewards/rejected": -0.0006677471101284027, "step": 1240 }, { "epoch": 0.9005763688760807, "grad_norm": 2.5306854248046875, "learning_rate": 4.4146619233165604e-08, "logits/chosen": -1.551155686378479, "logits/rejected": -1.5466840267181396, "logps/chosen": -50.65498733520508, "logps/rejected": -54.01371383666992, "loss": 0.6917, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.002975714858621359, "rewards/margins": 0.0028749865014106035, "rewards/rejected": 0.00010072816803585738, "step": 1250 }, { "epoch": 0.9077809798270894, "grad_norm": 3.129319906234741, "learning_rate": 4.4011170069798126e-08, "logits/chosen": -1.5055732727050781, "logits/rejected": -1.5217446088790894, "logps/chosen": -46.49773025512695, "logps/rejected": -53.42799758911133, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": 0.0027830186299979687, "rewards/margins": 0.00282860454171896, "rewards/rejected": -4.558537693810649e-05, "step": 1260 }, { "epoch": 0.9149855907780979, "grad_norm": 3.1904845237731934, "learning_rate": 4.387438449967594e-08, "logits/chosen": -1.454516053199768, "logits/rejected": -1.4410918951034546, "logps/chosen": -45.38161087036133, "logps/rejected": -47.975074768066406, "loss": 0.6907, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004763299599289894, "rewards/margins": 0.00504193315282464, "rewards/rejected": -0.00027863297145813704, "step": 1270 }, { "epoch": 0.9221902017291066, "grad_norm": 3.450941324234009, "learning_rate": 4.373627213825983e-08, "logits/chosen": -1.6091142892837524, "logits/rejected": -1.5993342399597168, "logps/chosen": -46.181339263916016, "logps/rejected": -49.70648193359375, "loss": 0.6904, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0049495515413582325, "rewards/margins": 0.005530401133000851, "rewards/rejected": -0.0005808495916426182, "step": 1280 }, { "epoch": 0.9293948126801153, "grad_norm": 2.4806911945343018, "learning_rate": 4.359684269427848e-08, "logits/chosen": -1.5663807392120361, "logits/rejected": -1.563518762588501, "logps/chosen": -45.587608337402344, "logps/rejected": -49.11951446533203, "loss": 0.6911, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004525287542492151, "rewards/margins": 0.004190221894532442, "rewards/rejected": 0.0003350655024405569, "step": 1290 }, { "epoch": 0.9365994236311239, "grad_norm": 3.0177292823791504, "learning_rate": 4.34561059690461e-08, "logits/chosen": -1.609378457069397, "logits/rejected": -1.6081126928329468, "logps/chosen": -47.325008392333984, "logps/rejected": -48.884395599365234, "loss": 0.6925, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0018209271365776658, "rewards/margins": 0.0013891037087887526, "rewards/rejected": 0.0004318233986850828, "step": 1300 }, { "epoch": 0.9438040345821326, "grad_norm": 2.676215171813965, "learning_rate": 4.3314071855773314e-08, "logits/chosen": -1.5698845386505127, "logits/rejected": -1.5718752145767212, "logps/chosen": -41.882728576660156, "logps/rejected": -45.07906723022461, "loss": 0.6913, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0035904725082218647, "rewards/margins": 0.0036595568526536226, "rewards/rejected": -6.908467912580818e-05, "step": 1310 }, { "epoch": 0.9510086455331412, "grad_norm": 3.0929179191589355, "learning_rate": 4.3170750338871806e-08, "logits/chosen": -1.5062233209609985, "logits/rejected": -1.491188406944275, "logps/chosen": -46.517478942871094, "logps/rejected": -49.685264587402344, "loss": 0.6909, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004791845567524433, "rewards/margins": 0.004565626382827759, "rewards/rejected": 0.00022621969401370734, "step": 1320 }, { "epoch": 0.9582132564841499, "grad_norm": 3.0005862712860107, "learning_rate": 4.3026151493252414e-08, "logits/chosen": -1.5496270656585693, "logits/rejected": -1.5283520221710205, "logps/chosen": -51.4772834777832, "logps/rejected": -52.89916229248047, "loss": 0.6908, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004191336687654257, "rewards/margins": 0.0047586942091584206, "rewards/rejected": -0.0005673574050888419, "step": 1330 }, { "epoch": 0.9654178674351584, "grad_norm": 3.5196878910064697, "learning_rate": 4.2880285483616895e-08, "logits/chosen": -1.5336337089538574, "logits/rejected": -1.530709981918335, "logps/chosen": -45.714988708496094, "logps/rejected": -49.015140533447266, "loss": 0.6912, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.003844538237899542, "rewards/margins": 0.0039407783187925816, "rewards/rejected": -9.623957157600671e-05, "step": 1340 }, { "epoch": 0.9726224783861671, "grad_norm": 2.7437596321105957, "learning_rate": 4.273316256374342e-08, "logits/chosen": -1.4035046100616455, "logits/rejected": -1.398699164390564, "logps/chosen": -52.25505447387695, "logps/rejected": -53.14892578125, "loss": 0.6912, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004184540826827288, "rewards/margins": 0.0038869075942784548, "rewards/rejected": 0.0002976330870296806, "step": 1350 }, { "epoch": 0.9798270893371758, "grad_norm": 3.3687148094177246, "learning_rate": 4.258479307576576e-08, "logits/chosen": -1.5007444620132446, "logits/rejected": -1.4955756664276123, "logps/chosen": -43.784767150878906, "logps/rejected": -45.702003479003906, "loss": 0.6903, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.005375358276069164, "rewards/margins": 0.005684197880327702, "rewards/rejected": -0.00030883969157002866, "step": 1360 }, { "epoch": 0.9870317002881844, "grad_norm": 2.7493855953216553, "learning_rate": 4.243518744944626e-08, "logits/chosen": -1.5043120384216309, "logits/rejected": -1.5005708932876587, "logps/chosen": -43.27696228027344, "logps/rejected": -47.130455017089844, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": 0.003993698861449957, "rewards/margins": 0.004580583423376083, "rewards/rejected": -0.0005868846783414483, "step": 1370 }, { "epoch": 0.9942363112391931, "grad_norm": 3.398181915283203, "learning_rate": 4.22843562014427e-08, "logits/chosen": -1.449882984161377, "logits/rejected": -1.4400885105133057, "logps/chosen": -46.8693962097168, "logps/rejected": -49.02799987792969, "loss": 0.6918, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004130179528146982, "rewards/margins": 0.0028191660530865192, "rewards/rejected": 0.001311013475060463, "step": 1380 }, { "epoch": 1.0014409221902016, "grad_norm": 3.157179832458496, "learning_rate": 4.2132309934569e-08, "logits/chosen": -1.5672855377197266, "logits/rejected": -1.562207579612732, "logps/chosen": -43.753822326660156, "logps/rejected": -46.12738800048828, "loss": 0.6913, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0052392082288861275, "rewards/margins": 0.0038300647865980864, "rewards/rejected": 0.0014091433258727193, "step": 1390 }, { "epoch": 1.0086455331412103, "grad_norm": 2.479654312133789, "learning_rate": 4.197905933704989e-08, "logits/chosen": -1.4313812255859375, "logits/rejected": -1.4214115142822266, "logps/chosen": -47.267311096191406, "logps/rejected": -49.97428512573242, "loss": 0.6896, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.005089783109724522, "rewards/margins": 0.007234425283968449, "rewards/rejected": -0.002144642174243927, "step": 1400 }, { "epoch": 1.015850144092219, "grad_norm": 2.7182703018188477, "learning_rate": 4.1824615181769577e-08, "logits/chosen": -1.486061453819275, "logits/rejected": -1.492903232574463, "logps/chosen": -43.82571792602539, "logps/rejected": -47.75173568725586, "loss": 0.6902, "rewards/accuracies": 0.625, "rewards/chosen": 0.004842250142246485, "rewards/margins": 0.005944275297224522, "rewards/rejected": -0.0011020256206393242, "step": 1410 }, { "epoch": 1.0230547550432276, "grad_norm": 3.1151888370513916, "learning_rate": 4.1668988325514434e-08, "logits/chosen": -1.5238953828811646, "logits/rejected": -1.5135290622711182, "logps/chosen": -49.40985107421875, "logps/rejected": -52.1383171081543, "loss": 0.6898, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005019115749746561, "rewards/margins": 0.006819140166044235, "rewards/rejected": -0.0018000241834670305, "step": 1420 }, { "epoch": 1.0302593659942363, "grad_norm": 3.011781930923462, "learning_rate": 4.1512189708209844e-08, "logits/chosen": -1.573925256729126, "logits/rejected": -1.5644539594650269, "logps/chosen": -38.25946807861328, "logps/rejected": -39.431949615478516, "loss": 0.6908, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004985497798770666, "rewards/margins": 0.0047707511112093925, "rewards/rejected": 0.0002147464983863756, "step": 1430 }, { "epoch": 1.037463976945245, "grad_norm": 3.615877866744995, "learning_rate": 4.1354230352151143e-08, "logits/chosen": -1.504152536392212, "logits/rejected": -1.4912039041519165, "logps/chosen": -56.4444694519043, "logps/rejected": -56.67643356323242, "loss": 0.6906, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004153282381594181, "rewards/margins": 0.005227471701800823, "rewards/rejected": -0.0010741890873759985, "step": 1440 }, { "epoch": 1.0446685878962536, "grad_norm": 2.5767173767089844, "learning_rate": 4.119512136122882e-08, "logits/chosen": -1.6086022853851318, "logits/rejected": -1.6209228038787842, "logps/chosen": -42.2785530090332, "logps/rejected": -48.44477081298828, "loss": 0.6897, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004324558191001415, "rewards/margins": 0.006945625878870487, "rewards/rejected": -0.002621066989377141, "step": 1450 }, { "epoch": 1.0518731988472623, "grad_norm": 3.475733757019043, "learning_rate": 4.103487392014795e-08, "logits/chosen": -1.475531816482544, "logits/rejected": -1.4568547010421753, "logps/chosen": -46.373329162597656, "logps/rejected": -51.0313835144043, "loss": 0.6888, "rewards/accuracies": 0.6875, "rewards/chosen": 0.005704123992472887, "rewards/margins": 0.008804896846413612, "rewards/rejected": -0.0031007737852633, "step": 1460 }, { "epoch": 1.059077809798271, "grad_norm": 2.9147305488586426, "learning_rate": 4.087349929364192e-08, "logits/chosen": -1.5652912855148315, "logits/rejected": -1.544355869293213, "logps/chosen": -42.559722900390625, "logps/rejected": -45.870784759521484, "loss": 0.6897, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004617183469235897, "rewards/margins": 0.0069281430914998055, "rewards/rejected": -0.002310959156602621, "step": 1470 }, { "epoch": 1.0662824207492796, "grad_norm": 2.4540820121765137, "learning_rate": 4.0711008825680645e-08, "logits/chosen": -1.5043457746505737, "logits/rejected": -1.4851245880126953, "logps/chosen": -47.31243896484375, "logps/rejected": -50.1568489074707, "loss": 0.6905, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004572049714624882, "rewards/margins": 0.005410096608102322, "rewards/rejected": -0.0008380465442314744, "step": 1480 }, { "epoch": 1.0734870317002883, "grad_norm": 3.5471794605255127, "learning_rate": 4.054741393867306e-08, "logits/chosen": -1.4756460189819336, "logits/rejected": -1.4643748998641968, "logps/chosen": -54.061676025390625, "logps/rejected": -55.685035705566406, "loss": 0.6903, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004487854428589344, "rewards/margins": 0.005812914576381445, "rewards/rejected": -0.0013250606134533882, "step": 1490 }, { "epoch": 1.080691642651297, "grad_norm": 2.9531631469726562, "learning_rate": 4.038272613266419e-08, "logits/chosen": -1.5454641580581665, "logits/rejected": -1.5215752124786377, "logps/chosen": -44.894840240478516, "logps/rejected": -47.37709045410156, "loss": 0.6903, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004894919227808714, "rewards/margins": 0.005796266719698906, "rewards/rejected": -0.000901346851605922, "step": 1500 }, { "epoch": 1.0878962536023056, "grad_norm": 3.38706111907959, "learning_rate": 4.0216956984526784e-08, "logits/chosen": -1.5542356967926025, "logits/rejected": -1.550040364265442, "logps/chosen": -42.899436950683594, "logps/rejected": -45.57080078125, "loss": 0.6897, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005091445986181498, "rewards/margins": 0.0070404186844825745, "rewards/rejected": -0.0019489733967930079, "step": 1510 }, { "epoch": 1.0951008645533142, "grad_norm": 3.101982831954956, "learning_rate": 4.0050118147147446e-08, "logits/chosen": -1.5179659128189087, "logits/rejected": -1.509966254234314, "logps/chosen": -53.40665817260742, "logps/rejected": -52.11634063720703, "loss": 0.6921, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0031676869839429855, "rewards/margins": 0.0020833946764469147, "rewards/rejected": 0.0010842925403267145, "step": 1520 }, { "epoch": 1.1023054755043227, "grad_norm": 3.091132640838623, "learning_rate": 3.988222134860755e-08, "logits/chosen": -1.563659906387329, "logits/rejected": -1.5506489276885986, "logps/chosen": -47.28032684326172, "logps/rejected": -51.66279983520508, "loss": 0.6899, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005120830610394478, "rewards/margins": 0.0066344826482236385, "rewards/rejected": -0.0015136522706598043, "step": 1530 }, { "epoch": 1.1095100864553313, "grad_norm": 3.1105594635009766, "learning_rate": 3.9713278391358724e-08, "logits/chosen": -1.5746492147445679, "logits/rejected": -1.5628042221069336, "logps/chosen": -45.93760681152344, "logps/rejected": -49.18937301635742, "loss": 0.6903, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005176537670195103, "rewards/margins": 0.005757451057434082, "rewards/rejected": -0.0005809130962006748, "step": 1540 }, { "epoch": 1.11671469740634, "grad_norm": 2.4670963287353516, "learning_rate": 3.954330115139328e-08, "logits/chosen": -1.5431309938430786, "logits/rejected": -1.5327889919281006, "logps/chosen": -46.76984405517578, "logps/rejected": -48.848941802978516, "loss": 0.6899, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004580738488584757, "rewards/margins": 0.006636142730712891, "rewards/rejected": -0.0020554042421281338, "step": 1550 }, { "epoch": 1.1239193083573487, "grad_norm": 4.051297187805176, "learning_rate": 3.937230157740931e-08, "logits/chosen": -1.5918155908584595, "logits/rejected": -1.5730479955673218, "logps/chosen": -47.90183639526367, "logps/rejected": -51.474891662597656, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006330148316919804, "rewards/margins": 0.007905492559075356, "rewards/rejected": -0.0015753433108329773, "step": 1560 }, { "epoch": 1.1311239193083573, "grad_norm": 2.364239454269409, "learning_rate": 3.920029168997077e-08, "logits/chosen": -1.5559653043746948, "logits/rejected": -1.5427926778793335, "logps/chosen": -48.723899841308594, "logps/rejected": -51.545448303222656, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004865005612373352, "rewards/margins": 0.005895170383155346, "rewards/rejected": -0.001030164072290063, "step": 1570 }, { "epoch": 1.138328530259366, "grad_norm": 3.9678614139556885, "learning_rate": 3.9027283580662476e-08, "logits/chosen": -1.5198521614074707, "logits/rejected": -1.5075480937957764, "logps/chosen": -49.6241340637207, "logps/rejected": -52.78132247924805, "loss": 0.6888, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005017532967031002, "rewards/margins": 0.008875850588083267, "rewards/rejected": -0.003858317853882909, "step": 1580 }, { "epoch": 1.1455331412103746, "grad_norm": 3.9845945835113525, "learning_rate": 3.885328941124014e-08, "logits/chosen": -1.5015017986297607, "logits/rejected": -1.488577127456665, "logps/chosen": -45.922607421875, "logps/rejected": -50.63169860839844, "loss": 0.6896, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005352140404284, "rewards/margins": 0.007262455765157938, "rewards/rejected": -0.0019103146623820066, "step": 1590 }, { "epoch": 1.1527377521613833, "grad_norm": 3.0348169803619385, "learning_rate": 3.867832141277539e-08, "logits/chosen": -1.5485866069793701, "logits/rejected": -1.5293524265289307, "logps/chosen": -49.10395812988281, "logps/rejected": -51.272361755371094, "loss": 0.69, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004336564801633358, "rewards/margins": 0.006302339024841785, "rewards/rejected": -0.0019657742232084274, "step": 1600 }, { "epoch": 1.159942363112392, "grad_norm": 3.4307823181152344, "learning_rate": 3.850239188479606e-08, "logits/chosen": -1.4621508121490479, "logits/rejected": -1.4584615230560303, "logps/chosen": -46.74929428100586, "logps/rejected": -49.1221809387207, "loss": 0.69, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004755138885229826, "rewards/margins": 0.006490757223218679, "rewards/rejected": -0.0017356185708194971, "step": 1610 }, { "epoch": 1.1671469740634006, "grad_norm": 3.5623254776000977, "learning_rate": 3.832551319442151e-08, "logits/chosen": -1.5857195854187012, "logits/rejected": -1.5824106931686401, "logps/chosen": -49.597537994384766, "logps/rejected": -53.8158073425293, "loss": 0.6899, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.00521459337323904, "rewards/margins": 0.006661839783191681, "rewards/rejected": -0.0014472461771219969, "step": 1620 }, { "epoch": 1.1743515850144093, "grad_norm": 4.247158527374268, "learning_rate": 3.81476977754933e-08, "logits/chosen": -1.3996719121932983, "logits/rejected": -1.3850539922714233, "logps/chosen": -51.30238723754883, "logps/rejected": -50.659263610839844, "loss": 0.6901, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004295485559850931, "rewards/margins": 0.006129544228315353, "rewards/rejected": -0.0018340591341257095, "step": 1630 }, { "epoch": 1.181556195965418, "grad_norm": 2.404221773147583, "learning_rate": 3.796895812770114e-08, "logits/chosen": -1.5022691488265991, "logits/rejected": -1.4923193454742432, "logps/chosen": -45.77785110473633, "logps/rejected": -47.2498664855957, "loss": 0.6894, "rewards/accuracies": 0.5625, "rewards/chosen": 0.005898059345781803, "rewards/margins": 0.007563650608062744, "rewards/rejected": -0.00166559056378901, "step": 1640 }, { "epoch": 1.1887608069164266, "grad_norm": 3.0766351222991943, "learning_rate": 3.7789306815704216e-08, "logits/chosen": -1.5271815061569214, "logits/rejected": -1.5167516469955444, "logps/chosen": -40.93082046508789, "logps/rejected": -42.0340461730957, "loss": 0.6908, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0035404022783041, "rewards/margins": 0.004842911381274462, "rewards/rejected": -0.0013025096850469708, "step": 1650 }, { "epoch": 1.195965417867435, "grad_norm": 2.643650531768799, "learning_rate": 3.760875646824795e-08, "logits/chosen": -1.395875334739685, "logits/rejected": -1.3972784280776978, "logps/chosen": -46.130271911621094, "logps/rejected": -48.43110656738281, "loss": 0.6896, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.003639611881226301, "rewards/margins": 0.007139836438000202, "rewards/rejected": -0.00350022385828197, "step": 1660 }, { "epoch": 1.2031700288184437, "grad_norm": 3.6421399116516113, "learning_rate": 3.742731977727623e-08, "logits/chosen": -1.5392658710479736, "logits/rejected": -1.5320463180541992, "logps/chosen": -45.195159912109375, "logps/rejected": -49.122718811035156, "loss": 0.6894, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.006233454681932926, "rewards/margins": 0.007511253468692303, "rewards/rejected": -0.0012777980882674456, "step": 1670 }, { "epoch": 1.2103746397694524, "grad_norm": 3.6778624057769775, "learning_rate": 3.7245009497039244e-08, "logits/chosen": -1.4356926679611206, "logits/rejected": -1.4204113483428955, "logps/chosen": -45.431312561035156, "logps/rejected": -49.4974479675293, "loss": 0.6892, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0038647849578410387, "rewards/margins": 0.007974617183208466, "rewards/rejected": -0.0041098324581980705, "step": 1680 }, { "epoch": 1.217579250720461, "grad_norm": 2.6512179374694824, "learning_rate": 3.7061838443196886e-08, "logits/chosen": -1.5117601156234741, "logits/rejected": -1.5023605823516846, "logps/chosen": -50.03330612182617, "logps/rejected": -52.160560607910156, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.006138836033642292, "rewards/margins": 0.009467920288443565, "rewards/rejected": -0.003329083789139986, "step": 1690 }, { "epoch": 1.2247838616714697, "grad_norm": 3.0043179988861084, "learning_rate": 3.68778194919179e-08, "logits/chosen": -1.4723100662231445, "logits/rejected": -1.4666262865066528, "logps/chosen": -50.07857894897461, "logps/rejected": -53.3220100402832, "loss": 0.6878, "rewards/accuracies": 0.6875, "rewards/chosen": 0.008099446073174477, "rewards/margins": 0.010907831601798534, "rewards/rejected": -0.0028083862271159887, "step": 1700 }, { "epoch": 1.2319884726224783, "grad_norm": 3.6292684078216553, "learning_rate": 3.66929655789747e-08, "logits/chosen": -1.571274995803833, "logits/rejected": -1.5532617568969727, "logps/chosen": -41.95909118652344, "logps/rejected": -46.51947784423828, "loss": 0.6889, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.005592281464487314, "rewards/margins": 0.008521712385118008, "rewards/rejected": -0.002929429989308119, "step": 1710 }, { "epoch": 1.239193083573487, "grad_norm": 2.337404727935791, "learning_rate": 3.6507289698834064e-08, "logits/chosen": -1.471616506576538, "logits/rejected": -1.4550542831420898, "logps/chosen": -43.53514862060547, "logps/rejected": -46.06405258178711, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005335145629942417, "rewards/margins": 0.008476309478282928, "rewards/rejected": -0.0031411624513566494, "step": 1720 }, { "epoch": 1.2463976945244957, "grad_norm": 4.029110431671143, "learning_rate": 3.6320804903743684e-08, "logits/chosen": -1.5162017345428467, "logits/rejected": -1.5111761093139648, "logps/chosen": -45.39423370361328, "logps/rejected": -49.17229080200195, "loss": 0.6887, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0036242641508579254, "rewards/margins": 0.009098870679736137, "rewards/rejected": -0.005474607460200787, "step": 1730 }, { "epoch": 1.2536023054755043, "grad_norm": 2.6486778259277344, "learning_rate": 3.61335243028146e-08, "logits/chosen": -1.495854377746582, "logits/rejected": -1.4898358583450317, "logps/chosen": -48.8822135925293, "logps/rejected": -51.64506912231445, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004486840683966875, "rewards/margins": 0.00841403380036354, "rewards/rejected": -0.003927193582057953, "step": 1740 }, { "epoch": 1.260806916426513, "grad_norm": 3.2701268196105957, "learning_rate": 3.5945461061099736e-08, "logits/chosen": -1.4389398097991943, "logits/rejected": -1.4077644348144531, "logps/chosen": -50.76543426513672, "logps/rejected": -49.68498992919922, "loss": 0.6871, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.006598909851163626, "rewards/margins": 0.012235610745847225, "rewards/rejected": -0.0056367008946835995, "step": 1750 }, { "epoch": 1.2680115273775217, "grad_norm": 2.968621253967285, "learning_rate": 3.5756628398668446e-08, "logits/chosen": -1.5589392185211182, "logits/rejected": -1.5580534934997559, "logps/chosen": -51.238853454589844, "logps/rejected": -53.6939697265625, "loss": 0.6885, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.003833180759102106, "rewards/margins": 0.009422466158866882, "rewards/rejected": -0.005589285399764776, "step": 1760 }, { "epoch": 1.2752161383285303, "grad_norm": 2.652916193008423, "learning_rate": 3.556703958967716e-08, "logits/chosen": -1.557422399520874, "logits/rejected": -1.543720006942749, "logps/chosen": -44.34943771362305, "logps/rejected": -47.97236251831055, "loss": 0.6897, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0036465474404394627, "rewards/margins": 0.007039936725050211, "rewards/rejected": -0.003393388818949461, "step": 1770 }, { "epoch": 1.282420749279539, "grad_norm": 4.006654739379883, "learning_rate": 3.5376707961436297e-08, "logits/chosen": -1.5315957069396973, "logits/rejected": -1.5152517557144165, "logps/chosen": -53.33189010620117, "logps/rejected": -53.46404266357422, "loss": 0.6905, "rewards/accuracies": 0.53125, "rewards/chosen": 0.004476548638194799, "rewards/margins": 0.005307585000991821, "rewards/rejected": -0.0008310364792123437, "step": 1780 }, { "epoch": 1.2896253602305476, "grad_norm": 2.423352003097534, "learning_rate": 3.51856468934734e-08, "logits/chosen": -1.4921244382858276, "logits/rejected": -1.4953742027282715, "logps/chosen": -46.34716033935547, "logps/rejected": -48.64600372314453, "loss": 0.6912, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004633346572518349, "rewards/margins": 0.00402105925604701, "rewards/rejected": 0.0006122869672253728, "step": 1790 }, { "epoch": 1.2968299711815563, "grad_norm": 3.358710289001465, "learning_rate": 3.499386981659262e-08, "logits/chosen": -1.5787866115570068, "logits/rejected": -1.570294737815857, "logps/chosen": -45.49131774902344, "logps/rejected": -51.6963996887207, "loss": 0.6891, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.006135477218776941, "rewards/margins": 0.008345494046807289, "rewards/rejected": -0.002210016595199704, "step": 1800 }, { "epoch": 1.304034582132565, "grad_norm": 2.5438549518585205, "learning_rate": 3.480139021193057e-08, "logits/chosen": -1.4628260135650635, "logits/rejected": -1.4626216888427734, "logps/chosen": -46.477264404296875, "logps/rejected": -49.938323974609375, "loss": 0.6895, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004061608109623194, "rewards/margins": 0.007341681979596615, "rewards/rejected": -0.003280073869973421, "step": 1810 }, { "epoch": 1.3112391930835736, "grad_norm": 4.097665786743164, "learning_rate": 3.4608221610008666e-08, "logits/chosen": -1.554359793663025, "logits/rejected": -1.5441957712173462, "logps/chosen": -40.67582321166992, "logps/rejected": -45.34405517578125, "loss": 0.6876, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005710783414542675, "rewards/margins": 0.011185348965227604, "rewards/rejected": -0.005474564619362354, "step": 1820 }, { "epoch": 1.318443804034582, "grad_norm": 2.3061835765838623, "learning_rate": 3.4414377589782e-08, "logits/chosen": -1.489579439163208, "logits/rejected": -1.4888752698898315, "logps/chosen": -44.27809143066406, "logps/rejected": -46.678611755371094, "loss": 0.689, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.003133392659947276, "rewards/margins": 0.008380794897675514, "rewards/rejected": -0.0052474020048975945, "step": 1830 }, { "epoch": 1.3256484149855907, "grad_norm": 2.2423462867736816, "learning_rate": 3.4219871777684745e-08, "logits/chosen": -1.5047807693481445, "logits/rejected": -1.4803636074066162, "logps/chosen": -48.244483947753906, "logps/rejected": -49.67208480834961, "loss": 0.6888, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004430696833878756, "rewards/margins": 0.008876333944499493, "rewards/rejected": -0.004445637576282024, "step": 1840 }, { "epoch": 1.3328530259365994, "grad_norm": 3.133010149002075, "learning_rate": 3.4024717846672364e-08, "logits/chosen": -1.55430006980896, "logits/rejected": -1.5407884120941162, "logps/chosen": -43.84967803955078, "logps/rejected": -47.088844299316406, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.003224353538826108, "rewards/margins": 0.009443853981792927, "rewards/rejected": -0.0062195006757974625, "step": 1850 }, { "epoch": 1.340057636887608, "grad_norm": 3.2332677841186523, "learning_rate": 3.382892951526036e-08, "logits/chosen": -1.5088350772857666, "logits/rejected": -1.4985536336898804, "logps/chosen": -48.5511589050293, "logps/rejected": -53.52519607543945, "loss": 0.688, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004791603423655033, "rewards/margins": 0.01051848940551281, "rewards/rejected": -0.005726885981857777, "step": 1860 }, { "epoch": 1.3472622478386167, "grad_norm": 3.0702571868896484, "learning_rate": 3.3632520546559974e-08, "logits/chosen": -1.477571964263916, "logits/rejected": -1.4503300189971924, "logps/chosen": -42.1833610534668, "logps/rejected": -46.2810173034668, "loss": 0.6883, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.005192961078137159, "rewards/margins": 0.0098468828946352, "rewards/rejected": -0.004653920419514179, "step": 1870 }, { "epoch": 1.3544668587896254, "grad_norm": 3.413985013961792, "learning_rate": 3.34355047473107e-08, "logits/chosen": -1.509441614151001, "logits/rejected": -1.4929606914520264, "logps/chosen": -49.14087677001953, "logps/rejected": -50.32634353637695, "loss": 0.6892, "rewards/accuracies": 0.59375, "rewards/chosen": 0.003523029386997223, "rewards/margins": 0.00808628834784031, "rewards/rejected": -0.004563258029520512, "step": 1880 }, { "epoch": 1.361671469740634, "grad_norm": 3.261613607406616, "learning_rate": 3.323789596690971e-08, "logits/chosen": -1.4438692331314087, "logits/rejected": -1.4406040906906128, "logps/chosen": -46.01984405517578, "logps/rejected": -50.35694122314453, "loss": 0.6885, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.004411002155393362, "rewards/margins": 0.009367231279611588, "rewards/rejected": -0.0049562300555408, "step": 1890 }, { "epoch": 1.3688760806916427, "grad_norm": 2.0379562377929688, "learning_rate": 3.303970809643828e-08, "logits/chosen": -1.5255157947540283, "logits/rejected": -1.5278053283691406, "logps/chosen": -45.327415466308594, "logps/rejected": -49.04973220825195, "loss": 0.689, "rewards/accuracies": 0.59375, "rewards/chosen": 0.005483666434884071, "rewards/margins": 0.008489950560033321, "rewards/rejected": -0.0030062845908105373, "step": 1900 }, { "epoch": 1.3760806916426513, "grad_norm": 2.9973742961883545, "learning_rate": 3.2840955067685356e-08, "logits/chosen": -1.563256859779358, "logits/rejected": -1.5630435943603516, "logps/chosen": -46.008609771728516, "logps/rejected": -50.46318054199219, "loss": 0.6875, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0052226148545742035, "rewards/margins": 0.011490847915410995, "rewards/rejected": -0.006268233992159367, "step": 1910 }, { "epoch": 1.38328530259366, "grad_norm": 2.7613437175750732, "learning_rate": 3.264165085216817e-08, "logits/chosen": -1.580041766166687, "logits/rejected": -1.5727280378341675, "logps/chosen": -38.53578186035156, "logps/rejected": -43.83596420288086, "loss": 0.6887, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0044464850798249245, "rewards/margins": 0.009077567607164383, "rewards/rejected": -0.004631082061678171, "step": 1920 }, { "epoch": 1.3904899135446687, "grad_norm": 4.180810451507568, "learning_rate": 3.244180946015008e-08, "logits/chosen": -1.4442589282989502, "logits/rejected": -1.435718297958374, "logps/chosen": -52.15314483642578, "logps/rejected": -53.91064453125, "loss": 0.6896, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004610007628798485, "rewards/margins": 0.007277486380189657, "rewards/rejected": -0.0026674787513911724, "step": 1930 }, { "epoch": 1.397694524495677, "grad_norm": 2.520510196685791, "learning_rate": 3.224144493965578e-08, "logits/chosen": -1.5796912908554077, "logits/rejected": -1.5778270959854126, "logps/chosen": -43.635459899902344, "logps/rejected": -45.74983215332031, "loss": 0.6891, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.003754237201064825, "rewards/margins": 0.008241266012191772, "rewards/rejected": -0.00448702834546566, "step": 1940 }, { "epoch": 1.4048991354466858, "grad_norm": 2.8245351314544678, "learning_rate": 3.204057137548371e-08, "logits/chosen": -1.5312103033065796, "logits/rejected": -1.5232504606246948, "logps/chosen": -43.69184112548828, "logps/rejected": -47.32173156738281, "loss": 0.6876, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004152680281549692, "rewards/margins": 0.011306528933346272, "rewards/rejected": -0.007153847720474005, "step": 1950 }, { "epoch": 1.4121037463976944, "grad_norm": 3.71762752532959, "learning_rate": 3.183920288821597e-08, "logits/chosen": -1.4898974895477295, "logits/rejected": -1.4814989566802979, "logps/chosen": -45.30360412597656, "logps/rejected": -49.977867126464844, "loss": 0.6872, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004725744016468525, "rewards/margins": 0.012062130495905876, "rewards/rejected": -0.0073363869450986385, "step": 1960 }, { "epoch": 1.419308357348703, "grad_norm": 3.813885450363159, "learning_rate": 3.1637353633225735e-08, "logits/chosen": -1.5400512218475342, "logits/rejected": -1.5293104648590088, "logps/chosen": -41.25631332397461, "logps/rejected": -45.72794723510742, "loss": 0.6869, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.003288590582087636, "rewards/margins": 0.012727511115372181, "rewards/rejected": -0.009438920766115189, "step": 1970 }, { "epoch": 1.4265129682997117, "grad_norm": 3.278331995010376, "learning_rate": 3.143503779968213e-08, "logits/chosen": -1.50669264793396, "logits/rejected": -1.506869912147522, "logps/chosen": -45.4344367980957, "logps/rejected": -49.771766662597656, "loss": 0.689, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001246375497430563, "rewards/margins": 0.00855537410825491, "rewards/rejected": -0.0073089986108243465, "step": 1980 }, { "epoch": 1.4337175792507204, "grad_norm": 3.326122760772705, "learning_rate": 3.1232269609552875e-08, "logits/chosen": -1.5182191133499146, "logits/rejected": -1.5079318284988403, "logps/chosen": -43.68280792236328, "logps/rejected": -46.17193603515625, "loss": 0.6887, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004092085175216198, "rewards/margins": 0.009019319899380207, "rewards/rejected": -0.004927235189825296, "step": 1990 }, { "epoch": 1.440922190201729, "grad_norm": 2.173727035522461, "learning_rate": 3.102906331660444e-08, "logits/chosen": -1.5566580295562744, "logits/rejected": -1.542860507965088, "logps/chosen": -41.94898223876953, "logps/rejected": -48.24005126953125, "loss": 0.6868, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.005175874102860689, "rewards/margins": 0.012963724322617054, "rewards/rejected": -0.007787850685417652, "step": 2000 }, { "epoch": 1.4481268011527377, "grad_norm": 3.172349214553833, "learning_rate": 3.082543320540015e-08, "logits/chosen": -1.4700515270233154, "logits/rejected": -1.4550468921661377, "logps/chosen": -43.87895965576172, "logps/rejected": -47.5235710144043, "loss": 0.6883, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0023733568377792835, "rewards/margins": 0.009878008626401424, "rewards/rejected": -0.007504651788622141, "step": 2010 }, { "epoch": 1.4553314121037464, "grad_norm": 4.150721549987793, "learning_rate": 3.062139359029599e-08, "logits/chosen": -1.557370901107788, "logits/rejected": -1.553252100944519, "logps/chosen": -46.466400146484375, "logps/rejected": -48.92326736450195, "loss": 0.6881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.003435027552768588, "rewards/margins": 0.010258909314870834, "rewards/rejected": -0.006823881529271603, "step": 2020 }, { "epoch": 1.462536023054755, "grad_norm": 3.4097900390625, "learning_rate": 3.041695881443437e-08, "logits/chosen": -1.5758157968521118, "logits/rejected": -1.5673719644546509, "logps/chosen": -46.34037399291992, "logps/rejected": -50.32112121582031, "loss": 0.69, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0035826407838612795, "rewards/margins": 0.0064225392416119576, "rewards/rejected": -0.0028398979920893908, "step": 2030 }, { "epoch": 1.4697406340057637, "grad_norm": 4.047444820404053, "learning_rate": 3.0212143248735886e-08, "logits/chosen": -1.531328797340393, "logits/rejected": -1.5289762020111084, "logps/chosen": -49.777374267578125, "logps/rejected": -54.45976638793945, "loss": 0.6876, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0041795228607952595, "rewards/margins": 0.01155170239508152, "rewards/rejected": -0.007372179068624973, "step": 2040 }, { "epoch": 1.4769452449567724, "grad_norm": 3.1559255123138428, "learning_rate": 3.0006961290889077e-08, "logits/chosen": -1.5212230682373047, "logits/rejected": -1.4961992502212524, "logps/chosen": -50.666038513183594, "logps/rejected": -53.179840087890625, "loss": 0.6875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.005002585239708424, "rewards/margins": 0.011495384387671947, "rewards/rejected": -0.006492799613624811, "step": 2050 }, { "epoch": 1.484149855907781, "grad_norm": 2.672394037246704, "learning_rate": 2.980142736433833e-08, "logits/chosen": -1.5468004941940308, "logits/rejected": -1.522716760635376, "logps/chosen": -44.2849235534668, "logps/rejected": -44.554542541503906, "loss": 0.6883, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0016432057600468397, "rewards/margins": 0.009843757376074791, "rewards/rejected": -0.008200552314519882, "step": 2060 }, { "epoch": 1.4913544668587897, "grad_norm": 4.110817909240723, "learning_rate": 2.9595555917269997e-08, "logits/chosen": -1.556030035018921, "logits/rejected": -1.5290006399154663, "logps/chosen": -51.58045196533203, "logps/rejected": -53.32768630981445, "loss": 0.6883, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0018507679924368858, "rewards/margins": 0.00991340633481741, "rewards/rejected": -0.008062639273703098, "step": 2070 }, { "epoch": 1.4985590778097984, "grad_norm": 3.1407599449157715, "learning_rate": 2.9389361421596725e-08, "logits/chosen": -1.4347405433654785, "logits/rejected": -1.4316017627716064, "logps/chosen": -49.19403839111328, "logps/rejected": -53.545204162597656, "loss": 0.6871, "rewards/accuracies": 0.625, "rewards/chosen": 0.0042905667796730995, "rewards/margins": 0.012316063046455383, "rewards/rejected": -0.00802549533545971, "step": 2080 }, { "epoch": 1.505763688760807, "grad_norm": 2.570767879486084, "learning_rate": 2.9182858371940126e-08, "logits/chosen": -1.5318742990493774, "logits/rejected": -1.5175909996032715, "logps/chosen": -42.606483459472656, "logps/rejected": -46.17628860473633, "loss": 0.6866, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0029234064277261496, "rewards/margins": 0.013527562841773033, "rewards/rejected": -0.01060415618121624, "step": 2090 }, { "epoch": 1.5129682997118157, "grad_norm": 3.642561674118042, "learning_rate": 2.8976061284611908e-08, "logits/chosen": -1.4698221683502197, "logits/rejected": -1.4809458255767822, "logps/chosen": -41.60303497314453, "logps/rejected": -45.31582260131836, "loss": 0.6876, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0049819787964224815, "rewards/margins": 0.011425621807575226, "rewards/rejected": -0.006443643011152744, "step": 2100 }, { "epoch": 1.5201729106628243, "grad_norm": 3.2771670818328857, "learning_rate": 2.8768984696593384e-08, "logits/chosen": -1.4798085689544678, "logits/rejected": -1.4632951021194458, "logps/chosen": -44.77037811279297, "logps/rejected": -47.931278228759766, "loss": 0.6868, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.005007242318242788, "rewards/margins": 0.013035112991929054, "rewards/rejected": -0.008027870208024979, "step": 2110 }, { "epoch": 1.527377521613833, "grad_norm": 3.2423009872436523, "learning_rate": 2.8561643164513637e-08, "logits/chosen": -1.334378957748413, "logits/rejected": -1.318719744682312, "logps/chosen": -51.95469284057617, "logps/rejected": -54.27800369262695, "loss": 0.6888, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.003995453007519245, "rewards/margins": 0.008951379917562008, "rewards/rejected": -0.00495592737570405, "step": 2120 }, { "epoch": 1.5345821325648417, "grad_norm": 3.3206255435943604, "learning_rate": 2.8354051263626227e-08, "logits/chosen": -1.46604323387146, "logits/rejected": -1.466449499130249, "logps/chosen": -50.30503463745117, "logps/rejected": -52.849761962890625, "loss": 0.6885, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0029683273751288652, "rewards/margins": 0.00942517165094614, "rewards/rejected": -0.006456844508647919, "step": 2130 }, { "epoch": 1.54178674351585, "grad_norm": 4.676600456237793, "learning_rate": 2.8146223586784573e-08, "logits/chosen": -1.4574966430664062, "logits/rejected": -1.4443973302841187, "logps/chosen": -52.07725143432617, "logps/rejected": -55.014495849609375, "loss": 0.687, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.003784618806093931, "rewards/margins": 0.012685844674706459, "rewards/rejected": -0.00890122540295124, "step": 2140 }, { "epoch": 1.5489913544668588, "grad_norm": 3.3971612453460693, "learning_rate": 2.7938174743416205e-08, "logits/chosen": -1.3624401092529297, "logits/rejected": -1.3555347919464111, "logps/chosen": -51.51061248779297, "logps/rejected": -55.449851989746094, "loss": 0.6877, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0027002261485904455, "rewards/margins": 0.0111739132553339, "rewards/rejected": -0.008473685942590237, "step": 2150 }, { "epoch": 1.5561959654178674, "grad_norm": 3.0555315017700195, "learning_rate": 2.7729919358495728e-08, "logits/chosen": -1.5037503242492676, "logits/rejected": -1.4941872358322144, "logps/chosen": -52.352867126464844, "logps/rejected": -53.544029235839844, "loss": 0.687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0034458369482308626, "rewards/margins": 0.012528976425528526, "rewards/rejected": -0.009083138778805733, "step": 2160 }, { "epoch": 1.563400576368876, "grad_norm": 3.8352720737457275, "learning_rate": 2.7521472071516772e-08, "logits/chosen": -1.4726879596710205, "logits/rejected": -1.4662566184997559, "logps/chosen": -43.68360900878906, "logps/rejected": -47.45799255371094, "loss": 0.6883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005658530630171299, "rewards/margins": 0.0100140031427145, "rewards/rejected": -0.004355472978204489, "step": 2170 }, { "epoch": 1.5706051873198847, "grad_norm": 3.8918683528900146, "learning_rate": 2.731284753546289e-08, "logits/chosen": -1.480924367904663, "logits/rejected": -1.4743003845214844, "logps/chosen": -53.0228157043457, "logps/rejected": -56.795387268066406, "loss": 0.6894, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0002322282234672457, "rewards/margins": 0.007790303323417902, "rewards/rejected": -0.008022531867027283, "step": 2180 }, { "epoch": 1.5778097982708934, "grad_norm": 4.045823097229004, "learning_rate": 2.710406041577751e-08, "logits/chosen": -1.5512325763702393, "logits/rejected": -1.5479477643966675, "logps/chosen": -47.99006271362305, "logps/rejected": -53.79365921020508, "loss": 0.6881, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.00434616394340992, "rewards/margins": 0.010301290079951286, "rewards/rejected": -0.005955126136541367, "step": 2190 }, { "epoch": 1.585014409221902, "grad_norm": 3.306387424468994, "learning_rate": 2.6895125389333017e-08, "logits/chosen": -1.5372812747955322, "logits/rejected": -1.5224703550338745, "logps/chosen": -48.444435119628906, "logps/rejected": -52.62398147583008, "loss": 0.6851, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.007179601583629847, "rewards/margins": 0.016431204974651337, "rewards/rejected": -0.009251603856682777, "step": 2200 }, { "epoch": 1.5922190201729105, "grad_norm": 3.1701338291168213, "learning_rate": 2.6686057143399028e-08, "logits/chosen": -1.5057289600372314, "logits/rejected": -1.4977985620498657, "logps/chosen": -48.5191764831543, "logps/rejected": -50.016353607177734, "loss": 0.6882, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004845013376325369, "rewards/margins": 0.010277556255459785, "rewards/rejected": -0.005432543810456991, "step": 2210 }, { "epoch": 1.5994236311239192, "grad_norm": 3.6945786476135254, "learning_rate": 2.647687037460996e-08, "logits/chosen": -1.4846514463424683, "logits/rejected": -1.4771755933761597, "logps/chosen": -52.859474182128906, "logps/rejected": -58.45153045654297, "loss": 0.6862, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.006427395157516003, "rewards/margins": 0.01427508145570755, "rewards/rejected": -0.007847686298191547, "step": 2220 }, { "epoch": 1.6066282420749278, "grad_norm": 3.2228472232818604, "learning_rate": 2.626757978793187e-08, "logits/chosen": -1.50605046749115, "logits/rejected": -1.4987943172454834, "logps/chosen": -48.89452362060547, "logps/rejected": -52.482139587402344, "loss": 0.6889, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0004428673128131777, "rewards/margins": 0.008724686689674854, "rewards/rejected": -0.009167554788291454, "step": 2230 }, { "epoch": 1.6138328530259365, "grad_norm": 2.9732460975646973, "learning_rate": 2.6058200095628797e-08, "logits/chosen": -1.5061275959014893, "logits/rejected": -1.5063308477401733, "logps/chosen": -40.85835266113281, "logps/rejected": -46.76417541503906, "loss": 0.6849, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00457608001306653, "rewards/margins": 0.016967391595244408, "rewards/rejected": -0.012391313910484314, "step": 2240 }, { "epoch": 1.6210374639769451, "grad_norm": 3.2725095748901367, "learning_rate": 2.584874601622854e-08, "logits/chosen": -1.5664986371994019, "logits/rejected": -1.5493196249008179, "logps/chosen": -49.408843994140625, "logps/rejected": -53.2461051940918, "loss": 0.689, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.002165078418329358, "rewards/margins": 0.00868967454880476, "rewards/rejected": -0.00652459729462862, "step": 2250 }, { "epoch": 1.6282420749279538, "grad_norm": 3.0531673431396484, "learning_rate": 2.5639232273487993e-08, "logits/chosen": -1.4604244232177734, "logits/rejected": -1.4406094551086426, "logps/chosen": -44.316959381103516, "logps/rejected": -47.68523025512695, "loss": 0.6875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004358319565653801, "rewards/margins": 0.01151906605809927, "rewards/rejected": -0.007160746958106756, "step": 2260 }, { "epoch": 1.6354466858789625, "grad_norm": 3.623836040496826, "learning_rate": 2.5429673595358142e-08, "logits/chosen": -1.5242135524749756, "logits/rejected": -1.5093494653701782, "logps/chosen": -45.795074462890625, "logps/rejected": -48.57317352294922, "loss": 0.6877, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.001714804908260703, "rewards/margins": 0.0111106988042593, "rewards/rejected": -0.009395892731845379, "step": 2270 }, { "epoch": 1.6426512968299711, "grad_norm": 3.297855854034424, "learning_rate": 2.5220084712948764e-08, "logits/chosen": -1.4578922986984253, "logits/rejected": -1.445966362953186, "logps/chosen": -52.103614807128906, "logps/rejected": -55.21147918701172, "loss": 0.6898, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0028083932120352983, "rewards/margins": 0.006879979278892279, "rewards/rejected": -0.004071586299687624, "step": 2280 }, { "epoch": 1.6498559077809798, "grad_norm": 3.762510061264038, "learning_rate": 2.5010480359492838e-08, "logits/chosen": -1.463905692100525, "logits/rejected": -1.4516466856002808, "logps/chosen": -49.459896087646484, "logps/rejected": -49.4564094543457, "loss": 0.686, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0037110510747879744, "rewards/margins": 0.014671266078948975, "rewards/rejected": -0.010960215702652931, "step": 2290 }, { "epoch": 1.6570605187319885, "grad_norm": 2.9843027591705322, "learning_rate": 2.480087526931091e-08, "logits/chosen": -1.5036156177520752, "logits/rejected": -1.4841887950897217, "logps/chosen": -43.37828826904297, "logps/rejected": -45.4171142578125, "loss": 0.6862, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003712053643539548, "rewards/margins": 0.01433002669364214, "rewards/rejected": -0.010617973282933235, "step": 2300 }, { "epoch": 1.6642651296829971, "grad_norm": 3.441652297973633, "learning_rate": 2.4591284176775326e-08, "logits/chosen": -1.4469178915023804, "logits/rejected": -1.4350392818450928, "logps/chosen": -55.11457061767578, "logps/rejected": -56.50977325439453, "loss": 0.6891, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003367173718288541, "rewards/margins": 0.008350704796612263, "rewards/rejected": -0.004983530845493078, "step": 2310 }, { "epoch": 1.6714697406340058, "grad_norm": 2.8864128589630127, "learning_rate": 2.4381721815274443e-08, "logits/chosen": -1.5199902057647705, "logits/rejected": -1.5132654905319214, "logps/chosen": -43.241661071777344, "logps/rejected": -46.4239616394043, "loss": 0.687, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0014256946742534637, "rewards/margins": 0.012737279757857323, "rewards/rejected": -0.011311585083603859, "step": 2320 }, { "epoch": 1.6786743515850144, "grad_norm": 3.1540913581848145, "learning_rate": 2.4172202916176936e-08, "logits/chosen": -1.5634369850158691, "logits/rejected": -1.5546420812606812, "logps/chosen": -43.037837982177734, "logps/rejected": -47.856117248535156, "loss": 0.686, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0006109640235081315, "rewards/margins": 0.014766094274818897, "rewards/rejected": -0.0141551299020648, "step": 2330 }, { "epoch": 1.685878962536023, "grad_norm": 3.647859573364258, "learning_rate": 2.3962742207796268e-08, "logits/chosen": -1.4479600191116333, "logits/rejected": -1.4379609823226929, "logps/chosen": -41.66529083251953, "logps/rejected": -45.63011932373047, "loss": 0.685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004694860894232988, "rewards/margins": 0.016697410494089127, "rewards/rejected": -0.012002546340227127, "step": 2340 }, { "epoch": 1.6930835734870318, "grad_norm": 3.6262919902801514, "learning_rate": 2.3753354414355334e-08, "logits/chosen": -1.4229185581207275, "logits/rejected": -1.400246024131775, "logps/chosen": -53.5576171875, "logps/rejected": -55.23441696166992, "loss": 0.6872, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0002725389786064625, "rewards/margins": 0.012355529703199863, "rewards/rejected": -0.012082991190254688, "step": 2350 }, { "epoch": 1.7002881844380404, "grad_norm": 3.3609025478363037, "learning_rate": 2.3544054254951408e-08, "logits/chosen": -1.4647839069366455, "logits/rejected": -1.4447661638259888, "logps/chosen": -42.85536193847656, "logps/rejected": -48.42116165161133, "loss": 0.6842, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.00428318977355957, "rewards/margins": 0.018426140770316124, "rewards/rejected": -0.014142952859401703, "step": 2360 }, { "epoch": 1.707492795389049, "grad_norm": 3.3678860664367676, "learning_rate": 2.3334856442521435e-08, "logits/chosen": -1.5585176944732666, "logits/rejected": -1.5401690006256104, "logps/chosen": -51.256141662597656, "logps/rejected": -51.39955520629883, "loss": 0.688, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0024927393533289433, "rewards/margins": 0.010601336136460304, "rewards/rejected": -0.008108596317470074, "step": 2370 }, { "epoch": 1.7146974063400577, "grad_norm": 3.430454730987549, "learning_rate": 2.3125775682807826e-08, "logits/chosen": -1.5538517236709595, "logits/rejected": -1.552392601966858, "logps/chosen": -49.91531753540039, "logps/rejected": -53.7569465637207, "loss": 0.686, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0030004787258803844, "rewards/margins": 0.01481213141232729, "rewards/rejected": -0.011811653152108192, "step": 2380 }, { "epoch": 1.7219020172910664, "grad_norm": 2.7333407402038574, "learning_rate": 2.291682667332464e-08, "logits/chosen": -1.612623929977417, "logits/rejected": -1.5989640951156616, "logps/chosen": -46.485713958740234, "logps/rejected": -49.609825134277344, "loss": 0.6888, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.000499680289067328, "rewards/margins": 0.008998895063996315, "rewards/rejected": -0.008499214425683022, "step": 2390 }, { "epoch": 1.729106628242075, "grad_norm": 2.9348888397216797, "learning_rate": 2.2708024102324454e-08, "logits/chosen": -1.5304839611053467, "logits/rejected": -1.5251142978668213, "logps/chosen": -46.70942306518555, "logps/rejected": -51.74241256713867, "loss": 0.6851, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.00332896551117301, "rewards/margins": 0.016565581783652306, "rewards/rejected": -0.013236616738140583, "step": 2400 }, { "epoch": 1.7363112391930837, "grad_norm": 3.6855247020721436, "learning_rate": 2.2499382647765797e-08, "logits/chosen": -1.4964635372161865, "logits/rejected": -1.4977600574493408, "logps/chosen": -48.41642379760742, "logps/rejected": -52.0125617980957, "loss": 0.6874, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 3.586681486922316e-05, "rewards/margins": 0.011704354546964169, "rewards/rejected": -0.011668487451970577, "step": 2410 }, { "epoch": 1.7435158501440924, "grad_norm": 2.8829994201660156, "learning_rate": 2.2290916976281427e-08, "logits/chosen": -1.4776675701141357, "logits/rejected": -1.4631723165512085, "logps/chosen": -43.69971466064453, "logps/rejected": -46.09686279296875, "loss": 0.6861, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00013883263454772532, "rewards/margins": 0.014544370584189892, "rewards/rejected": -0.014683201909065247, "step": 2420 }, { "epoch": 1.7507204610951008, "grad_norm": 3.6001625061035156, "learning_rate": 2.2082641742147238e-08, "logits/chosen": -1.4721145629882812, "logits/rejected": -1.4632716178894043, "logps/chosen": -45.69694519042969, "logps/rejected": -51.60881805419922, "loss": 0.6866, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.000785698473919183, "rewards/margins": 0.013660689815878868, "rewards/rejected": -0.012874990701675415, "step": 2430 }, { "epoch": 1.7579250720461095, "grad_norm": 3.089428424835205, "learning_rate": 2.1874571586252177e-08, "logits/chosen": -1.5453965663909912, "logits/rejected": -1.5336331129074097, "logps/chosen": -45.585872650146484, "logps/rejected": -48.34822082519531, "loss": 0.6877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0007279099081642926, "rewards/margins": 0.01115675363689661, "rewards/rejected": -0.010428843088448048, "step": 2440 }, { "epoch": 1.7651296829971181, "grad_norm": 2.425853967666626, "learning_rate": 2.1666721135069037e-08, "logits/chosen": -1.5158154964447021, "logits/rejected": -1.502139687538147, "logps/chosen": -49.875404357910156, "logps/rejected": -51.33538055419922, "loss": 0.6875, "rewards/accuracies": 0.5625, "rewards/chosen": 0.001414857804775238, "rewards/margins": 0.011725572869181633, "rewards/rejected": -0.010310716927051544, "step": 2450 }, { "epoch": 1.7723342939481268, "grad_norm": 2.6500022411346436, "learning_rate": 2.145910499962628e-08, "logits/chosen": -1.5762279033660889, "logits/rejected": -1.5551462173461914, "logps/chosen": -44.01633834838867, "logps/rejected": -46.197105407714844, "loss": 0.6846, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0036026693414896727, "rewards/margins": 0.017717977985739708, "rewards/rejected": -0.014115308411419392, "step": 2460 }, { "epoch": 1.7795389048991355, "grad_norm": 3.8977699279785156, "learning_rate": 2.1251737774480915e-08, "logits/chosen": -1.5487562417984009, "logits/rejected": -1.5394015312194824, "logps/chosen": -53.25288772583008, "logps/rejected": -55.375213623046875, "loss": 0.6872, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0014735452132299542, "rewards/margins": 0.01222173310816288, "rewards/rejected": -0.010748187080025673, "step": 2470 }, { "epoch": 1.7867435158501441, "grad_norm": 2.550679922103882, "learning_rate": 2.104463403669264e-08, "logits/chosen": -1.47690749168396, "logits/rejected": -1.4564216136932373, "logps/chosen": -49.03275680541992, "logps/rejected": -51.274871826171875, "loss": 0.6863, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0006415749667212367, "rewards/margins": 0.01415687520056963, "rewards/rejected": -0.013515301048755646, "step": 2480 }, { "epoch": 1.7939481268011528, "grad_norm": 2.6638097763061523, "learning_rate": 2.0837808344799028e-08, "logits/chosen": -1.4521634578704834, "logits/rejected": -1.4361908435821533, "logps/chosen": -43.8492431640625, "logps/rejected": -47.552284240722656, "loss": 0.6837, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.006942611187696457, "rewards/margins": 0.019251275807619095, "rewards/rejected": -0.012308661825954914, "step": 2490 }, { "epoch": 1.8011527377521612, "grad_norm": 3.212559700012207, "learning_rate": 2.063127523779219e-08, "logits/chosen": -1.4297634363174438, "logits/rejected": -1.4289541244506836, "logps/chosen": -44.858970642089844, "logps/rejected": -51.319175720214844, "loss": 0.6835, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.002955908887088299, "rewards/margins": 0.019690891727805138, "rewards/rejected": -0.016734981909394264, "step": 2500 }, { "epoch": 1.8083573487031699, "grad_norm": 3.859661102294922, "learning_rate": 2.0425049234096737e-08, "logits/chosen": -1.485381841659546, "logits/rejected": -1.4706742763519287, "logps/chosen": -49.1226921081543, "logps/rejected": -51.82140350341797, "loss": 0.6861, "rewards/accuracies": 0.53125, "rewards/chosen": 9.506577771389857e-05, "rewards/margins": 0.014713233336806297, "rewards/rejected": -0.014618167653679848, "step": 2510 }, { "epoch": 1.8155619596541785, "grad_norm": 2.6423633098602295, "learning_rate": 2.0219144830549163e-08, "logits/chosen": -1.4601681232452393, "logits/rejected": -1.4504835605621338, "logps/chosen": -49.000816345214844, "logps/rejected": -52.69274139404297, "loss": 0.6847, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0008226001518778503, "rewards/margins": 0.017380500212311745, "rewards/rejected": -0.016557898372411728, "step": 2520 }, { "epoch": 1.8227665706051872, "grad_norm": 2.8962371349334717, "learning_rate": 2.0013576501378823e-08, "logits/chosen": -1.4357713460922241, "logits/rejected": -1.4260128736495972, "logps/chosen": -44.66715621948242, "logps/rejected": -48.878177642822266, "loss": 0.6816, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00762571394443512, "rewards/margins": 0.02372068539261818, "rewards/rejected": -0.01609497331082821, "step": 2530 }, { "epoch": 1.8299711815561959, "grad_norm": 3.5678348541259766, "learning_rate": 1.9808358697190426e-08, "logits/chosen": -1.4616868495941162, "logits/rejected": -1.4613463878631592, "logps/chosen": -40.0428466796875, "logps/rejected": -45.382728576660156, "loss": 0.6845, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0007027705432847142, "rewards/margins": 0.017785217612981796, "rewards/rejected": -0.01848798617720604, "step": 2540 }, { "epoch": 1.8371757925072045, "grad_norm": 3.052267551422119, "learning_rate": 1.9603505843948214e-08, "logits/chosen": -1.4895626306533813, "logits/rejected": -1.4689642190933228, "logps/chosen": -41.04018020629883, "logps/rejected": -46.34733200073242, "loss": 0.6863, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0010787765495479107, "rewards/margins": 0.014137683436274529, "rewards/rejected": -0.01305890642106533, "step": 2550 }, { "epoch": 1.8443804034582132, "grad_norm": 3.034510374069214, "learning_rate": 1.9399032341961886e-08, "logits/chosen": -1.4604467153549194, "logits/rejected": -1.4406030178070068, "logps/chosen": -44.063720703125, "logps/rejected": -45.94016647338867, "loss": 0.6869, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0032738572917878628, "rewards/margins": 0.013115392997860909, "rewards/rejected": -0.009841536171734333, "step": 2560 }, { "epoch": 1.8515850144092219, "grad_norm": 3.6909120082855225, "learning_rate": 1.9194952564874323e-08, "logits/chosen": -1.4899951219558716, "logits/rejected": -1.4772499799728394, "logps/chosen": -49.38003158569336, "logps/rejected": -52.764427185058594, "loss": 0.6858, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0014510613400489092, "rewards/margins": 0.015302592888474464, "rewards/rejected": -0.013851528987288475, "step": 2570 }, { "epoch": 1.8587896253602305, "grad_norm": 2.995894432067871, "learning_rate": 1.8991280858651157e-08, "logits/chosen": -1.465537190437317, "logits/rejected": -1.4445680379867554, "logps/chosen": -48.079158782958984, "logps/rejected": -49.74862289428711, "loss": 0.6861, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0006449748761951923, "rewards/margins": 0.014429867267608643, "rewards/rejected": -0.013784890063107014, "step": 2580 }, { "epoch": 1.8659942363112392, "grad_norm": 3.794341802597046, "learning_rate": 1.8788031540572327e-08, "logits/chosen": -1.4323540925979614, "logits/rejected": -1.4187084436416626, "logps/chosen": -43.358890533447266, "logps/rejected": -47.263160705566406, "loss": 0.6846, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0011780399363487959, "rewards/margins": 0.017769720405340195, "rewards/rejected": -0.01659167930483818, "step": 2590 }, { "epoch": 1.8731988472622478, "grad_norm": 3.5173821449279785, "learning_rate": 1.858521889822565e-08, "logits/chosen": -1.4809000492095947, "logits/rejected": -1.4711607694625854, "logps/chosen": -44.77101135253906, "logps/rejected": -47.36913299560547, "loss": 0.6873, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00217301188968122, "rewards/margins": 0.012151877395808697, "rewards/rejected": -0.009978866204619408, "step": 2600 }, { "epoch": 1.8804034582132565, "grad_norm": 3.0861990451812744, "learning_rate": 1.8382857188502422e-08, "logits/chosen": -1.479575514793396, "logits/rejected": -1.4646644592285156, "logps/chosen": -43.409400939941406, "logps/rejected": -46.272499084472656, "loss": 0.6855, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0008953431388363242, "rewards/margins": 0.015901099890470505, "rewards/rejected": -0.015005757100880146, "step": 2610 }, { "epoch": 1.8876080691642652, "grad_norm": 3.0519216060638428, "learning_rate": 1.8180960636595234e-08, "logits/chosen": -1.434323787689209, "logits/rejected": -1.4235074520111084, "logps/chosen": -45.47016525268555, "logps/rejected": -48.84101104736328, "loss": 0.6842, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0007277448894456029, "rewards/margins": 0.018448855727910995, "rewards/rejected": -0.017721110954880714, "step": 2620 }, { "epoch": 1.8948126801152738, "grad_norm": 2.6402575969696045, "learning_rate": 1.7979543434998015e-08, "logits/chosen": -1.5166761875152588, "logits/rejected": -1.5122129917144775, "logps/chosen": -54.10516357421875, "logps/rejected": -55.752830505371094, "loss": 0.6886, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.002384190447628498, "rewards/margins": 0.009313153102993965, "rewards/rejected": -0.011697344481945038, "step": 2630 }, { "epoch": 1.9020172910662825, "grad_norm": 3.219054698944092, "learning_rate": 1.7778619742508345e-08, "logits/chosen": -1.4986519813537598, "logits/rejected": -1.4785531759262085, "logps/chosen": -48.8413200378418, "logps/rejected": -50.63405990600586, "loss": 0.6861, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.001953819999471307, "rewards/margins": 0.014699439518153667, "rewards/rejected": -0.016653258353471756, "step": 2640 }, { "epoch": 1.9092219020172911, "grad_norm": 5.369049549102783, "learning_rate": 1.757820368323213e-08, "logits/chosen": -1.4477155208587646, "logits/rejected": -1.431592345237732, "logps/chosen": -55.639549255371094, "logps/rejected": -60.893531799316406, "loss": 0.6852, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.00035453608143143356, "rewards/margins": 0.01644885540008545, "rewards/rejected": -0.01680339314043522, "step": 2650 }, { "epoch": 1.9164265129682998, "grad_norm": 2.708446741104126, "learning_rate": 1.7378309345590803e-08, "logits/chosen": -1.518417477607727, "logits/rejected": -1.5215131044387817, "logps/chosen": -48.15247344970703, "logps/rejected": -51.795204162597656, "loss": 0.6858, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0003538710006978363, "rewards/margins": 0.01514039933681488, "rewards/rejected": -0.01478652935475111, "step": 2660 }, { "epoch": 1.9236311239193085, "grad_norm": 3.0533721446990967, "learning_rate": 1.717895078133088e-08, "logits/chosen": -1.5372436046600342, "logits/rejected": -1.5273748636245728, "logps/chosen": -45.74188995361328, "logps/rejected": -50.93372344970703, "loss": 0.6844, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.001131549128331244, "rewards/margins": 0.018048815429210663, "rewards/rejected": -0.016917267814278603, "step": 2670 }, { "epoch": 1.9308357348703171, "grad_norm": 2.965850830078125, "learning_rate": 1.698014200453624e-08, "logits/chosen": -1.5121673345565796, "logits/rejected": -1.5155454874038696, "logps/chosen": -48.55657196044922, "logps/rejected": -53.24010467529297, "loss": 0.6893, "rewards/accuracies": 0.59375, "rewards/chosen": -5.956319910183083e-06, "rewards/margins": 0.00807130616158247, "rewards/rejected": -0.008077261969447136, "step": 2680 }, { "epoch": 1.9380403458213258, "grad_norm": 3.131131410598755, "learning_rate": 1.6781896990642964e-08, "logits/chosen": -1.4162065982818604, "logits/rejected": -1.4067705869674683, "logps/chosen": -53.68306350708008, "logps/rejected": -55.64463424682617, "loss": 0.6873, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0006439717253670096, "rewards/margins": 0.012254132889211178, "rewards/rejected": -0.011610162444412708, "step": 2690 }, { "epoch": 1.9452449567723344, "grad_norm": 3.7046432495117188, "learning_rate": 1.658422967545693e-08, "logits/chosen": -1.5447876453399658, "logits/rejected": -1.5232436656951904, "logps/chosen": -46.61565017700195, "logps/rejected": -48.878089904785156, "loss": 0.6864, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.002516025211662054, "rewards/margins": 0.013911202549934387, "rewards/rejected": -0.016427230089902878, "step": 2700 }, { "epoch": 1.952449567723343, "grad_norm": 3.3615024089813232, "learning_rate": 1.638715395417418e-08, "logits/chosen": -1.515842318534851, "logits/rejected": -1.4996858835220337, "logps/chosen": -47.76423645019531, "logps/rejected": -50.23698806762695, "loss": 0.6872, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0017537068342790008, "rewards/margins": 0.012197253294289112, "rewards/rejected": -0.013950960710644722, "step": 2710 }, { "epoch": 1.9596541786743515, "grad_norm": 3.4271042346954346, "learning_rate": 1.619068368040416e-08, "logits/chosen": -1.5035583972930908, "logits/rejected": -1.4936153888702393, "logps/chosen": -42.36717987060547, "logps/rejected": -48.00276565551758, "loss": 0.6848, "rewards/accuracies": 0.625, "rewards/chosen": 0.0006282638642005622, "rewards/margins": 0.017301367595791817, "rewards/rejected": -0.016673101112246513, "step": 2720 }, { "epoch": 1.9668587896253602, "grad_norm": 3.3180654048919678, "learning_rate": 1.5994832665195853e-08, "logits/chosen": -1.4341150522232056, "logits/rejected": -1.4278676509857178, "logps/chosen": -46.42858123779297, "logps/rejected": -48.82233428955078, "loss": 0.6873, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.00017052926705218852, "rewards/margins": 0.01208487804979086, "rewards/rejected": -0.011914348229765892, "step": 2730 }, { "epoch": 1.9740634005763689, "grad_norm": 3.3026318550109863, "learning_rate": 1.5799614676066906e-08, "logits/chosen": -1.5610630512237549, "logits/rejected": -1.5564082860946655, "logps/chosen": -42.62709045410156, "logps/rejected": -47.11577606201172, "loss": 0.6852, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.001482530147768557, "rewards/margins": 0.01645783707499504, "rewards/rejected": -0.017940368503332138, "step": 2740 }, { "epoch": 1.9812680115273775, "grad_norm": 2.802278518676758, "learning_rate": 1.560504343603587e-08, "logits/chosen": -1.4570392370224, "logits/rejected": -1.4598525762557983, "logps/chosen": -47.614715576171875, "logps/rejected": -53.1636962890625, "loss": 0.6862, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0013699980918318033, "rewards/margins": 0.014168500900268555, "rewards/rejected": -0.012798503041267395, "step": 2750 }, { "epoch": 1.9884726224783862, "grad_norm": 2.706557512283325, "learning_rate": 1.541113262265748e-08, "logits/chosen": -1.5580412149429321, "logits/rejected": -1.5532734394073486, "logps/chosen": -47.86179733276367, "logps/rejected": -52.08073043823242, "loss": 0.6855, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0002923453284893185, "rewards/margins": 0.015893833711743355, "rewards/rejected": -0.015601487830281258, "step": 2760 }, { "epoch": 1.9956772334293948, "grad_norm": 2.82700777053833, "learning_rate": 1.5217895867061227e-08, "logits/chosen": -1.4797217845916748, "logits/rejected": -1.46820068359375, "logps/chosen": -49.087162017822266, "logps/rejected": -51.762550354003906, "loss": 0.6855, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0006480285665020347, "rewards/margins": 0.015934782102704048, "rewards/rejected": -0.016582807525992393, "step": 2770 }, { "epoch": 2.0028818443804033, "grad_norm": 3.225586414337158, "learning_rate": 1.5025346752993098e-08, "logits/chosen": -1.473181128501892, "logits/rejected": -1.4776010513305664, "logps/chosen": -47.254737854003906, "logps/rejected": -51.428993225097656, "loss": 0.6887, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0032721899915486574, "rewards/margins": 0.009254529140889645, "rewards/rejected": -0.012526720762252808, "step": 2780 }, { "epoch": 2.010086455331412, "grad_norm": 3.1906368732452393, "learning_rate": 1.4833498815860756e-08, "logits/chosen": -1.6033878326416016, "logits/rejected": -1.594612717628479, "logps/chosen": -44.750770568847656, "logps/rejected": -49.424808502197266, "loss": 0.6835, "rewards/accuracies": 0.59375, "rewards/chosen": 0.000528767064679414, "rewards/margins": 0.019820012152194977, "rewards/rejected": -0.019291242584586143, "step": 2790 }, { "epoch": 2.0172910662824206, "grad_norm": 3.4794318675994873, "learning_rate": 1.4642365541781993e-08, "logits/chosen": -1.4186664819717407, "logits/rejected": -1.4020098447799683, "logps/chosen": -46.425323486328125, "logps/rejected": -51.33763885498047, "loss": 0.685, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.002099757781252265, "rewards/margins": 0.01701737754046917, "rewards/rejected": -0.019117135554552078, "step": 2800 }, { "epoch": 2.0244956772334293, "grad_norm": 3.546409845352173, "learning_rate": 1.4451960366636745e-08, "logits/chosen": -1.5050753355026245, "logits/rejected": -1.509311318397522, "logps/chosen": -50.25437545776367, "logps/rejected": -54.8748779296875, "loss": 0.686, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.9070675736875273e-05, "rewards/margins": 0.014723686501383781, "rewards/rejected": -0.014752757735550404, "step": 2810 }, { "epoch": 2.031700288184438, "grad_norm": 2.9926185607910156, "learning_rate": 1.4262296675122592e-08, "logits/chosen": -1.505336880683899, "logits/rejected": -1.4901078939437866, "logps/chosen": -43.937278747558594, "logps/rejected": -48.619163513183594, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": -0.0006590264965780079, "rewards/margins": 0.016862262040376663, "rewards/rejected": -0.017521290108561516, "step": 2820 }, { "epoch": 2.0389048991354466, "grad_norm": 3.474058151245117, "learning_rate": 1.407338779981389e-08, "logits/chosen": -1.4738878011703491, "logits/rejected": -1.4622704982757568, "logps/chosen": -41.43745040893555, "logps/rejected": -46.45849609375, "loss": 0.683, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.001070915604941547, "rewards/margins": 0.020852236077189445, "rewards/rejected": -0.02192315086722374, "step": 2830 }, { "epoch": 2.0461095100864553, "grad_norm": 3.1651461124420166, "learning_rate": 1.3885247020224534e-08, "logits/chosen": -1.4730961322784424, "logits/rejected": -1.4625325202941895, "logps/chosen": -40.98516082763672, "logps/rejected": -44.32816696166992, "loss": 0.6834, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0006273157196119428, "rewards/margins": 0.020260414108633995, "rewards/rejected": -0.0196330975741148, "step": 2840 }, { "epoch": 2.053314121037464, "grad_norm": 2.788933277130127, "learning_rate": 1.369788756187445e-08, "logits/chosen": -1.5226420164108276, "logits/rejected": -1.5108869075775146, "logps/chosen": -46.892662048339844, "logps/rejected": -48.15214920043945, "loss": 0.6879, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0032084626145660877, "rewards/margins": 0.011149941943585873, "rewards/rejected": -0.014358404092490673, "step": 2850 }, { "epoch": 2.0605187319884726, "grad_norm": 3.175995111465454, "learning_rate": 1.3511322595359925e-08, "logits/chosen": -1.5318832397460938, "logits/rejected": -1.5205862522125244, "logps/chosen": -43.323387145996094, "logps/rejected": -49.04631042480469, "loss": 0.6839, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0012531958054751158, "rewards/margins": 0.019172416999936104, "rewards/rejected": -0.02042561024427414, "step": 2860 }, { "epoch": 2.0677233429394812, "grad_norm": 3.237109422683716, "learning_rate": 1.3325565235427716e-08, "logits/chosen": -1.5520436763763428, "logits/rejected": -1.5436103343963623, "logps/chosen": -45.29417037963867, "logps/rejected": -49.32756423950195, "loss": 0.6843, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0010891546262428164, "rewards/margins": 0.01812928169965744, "rewards/rejected": -0.019218437373638153, "step": 2870 }, { "epoch": 2.07492795389049, "grad_norm": 3.6626226902008057, "learning_rate": 1.3140628540053218e-08, "logits/chosen": -1.4575971364974976, "logits/rejected": -1.4558467864990234, "logps/chosen": -45.92155838012695, "logps/rejected": -49.4353141784668, "loss": 0.6855, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0020993829239159822, "rewards/margins": 0.01558808796107769, "rewards/rejected": -0.0134887071326375, "step": 2880 }, { "epoch": 2.0821325648414986, "grad_norm": 3.993279218673706, "learning_rate": 1.2956525509522451e-08, "logits/chosen": -1.434325933456421, "logits/rejected": -1.43949556350708, "logps/chosen": -47.83916473388672, "logps/rejected": -51.43572998046875, "loss": 0.6872, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.001653014332987368, "rewards/margins": 0.01248462125658989, "rewards/rejected": -0.01083160750567913, "step": 2890 }, { "epoch": 2.089337175792507, "grad_norm": 3.8572018146514893, "learning_rate": 1.2773269085518267e-08, "logits/chosen": -1.5160229206085205, "logits/rejected": -1.5104598999023438, "logps/chosen": -52.54634475708008, "logps/rejected": -56.1825065612793, "loss": 0.6865, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0702252641058294e-06, "rewards/margins": 0.013869213871657848, "rewards/rejected": -0.013871285133063793, "step": 2900 }, { "epoch": 2.096541786743516, "grad_norm": 2.667027235031128, "learning_rate": 1.2590872150210574e-08, "logits/chosen": -1.5915756225585938, "logits/rejected": -1.575210452079773, "logps/chosen": -45.68871307373047, "logps/rejected": -47.84299087524414, "loss": 0.6837, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004803013987839222, "rewards/margins": 0.019728917628526688, "rewards/rejected": -0.024531930685043335, "step": 2910 }, { "epoch": 2.1037463976945245, "grad_norm": 2.844864845275879, "learning_rate": 1.2409347525350775e-08, "logits/chosen": -1.4988892078399658, "logits/rejected": -1.480571985244751, "logps/chosen": -47.45149230957031, "logps/rejected": -51.376197814941406, "loss": 0.6831, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0007760589360259473, "rewards/margins": 0.020668352022767067, "rewards/rejected": -0.019892293959856033, "step": 2920 }, { "epoch": 2.110951008645533, "grad_norm": 3.4284279346466064, "learning_rate": 1.2228707971370421e-08, "logits/chosen": -1.4963008165359497, "logits/rejected": -1.477461576461792, "logps/chosen": -42.0821647644043, "logps/rejected": -44.58640670776367, "loss": 0.6839, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0020004522521048784, "rewards/margins": 0.01911274902522564, "rewards/rejected": -0.01711229607462883, "step": 2930 }, { "epoch": 2.118155619596542, "grad_norm": 4.411471843719482, "learning_rate": 1.2048966186484282e-08, "logits/chosen": -1.525394320487976, "logits/rejected": -1.49559485912323, "logps/chosen": -52.1498908996582, "logps/rejected": -55.0552864074707, "loss": 0.6861, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0017288762610405684, "rewards/margins": 0.014565639197826385, "rewards/rejected": -0.016294512897729874, "step": 2940 }, { "epoch": 2.1253602305475505, "grad_norm": 3.3227975368499756, "learning_rate": 1.187013480579762e-08, "logits/chosen": -1.489044189453125, "logits/rejected": -1.4827308654785156, "logps/chosen": -45.42039489746094, "logps/rejected": -49.41254425048828, "loss": 0.6841, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0047860899940133095, "rewards/margins": 0.019023966044187546, "rewards/rejected": -0.02381005696952343, "step": 2950 }, { "epoch": 2.132564841498559, "grad_norm": 4.395474433898926, "learning_rate": 1.1692226400418073e-08, "logits/chosen": -1.4131033420562744, "logits/rejected": -1.4049489498138428, "logps/chosen": -49.05443572998047, "logps/rejected": -52.0718994140625, "loss": 0.6855, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.004469198640435934, "rewards/margins": 0.016008742153644562, "rewards/rejected": -0.020477941259741783, "step": 2960 }, { "epoch": 2.139769452449568, "grad_norm": 2.495917558670044, "learning_rate": 1.1515253476571923e-08, "logits/chosen": -1.4480403661727905, "logits/rejected": -1.4422080516815186, "logps/chosen": -44.43804931640625, "logps/rejected": -51.04991149902344, "loss": 0.684, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.003528184024617076, "rewards/margins": 0.018853966146707535, "rewards/rejected": -0.02238215133547783, "step": 2970 }, { "epoch": 2.1469740634005765, "grad_norm": 3.3481624126434326, "learning_rate": 1.133922847472496e-08, "logits/chosen": -1.4908777475357056, "logits/rejected": -1.4869946241378784, "logps/chosen": -52.52874755859375, "logps/rejected": -55.038414001464844, "loss": 0.685, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0005473472410812974, "rewards/margins": 0.017001759260892868, "rewards/rejected": -0.01754910498857498, "step": 2980 }, { "epoch": 2.154178674351585, "grad_norm": 3.3193447589874268, "learning_rate": 1.1164163768707952e-08, "logits/chosen": -1.4650285243988037, "logits/rejected": -1.4538644552230835, "logps/chosen": -47.43016052246094, "logps/rejected": -51.636924743652344, "loss": 0.6824, "rewards/accuracies": 0.625, "rewards/chosen": -0.0007126646814867854, "rewards/margins": 0.02234521321952343, "rewards/rejected": -0.023057879880070686, "step": 2990 }, { "epoch": 2.161383285302594, "grad_norm": 3.359654426574707, "learning_rate": 1.0990071664846861e-08, "logits/chosen": -1.4393339157104492, "logits/rejected": -1.429203748703003, "logps/chosen": -48.76907730102539, "logps/rejected": -53.98026657104492, "loss": 0.6819, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.00010694740194594488, "rewards/margins": 0.023278547450900078, "rewards/rejected": -0.023171598091721535, "step": 3000 }, { "epoch": 2.1685878962536025, "grad_norm": 3.054330348968506, "learning_rate": 1.0816964401097739e-08, "logits/chosen": -1.4826459884643555, "logits/rejected": -1.472663164138794, "logps/chosen": -43.025970458984375, "logps/rejected": -45.86864471435547, "loss": 0.6854, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0005824376130476594, "rewards/margins": 0.016256345435976982, "rewards/rejected": -0.016838783398270607, "step": 3010 }, { "epoch": 2.175792507204611, "grad_norm": 3.929755926132202, "learning_rate": 1.0644854146186406e-08, "logits/chosen": -1.5146058797836304, "logits/rejected": -1.49686598777771, "logps/chosen": -48.23189926147461, "logps/rejected": -52.9786491394043, "loss": 0.6827, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0024410493206232786, "rewards/margins": 0.021744880825281143, "rewards/rejected": -0.02418592758476734, "step": 3020 }, { "epoch": 2.18299711815562, "grad_norm": 3.215834856033325, "learning_rate": 1.0473752998753114e-08, "logits/chosen": -1.4946410655975342, "logits/rejected": -1.4729467630386353, "logps/chosen": -48.56220626831055, "logps/rejected": -51.715721130371094, "loss": 0.6823, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0010552393505349755, "rewards/margins": 0.022361181676387787, "rewards/rejected": -0.02130594104528427, "step": 3030 }, { "epoch": 2.1902017291066285, "grad_norm": 2.973313093185425, "learning_rate": 1.030367298650201e-08, "logits/chosen": -1.4930821657180786, "logits/rejected": -1.492546796798706, "logps/chosen": -48.8010368347168, "logps/rejected": -53.67090606689453, "loss": 0.6873, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.004332934506237507, "rewards/margins": 0.012475891038775444, "rewards/rejected": -0.016808826476335526, "step": 3040 }, { "epoch": 2.1974063400576367, "grad_norm": 3.869215726852417, "learning_rate": 1.0134626065355675e-08, "logits/chosen": -1.5941343307495117, "logits/rejected": -1.5830258131027222, "logps/chosen": -49.36521530151367, "logps/rejected": -52.964378356933594, "loss": 0.6825, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0014757805038243532, "rewards/margins": 0.02204863727092743, "rewards/rejected": -0.020572859793901443, "step": 3050 }, { "epoch": 2.2046109510086453, "grad_norm": 3.5000784397125244, "learning_rate": 9.966624118614611e-09, "logits/chosen": -1.491275668144226, "logits/rejected": -1.471986174583435, "logps/chosen": -52.319740295410156, "logps/rejected": -55.521949768066406, "loss": 0.6837, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.002091693691909313, "rewards/margins": 0.019565019756555557, "rewards/rejected": -0.01747332513332367, "step": 3060 }, { "epoch": 2.211815561959654, "grad_norm": 2.422412395477295, "learning_rate": 9.799678956121976e-09, "logits/chosen": -1.436295747756958, "logits/rejected": -1.4197183847427368, "logps/chosen": -45.91994094848633, "logps/rejected": -48.53052520751953, "loss": 0.6871, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.002334743272513151, "rewards/margins": 0.012487743981182575, "rewards/rejected": -0.014822488650679588, "step": 3070 }, { "epoch": 2.2190201729106627, "grad_norm": 3.5557055473327637, "learning_rate": 9.633802313433314e-09, "logits/chosen": -1.4133331775665283, "logits/rejected": -1.4094794988632202, "logps/chosen": -48.400875091552734, "logps/rejected": -50.9952278137207, "loss": 0.685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0015882644802331924, "rewards/margins": 0.016813434660434723, "rewards/rejected": -0.018401699140667915, "step": 3080 }, { "epoch": 2.2262247838616713, "grad_norm": 2.77536940574646, "learning_rate": 9.469005850991705e-09, "logits/chosen": -1.4840987920761108, "logits/rejected": -1.4713189601898193, "logps/chosen": -47.20307159423828, "logps/rejected": -48.701622009277344, "loss": 0.6842, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0018709308933466673, "rewards/margins": 0.01850222796201706, "rewards/rejected": -0.020373158156871796, "step": 3090 }, { "epoch": 2.23342939481268, "grad_norm": 3.2006888389587402, "learning_rate": 9.305301153307949e-09, "logits/chosen": -1.4952600002288818, "logits/rejected": -1.499306321144104, "logps/chosen": -40.018882751464844, "logps/rejected": -44.07697296142578, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": -0.005154062993824482, "rewards/margins": 0.020358018577098846, "rewards/rejected": -0.025512080639600754, "step": 3100 }, { "epoch": 2.2406340057636887, "grad_norm": 2.7660207748413086, "learning_rate": 9.142699728146336e-09, "logits/chosen": -1.4331061840057373, "logits/rejected": -1.4245890378952026, "logps/chosen": -46.127166748046875, "logps/rejected": -51.11559295654297, "loss": 0.6843, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0028220864478498697, "rewards/margins": 0.018410906195640564, "rewards/rejected": -0.02123299241065979, "step": 3110 }, { "epoch": 2.2478386167146973, "grad_norm": 2.9993467330932617, "learning_rate": 8.981213005715627e-09, "logits/chosen": -1.501518726348877, "logits/rejected": -1.5010533332824707, "logps/chosen": -44.233001708984375, "logps/rejected": -49.084808349609375, "loss": 0.6841, "rewards/accuracies": 0.625, "rewards/chosen": -0.0025021065957844257, "rewards/margins": 0.01857968047261238, "rewards/rejected": -0.021081790328025818, "step": 3120 }, { "epoch": 2.255043227665706, "grad_norm": 3.6337778568267822, "learning_rate": 8.820852337865611e-09, "logits/chosen": -1.5516728162765503, "logits/rejected": -1.536139965057373, "logps/chosen": -45.085365295410156, "logps/rejected": -48.6732063293457, "loss": 0.6844, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0014631979865953326, "rewards/margins": 0.01818043366074562, "rewards/rejected": -0.0196436308324337, "step": 3130 }, { "epoch": 2.2622478386167146, "grad_norm": 2.8269283771514893, "learning_rate": 8.661628997289044e-09, "logits/chosen": -1.4337918758392334, "logits/rejected": -1.420588731765747, "logps/chosen": -45.37371826171875, "logps/rejected": -49.878807067871094, "loss": 0.6839, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.001038391375914216, "rewards/margins": 0.019226137548685074, "rewards/rejected": -0.020264528691768646, "step": 3140 }, { "epoch": 2.2694524495677233, "grad_norm": 2.8211851119995117, "learning_rate": 8.503554176729341e-09, "logits/chosen": -1.4207173585891724, "logits/rejected": -1.4155914783477783, "logps/chosen": -45.44511032104492, "logps/rejected": -49.06669998168945, "loss": 0.6832, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0009460471337661147, "rewards/margins": 0.0205953698605299, "rewards/rejected": -0.01964932307600975, "step": 3150 }, { "epoch": 2.276657060518732, "grad_norm": 3.823362350463867, "learning_rate": 8.346638988193636e-09, "logits/chosen": -1.4726402759552002, "logits/rejected": -1.4704288244247437, "logps/chosen": -40.57404708862305, "logps/rejected": -46.3819580078125, "loss": 0.6835, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0009540968458168209, "rewards/margins": 0.02004236914217472, "rewards/rejected": -0.020996464416384697, "step": 3160 }, { "epoch": 2.2838616714697406, "grad_norm": 4.203202247619629, "learning_rate": 8.19089446217176e-09, "logits/chosen": -1.4289653301239014, "logits/rejected": -1.4073545932769775, "logps/chosen": -45.77039337158203, "logps/rejected": -51.22684860229492, "loss": 0.6797, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.002206823555752635, "rewards/margins": 0.02788766846060753, "rewards/rejected": -0.025680843740701675, "step": 3170 }, { "epoch": 2.2910662824207493, "grad_norm": 3.061116933822632, "learning_rate": 8.036331546860777e-09, "logits/chosen": -1.4556572437286377, "logits/rejected": -1.453061819076538, "logps/chosen": -45.42163848876953, "logps/rejected": -48.2816276550293, "loss": 0.6875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0038195946253836155, "rewards/margins": 0.012059660628437996, "rewards/rejected": -0.015879254788160324, "step": 3180 }, { "epoch": 2.298270893371758, "grad_norm": 3.667120933532715, "learning_rate": 7.882961107395416e-09, "logits/chosen": -1.4970252513885498, "logits/rejected": -1.4873993396759033, "logps/chosen": -52.32844924926758, "logps/rejected": -52.70637893676758, "loss": 0.6869, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006024113390594721, "rewards/margins": 0.013099935837090015, "rewards/rejected": -0.019124049693346024, "step": 3190 }, { "epoch": 2.3054755043227666, "grad_norm": 4.6425042152404785, "learning_rate": 7.73079392508428e-09, "logits/chosen": -1.421281099319458, "logits/rejected": -1.4252192974090576, "logps/chosen": -49.780357360839844, "logps/rejected": -56.728782653808594, "loss": 0.6824, "rewards/accuracies": 0.59375, "rewards/chosen": -0.004081489983946085, "rewards/margins": 0.02246311493217945, "rewards/rejected": -0.026544606313109398, "step": 3200 }, { "epoch": 2.3126801152737753, "grad_norm": 3.641294479370117, "learning_rate": 7.579840696651938e-09, "logits/chosen": -1.5132240056991577, "logits/rejected": -1.5067884922027588, "logps/chosen": -42.3286247253418, "logps/rejected": -45.735172271728516, "loss": 0.6841, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.004373098723590374, "rewards/margins": 0.01880384422838688, "rewards/rejected": -0.02317694202065468, "step": 3210 }, { "epoch": 2.319884726224784, "grad_norm": 4.251941680908203, "learning_rate": 7.43011203348704e-09, "logits/chosen": -1.3565527200698853, "logits/rejected": -1.3517991304397583, "logps/chosen": -53.13508224487305, "logps/rejected": -53.89665603637695, "loss": 0.6857, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.005780863109976053, "rewards/margins": 0.015468914993107319, "rewards/rejected": -0.02124977670609951, "step": 3220 }, { "epoch": 2.3270893371757926, "grad_norm": 3.2468185424804688, "learning_rate": 7.281618460896344e-09, "logits/chosen": -1.4836031198501587, "logits/rejected": -1.4732134342193604, "logps/chosen": -46.293495178222656, "logps/rejected": -50.772377014160156, "loss": 0.6844, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0019498964538797736, "rewards/margins": 0.018127668648958206, "rewards/rejected": -0.02007756568491459, "step": 3230 }, { "epoch": 2.3342939481268012, "grad_norm": 2.987872362136841, "learning_rate": 7.134370417364849e-09, "logits/chosen": -1.431056022644043, "logits/rejected": -1.4228883981704712, "logps/chosen": -45.278480529785156, "logps/rejected": -48.09427261352539, "loss": 0.6868, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007163041736930609, "rewards/margins": 0.013337318785488605, "rewards/rejected": -0.020500360056757927, "step": 3240 }, { "epoch": 2.34149855907781, "grad_norm": 4.016878604888916, "learning_rate": 6.988378253821981e-09, "logits/chosen": -1.4581267833709717, "logits/rejected": -1.4503519535064697, "logps/chosen": -51.406394958496094, "logps/rejected": -54.95398712158203, "loss": 0.6867, "rewards/accuracies": 0.59375, "rewards/chosen": 8.269222598755732e-05, "rewards/margins": 0.013455493375658989, "rewards/rejected": -0.013372799381613731, "step": 3250 }, { "epoch": 2.3487031700288186, "grad_norm": 3.061265468597412, "learning_rate": 6.8436522329140186e-09, "logits/chosen": -1.4396600723266602, "logits/rejected": -1.445936918258667, "logps/chosen": -46.99631881713867, "logps/rejected": -50.84511184692383, "loss": 0.6853, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0030071113724261522, "rewards/margins": 0.016658034175634384, "rewards/rejected": -0.019665146246552467, "step": 3260 }, { "epoch": 2.3559077809798272, "grad_norm": 3.53556752204895, "learning_rate": 6.700202528282603e-09, "logits/chosen": -1.4168142080307007, "logits/rejected": -1.3971381187438965, "logps/chosen": -48.588462829589844, "logps/rejected": -51.615745544433594, "loss": 0.6833, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.00601479597389698, "rewards/margins": 0.020779959857463837, "rewards/rejected": -0.026794757694005966, "step": 3270 }, { "epoch": 2.363112391930836, "grad_norm": 3.7148492336273193, "learning_rate": 6.558039223849668e-09, "logits/chosen": -1.5100008249282837, "logits/rejected": -1.4907362461090088, "logps/chosen": -46.292266845703125, "logps/rejected": -52.69337844848633, "loss": 0.6816, "rewards/accuracies": 0.625, "rewards/chosen": -0.0020173946395516396, "rewards/margins": 0.024240782484412193, "rewards/rejected": -0.026258179917931557, "step": 3280 }, { "epoch": 2.3703170028818445, "grad_norm": 2.8915648460388184, "learning_rate": 6.417172313108471e-09, "logits/chosen": -1.4239482879638672, "logits/rejected": -1.4123411178588867, "logps/chosen": -44.13205337524414, "logps/rejected": -47.372459411621094, "loss": 0.6852, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.008008310571312904, "rewards/margins": 0.016741162165999413, "rewards/rejected": -0.024749474599957466, "step": 3290 }, { "epoch": 2.377521613832853, "grad_norm": 2.96709942817688, "learning_rate": 6.277611698421179e-09, "logits/chosen": -1.5538231134414673, "logits/rejected": -1.5337212085723877, "logps/chosen": -39.08244323730469, "logps/rejected": -45.03202438354492, "loss": 0.6812, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003737329039722681, "rewards/margins": 0.02486787550151348, "rewards/rejected": -0.028605204075574875, "step": 3300 }, { "epoch": 2.3847262247838614, "grad_norm": 4.765800476074219, "learning_rate": 6.139367190322714e-09, "logits/chosen": -1.4922102689743042, "logits/rejected": -1.4921929836273193, "logps/chosen": -52.74082565307617, "logps/rejected": -58.16496658325195, "loss": 0.6855, "rewards/accuracies": 0.625, "rewards/chosen": -0.0037182599771767855, "rewards/margins": 0.01609138958156109, "rewards/rejected": -0.019809648394584656, "step": 3310 }, { "epoch": 2.39193083573487, "grad_norm": 2.5080461502075195, "learning_rate": 6.002448506831171e-09, "logits/chosen": -1.4787399768829346, "logits/rejected": -1.474265217781067, "logps/chosen": -44.079654693603516, "logps/rejected": -49.27398681640625, "loss": 0.6843, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.003509215312078595, "rewards/margins": 0.0185092780739069, "rewards/rejected": -0.022018492221832275, "step": 3320 }, { "epoch": 2.3991354466858787, "grad_norm": 3.059837818145752, "learning_rate": 5.866865272764607e-09, "logits/chosen": -1.4946706295013428, "logits/rejected": -1.4880635738372803, "logps/chosen": -46.43208694458008, "logps/rejected": -50.517845153808594, "loss": 0.6854, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0064870258793234825, "rewards/margins": 0.016077209264039993, "rewards/rejected": -0.0225642379373312, "step": 3330 }, { "epoch": 2.4063400576368874, "grad_norm": 4.633550643920898, "learning_rate": 5.7326270190645595e-09, "logits/chosen": -1.327772855758667, "logits/rejected": -1.3225994110107422, "logps/chosen": -49.93898391723633, "logps/rejected": -52.00776290893555, "loss": 0.6846, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0053224144503474236, "rewards/margins": 0.017836768180131912, "rewards/rejected": -0.02315918542444706, "step": 3340 }, { "epoch": 2.413544668587896, "grad_norm": 3.6820759773254395, "learning_rate": 5.599743182125938e-09, "logits/chosen": -1.537332534790039, "logits/rejected": -1.536694049835205, "logps/chosen": -48.791236877441406, "logps/rejected": -54.156349182128906, "loss": 0.6848, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0013721368741244078, "rewards/margins": 0.01742737926542759, "rewards/rejected": -0.018799515441060066, "step": 3350 }, { "epoch": 2.4207492795389047, "grad_norm": 3.4776570796966553, "learning_rate": 5.46822310313379e-09, "logits/chosen": -1.5602588653564453, "logits/rejected": -1.5649378299713135, "logps/chosen": -49.44075012207031, "logps/rejected": -52.79970169067383, "loss": 0.6874, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0048035006038844585, "rewards/margins": 0.012090938165783882, "rewards/rejected": -0.01689443737268448, "step": 3360 }, { "epoch": 2.4279538904899134, "grad_norm": 3.6628270149230957, "learning_rate": 5.33807602740658e-09, "logits/chosen": -1.5563075542449951, "logits/rejected": -1.5405874252319336, "logps/chosen": -41.89576721191406, "logps/rejected": -47.50288009643555, "loss": 0.6799, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0014247519429773092, "rewards/margins": 0.027519360184669495, "rewards/rejected": -0.0289441104978323, "step": 3370 }, { "epoch": 2.435158501440922, "grad_norm": 3.8053536415100098, "learning_rate": 5.209311103746334e-09, "logits/chosen": -1.475118637084961, "logits/rejected": -1.4708943367004395, "logps/chosen": -47.132999420166016, "logps/rejected": -52.47126007080078, "loss": 0.6836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003361554117873311, "rewards/margins": 0.019802602007985115, "rewards/rejected": -0.023164156824350357, "step": 3380 }, { "epoch": 2.4423631123919307, "grad_norm": 4.187764644622803, "learning_rate": 5.081937383795484e-09, "logits/chosen": -1.4638832807540894, "logits/rejected": -1.45353102684021, "logps/chosen": -44.22564697265625, "logps/rejected": -48.906394958496094, "loss": 0.6821, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0011400593211874366, "rewards/margins": 0.022827504202723503, "rewards/rejected": -0.02396756038069725, "step": 3390 }, { "epoch": 2.4495677233429394, "grad_norm": 3.6951913833618164, "learning_rate": 4.955963821400599e-09, "logits/chosen": -1.524279236793518, "logits/rejected": -1.506208062171936, "logps/chosen": -46.91497802734375, "logps/rejected": -49.75968933105469, "loss": 0.6827, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.002611330011859536, "rewards/margins": 0.0218039583414793, "rewards/rejected": -0.024415289983153343, "step": 3400 }, { "epoch": 2.456772334293948, "grad_norm": 2.797675371170044, "learning_rate": 4.831399271982928e-09, "logits/chosen": -1.3963180780410767, "logits/rejected": -1.380124568939209, "logps/chosen": -49.73040008544922, "logps/rejected": -52.8338737487793, "loss": 0.6831, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0018096374114975333, "rewards/margins": 0.02105477824807167, "rewards/rejected": -0.022864414379000664, "step": 3410 }, { "epoch": 2.4639769452449567, "grad_norm": 3.9951834678649902, "learning_rate": 4.708252491915951e-09, "logits/chosen": -1.4993171691894531, "logits/rejected": -1.4892971515655518, "logps/chosen": -47.080963134765625, "logps/rejected": -51.67246627807617, "loss": 0.683, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.003982001915574074, "rewards/margins": 0.021508801728487015, "rewards/rejected": -0.02549080178141594, "step": 3420 }, { "epoch": 2.4711815561959654, "grad_norm": 2.9166736602783203, "learning_rate": 4.58653213790981e-09, "logits/chosen": -1.4970018863677979, "logits/rejected": -1.4785791635513306, "logps/chosen": -47.441192626953125, "logps/rejected": -52.0790901184082, "loss": 0.6837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002383728977292776, "rewards/margins": 0.019747789949178696, "rewards/rejected": -0.02213152125477791, "step": 3430 }, { "epoch": 2.478386167146974, "grad_norm": 3.378357172012329, "learning_rate": 4.466246766402773e-09, "logits/chosen": -1.4705661535263062, "logits/rejected": -1.4513781070709229, "logps/chosen": -48.759010314941406, "logps/rejected": -52.48942947387695, "loss": 0.6819, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0021346856374293566, "rewards/margins": 0.023418936878442764, "rewards/rejected": -0.025553623214364052, "step": 3440 }, { "epoch": 2.4855907780979827, "grad_norm": 3.730109453201294, "learning_rate": 4.347404832959775e-09, "logits/chosen": -1.5252504348754883, "logits/rejected": -1.5139144659042358, "logps/chosen": -44.665199279785156, "logps/rejected": -48.89989471435547, "loss": 0.6839, "rewards/accuracies": 0.65625, "rewards/chosen": -0.004804068244993687, "rewards/margins": 0.019236544147133827, "rewards/rejected": -0.02404061332345009, "step": 3450 }, { "epoch": 2.4927953890489913, "grad_norm": 3.4961752891540527, "learning_rate": 4.230014691678016e-09, "logits/chosen": -1.4771640300750732, "logits/rejected": -1.4784762859344482, "logps/chosen": -49.46748352050781, "logps/rejected": -51.16899871826172, "loss": 0.6866, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.006382669322192669, "rewards/margins": 0.013793786987662315, "rewards/rejected": -0.020176459103822708, "step": 3460 }, { "epoch": 2.5, "grad_norm": 3.152644395828247, "learning_rate": 4.114084594599707e-09, "logits/chosen": -1.4625837802886963, "logits/rejected": -1.4395456314086914, "logps/chosen": -45.60460662841797, "logps/rejected": -51.585113525390625, "loss": 0.682, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0029743132181465626, "rewards/margins": 0.02330465242266655, "rewards/rejected": -0.026278968900442123, "step": 3470 }, { "epoch": 2.5072046109510087, "grad_norm": 3.1570560932159424, "learning_rate": 3.9996226911319546e-09, "logits/chosen": -1.4798153638839722, "logits/rejected": -1.457363486289978, "logps/chosen": -45.63786315917969, "logps/rejected": -48.74847412109375, "loss": 0.6838, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0032687117345631123, "rewards/margins": 0.01941683515906334, "rewards/rejected": -0.022685546427965164, "step": 3480 }, { "epoch": 2.5144092219020173, "grad_norm": 3.3458943367004395, "learning_rate": 3.886637027473949e-09, "logits/chosen": -1.512085199356079, "logits/rejected": -1.5080697536468506, "logps/chosen": -47.53780746459961, "logps/rejected": -51.640533447265625, "loss": 0.6836, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0049394266679883, "rewards/margins": 0.01969726011157036, "rewards/rejected": -0.024636687710881233, "step": 3490 }, { "epoch": 2.521613832853026, "grad_norm": 3.2042598724365234, "learning_rate": 3.775135546051295e-09, "logits/chosen": -1.4048144817352295, "logits/rejected": -1.4050971269607544, "logps/chosen": -46.04724884033203, "logps/rejected": -50.54692077636719, "loss": 0.6819, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.004532016348093748, "rewards/margins": 0.023406367748975754, "rewards/rejected": -0.02793838456273079, "step": 3500 }, { "epoch": 2.5288184438040346, "grad_norm": 3.420597553253174, "learning_rate": 3.665126084957723e-09, "logits/chosen": -1.469005823135376, "logits/rejected": -1.4607031345367432, "logps/chosen": -50.867950439453125, "logps/rejected": -51.225799560546875, "loss": 0.6844, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.005590965040028095, "rewards/margins": 0.01840902492403984, "rewards/rejected": -0.023999987170100212, "step": 3510 }, { "epoch": 2.5360230547550433, "grad_norm": 3.1464619636535645, "learning_rate": 3.556616377404101e-09, "logits/chosen": -1.5020486116409302, "logits/rejected": -1.4901654720306396, "logps/chosen": -51.85783004760742, "logps/rejected": -55.8673210144043, "loss": 0.6817, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0066805132664740086, "rewards/margins": 0.02360719069838524, "rewards/rejected": -0.03028770722448826, "step": 3520 }, { "epoch": 2.543227665706052, "grad_norm": 3.3954811096191406, "learning_rate": 3.4496140511748125e-09, "logits/chosen": -1.485480546951294, "logits/rejected": -1.4669150114059448, "logps/chosen": -48.15165710449219, "logps/rejected": -51.049583435058594, "loss": 0.6833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007361284457147121, "rewards/margins": 0.02040119096636772, "rewards/rejected": -0.027762476354837418, "step": 3530 }, { "epoch": 2.5504322766570606, "grad_norm": 3.955996513366699, "learning_rate": 3.3441266280915427e-09, "logits/chosen": -1.4491957426071167, "logits/rejected": -1.446547269821167, "logps/chosen": -53.73142623901367, "logps/rejected": -57.25432205200195, "loss": 0.6852, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0023315693251788616, "rewards/margins": 0.01659976877272129, "rewards/rejected": -0.01893133856356144, "step": 3540 }, { "epoch": 2.5576368876080693, "grad_norm": 3.521127462387085, "learning_rate": 3.2401615234845693e-09, "logits/chosen": -1.492701530456543, "logits/rejected": -1.475007176399231, "logps/chosen": -54.01853561401367, "logps/rejected": -57.351951599121094, "loss": 0.6819, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0070812939666211605, "rewards/margins": 0.023319412022829056, "rewards/rejected": -0.030400704592466354, "step": 3550 }, { "epoch": 2.564841498559078, "grad_norm": 3.1193737983703613, "learning_rate": 3.1377260456714375e-09, "logits/chosen": -1.3230210542678833, "logits/rejected": -1.3108537197113037, "logps/chosen": -49.07139587402344, "logps/rejected": -54.1447868347168, "loss": 0.6835, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007112964056432247, "rewards/margins": 0.020381804555654526, "rewards/rejected": -0.027494769543409348, "step": 3560 }, { "epoch": 2.5720461095100866, "grad_norm": 3.66926908493042, "learning_rate": 3.0368273954432698e-09, "logits/chosen": -1.5296647548675537, "logits/rejected": -1.5019731521606445, "logps/chosen": -51.096221923828125, "logps/rejected": -53.320556640625, "loss": 0.6845, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.006328393705189228, "rewards/margins": 0.018091212958097458, "rewards/rejected": -0.02441960759460926, "step": 3570 }, { "epoch": 2.5792507204610953, "grad_norm": 3.0272419452667236, "learning_rate": 2.937472665558541e-09, "logits/chosen": -1.5538597106933594, "logits/rejected": -1.5464608669281006, "logps/chosen": -45.452919006347656, "logps/rejected": -47.75053024291992, "loss": 0.6817, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.008346345275640488, "rewards/margins": 0.02381196618080139, "rewards/rejected": -0.03215831145644188, "step": 3580 }, { "epoch": 2.586455331412104, "grad_norm": 4.039034843444824, "learning_rate": 2.8396688402445053e-09, "logits/chosen": -1.574406385421753, "logits/rejected": -1.557455062866211, "logps/chosen": -45.44925308227539, "logps/rejected": -51.695777893066406, "loss": 0.6816, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.010429826565086842, "rewards/margins": 0.024167021736502647, "rewards/rejected": -0.03459685295820236, "step": 3590 }, { "epoch": 2.5936599423631126, "grad_norm": 4.070069313049316, "learning_rate": 2.7434227947062324e-09, "logits/chosen": -1.526296854019165, "logits/rejected": -1.5150690078735352, "logps/chosen": -53.84295654296875, "logps/rejected": -57.382408142089844, "loss": 0.6856, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.005254354793578386, "rewards/margins": 0.015645707026124, "rewards/rejected": -0.02090005949139595, "step": 3600 }, { "epoch": 2.6008645533141213, "grad_norm": 3.0419559478759766, "learning_rate": 2.6487412946432976e-09, "logits/chosen": -1.4455441236495972, "logits/rejected": -1.4326177835464478, "logps/chosen": -49.550575256347656, "logps/rejected": -52.355995178222656, "loss": 0.6822, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.013003657571971416, "rewards/margins": 0.02292976900935173, "rewards/rejected": -0.03593342751264572, "step": 3610 }, { "epoch": 2.60806916426513, "grad_norm": 3.4363460540771484, "learning_rate": 2.5556309957742024e-09, "logits/chosen": -1.4442135095596313, "logits/rejected": -1.4347007274627686, "logps/chosen": -44.96186447143555, "logps/rejected": -52.17518997192383, "loss": 0.6803, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0013826603535562754, "rewards/margins": 0.02670016512274742, "rewards/rejected": -0.02531750500202179, "step": 3620 }, { "epoch": 2.6152737752161386, "grad_norm": 3.359717607498169, "learning_rate": 2.4640984433684758e-09, "logits/chosen": -1.5575648546218872, "logits/rejected": -1.543250560760498, "logps/chosen": -50.973533630371094, "logps/rejected": -53.1517448425293, "loss": 0.6838, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004719638731330633, "rewards/margins": 0.019542286172509193, "rewards/rejected": -0.024261925369501114, "step": 3630 }, { "epoch": 2.6224783861671472, "grad_norm": 3.6610288619995117, "learning_rate": 2.3741500717865987e-09, "logits/chosen": -1.4447298049926758, "logits/rejected": -1.456498384475708, "logps/chosen": -47.38273620605469, "logps/rejected": -52.238502502441406, "loss": 0.684, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0018823295831680298, "rewards/margins": 0.01906423084437847, "rewards/rejected": -0.0209465604275465, "step": 3640 }, { "epoch": 2.629682997118156, "grad_norm": 3.1195054054260254, "learning_rate": 2.285792204027678e-09, "logits/chosen": -1.4207738637924194, "logits/rejected": -1.4102163314819336, "logps/chosen": -47.51162338256836, "logps/rejected": -54.73249053955078, "loss": 0.6823, "rewards/accuracies": 0.625, "rewards/chosen": -0.004845681134611368, "rewards/margins": 0.02257709763944149, "rewards/rejected": -0.02742278017103672, "step": 3650 }, { "epoch": 2.636887608069164, "grad_norm": 3.726067066192627, "learning_rate": 2.199031051284972e-09, "logits/chosen": -1.4994523525238037, "logits/rejected": -1.4987690448760986, "logps/chosen": -48.3768196105957, "logps/rejected": -52.434532165527344, "loss": 0.6837, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.004970946349203587, "rewards/margins": 0.020046500489115715, "rewards/rejected": -0.025017445906996727, "step": 3660 }, { "epoch": 2.6440922190201728, "grad_norm": 3.8218469619750977, "learning_rate": 2.113872712509254e-09, "logits/chosen": -1.4067411422729492, "logits/rejected": -1.3974635601043701, "logps/chosen": -56.22896194458008, "logps/rejected": -59.406829833984375, "loss": 0.6831, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.009745483286678791, "rewards/margins": 0.020852208137512207, "rewards/rejected": -0.030597690492868423, "step": 3670 }, { "epoch": 2.6512968299711814, "grad_norm": 3.4928946495056152, "learning_rate": 2.0303231739801143e-09, "logits/chosen": -1.410017490386963, "logits/rejected": -1.3968251943588257, "logps/chosen": -50.764427185058594, "logps/rejected": -55.013816833496094, "loss": 0.6845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009240074083209038, "rewards/margins": 0.01817367412149906, "rewards/rejected": -0.02741374634206295, "step": 3680 }, { "epoch": 2.65850144092219, "grad_norm": 3.9155476093292236, "learning_rate": 1.948388308885102e-09, "logits/chosen": -1.573972225189209, "logits/rejected": -1.558861255645752, "logps/chosen": -50.125404357910156, "logps/rejected": -53.09186553955078, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -0.003737266408279538, "rewards/margins": 0.01738080568611622, "rewards/rejected": -0.02111807093024254, "step": 3690 }, { "epoch": 2.6657060518731988, "grad_norm": 3.190427780151367, "learning_rate": 1.86807387690692e-09, "logits/chosen": -1.5523760318756104, "logits/rejected": -1.5453031063079834, "logps/chosen": -50.241458892822266, "logps/rejected": -57.71672821044922, "loss": 0.6787, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0018135461723431945, "rewards/margins": 0.03013269044458866, "rewards/rejected": -0.03194623440504074, "step": 3700 }, { "epoch": 2.6729106628242074, "grad_norm": 3.522392511367798, "learning_rate": 1.789385523818493e-09, "logits/chosen": -1.4759515523910522, "logits/rejected": -1.4781670570373535, "logps/chosen": -45.24992370605469, "logps/rejected": -51.2096061706543, "loss": 0.6822, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0035810978151857853, "rewards/margins": 0.022562062367796898, "rewards/rejected": -0.026143159717321396, "step": 3710 }, { "epoch": 2.680115273775216, "grad_norm": 3.612794876098633, "learning_rate": 1.712328781086131e-09, "logits/chosen": -1.5478874444961548, "logits/rejected": -1.531994104385376, "logps/chosen": -51.03803634643555, "logps/rejected": -53.24555206298828, "loss": 0.6864, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.007410322315990925, "rewards/margins": 0.014138393104076385, "rewards/rejected": -0.021548714488744736, "step": 3720 }, { "epoch": 2.6873198847262247, "grad_norm": 3.423640727996826, "learning_rate": 1.6369090654806543e-09, "logits/chosen": -1.5726535320281982, "logits/rejected": -1.5603439807891846, "logps/chosen": -46.89988708496094, "logps/rejected": -51.72241973876953, "loss": 0.6846, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.008618971332907677, "rewards/margins": 0.017804089933633804, "rewards/rejected": -0.026423057541251183, "step": 3730 }, { "epoch": 2.6945244956772334, "grad_norm": 3.223950147628784, "learning_rate": 1.5631316786966498e-09, "logits/chosen": -1.4826900959014893, "logits/rejected": -1.4667797088623047, "logps/chosen": -45.12305450439453, "logps/rejected": -48.53042984008789, "loss": 0.6849, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.006777583155781031, "rewards/margins": 0.01729729399085045, "rewards/rejected": -0.024074876680970192, "step": 3740 }, { "epoch": 2.701729106628242, "grad_norm": 4.128338813781738, "learning_rate": 1.491001806979772e-09, "logits/chosen": -1.5129797458648682, "logits/rejected": -1.498471736907959, "logps/chosen": -50.177757263183594, "logps/rejected": -54.36769485473633, "loss": 0.6838, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0021677894983440638, "rewards/margins": 0.019533688202500343, "rewards/rejected": -0.021701481193304062, "step": 3750 }, { "epoch": 2.7089337175792507, "grad_norm": 3.762078285217285, "learning_rate": 1.4205245207621508e-09, "logits/chosen": -1.4353492259979248, "logits/rejected": -1.4193631410598755, "logps/chosen": -52.8912239074707, "logps/rejected": -55.724334716796875, "loss": 0.682, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0020478595979511738, "rewards/margins": 0.023334335535764694, "rewards/rejected": -0.02538219466805458, "step": 3760 }, { "epoch": 2.7161383285302594, "grad_norm": 3.871379852294922, "learning_rate": 1.3517047743059978e-09, "logits/chosen": -1.5186526775360107, "logits/rejected": -1.5207148790359497, "logps/chosen": -49.53696060180664, "logps/rejected": -55.4676513671875, "loss": 0.6838, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0054653664119541645, "rewards/margins": 0.01927504874765873, "rewards/rejected": -0.024740414693951607, "step": 3770 }, { "epoch": 2.723342939481268, "grad_norm": 3.295902729034424, "learning_rate": 1.2845474053553156e-09, "logits/chosen": -1.5167324542999268, "logits/rejected": -1.5084768533706665, "logps/chosen": -43.50970458984375, "logps/rejected": -47.10810089111328, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": -0.007235602475702763, "rewards/margins": 0.01710323989391327, "rewards/rejected": -0.024338845163583755, "step": 3780 }, { "epoch": 2.7305475504322767, "grad_norm": 2.8286335468292236, "learning_rate": 1.2190571347958422e-09, "logits/chosen": -1.5425716638565063, "logits/rejected": -1.5455642938613892, "logps/chosen": -43.28416061401367, "logps/rejected": -50.13677978515625, "loss": 0.6843, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0002638758742250502, "rewards/margins": 0.018405336886644363, "rewards/rejected": -0.018669212237000465, "step": 3790 }, { "epoch": 2.7377521613832854, "grad_norm": 2.9479782581329346, "learning_rate": 1.1552385663231634e-09, "logits/chosen": -1.4791892766952515, "logits/rejected": -1.457729458808899, "logps/chosen": -48.17145538330078, "logps/rejected": -50.11743927001953, "loss": 0.6851, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.005142406094819307, "rewards/margins": 0.016895312815904617, "rewards/rejected": -0.022037718445062637, "step": 3800 }, { "epoch": 2.744956772334294, "grad_norm": 3.1124942302703857, "learning_rate": 1.0930961861191302e-09, "logits/chosen": -1.4413386583328247, "logits/rejected": -1.4406673908233643, "logps/chosen": -46.42212677001953, "logps/rejected": -49.962947845458984, "loss": 0.6863, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007161378860473633, "rewards/margins": 0.014661896042525768, "rewards/rejected": -0.021823275834321976, "step": 3810 }, { "epoch": 2.7521613832853027, "grad_norm": 3.0346012115478516, "learning_rate": 1.0326343625364608e-09, "logits/chosen": -1.4345784187316895, "logits/rejected": -1.4189555644989014, "logps/chosen": -47.113319396972656, "logps/rejected": -52.565155029296875, "loss": 0.6807, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.005450129974633455, "rewards/margins": 0.02591022290289402, "rewards/rejected": -0.03136035054922104, "step": 3820 }, { "epoch": 2.7593659942363113, "grad_norm": 2.6539506912231445, "learning_rate": 9.738573457917066e-10, "logits/chosen": -1.5480695962905884, "logits/rejected": -1.5419654846191406, "logps/chosen": -41.202308654785156, "logps/rejected": -47.30192184448242, "loss": 0.6825, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.005542878992855549, "rewards/margins": 0.022122148424386978, "rewards/rejected": -0.0276650283485651, "step": 3830 }, { "epoch": 2.76657060518732, "grad_norm": 3.010223865509033, "learning_rate": 9.16769267666434e-10, "logits/chosen": -1.466994047164917, "logits/rejected": -1.4610474109649658, "logps/chosen": -46.274566650390625, "logps/rejected": -48.29325866699219, "loss": 0.6887, "rewards/accuracies": 0.53125, "rewards/chosen": -0.006465951446443796, "rewards/margins": 0.009437872096896172, "rewards/rejected": -0.01590382307767868, "step": 3840 }, { "epoch": 2.7737752161383287, "grad_norm": 3.2893083095550537, "learning_rate": 8.613741412168113e-10, "logits/chosen": -1.4859510660171509, "logits/rejected": -1.4807151556015015, "logps/chosen": -54.31746292114258, "logps/rejected": -58.5187873840332, "loss": 0.6831, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.003899764269590378, "rewards/margins": 0.02083163894712925, "rewards/rejected": -0.024731403216719627, "step": 3850 }, { "epoch": 2.7809798270893373, "grad_norm": 3.3477933406829834, "learning_rate": 8.076758604914802e-10, "logits/chosen": -1.4456332921981812, "logits/rejected": -1.4328854084014893, "logps/chosen": -43.15898895263672, "logps/rejected": -46.71881866455078, "loss": 0.6843, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0022692340426146984, "rewards/margins": 0.018418530002236366, "rewards/rejected": -0.020687762647867203, "step": 3860 }, { "epoch": 2.7881844380403455, "grad_norm": 4.6873016357421875, "learning_rate": 7.55678200257856e-10, "logits/chosen": -1.442856788635254, "logits/rejected": -1.4301444292068481, "logps/chosen": -50.084381103515625, "logps/rejected": -55.53865432739258, "loss": 0.6827, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.006987369619309902, "rewards/margins": 0.02188916876912117, "rewards/rejected": -0.028876539319753647, "step": 3870 }, { "epoch": 2.795389048991354, "grad_norm": 3.2509396076202393, "learning_rate": 7.053848157367315e-10, "logits/chosen": -1.4659183025360107, "logits/rejected": -1.4521931409835815, "logps/chosen": -48.176673889160156, "logps/rejected": -53.21508026123047, "loss": 0.6827, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.001605423167347908, "rewards/margins": 0.021672086790204048, "rewards/rejected": -0.023277509957551956, "step": 3880 }, { "epoch": 2.802593659942363, "grad_norm": 2.586207389831543, "learning_rate": 6.567992423453794e-10, "logits/chosen": -1.4940398931503296, "logits/rejected": -1.4875811338424683, "logps/chosen": -43.39032745361328, "logps/rejected": -46.69108581542969, "loss": 0.6838, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004483362659811974, "rewards/margins": 0.019497577100992203, "rewards/rejected": -0.023980939760804176, "step": 3890 }, { "epoch": 2.8097982708933715, "grad_norm": 3.191453695297241, "learning_rate": 6.099248954489794e-10, "logits/chosen": -1.4086828231811523, "logits/rejected": -1.4071089029312134, "logps/chosen": -47.94374465942383, "logps/rejected": -53.171348571777344, "loss": 0.6833, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007214091718196869, "rewards/margins": 0.020340237766504288, "rewards/rejected": -0.027554329484701157, "step": 3900 }, { "epoch": 2.81700288184438, "grad_norm": 3.6769521236419678, "learning_rate": 5.647650701205653e-10, "logits/chosen": -1.5014244318008423, "logits/rejected": -1.4815281629562378, "logps/chosen": -54.42211151123047, "logps/rejected": -58.34537887573242, "loss": 0.6805, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00018626952078193426, "rewards/margins": 0.026389459148049355, "rewards/rejected": -0.026575729250907898, "step": 3910 }, { "epoch": 2.824207492795389, "grad_norm": 3.143864631652832, "learning_rate": 5.213229409093856e-10, "logits/chosen": -1.5347833633422852, "logits/rejected": -1.5241453647613525, "logps/chosen": -52.7486572265625, "logps/rejected": -57.841835021972656, "loss": 0.6815, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004956572316586971, "rewards/margins": 0.02456669509410858, "rewards/rejected": -0.029523268342018127, "step": 3920 }, { "epoch": 2.8314121037463975, "grad_norm": 4.357008457183838, "learning_rate": 4.796015616177401e-10, "logits/chosen": -1.4576940536499023, "logits/rejected": -1.4457906484603882, "logps/chosen": -51.851417541503906, "logps/rejected": -55.67144012451172, "loss": 0.6854, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.007598734460771084, "rewards/margins": 0.016234243288636208, "rewards/rejected": -0.023832976818084717, "step": 3930 }, { "epoch": 2.838616714697406, "grad_norm": 3.335747480392456, "learning_rate": 4.3960386508631595e-10, "logits/chosen": -1.3839704990386963, "logits/rejected": -1.383461356163025, "logps/chosen": -42.68264389038086, "logps/rejected": -46.677146911621094, "loss": 0.6852, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.007006730884313583, "rewards/margins": 0.016936389729380608, "rewards/rejected": -0.02394312247633934, "step": 3940 }, { "epoch": 2.845821325648415, "grad_norm": 4.871637344360352, "learning_rate": 4.013326629880243e-10, "logits/chosen": -1.4295424222946167, "logits/rejected": -1.4122328758239746, "logps/chosen": -50.189979553222656, "logps/rejected": -54.05989456176758, "loss": 0.6825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007883711718022823, "rewards/margins": 0.022242117673158646, "rewards/rejected": -0.030125826597213745, "step": 3950 }, { "epoch": 2.8530259365994235, "grad_norm": 3.459960460662842, "learning_rate": 3.64790645630339e-10, "logits/chosen": -1.3913426399230957, "logits/rejected": -1.3863009214401245, "logps/chosen": -53.34258270263672, "logps/rejected": -55.69043731689453, "loss": 0.6872, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0019184326520189643, "rewards/margins": 0.012400278821587563, "rewards/rejected": -0.014318712055683136, "step": 3960 }, { "epoch": 2.860230547550432, "grad_norm": 4.9613542556762695, "learning_rate": 3.2998038176619e-10, "logits/chosen": -1.4524660110473633, "logits/rejected": -1.4362642765045166, "logps/chosen": -51.490867614746094, "logps/rejected": -54.9944953918457, "loss": 0.6849, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007123004645109177, "rewards/margins": 0.01731189154088497, "rewards/rejected": -0.024434898048639297, "step": 3970 }, { "epoch": 2.867435158501441, "grad_norm": 3.5082051753997803, "learning_rate": 2.969043184133907e-10, "logits/chosen": -1.5575920343399048, "logits/rejected": -1.5562455654144287, "logps/chosen": -45.0047721862793, "logps/rejected": -53.420326232910156, "loss": 0.6815, "rewards/accuracies": 0.65625, "rewards/chosen": -0.000279056781437248, "rewards/margins": 0.02413121983408928, "rewards/rejected": -0.024410273879766464, "step": 3980 }, { "epoch": 2.8746397694524495, "grad_norm": 3.8809545040130615, "learning_rate": 2.6556478068261447e-10, "logits/chosen": -1.4493497610092163, "logits/rejected": -1.4356361627578735, "logps/chosen": -44.4965934753418, "logps/rejected": -48.013553619384766, "loss": 0.6796, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0006090818787924945, "rewards/margins": 0.028188396245241165, "rewards/rejected": -0.02757931686937809, "step": 3990 }, { "epoch": 2.881844380403458, "grad_norm": 3.4924869537353516, "learning_rate": 2.3596397161395607e-10, "logits/chosen": -1.558559775352478, "logits/rejected": -1.5367920398712158, "logps/chosen": -49.601959228515625, "logps/rejected": -54.7277946472168, "loss": 0.6809, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.0002869130694307387, "rewards/margins": 0.025376971811056137, "rewards/rejected": -0.025090059265494347, "step": 4000 }, { "epoch": 2.889048991354467, "grad_norm": 4.832521915435791, "learning_rate": 2.0810397202206399e-10, "logits/chosen": -1.4147446155548096, "logits/rejected": -1.4100733995437622, "logps/chosen": -49.89708709716797, "logps/rejected": -53.32837677001953, "loss": 0.6846, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0006277470965869725, "rewards/margins": 0.01780160702764988, "rewards/rejected": -0.01717386022210121, "step": 4010 }, { "epoch": 2.8962536023054755, "grad_norm": 3.2794227600097656, "learning_rate": 1.819867403498737e-10, "logits/chosen": -1.5649325847625732, "logits/rejected": -1.5557774305343628, "logps/chosen": -47.87010955810547, "logps/rejected": -51.79607391357422, "loss": 0.6835, "rewards/accuracies": 0.65625, "rewards/chosen": -0.00826673861593008, "rewards/margins": 0.020225917920470238, "rewards/rejected": -0.028492655605077744, "step": 4020 }, { "epoch": 2.903458213256484, "grad_norm": 3.472938299179077, "learning_rate": 1.5761411253092382e-10, "logits/chosen": -1.430612325668335, "logits/rejected": -1.4091012477874756, "logps/chosen": -46.053260803222656, "logps/rejected": -48.203346252441406, "loss": 0.6843, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006684750318527222, "rewards/margins": 0.01832910254597664, "rewards/rejected": -0.02501385286450386, "step": 4030 }, { "epoch": 2.910662824207493, "grad_norm": 3.624494791030884, "learning_rate": 1.3498780186031455e-10, "logits/chosen": -1.4939110279083252, "logits/rejected": -1.4849644899368286, "logps/chosen": -53.622901916503906, "logps/rejected": -57.3227653503418, "loss": 0.6845, "rewards/accuracies": 0.59375, "rewards/chosen": -0.006433118134737015, "rewards/margins": 0.018010510131716728, "rewards/rejected": -0.024443628266453743, "step": 4040 }, { "epoch": 2.9178674351585014, "grad_norm": 3.310802936553955, "learning_rate": 1.1410939887425141e-10, "logits/chosen": -1.49782133102417, "logits/rejected": -1.4898258447647095, "logps/chosen": -47.12517166137695, "logps/rejected": -49.63444137573242, "loss": 0.6862, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.010405481792986393, "rewards/margins": 0.014516057446599007, "rewards/rejected": -0.024921538308262825, "step": 4050 }, { "epoch": 2.92507204610951, "grad_norm": 2.949910879135132, "learning_rate": 9.498037123825686e-11, "logits/chosen": -1.5098581314086914, "logits/rejected": -1.498915433883667, "logps/chosen": -45.20121383666992, "logps/rejected": -49.48499298095703, "loss": 0.6833, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.004668011330068111, "rewards/margins": 0.0204728152602911, "rewards/rejected": -0.025140831246972084, "step": 4060 }, { "epoch": 2.9322766570605188, "grad_norm": 3.276839256286621, "learning_rate": 7.760206364398614e-11, "logits/chosen": -1.5862383842468262, "logits/rejected": -1.5651108026504517, "logps/chosen": -49.96502685546875, "logps/rejected": -53.223594665527344, "loss": 0.6837, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.009547281078994274, "rewards/margins": 0.019979404285550117, "rewards/rejected": -0.029526684433221817, "step": 4070 }, { "epoch": 2.9394812680115274, "grad_norm": 3.859468698501587, "learning_rate": 6.19756977147029e-11, "logits/chosen": -1.4423165321350098, "logits/rejected": -1.4334721565246582, "logps/chosen": -47.239994049072266, "logps/rejected": -54.47491455078125, "loss": 0.6829, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.009559379890561104, "rewards/margins": 0.021419430151581764, "rewards/rejected": -0.03097880817949772, "step": 4080 }, { "epoch": 2.946685878962536, "grad_norm": 2.8501765727996826, "learning_rate": 4.810237191940625e-11, "logits/chosen": -1.4424220323562622, "logits/rejected": -1.4332430362701416, "logps/chosen": -46.96299362182617, "logps/rejected": -49.96550369262695, "loss": 0.6856, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008839382790029049, "rewards/margins": 0.01596887595951557, "rewards/rejected": -0.024808257818222046, "step": 4090 }, { "epoch": 2.9538904899135447, "grad_norm": 3.3934149742126465, "learning_rate": 3.5983061495617476e-11, "logits/chosen": -1.5262759923934937, "logits/rejected": -1.5253779888153076, "logps/chosen": -51.821449279785156, "logps/rejected": -57.49546432495117, "loss": 0.684, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.005071837455034256, "rewards/margins": 0.019206812605261803, "rewards/rejected": -0.02427865006029606, "step": 4100 }, { "epoch": 2.9610951008645534, "grad_norm": 3.1180672645568848, "learning_rate": 2.5618618380812694e-11, "logits/chosen": -1.520560622215271, "logits/rejected": -1.5057575702667236, "logps/chosen": -42.085235595703125, "logps/rejected": -47.49565887451172, "loss": 0.6811, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0040741474367678165, "rewards/margins": 0.02492835372686386, "rewards/rejected": -0.029002506285905838, "step": 4110 }, { "epoch": 2.968299711815562, "grad_norm": 3.4017624855041504, "learning_rate": 1.700977115254576e-11, "logits/chosen": -1.4638285636901855, "logits/rejected": -1.454246163368225, "logps/chosen": -46.245887756347656, "logps/rejected": -51.503753662109375, "loss": 0.6831, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.007428421638906002, "rewards/margins": 0.02083088457584381, "rewards/rejected": -0.028259307146072388, "step": 4120 }, { "epoch": 2.9755043227665707, "grad_norm": 2.984016180038452, "learning_rate": 1.0157124977230868e-11, "logits/chosen": -1.4343056678771973, "logits/rejected": -1.4250215291976929, "logps/chosen": -43.65608596801758, "logps/rejected": -47.83146667480469, "loss": 0.6845, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0026060601230710745, "rewards/margins": 0.01812109351158142, "rewards/rejected": -0.02072715200483799, "step": 4130 }, { "epoch": 2.9827089337175794, "grad_norm": 3.555567741394043, "learning_rate": 5.061161567596061e-12, "logits/chosen": -1.4683890342712402, "logits/rejected": -1.4556920528411865, "logps/chosen": -47.792022705078125, "logps/rejected": -50.41429901123047, "loss": 0.6844, "rewards/accuracies": 0.625, "rewards/chosen": -0.0011950184125453234, "rewards/margins": 0.018146729096770287, "rewards/rejected": -0.01934174820780754, "step": 4140 }, { "epoch": 2.989913544668588, "grad_norm": 3.3906075954437256, "learning_rate": 1.7222391488297406e-12, "logits/chosen": -1.5164161920547485, "logits/rejected": -1.5046594142913818, "logps/chosen": -53.657630920410156, "logps/rejected": -58.71654510498047, "loss": 0.6793, "rewards/accuracies": 0.65625, "rewards/chosen": -0.003981665708124638, "rewards/margins": 0.029033973813056946, "rewards/rejected": -0.03301564231514931, "step": 4150 }, { "epoch": 2.9971181556195967, "grad_norm": 3.9542465209960938, "learning_rate": 1.4059243338693238e-13, "logits/chosen": -1.4423478841781616, "logits/rejected": -1.4315330982208252, "logps/chosen": -48.669158935546875, "logps/rejected": -53.47356033325195, "loss": 0.6825, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0017929436871781945, "rewards/margins": 0.02205517329275608, "rewards/rejected": -0.023848116397857666, "step": 4160 }, { "epoch": 3.0, "step": 4164, "total_flos": 0.0, "train_loss": 0.6880504754617968, "train_runtime": 7518.2292, "train_samples_per_second": 8.859, "train_steps_per_second": 0.554 } ], "logging_steps": 10, "max_steps": 4164, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }