zephyr-7b-sft-full / trainer_state.json
zodi1121's picture
Model save
e47ac0d verified
raw
history blame
76.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2167,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00046146746654360867,
"grad_norm": 9.51411938145824,
"learning_rate": 9.216589861751152e-08,
"loss": 1.1509,
"step": 1
},
{
"epoch": 0.0023073373327180432,
"grad_norm": 9.594129890510615,
"learning_rate": 4.608294930875577e-07,
"loss": 1.1369,
"step": 5
},
{
"epoch": 0.0046146746654360865,
"grad_norm": 5.144220521250804,
"learning_rate": 9.216589861751154e-07,
"loss": 1.0949,
"step": 10
},
{
"epoch": 0.00692201199815413,
"grad_norm": 3.40382562328973,
"learning_rate": 1.382488479262673e-06,
"loss": 1.0191,
"step": 15
},
{
"epoch": 0.009229349330872173,
"grad_norm": 3.215915212337555,
"learning_rate": 1.8433179723502307e-06,
"loss": 1.0296,
"step": 20
},
{
"epoch": 0.011536686663590217,
"grad_norm": 2.650742433924389,
"learning_rate": 2.3041474654377884e-06,
"loss": 0.976,
"step": 25
},
{
"epoch": 0.01384402399630826,
"grad_norm": 2.7902393320247687,
"learning_rate": 2.764976958525346e-06,
"loss": 1.0089,
"step": 30
},
{
"epoch": 0.016151361329026302,
"grad_norm": 2.4654209985126148,
"learning_rate": 3.225806451612903e-06,
"loss": 0.9951,
"step": 35
},
{
"epoch": 0.018458698661744346,
"grad_norm": 2.527650306992279,
"learning_rate": 3.6866359447004615e-06,
"loss": 0.9988,
"step": 40
},
{
"epoch": 0.02076603599446239,
"grad_norm": 2.655474988385568,
"learning_rate": 4.147465437788019e-06,
"loss": 1.004,
"step": 45
},
{
"epoch": 0.023073373327180433,
"grad_norm": 2.537803872777302,
"learning_rate": 4.608294930875577e-06,
"loss": 0.9863,
"step": 50
},
{
"epoch": 0.025380710659898477,
"grad_norm": 2.6464740488754366,
"learning_rate": 5.0691244239631346e-06,
"loss": 0.9694,
"step": 55
},
{
"epoch": 0.02768804799261652,
"grad_norm": 2.7507142522196566,
"learning_rate": 5.529953917050692e-06,
"loss": 0.9688,
"step": 60
},
{
"epoch": 0.029995385325334564,
"grad_norm": 2.836749382191462,
"learning_rate": 5.9907834101382485e-06,
"loss": 0.9968,
"step": 65
},
{
"epoch": 0.032302722658052604,
"grad_norm": 3.048439725741993,
"learning_rate": 6.451612903225806e-06,
"loss": 1.0023,
"step": 70
},
{
"epoch": 0.03461005999077065,
"grad_norm": 3.072103382083384,
"learning_rate": 6.912442396313365e-06,
"loss": 0.9909,
"step": 75
},
{
"epoch": 0.03691739732348869,
"grad_norm": 2.4206649546182386,
"learning_rate": 7.373271889400923e-06,
"loss": 1.0277,
"step": 80
},
{
"epoch": 0.03922473465620674,
"grad_norm": 2.6546178755277254,
"learning_rate": 7.83410138248848e-06,
"loss": 1.0123,
"step": 85
},
{
"epoch": 0.04153207198892478,
"grad_norm": 2.3888921486796058,
"learning_rate": 8.294930875576038e-06,
"loss": 0.9688,
"step": 90
},
{
"epoch": 0.043839409321642826,
"grad_norm": 2.32499977135365,
"learning_rate": 8.755760368663595e-06,
"loss": 0.9903,
"step": 95
},
{
"epoch": 0.046146746654360866,
"grad_norm": 2.392443248966377,
"learning_rate": 9.216589861751153e-06,
"loss": 0.9893,
"step": 100
},
{
"epoch": 0.048454083987078914,
"grad_norm": 2.680320857358668,
"learning_rate": 9.67741935483871e-06,
"loss": 0.9846,
"step": 105
},
{
"epoch": 0.050761421319796954,
"grad_norm": 2.532469504703905,
"learning_rate": 1.0138248847926269e-05,
"loss": 1.0089,
"step": 110
},
{
"epoch": 0.053068758652515,
"grad_norm": 3.1046898569172945,
"learning_rate": 1.0599078341013826e-05,
"loss": 1.0266,
"step": 115
},
{
"epoch": 0.05537609598523304,
"grad_norm": 2.5574603903328743,
"learning_rate": 1.1059907834101385e-05,
"loss": 1.002,
"step": 120
},
{
"epoch": 0.05768343331795108,
"grad_norm": 2.452071743693235,
"learning_rate": 1.152073732718894e-05,
"loss": 1.03,
"step": 125
},
{
"epoch": 0.05999077065066913,
"grad_norm": 2.418236004711402,
"learning_rate": 1.1981566820276497e-05,
"loss": 1.006,
"step": 130
},
{
"epoch": 0.06229810798338717,
"grad_norm": 2.378674843033103,
"learning_rate": 1.2442396313364056e-05,
"loss": 0.9717,
"step": 135
},
{
"epoch": 0.06460544531610521,
"grad_norm": 2.288433336347559,
"learning_rate": 1.2903225806451613e-05,
"loss": 1.0247,
"step": 140
},
{
"epoch": 0.06691278264882326,
"grad_norm": 2.7611308296401282,
"learning_rate": 1.3364055299539171e-05,
"loss": 1.003,
"step": 145
},
{
"epoch": 0.0692201199815413,
"grad_norm": 2.421569626628109,
"learning_rate": 1.382488479262673e-05,
"loss": 1.0197,
"step": 150
},
{
"epoch": 0.07152745731425934,
"grad_norm": 2.3978303307399247,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.9898,
"step": 155
},
{
"epoch": 0.07383479464697738,
"grad_norm": 2.980112762291027,
"learning_rate": 1.4746543778801846e-05,
"loss": 1.0275,
"step": 160
},
{
"epoch": 0.07614213197969544,
"grad_norm": 2.6176775912790404,
"learning_rate": 1.5207373271889403e-05,
"loss": 1.0382,
"step": 165
},
{
"epoch": 0.07844946931241348,
"grad_norm": 2.44164739077761,
"learning_rate": 1.566820276497696e-05,
"loss": 1.0236,
"step": 170
},
{
"epoch": 0.08075680664513152,
"grad_norm": 2.404929271261824,
"learning_rate": 1.6129032258064517e-05,
"loss": 1.0304,
"step": 175
},
{
"epoch": 0.08306414397784956,
"grad_norm": 2.3566906067196105,
"learning_rate": 1.6589861751152075e-05,
"loss": 1.0355,
"step": 180
},
{
"epoch": 0.0853714813105676,
"grad_norm": 2.6391147388298246,
"learning_rate": 1.705069124423963e-05,
"loss": 1.0417,
"step": 185
},
{
"epoch": 0.08767881864328565,
"grad_norm": 2.684091771591401,
"learning_rate": 1.751152073732719e-05,
"loss": 1.0434,
"step": 190
},
{
"epoch": 0.08998615597600369,
"grad_norm": 2.482023036660176,
"learning_rate": 1.7972350230414748e-05,
"loss": 1.0638,
"step": 195
},
{
"epoch": 0.09229349330872173,
"grad_norm": 2.4167958259613944,
"learning_rate": 1.8433179723502307e-05,
"loss": 1.0422,
"step": 200
},
{
"epoch": 0.09460083064143977,
"grad_norm": 2.5336845010658586,
"learning_rate": 1.8894009216589862e-05,
"loss": 1.0711,
"step": 205
},
{
"epoch": 0.09690816797415783,
"grad_norm": 2.51621301326488,
"learning_rate": 1.935483870967742e-05,
"loss": 1.0881,
"step": 210
},
{
"epoch": 0.09921550530687587,
"grad_norm": 2.4344384988965175,
"learning_rate": 1.981566820276498e-05,
"loss": 1.0735,
"step": 215
},
{
"epoch": 0.10152284263959391,
"grad_norm": 2.5052346642024728,
"learning_rate": 1.9999883200175286e-05,
"loss": 1.0593,
"step": 220
},
{
"epoch": 0.10383017997231195,
"grad_norm": 2.3275673568454986,
"learning_rate": 1.9999169433349454e-05,
"loss": 1.0766,
"step": 225
},
{
"epoch": 0.10613751730503,
"grad_norm": 2.444969349213072,
"learning_rate": 1.9997806834748455e-05,
"loss": 1.0805,
"step": 230
},
{
"epoch": 0.10844485463774804,
"grad_norm": 2.5019688689621455,
"learning_rate": 1.9995795492789368e-05,
"loss": 1.0795,
"step": 235
},
{
"epoch": 0.11075219197046608,
"grad_norm": 2.41754553967893,
"learning_rate": 1.9993135537985285e-05,
"loss": 1.0419,
"step": 240
},
{
"epoch": 0.11305952930318412,
"grad_norm": 2.4632725320939297,
"learning_rate": 1.9989827142936864e-05,
"loss": 1.1022,
"step": 245
},
{
"epoch": 0.11536686663590216,
"grad_norm": 2.1283515103999004,
"learning_rate": 1.9985870522321118e-05,
"loss": 1.0727,
"step": 250
},
{
"epoch": 0.11767420396862022,
"grad_norm": 2.5373992715116316,
"learning_rate": 1.9981265932877486e-05,
"loss": 1.0595,
"step": 255
},
{
"epoch": 0.11998154130133826,
"grad_norm": 2.382000156883209,
"learning_rate": 1.9976013673391185e-05,
"loss": 1.0585,
"step": 260
},
{
"epoch": 0.1222888786340563,
"grad_norm": 2.4022206882693857,
"learning_rate": 1.9970114084673796e-05,
"loss": 1.089,
"step": 265
},
{
"epoch": 0.12459621596677434,
"grad_norm": 2.4577010071803707,
"learning_rate": 1.996356754954119e-05,
"loss": 1.0971,
"step": 270
},
{
"epoch": 0.12690355329949238,
"grad_norm": 2.973750019978815,
"learning_rate": 1.995637449278864e-05,
"loss": 1.083,
"step": 275
},
{
"epoch": 0.12921089063221042,
"grad_norm": 2.771443155668778,
"learning_rate": 1.994853538116329e-05,
"loss": 1.0948,
"step": 280
},
{
"epoch": 0.13151822796492849,
"grad_norm": 2.2945686154224902,
"learning_rate": 1.9940050723333867e-05,
"loss": 1.0684,
"step": 285
},
{
"epoch": 0.13382556529764653,
"grad_norm": 2.2402037197255864,
"learning_rate": 1.9930921069857653e-05,
"loss": 1.0605,
"step": 290
},
{
"epoch": 0.13613290263036457,
"grad_norm": 3.0321973969800955,
"learning_rate": 1.9921147013144782e-05,
"loss": 1.0629,
"step": 295
},
{
"epoch": 0.1384402399630826,
"grad_norm": 2.4295781427476215,
"learning_rate": 1.991072918741978e-05,
"loss": 1.0353,
"step": 300
},
{
"epoch": 0.14074757729580065,
"grad_norm": 2.5121504192318698,
"learning_rate": 1.9899668268680438e-05,
"loss": 1.1156,
"step": 305
},
{
"epoch": 0.1430549146285187,
"grad_norm": 2.454863605026453,
"learning_rate": 1.988796497465392e-05,
"loss": 1.0921,
"step": 310
},
{
"epoch": 0.14536225196123673,
"grad_norm": 2.2532325296884532,
"learning_rate": 1.98756200647502e-05,
"loss": 1.0683,
"step": 315
},
{
"epoch": 0.14766958929395477,
"grad_norm": 2.3557373973476334,
"learning_rate": 1.9862634340012796e-05,
"loss": 1.0559,
"step": 320
},
{
"epoch": 0.1499769266266728,
"grad_norm": 2.493215513816939,
"learning_rate": 1.9849008643066774e-05,
"loss": 1.0725,
"step": 325
},
{
"epoch": 0.15228426395939088,
"grad_norm": 3.2539520651857594,
"learning_rate": 1.983474385806408e-05,
"loss": 1.0674,
"step": 330
},
{
"epoch": 0.15459160129210892,
"grad_norm": 2.2486130234020165,
"learning_rate": 1.9819840910626174e-05,
"loss": 1.0705,
"step": 335
},
{
"epoch": 0.15689893862482696,
"grad_norm": 2.237162636902425,
"learning_rate": 1.9804300767783958e-05,
"loss": 1.0772,
"step": 340
},
{
"epoch": 0.159206275957545,
"grad_norm": 6.51182752032121,
"learning_rate": 1.9788124437915034e-05,
"loss": 1.0837,
"step": 345
},
{
"epoch": 0.16151361329026304,
"grad_norm": 2.322519163352806,
"learning_rate": 1.9771312970678258e-05,
"loss": 1.0405,
"step": 350
},
{
"epoch": 0.16382095062298108,
"grad_norm": 2.2704370082078773,
"learning_rate": 1.9753867456945653e-05,
"loss": 1.0632,
"step": 355
},
{
"epoch": 0.16612828795569912,
"grad_norm": 2.1581678351113602,
"learning_rate": 1.9735789028731603e-05,
"loss": 1.0818,
"step": 360
},
{
"epoch": 0.16843562528841716,
"grad_norm": 2.0342475926569583,
"learning_rate": 1.971707885911941e-05,
"loss": 1.0679,
"step": 365
},
{
"epoch": 0.1707429626211352,
"grad_norm": 2.2080262528668957,
"learning_rate": 1.9697738162185163e-05,
"loss": 1.0813,
"step": 370
},
{
"epoch": 0.17305029995385326,
"grad_norm": 2.627310254608405,
"learning_rate": 1.9677768192918973e-05,
"loss": 1.0682,
"step": 375
},
{
"epoch": 0.1753576372865713,
"grad_norm": 31.427071093107436,
"learning_rate": 1.9657170247143526e-05,
"loss": 1.06,
"step": 380
},
{
"epoch": 0.17766497461928935,
"grad_norm": 2.4322378895965016,
"learning_rate": 1.9635945661430006e-05,
"loss": 1.0648,
"step": 385
},
{
"epoch": 0.17997231195200739,
"grad_norm": 2.446435285466414,
"learning_rate": 1.9614095813011366e-05,
"loss": 1.0795,
"step": 390
},
{
"epoch": 0.18227964928472543,
"grad_norm": 2.6478307008342745,
"learning_rate": 1.9591622119692953e-05,
"loss": 1.0655,
"step": 395
},
{
"epoch": 0.18458698661744347,
"grad_norm": 3.5462973637007433,
"learning_rate": 1.956852603976052e-05,
"loss": 1.074,
"step": 400
},
{
"epoch": 0.1868943239501615,
"grad_norm": 2.1952483345123834,
"learning_rate": 1.9544809071885603e-05,
"loss": 1.066,
"step": 405
},
{
"epoch": 0.18920166128287955,
"grad_norm": 2.2118543537817397,
"learning_rate": 1.9520472755028256e-05,
"loss": 1.0567,
"step": 410
},
{
"epoch": 0.1915089986155976,
"grad_norm": 2.1220225112366085,
"learning_rate": 1.9495518668337204e-05,
"loss": 1.0485,
"step": 415
},
{
"epoch": 0.19381633594831565,
"grad_norm": 2.0821093984759433,
"learning_rate": 1.946994843104737e-05,
"loss": 1.0374,
"step": 420
},
{
"epoch": 0.1961236732810337,
"grad_norm": 2.0190113966629433,
"learning_rate": 1.944376370237481e-05,
"loss": 1.064,
"step": 425
},
{
"epoch": 0.19843101061375173,
"grad_norm": 2.056684419876963,
"learning_rate": 1.9416966181409047e-05,
"loss": 1.0524,
"step": 430
},
{
"epoch": 0.20073834794646978,
"grad_norm": 2.366243541981115,
"learning_rate": 1.9389557607002808e-05,
"loss": 1.0587,
"step": 435
},
{
"epoch": 0.20304568527918782,
"grad_norm": 2.3366923572205356,
"learning_rate": 1.9361539757659212e-05,
"loss": 1.104,
"step": 440
},
{
"epoch": 0.20535302261190586,
"grad_norm": 2.135135120153321,
"learning_rate": 1.933291445141635e-05,
"loss": 1.0838,
"step": 445
},
{
"epoch": 0.2076603599446239,
"grad_norm": 2.1686749928093367,
"learning_rate": 1.930368354572932e-05,
"loss": 1.0889,
"step": 450
},
{
"epoch": 0.20996769727734194,
"grad_norm": 2.18295591084296,
"learning_rate": 1.9273848937349712e-05,
"loss": 1.0517,
"step": 455
},
{
"epoch": 0.21227503461006,
"grad_norm": 2.015301818923552,
"learning_rate": 1.92434125622025e-05,
"loss": 1.0766,
"step": 460
},
{
"epoch": 0.21458237194277804,
"grad_norm": 2.0741949328544984,
"learning_rate": 1.9212376395260447e-05,
"loss": 1.0896,
"step": 465
},
{
"epoch": 0.21688970927549608,
"grad_norm": 2.15451982178122,
"learning_rate": 1.9180742450415962e-05,
"loss": 1.0763,
"step": 470
},
{
"epoch": 0.21919704660821412,
"grad_norm": 2.0906260465476967,
"learning_rate": 1.9148512780350384e-05,
"loss": 1.086,
"step": 475
},
{
"epoch": 0.22150438394093216,
"grad_norm": 2.031559261836197,
"learning_rate": 1.9115689476400817e-05,
"loss": 1.059,
"step": 480
},
{
"epoch": 0.2238117212736502,
"grad_norm": 2.07650174311531,
"learning_rate": 1.9082274668424423e-05,
"loss": 1.0679,
"step": 485
},
{
"epoch": 0.22611905860636825,
"grad_norm": 2.126208239890011,
"learning_rate": 1.9048270524660197e-05,
"loss": 1.0809,
"step": 490
},
{
"epoch": 0.22842639593908629,
"grad_norm": 1.9929349716624978,
"learning_rate": 1.9013679251588304e-05,
"loss": 1.085,
"step": 495
},
{
"epoch": 0.23073373327180433,
"grad_norm": 3.001431077273745,
"learning_rate": 1.8978503093786882e-05,
"loss": 1.0558,
"step": 500
},
{
"epoch": 0.2330410706045224,
"grad_norm": 1.9284000626000521,
"learning_rate": 1.89427443337864e-05,
"loss": 1.0685,
"step": 505
},
{
"epoch": 0.23534840793724043,
"grad_norm": 2.156814659249471,
"learning_rate": 1.890640529192155e-05,
"loss": 1.0857,
"step": 510
},
{
"epoch": 0.23765574526995847,
"grad_norm": 2.2063349330204174,
"learning_rate": 1.8869488326180682e-05,
"loss": 1.092,
"step": 515
},
{
"epoch": 0.23996308260267651,
"grad_norm": 1.8963715836357997,
"learning_rate": 1.8831995832052802e-05,
"loss": 1.0694,
"step": 520
},
{
"epoch": 0.24227041993539455,
"grad_norm": 2.0285632136378613,
"learning_rate": 1.8793930242372117e-05,
"loss": 1.0795,
"step": 525
},
{
"epoch": 0.2445777572681126,
"grad_norm": 2.099474069037447,
"learning_rate": 1.8755294027160203e-05,
"loss": 1.0893,
"step": 530
},
{
"epoch": 0.24688509460083063,
"grad_norm": 2.0358502445768165,
"learning_rate": 1.8716089693465696e-05,
"loss": 1.086,
"step": 535
},
{
"epoch": 0.24919243193354867,
"grad_norm": 2.1218454361521633,
"learning_rate": 1.8676319785201617e-05,
"loss": 1.0842,
"step": 540
},
{
"epoch": 0.2514997692662667,
"grad_norm": 2.0341225955626583,
"learning_rate": 1.8635986882980325e-05,
"loss": 1.0625,
"step": 545
},
{
"epoch": 0.25380710659898476,
"grad_norm": 2.3910625184538747,
"learning_rate": 1.8595093603946053e-05,
"loss": 1.0727,
"step": 550
},
{
"epoch": 0.2561144439317028,
"grad_norm": 1.98644765469211,
"learning_rate": 1.855364260160507e-05,
"loss": 1.0595,
"step": 555
},
{
"epoch": 0.25842178126442084,
"grad_norm": 2.208738179396901,
"learning_rate": 1.851163656565351e-05,
"loss": 1.0936,
"step": 560
},
{
"epoch": 0.2607291185971389,
"grad_norm": 2.0209364206754645,
"learning_rate": 1.846907822180286e-05,
"loss": 1.0684,
"step": 565
},
{
"epoch": 0.26303645592985697,
"grad_norm": 1.925369099665116,
"learning_rate": 1.842597033160306e-05,
"loss": 1.0669,
"step": 570
},
{
"epoch": 0.265343793262575,
"grad_norm": 2.169504176441067,
"learning_rate": 1.8382315692263324e-05,
"loss": 1.0914,
"step": 575
},
{
"epoch": 0.26765113059529305,
"grad_norm": 2.0095996014073503,
"learning_rate": 1.8338117136470648e-05,
"loss": 1.0679,
"step": 580
},
{
"epoch": 0.2699584679280111,
"grad_norm": 2.0780448467433468,
"learning_rate": 1.829337753220597e-05,
"loss": 1.0823,
"step": 585
},
{
"epoch": 0.27226580526072913,
"grad_norm": 1.9092130149771946,
"learning_rate": 1.8248099782558103e-05,
"loss": 1.0485,
"step": 590
},
{
"epoch": 0.2745731425934472,
"grad_norm": 2.2904699258286914,
"learning_rate": 1.820228682553533e-05,
"loss": 1.0676,
"step": 595
},
{
"epoch": 0.2768804799261652,
"grad_norm": 2.1592942059891884,
"learning_rate": 1.8155941633874787e-05,
"loss": 1.0862,
"step": 600
},
{
"epoch": 0.27918781725888325,
"grad_norm": 1.9056960337173154,
"learning_rate": 1.810906721484954e-05,
"loss": 1.027,
"step": 605
},
{
"epoch": 0.2814951545916013,
"grad_norm": 2.029717811241469,
"learning_rate": 1.8061666610073465e-05,
"loss": 1.0638,
"step": 610
},
{
"epoch": 0.28380249192431933,
"grad_norm": 2.0411421295106873,
"learning_rate": 1.8013742895303883e-05,
"loss": 1.0667,
"step": 615
},
{
"epoch": 0.2861098292570374,
"grad_norm": 1.924799713813513,
"learning_rate": 1.7965299180241963e-05,
"loss": 1.0685,
"step": 620
},
{
"epoch": 0.2884171665897554,
"grad_norm": 2.171875799314523,
"learning_rate": 1.791633860833096e-05,
"loss": 1.0463,
"step": 625
},
{
"epoch": 0.29072450392247345,
"grad_norm": 2.068372996208825,
"learning_rate": 1.7866864356552215e-05,
"loss": 1.0715,
"step": 630
},
{
"epoch": 0.2930318412551915,
"grad_norm": 1.8601211490681129,
"learning_rate": 1.7816879635219028e-05,
"loss": 1.0576,
"step": 635
},
{
"epoch": 0.29533917858790953,
"grad_norm": 1.9725316785259686,
"learning_rate": 1.7766387687768338e-05,
"loss": 1.0648,
"step": 640
},
{
"epoch": 0.2976465159206276,
"grad_norm": 2.1844471268704515,
"learning_rate": 1.7715391790550255e-05,
"loss": 1.0637,
"step": 645
},
{
"epoch": 0.2999538532533456,
"grad_norm": 1.933021204525043,
"learning_rate": 1.766389525261547e-05,
"loss": 1.0803,
"step": 650
},
{
"epoch": 0.30226119058606365,
"grad_norm": 2.1351960039602695,
"learning_rate": 1.7611901415500536e-05,
"loss": 1.0979,
"step": 655
},
{
"epoch": 0.30456852791878175,
"grad_norm": 1.9611319269471612,
"learning_rate": 1.7559413653011027e-05,
"loss": 1.0652,
"step": 660
},
{
"epoch": 0.3068758652514998,
"grad_norm": 2.033679368734863,
"learning_rate": 1.7506435371002635e-05,
"loss": 1.0749,
"step": 665
},
{
"epoch": 0.30918320258421783,
"grad_norm": 2.0279720872015354,
"learning_rate": 1.745297000716016e-05,
"loss": 1.078,
"step": 670
},
{
"epoch": 0.31149053991693587,
"grad_norm": 1.989733876253561,
"learning_rate": 1.7399021030774443e-05,
"loss": 1.0639,
"step": 675
},
{
"epoch": 0.3137978772496539,
"grad_norm": 1.9037569950190747,
"learning_rate": 1.734459194251725e-05,
"loss": 1.0721,
"step": 680
},
{
"epoch": 0.31610521458237195,
"grad_norm": 2.183774346551292,
"learning_rate": 1.7289686274214116e-05,
"loss": 1.0755,
"step": 685
},
{
"epoch": 0.31841255191509,
"grad_norm": 1.992668508208317,
"learning_rate": 1.7234307588615177e-05,
"loss": 1.0761,
"step": 690
},
{
"epoch": 0.32071988924780803,
"grad_norm": 1.9985850630928745,
"learning_rate": 1.717845947916398e-05,
"loss": 1.0575,
"step": 695
},
{
"epoch": 0.3230272265805261,
"grad_norm": 2.1480400724448883,
"learning_rate": 1.712214556976431e-05,
"loss": 1.0404,
"step": 700
},
{
"epoch": 0.3253345639132441,
"grad_norm": 1.9503339704430334,
"learning_rate": 1.7065369514545054e-05,
"loss": 1.0579,
"step": 705
},
{
"epoch": 0.32764190124596215,
"grad_norm": 1.971699140050545,
"learning_rate": 1.7008134997623066e-05,
"loss": 1.0629,
"step": 710
},
{
"epoch": 0.3299492385786802,
"grad_norm": 1.97358622805482,
"learning_rate": 1.695044573286413e-05,
"loss": 1.039,
"step": 715
},
{
"epoch": 0.33225657591139823,
"grad_norm": 1.8903289514072814,
"learning_rate": 1.6892305463641967e-05,
"loss": 1.0996,
"step": 720
},
{
"epoch": 0.3345639132441163,
"grad_norm": 1.9171530806208752,
"learning_rate": 1.6833717962595327e-05,
"loss": 1.0587,
"step": 725
},
{
"epoch": 0.3368712505768343,
"grad_norm": 2.0292768253738855,
"learning_rate": 1.677468703138319e-05,
"loss": 1.0534,
"step": 730
},
{
"epoch": 0.33917858790955235,
"grad_norm": 1.9857495035997068,
"learning_rate": 1.6715216500438093e-05,
"loss": 1.0805,
"step": 735
},
{
"epoch": 0.3414859252422704,
"grad_norm": 5.464414796561983,
"learning_rate": 1.6655310228717565e-05,
"loss": 1.0802,
"step": 740
},
{
"epoch": 0.3437932625749885,
"grad_norm": 1.909425909480839,
"learning_rate": 1.6594972103453727e-05,
"loss": 1.0813,
"step": 745
},
{
"epoch": 0.34610059990770653,
"grad_norm": 1.9164783421961078,
"learning_rate": 1.6534206039901057e-05,
"loss": 1.0466,
"step": 750
},
{
"epoch": 0.34840793724042457,
"grad_norm": 4.300395520109931,
"learning_rate": 1.647301598108234e-05,
"loss": 1.0326,
"step": 755
},
{
"epoch": 0.3507152745731426,
"grad_norm": 2.2226539635666827,
"learning_rate": 1.64114058975328e-05,
"loss": 1.0824,
"step": 760
},
{
"epoch": 0.35302261190586065,
"grad_norm": 2.08738140836867,
"learning_rate": 1.6349379787042478e-05,
"loss": 1.0445,
"step": 765
},
{
"epoch": 0.3553299492385787,
"grad_norm": 1.7956395961308758,
"learning_rate": 1.6286941674396788e-05,
"loss": 1.0283,
"step": 770
},
{
"epoch": 0.35763728657129673,
"grad_norm": 1.9001406773036147,
"learning_rate": 1.6224095611115385e-05,
"loss": 1.0558,
"step": 775
},
{
"epoch": 0.35994462390401477,
"grad_norm": 1.8584506601925908,
"learning_rate": 1.6160845675189254e-05,
"loss": 1.0315,
"step": 780
},
{
"epoch": 0.3622519612367328,
"grad_norm": 1.8994200106765273,
"learning_rate": 1.6097195970816094e-05,
"loss": 1.0736,
"step": 785
},
{
"epoch": 0.36455929856945085,
"grad_norm": 2.396252821544053,
"learning_rate": 1.603315062813401e-05,
"loss": 1.0605,
"step": 790
},
{
"epoch": 0.3668666359021689,
"grad_norm": 1.798952489279231,
"learning_rate": 1.596871380295351e-05,
"loss": 1.0439,
"step": 795
},
{
"epoch": 0.36917397323488693,
"grad_norm": 1.8907451459454219,
"learning_rate": 1.5903889676487832e-05,
"loss": 1.047,
"step": 800
},
{
"epoch": 0.37148131056760497,
"grad_norm": 1.9272449578154556,
"learning_rate": 1.5838682455081657e-05,
"loss": 1.0557,
"step": 805
},
{
"epoch": 0.373788647900323,
"grad_norm": 1.8500068465129675,
"learning_rate": 1.5773096369938125e-05,
"loss": 1.0448,
"step": 810
},
{
"epoch": 0.37609598523304105,
"grad_norm": 2.024989082401722,
"learning_rate": 1.570713567684432e-05,
"loss": 1.0444,
"step": 815
},
{
"epoch": 0.3784033225657591,
"grad_norm": 1.864545609223796,
"learning_rate": 1.5640804655895086e-05,
"loss": 1.0316,
"step": 820
},
{
"epoch": 0.38071065989847713,
"grad_norm": 1.9810163039010853,
"learning_rate": 1.557410761121532e-05,
"loss": 1.0476,
"step": 825
},
{
"epoch": 0.3830179972311952,
"grad_norm": 1.9715732068507474,
"learning_rate": 1.5507048870680668e-05,
"loss": 1.0092,
"step": 830
},
{
"epoch": 0.38532533456391327,
"grad_norm": 1.9346233566232378,
"learning_rate": 1.5439632785636707e-05,
"loss": 1.0834,
"step": 835
},
{
"epoch": 0.3876326718966313,
"grad_norm": 2.2247085368619164,
"learning_rate": 1.5371863730616586e-05,
"loss": 1.0608,
"step": 840
},
{
"epoch": 0.38994000922934935,
"grad_norm": 1.855445999462738,
"learning_rate": 1.5303746103057163e-05,
"loss": 1.0311,
"step": 845
},
{
"epoch": 0.3922473465620674,
"grad_norm": 1.8433500481185805,
"learning_rate": 1.5235284323013674e-05,
"loss": 1.0513,
"step": 850
},
{
"epoch": 0.39455468389478543,
"grad_norm": 1.9238020550812749,
"learning_rate": 1.5166482832872923e-05,
"loss": 1.0611,
"step": 855
},
{
"epoch": 0.39686202122750347,
"grad_norm": 2.034539491931288,
"learning_rate": 1.5097346097065008e-05,
"loss": 1.0369,
"step": 860
},
{
"epoch": 0.3991693585602215,
"grad_norm": 1.8719773240320596,
"learning_rate": 1.5027878601773633e-05,
"loss": 1.031,
"step": 865
},
{
"epoch": 0.40147669589293955,
"grad_norm": 1.8218549093076317,
"learning_rate": 1.4958084854645018e-05,
"loss": 1.027,
"step": 870
},
{
"epoch": 0.4037840332256576,
"grad_norm": 1.908377236915306,
"learning_rate": 1.4887969384495403e-05,
"loss": 1.0505,
"step": 875
},
{
"epoch": 0.40609137055837563,
"grad_norm": 1.882874108335332,
"learning_rate": 1.4817536741017153e-05,
"loss": 1.0421,
"step": 880
},
{
"epoch": 0.40839870789109367,
"grad_norm": 2.0217065440618622,
"learning_rate": 1.4746791494483584e-05,
"loss": 1.0533,
"step": 885
},
{
"epoch": 0.4107060452238117,
"grad_norm": 1.8717323475177303,
"learning_rate": 1.4675738235452352e-05,
"loss": 1.0279,
"step": 890
},
{
"epoch": 0.41301338255652975,
"grad_norm": 1.9788825364185045,
"learning_rate": 1.4604381574467616e-05,
"loss": 1.042,
"step": 895
},
{
"epoch": 0.4153207198892478,
"grad_norm": 1.9327030504589935,
"learning_rate": 1.4532726141760849e-05,
"loss": 1.06,
"step": 900
},
{
"epoch": 0.41762805722196583,
"grad_norm": 1.8050202525178007,
"learning_rate": 1.4460776586950393e-05,
"loss": 1.0176,
"step": 905
},
{
"epoch": 0.41993539455468387,
"grad_norm": 1.7140772518888605,
"learning_rate": 1.438853757873975e-05,
"loss": 1.0336,
"step": 910
},
{
"epoch": 0.4222427318874019,
"grad_norm": 1.9381284110778458,
"learning_rate": 1.4316013804614644e-05,
"loss": 1.0283,
"step": 915
},
{
"epoch": 0.42455006922012,
"grad_norm": 1.8569863683755345,
"learning_rate": 1.4243209970538846e-05,
"loss": 1.0295,
"step": 920
},
{
"epoch": 0.42685740655283805,
"grad_norm": 1.7584125894267681,
"learning_rate": 1.4170130800648814e-05,
"loss": 1.0451,
"step": 925
},
{
"epoch": 0.4291647438855561,
"grad_norm": 1.8468563006595364,
"learning_rate": 1.4096781036947159e-05,
"loss": 1.0329,
"step": 930
},
{
"epoch": 0.43147208121827413,
"grad_norm": 1.8005410726866136,
"learning_rate": 1.4023165438994933e-05,
"loss": 1.0523,
"step": 935
},
{
"epoch": 0.43377941855099217,
"grad_norm": 1.7881203203680747,
"learning_rate": 1.394928878360279e-05,
"loss": 1.052,
"step": 940
},
{
"epoch": 0.4360867558837102,
"grad_norm": 1.9402582404112974,
"learning_rate": 1.3875155864521031e-05,
"loss": 1.0418,
"step": 945
},
{
"epoch": 0.43839409321642825,
"grad_norm": 1.926002050119894,
"learning_rate": 1.3800771492128537e-05,
"loss": 1.0491,
"step": 950
},
{
"epoch": 0.4407014305491463,
"grad_norm": 1.8807563751664647,
"learning_rate": 1.3726140493120639e-05,
"loss": 1.032,
"step": 955
},
{
"epoch": 0.44300876788186433,
"grad_norm": 1.9189485256851713,
"learning_rate": 1.3651267710195909e-05,
"loss": 1.0355,
"step": 960
},
{
"epoch": 0.44531610521458237,
"grad_norm": 1.9803461150155048,
"learning_rate": 1.3576158001741932e-05,
"loss": 1.0569,
"step": 965
},
{
"epoch": 0.4476234425473004,
"grad_norm": 2.018305042251882,
"learning_rate": 1.3500816241520059e-05,
"loss": 1.04,
"step": 970
},
{
"epoch": 0.44993077988001845,
"grad_norm": 1.7580864552202506,
"learning_rate": 1.3425247318349137e-05,
"loss": 1.0075,
"step": 975
},
{
"epoch": 0.4522381172127365,
"grad_norm": 1.8967387580024155,
"learning_rate": 1.3349456135788298e-05,
"loss": 1.0429,
"step": 980
},
{
"epoch": 0.45454545454545453,
"grad_norm": 2.2301409193225985,
"learning_rate": 1.3273447611818768e-05,
"loss": 1.0244,
"step": 985
},
{
"epoch": 0.45685279187817257,
"grad_norm": 1.8218900361937265,
"learning_rate": 1.3197226678524739e-05,
"loss": 1.0006,
"step": 990
},
{
"epoch": 0.4591601292108906,
"grad_norm": 1.8705871989801575,
"learning_rate": 1.3120798281773346e-05,
"loss": 1.0382,
"step": 995
},
{
"epoch": 0.46146746654360865,
"grad_norm": 1.8993921065361903,
"learning_rate": 1.3044167380893726e-05,
"loss": 1.0543,
"step": 1000
},
{
"epoch": 0.46377480387632675,
"grad_norm": 1.762141098208751,
"learning_rate": 1.2967338948355217e-05,
"loss": 1.031,
"step": 1005
},
{
"epoch": 0.4660821412090448,
"grad_norm": 1.8349620217027005,
"learning_rate": 1.2890317969444716e-05,
"loss": 1.0104,
"step": 1010
},
{
"epoch": 0.4683894785417628,
"grad_norm": 1.9527169882770812,
"learning_rate": 1.2813109441943166e-05,
"loss": 1.0325,
"step": 1015
},
{
"epoch": 0.47069681587448087,
"grad_norm": 1.7594153130967782,
"learning_rate": 1.273571837580127e-05,
"loss": 1.0476,
"step": 1020
},
{
"epoch": 0.4730041532071989,
"grad_norm": 1.7823802580407797,
"learning_rate": 1.2658149792814405e-05,
"loss": 1.0397,
"step": 1025
},
{
"epoch": 0.47531149053991695,
"grad_norm": 1.7288773807653248,
"learning_rate": 1.258040872629676e-05,
"loss": 1.0419,
"step": 1030
},
{
"epoch": 0.477618827872635,
"grad_norm": 1.8969879276116197,
"learning_rate": 1.2502500220754736e-05,
"loss": 1.0538,
"step": 1035
},
{
"epoch": 0.47992616520535303,
"grad_norm": 1.9748280209096565,
"learning_rate": 1.242442933155961e-05,
"loss": 1.0088,
"step": 1040
},
{
"epoch": 0.48223350253807107,
"grad_norm": 1.8630834023430007,
"learning_rate": 1.2346201124619502e-05,
"loss": 1.0041,
"step": 1045
},
{
"epoch": 0.4845408398707891,
"grad_norm": 1.93282001404706,
"learning_rate": 1.2267820676050657e-05,
"loss": 1.0117,
"step": 1050
},
{
"epoch": 0.48684817720350715,
"grad_norm": 1.7732915883551568,
"learning_rate": 1.2189293071848051e-05,
"loss": 1.0395,
"step": 1055
},
{
"epoch": 0.4891555145362252,
"grad_norm": 1.7668917225682153,
"learning_rate": 1.2110623407555398e-05,
"loss": 1.0055,
"step": 1060
},
{
"epoch": 0.49146285186894323,
"grad_norm": 1.770548333794891,
"learning_rate": 1.2031816787934465e-05,
"loss": 1.0198,
"step": 1065
},
{
"epoch": 0.49377018920166127,
"grad_norm": 1.804504292487286,
"learning_rate": 1.1952878326633872e-05,
"loss": 0.9925,
"step": 1070
},
{
"epoch": 0.4960775265343793,
"grad_norm": 1.8384240011850799,
"learning_rate": 1.187381314585725e-05,
"loss": 1.0167,
"step": 1075
},
{
"epoch": 0.49838486386709735,
"grad_norm": 1.8738489698458378,
"learning_rate": 1.1794626376030866e-05,
"loss": 1.0266,
"step": 1080
},
{
"epoch": 0.5006922011998154,
"grad_norm": 1.7963716989600227,
"learning_rate": 1.1715323155470745e-05,
"loss": 1.0203,
"step": 1085
},
{
"epoch": 0.5029995385325334,
"grad_norm": 1.7950924324700734,
"learning_rate": 1.163590863004922e-05,
"loss": 1.0014,
"step": 1090
},
{
"epoch": 0.5053068758652515,
"grad_norm": 1.7996992785566162,
"learning_rate": 1.1556387952861036e-05,
"loss": 1.0147,
"step": 1095
},
{
"epoch": 0.5076142131979695,
"grad_norm": 1.9262189643769105,
"learning_rate": 1.1476766283888986e-05,
"loss": 1.0176,
"step": 1100
},
{
"epoch": 0.5099215505306876,
"grad_norm": 1.738673259571015,
"learning_rate": 1.1397048789669061e-05,
"loss": 1.0221,
"step": 1105
},
{
"epoch": 0.5122288878634056,
"grad_norm": 1.7993896869653003,
"learning_rate": 1.1317240642955226e-05,
"loss": 1.0232,
"step": 1110
},
{
"epoch": 0.5145362251961236,
"grad_norm": 1.8492729168966688,
"learning_rate": 1.1237347022383747e-05,
"loss": 1.0138,
"step": 1115
},
{
"epoch": 0.5168435625288417,
"grad_norm": 1.792127722956897,
"learning_rate": 1.1157373112137171e-05,
"loss": 1.011,
"step": 1120
},
{
"epoch": 0.5191508998615597,
"grad_norm": 1.767761412954839,
"learning_rate": 1.107732410160793e-05,
"loss": 0.9917,
"step": 1125
},
{
"epoch": 0.5214582371942778,
"grad_norm": 1.78861016845621,
"learning_rate": 1.0997205185061599e-05,
"loss": 1.024,
"step": 1130
},
{
"epoch": 0.5237655745269958,
"grad_norm": 1.796111964851059,
"learning_rate": 1.0917021561299864e-05,
"loss": 1.0094,
"step": 1135
},
{
"epoch": 0.5260729118597139,
"grad_norm": 1.6839699201837544,
"learning_rate": 1.083677843332316e-05,
"loss": 1.0019,
"step": 1140
},
{
"epoch": 0.528380249192432,
"grad_norm": 1.735381208836221,
"learning_rate": 1.0756481007993063e-05,
"loss": 0.9979,
"step": 1145
},
{
"epoch": 0.53068758652515,
"grad_norm": 1.7283238773850635,
"learning_rate": 1.0676134495694439e-05,
"loss": 1.0127,
"step": 1150
},
{
"epoch": 0.5329949238578681,
"grad_norm": 1.8700927823490678,
"learning_rate": 1.0595744109997326e-05,
"loss": 0.9897,
"step": 1155
},
{
"epoch": 0.5353022611905861,
"grad_norm": 1.7850898978429104,
"learning_rate": 1.0515315067318652e-05,
"loss": 1.0155,
"step": 1160
},
{
"epoch": 0.5376095985233041,
"grad_norm": 1.8867979217718087,
"learning_rate": 1.0434852586583737e-05,
"loss": 0.9996,
"step": 1165
},
{
"epoch": 0.5399169358560222,
"grad_norm": 1.8409069763382047,
"learning_rate": 1.0354361888887642e-05,
"loss": 1.0038,
"step": 1170
},
{
"epoch": 0.5422242731887402,
"grad_norm": 1.9318197730416369,
"learning_rate": 1.0273848197156401e-05,
"loss": 0.9893,
"step": 1175
},
{
"epoch": 0.5445316105214583,
"grad_norm": 1.7534602196952722,
"learning_rate": 1.0193316735808085e-05,
"loss": 0.993,
"step": 1180
},
{
"epoch": 0.5468389478541763,
"grad_norm": 1.7436059512387687,
"learning_rate": 1.0112772730413816e-05,
"loss": 1.0079,
"step": 1185
},
{
"epoch": 0.5491462851868943,
"grad_norm": 1.8076118426423142,
"learning_rate": 1.0032221407358683e-05,
"loss": 1.0336,
"step": 1190
},
{
"epoch": 0.5514536225196124,
"grad_norm": 1.9044420451434694,
"learning_rate": 9.951667993502599e-06,
"loss": 1.0152,
"step": 1195
},
{
"epoch": 0.5537609598523304,
"grad_norm": 1.842329102136153,
"learning_rate": 9.871117715841151e-06,
"loss": 0.9783,
"step": 1200
},
{
"epoch": 0.5560682971850485,
"grad_norm": 1.8583622986365993,
"learning_rate": 9.790575801166432e-06,
"loss": 1.0054,
"step": 1205
},
{
"epoch": 0.5583756345177665,
"grad_norm": 1.7004919095912332,
"learning_rate": 9.710047475727854e-06,
"loss": 1.0011,
"step": 1210
},
{
"epoch": 0.5606829718504845,
"grad_norm": 1.7280460978374188,
"learning_rate": 9.629537964893063e-06,
"loss": 1.0299,
"step": 1215
},
{
"epoch": 0.5629903091832026,
"grad_norm": 1.7174011953937558,
"learning_rate": 9.549052492808834e-06,
"loss": 0.9946,
"step": 1220
},
{
"epoch": 0.5652976465159206,
"grad_norm": 1.8215498597720168,
"learning_rate": 9.468596282062114e-06,
"loss": 1.0113,
"step": 1225
},
{
"epoch": 0.5676049838486387,
"grad_norm": 1.6911500192296895,
"learning_rate": 9.38817455334112e-06,
"loss": 0.9855,
"step": 1230
},
{
"epoch": 0.5699123211813567,
"grad_norm": 1.8405046704539174,
"learning_rate": 9.307792525096582e-06,
"loss": 1.0113,
"step": 1235
},
{
"epoch": 0.5722196585140747,
"grad_norm": 1.838204327540361,
"learning_rate": 9.227455413203115e-06,
"loss": 0.9947,
"step": 1240
},
{
"epoch": 0.5745269958467928,
"grad_norm": 1.656688699609939,
"learning_rate": 9.147168430620788e-06,
"loss": 0.9892,
"step": 1245
},
{
"epoch": 0.5768343331795108,
"grad_norm": 1.7231036061816765,
"learning_rate": 9.066936787056843e-06,
"loss": 0.9944,
"step": 1250
},
{
"epoch": 0.5791416705122289,
"grad_norm": 1.645605940940624,
"learning_rate": 8.986765688627652e-06,
"loss": 0.9936,
"step": 1255
},
{
"epoch": 0.5814490078449469,
"grad_norm": 1.8141527360329759,
"learning_rate": 8.906660337520903e-06,
"loss": 1.0096,
"step": 1260
},
{
"epoch": 0.583756345177665,
"grad_norm": 1.856808726362016,
"learning_rate": 8.82662593165804e-06,
"loss": 1.0032,
"step": 1265
},
{
"epoch": 0.586063682510383,
"grad_norm": 1.8593818092553211,
"learning_rate": 8.746667664356957e-06,
"loss": 1.0177,
"step": 1270
},
{
"epoch": 0.588371019843101,
"grad_norm": 1.7318701186944272,
"learning_rate": 8.666790723995043e-06,
"loss": 0.9933,
"step": 1275
},
{
"epoch": 0.5906783571758191,
"grad_norm": 1.8632249625406112,
"learning_rate": 8.587000293672482e-06,
"loss": 1.0278,
"step": 1280
},
{
"epoch": 0.5929856945085371,
"grad_norm": 1.8482080793994375,
"learning_rate": 8.50730155087596e-06,
"loss": 0.9753,
"step": 1285
},
{
"epoch": 0.5952930318412551,
"grad_norm": 1.6654816438940703,
"learning_rate": 8.427699667142681e-06,
"loss": 0.9923,
"step": 1290
},
{
"epoch": 0.5976003691739732,
"grad_norm": 1.816789112483473,
"learning_rate": 8.348199807724806e-06,
"loss": 0.9951,
"step": 1295
},
{
"epoch": 0.5999077065066912,
"grad_norm": 1.8671938825009406,
"learning_rate": 8.268807131254288e-06,
"loss": 1.0063,
"step": 1300
},
{
"epoch": 0.6022150438394093,
"grad_norm": 1.736173419625791,
"learning_rate": 8.189526789408123e-06,
"loss": 0.9942,
"step": 1305
},
{
"epoch": 0.6045223811721273,
"grad_norm": 1.7397594354717327,
"learning_rate": 8.110363926574088e-06,
"loss": 0.9899,
"step": 1310
},
{
"epoch": 0.6068297185048455,
"grad_norm": 1.7112354026341845,
"learning_rate": 8.0313236795169e-06,
"loss": 0.9981,
"step": 1315
},
{
"epoch": 0.6091370558375635,
"grad_norm": 1.7633777819452738,
"learning_rate": 7.952411177044923e-06,
"loss": 0.9667,
"step": 1320
},
{
"epoch": 0.6114443931702815,
"grad_norm": 1.7477692209080626,
"learning_rate": 7.873631539677364e-06,
"loss": 0.9979,
"step": 1325
},
{
"epoch": 0.6137517305029996,
"grad_norm": 1.7532055508610305,
"learning_rate": 7.794989879311991e-06,
"loss": 0.9869,
"step": 1330
},
{
"epoch": 0.6160590678357176,
"grad_norm": 1.8525858143415055,
"learning_rate": 7.716491298893443e-06,
"loss": 0.9834,
"step": 1335
},
{
"epoch": 0.6183664051684357,
"grad_norm": 1.749585519245075,
"learning_rate": 7.638140892082118e-06,
"loss": 1.0092,
"step": 1340
},
{
"epoch": 0.6206737425011537,
"grad_norm": 1.8420135288729067,
"learning_rate": 7.559943742923626e-06,
"loss": 0.9797,
"step": 1345
},
{
"epoch": 0.6229810798338717,
"grad_norm": 1.7361527256574634,
"learning_rate": 7.4819049255189215e-06,
"loss": 1.0084,
"step": 1350
},
{
"epoch": 0.6252884171665898,
"grad_norm": 1.8341519418326866,
"learning_rate": 7.404029503695028e-06,
"loss": 0.978,
"step": 1355
},
{
"epoch": 0.6275957544993078,
"grad_norm": 1.8293945335237427,
"learning_rate": 7.326322530676471e-06,
"loss": 0.9949,
"step": 1360
},
{
"epoch": 0.6299030918320259,
"grad_norm": 1.8042095660293147,
"learning_rate": 7.248789048757368e-06,
"loss": 0.9708,
"step": 1365
},
{
"epoch": 0.6322104291647439,
"grad_norm": 1.845467719423503,
"learning_rate": 7.171434088974252e-06,
"loss": 0.9965,
"step": 1370
},
{
"epoch": 0.6345177664974619,
"grad_norm": 1.6543843384272663,
"learning_rate": 7.094262670779611e-06,
"loss": 0.9745,
"step": 1375
},
{
"epoch": 0.63682510383018,
"grad_norm": 1.78812671106571,
"learning_rate": 7.017279801716177e-06,
"loss": 0.9913,
"step": 1380
},
{
"epoch": 0.639132441162898,
"grad_norm": 1.6947334759904245,
"learning_rate": 6.940490477092004e-06,
"loss": 0.9852,
"step": 1385
},
{
"epoch": 0.6414397784956161,
"grad_norm": 1.8535301270043634,
"learning_rate": 6.8638996796563275e-06,
"loss": 1.007,
"step": 1390
},
{
"epoch": 0.6437471158283341,
"grad_norm": 1.7676355127694694,
"learning_rate": 6.78751237927623e-06,
"loss": 0.9514,
"step": 1395
},
{
"epoch": 0.6460544531610521,
"grad_norm": 1.6769380120076558,
"learning_rate": 6.711333532614168e-06,
"loss": 0.9698,
"step": 1400
},
{
"epoch": 0.6483617904937702,
"grad_norm": 1.7272039849376555,
"learning_rate": 6.6353680828063306e-06,
"loss": 0.948,
"step": 1405
},
{
"epoch": 0.6506691278264882,
"grad_norm": 1.7909691104530978,
"learning_rate": 6.559620959141897e-06,
"loss": 0.9741,
"step": 1410
},
{
"epoch": 0.6529764651592063,
"grad_norm": 1.7584119603336634,
"learning_rate": 6.48409707674317e-06,
"loss": 0.9825,
"step": 1415
},
{
"epoch": 0.6552838024919243,
"grad_norm": 1.704146715339984,
"learning_rate": 6.408801336246645e-06,
"loss": 0.9473,
"step": 1420
},
{
"epoch": 0.6575911398246423,
"grad_norm": 1.675928516675119,
"learning_rate": 6.3337386234850255e-06,
"loss": 0.9726,
"step": 1425
},
{
"epoch": 0.6598984771573604,
"grad_norm": 1.8640939079623915,
"learning_rate": 6.258913809170169e-06,
"loss": 0.9899,
"step": 1430
},
{
"epoch": 0.6622058144900784,
"grad_norm": 1.7077639495220778,
"learning_rate": 6.18433174857705e-06,
"loss": 0.9856,
"step": 1435
},
{
"epoch": 0.6645131518227965,
"grad_norm": 1.7756594990657744,
"learning_rate": 6.1099972812287e-06,
"loss": 0.9766,
"step": 1440
},
{
"epoch": 0.6668204891555145,
"grad_norm": 1.9246917842171538,
"learning_rate": 6.035915230582176e-06,
"loss": 0.9802,
"step": 1445
},
{
"epoch": 0.6691278264882325,
"grad_norm": 1.7061051671690723,
"learning_rate": 5.962090403715592e-06,
"loss": 0.9589,
"step": 1450
},
{
"epoch": 0.6714351638209506,
"grad_norm": 1.9209301423646885,
"learning_rate": 5.8885275910161574e-06,
"loss": 0.9661,
"step": 1455
},
{
"epoch": 0.6737425011536686,
"grad_norm": 1.7522165825796936,
"learning_rate": 5.815231565869377e-06,
"loss": 0.9683,
"step": 1460
},
{
"epoch": 0.6760498384863867,
"grad_norm": 1.7093168808099815,
"learning_rate": 5.742207084349274e-06,
"loss": 0.9787,
"step": 1465
},
{
"epoch": 0.6783571758191047,
"grad_norm": 1.7331687059615726,
"learning_rate": 5.669458884909815e-06,
"loss": 0.962,
"step": 1470
},
{
"epoch": 0.6806645131518227,
"grad_norm": 1.7125984142423774,
"learning_rate": 5.596991688077409e-06,
"loss": 0.9749,
"step": 1475
},
{
"epoch": 0.6829718504845408,
"grad_norm": 1.7116106483633,
"learning_rate": 5.5248101961446065e-06,
"loss": 0.9646,
"step": 1480
},
{
"epoch": 0.6852791878172588,
"grad_norm": 1.7871177917200074,
"learning_rate": 5.452919092864976e-06,
"loss": 0.9869,
"step": 1485
},
{
"epoch": 0.687586525149977,
"grad_norm": 1.7422263712914812,
"learning_rate": 5.381323043149192e-06,
"loss": 0.9598,
"step": 1490
},
{
"epoch": 0.689893862482695,
"grad_norm": 1.827833481936086,
"learning_rate": 5.310026692762316e-06,
"loss": 0.9674,
"step": 1495
},
{
"epoch": 0.6922011998154131,
"grad_norm": 1.8013595644003924,
"learning_rate": 5.239034668022353e-06,
"loss": 0.9573,
"step": 1500
},
{
"epoch": 0.6945085371481311,
"grad_norm": 1.8007495209856474,
"learning_rate": 5.168351575500049e-06,
"loss": 0.9719,
"step": 1505
},
{
"epoch": 0.6968158744808491,
"grad_norm": 1.7404494760342795,
"learning_rate": 5.097982001719994e-06,
"loss": 0.9724,
"step": 1510
},
{
"epoch": 0.6991232118135672,
"grad_norm": 1.8207400554800481,
"learning_rate": 5.027930512862976e-06,
"loss": 0.9643,
"step": 1515
},
{
"epoch": 0.7014305491462852,
"grad_norm": 1.6503774911907483,
"learning_rate": 4.958201654469731e-06,
"loss": 0.9718,
"step": 1520
},
{
"epoch": 0.7037378864790033,
"grad_norm": 1.7042499652030019,
"learning_rate": 4.888799951145948e-06,
"loss": 0.9786,
"step": 1525
},
{
"epoch": 0.7060452238117213,
"grad_norm": 1.691429781718496,
"learning_rate": 4.8197299062687e-06,
"loss": 0.9584,
"step": 1530
},
{
"epoch": 0.7083525611444393,
"grad_norm": 1.789471718965235,
"learning_rate": 4.750996001694215e-06,
"loss": 0.978,
"step": 1535
},
{
"epoch": 0.7106598984771574,
"grad_norm": 1.762516310353455,
"learning_rate": 4.6826026974670665e-06,
"loss": 0.9536,
"step": 1540
},
{
"epoch": 0.7129672358098754,
"grad_norm": 1.6808685507807348,
"learning_rate": 4.614554431530754e-06,
"loss": 0.9453,
"step": 1545
},
{
"epoch": 0.7152745731425935,
"grad_norm": 1.695661878440997,
"learning_rate": 4.546855619439734e-06,
"loss": 0.9674,
"step": 1550
},
{
"epoch": 0.7175819104753115,
"grad_norm": 1.780885617378124,
"learning_rate": 4.479510654072909e-06,
"loss": 0.9724,
"step": 1555
},
{
"epoch": 0.7198892478080295,
"grad_norm": 1.7441807694903777,
"learning_rate": 4.412523905348568e-06,
"loss": 0.9422,
"step": 1560
},
{
"epoch": 0.7221965851407476,
"grad_norm": 1.7666139001524914,
"learning_rate": 4.345899719940844e-06,
"loss": 0.9496,
"step": 1565
},
{
"epoch": 0.7245039224734656,
"grad_norm": 1.6463010573052135,
"learning_rate": 4.279642420997655e-06,
"loss": 0.9635,
"step": 1570
},
{
"epoch": 0.7268112598061837,
"grad_norm": 1.7790169033851828,
"learning_rate": 4.213756307860175e-06,
"loss": 0.9795,
"step": 1575
},
{
"epoch": 0.7291185971389017,
"grad_norm": 1.709252193673288,
"learning_rate": 4.148245655783869e-06,
"loss": 0.9542,
"step": 1580
},
{
"epoch": 0.7314259344716197,
"grad_norm": 1.7064646780964507,
"learning_rate": 4.083114715661069e-06,
"loss": 0.9494,
"step": 1585
},
{
"epoch": 0.7337332718043378,
"grad_norm": 1.7203471522785316,
"learning_rate": 4.018367713745137e-06,
"loss": 0.9513,
"step": 1590
},
{
"epoch": 0.7360406091370558,
"grad_norm": 1.7329896835019194,
"learning_rate": 3.954008851376252e-06,
"loss": 0.9464,
"step": 1595
},
{
"epoch": 0.7383479464697739,
"grad_norm": 1.6668720129339225,
"learning_rate": 3.890042304708758e-06,
"loss": 0.9349,
"step": 1600
},
{
"epoch": 0.7406552838024919,
"grad_norm": 1.6612958616670062,
"learning_rate": 3.826472224440202e-06,
"loss": 0.9753,
"step": 1605
},
{
"epoch": 0.7429626211352099,
"grad_norm": 1.689937062434287,
"learning_rate": 3.763302735541987e-06,
"loss": 0.9755,
"step": 1610
},
{
"epoch": 0.745269958467928,
"grad_norm": 1.8524303075498816,
"learning_rate": 3.700537936991733e-06,
"loss": 0.9919,
"step": 1615
},
{
"epoch": 0.747577295800646,
"grad_norm": 1.7330330880413027,
"learning_rate": 3.6381819015072652e-06,
"loss": 0.9968,
"step": 1620
},
{
"epoch": 0.7498846331333641,
"grad_norm": 1.732375990079818,
"learning_rate": 3.5762386752823643e-06,
"loss": 0.9598,
"step": 1625
},
{
"epoch": 0.7521919704660821,
"grad_norm": 1.6723382538398348,
"learning_rate": 3.5147122777242203e-06,
"loss": 0.9826,
"step": 1630
},
{
"epoch": 0.7544993077988001,
"grad_norm": 1.698445076932435,
"learning_rate": 3.4536067011925945e-06,
"loss": 0.975,
"step": 1635
},
{
"epoch": 0.7568066451315182,
"grad_norm": 1.6756544799204833,
"learning_rate": 3.3929259107407785e-06,
"loss": 0.9596,
"step": 1640
},
{
"epoch": 0.7591139824642362,
"grad_norm": 1.7323687941815844,
"learning_rate": 3.3326738438583116e-06,
"loss": 0.9471,
"step": 1645
},
{
"epoch": 0.7614213197969543,
"grad_norm": 1.6841658818773522,
"learning_rate": 3.272854410215467e-06,
"loss": 0.9478,
"step": 1650
},
{
"epoch": 0.7637286571296723,
"grad_norm": 1.7258401397718819,
"learning_rate": 3.213471491409568e-06,
"loss": 0.9545,
"step": 1655
},
{
"epoch": 0.7660359944623903,
"grad_norm": 1.754250495342998,
"learning_rate": 3.1545289407131128e-06,
"loss": 0.9557,
"step": 1660
},
{
"epoch": 0.7683433317951085,
"grad_norm": 1.7109892895946872,
"learning_rate": 3.0960305828237568e-06,
"loss": 0.9649,
"step": 1665
},
{
"epoch": 0.7706506691278265,
"grad_norm": 1.8321981237158624,
"learning_rate": 3.0379802136161073e-06,
"loss": 0.9612,
"step": 1670
},
{
"epoch": 0.7729580064605446,
"grad_norm": 1.6838996787097582,
"learning_rate": 2.9803815998954334e-06,
"loss": 0.9701,
"step": 1675
},
{
"epoch": 0.7752653437932626,
"grad_norm": 1.7069776476837635,
"learning_rate": 2.9232384791532377e-06,
"loss": 0.9724,
"step": 1680
},
{
"epoch": 0.7775726811259807,
"grad_norm": 1.6302606016182208,
"learning_rate": 2.866554559324731e-06,
"loss": 0.9441,
"step": 1685
},
{
"epoch": 0.7798800184586987,
"grad_norm": 1.6451304542174006,
"learning_rate": 2.810333518548246e-06,
"loss": 0.9337,
"step": 1690
},
{
"epoch": 0.7821873557914167,
"grad_norm": 1.763422741436097,
"learning_rate": 2.7545790049265506e-06,
"loss": 0.9542,
"step": 1695
},
{
"epoch": 0.7844946931241348,
"grad_norm": 1.6780739225283752,
"learning_rate": 2.699294636290134e-06,
"loss": 0.9468,
"step": 1700
},
{
"epoch": 0.7868020304568528,
"grad_norm": 1.7170452157015115,
"learning_rate": 2.6444839999624496e-06,
"loss": 0.9333,
"step": 1705
},
{
"epoch": 0.7891093677895709,
"grad_norm": 1.6339639009427172,
"learning_rate": 2.5901506525271424e-06,
"loss": 0.9656,
"step": 1710
},
{
"epoch": 0.7914167051222889,
"grad_norm": 1.7076055984466658,
"learning_rate": 2.5362981195972627e-06,
"loss": 0.9292,
"step": 1715
},
{
"epoch": 0.7937240424550069,
"grad_norm": 1.694775381636099,
"learning_rate": 2.4829298955865022e-06,
"loss": 0.9621,
"step": 1720
},
{
"epoch": 0.796031379787725,
"grad_norm": 1.6448376757312444,
"learning_rate": 2.4300494434824373e-06,
"loss": 0.9323,
"step": 1725
},
{
"epoch": 0.798338717120443,
"grad_norm": 1.7190721711044321,
"learning_rate": 2.3776601946218225e-06,
"loss": 0.9536,
"step": 1730
},
{
"epoch": 0.8006460544531611,
"grad_norm": 1.7369056682520372,
"learning_rate": 2.3257655484679376e-06,
"loss": 0.9474,
"step": 1735
},
{
"epoch": 0.8029533917858791,
"grad_norm": 1.9277048905233987,
"learning_rate": 2.274368872390009e-06,
"loss": 0.953,
"step": 1740
},
{
"epoch": 0.8052607291185971,
"grad_norm": 1.6988058479966548,
"learning_rate": 2.2234735014446905e-06,
"loss": 0.9546,
"step": 1745
},
{
"epoch": 0.8075680664513152,
"grad_norm": 1.708926311661711,
"learning_rate": 2.1730827381596643e-06,
"loss": 0.9442,
"step": 1750
},
{
"epoch": 0.8098754037840332,
"grad_norm": 1.7384137551353784,
"learning_rate": 2.123199852319352e-06,
"loss": 0.9415,
"step": 1755
},
{
"epoch": 0.8121827411167513,
"grad_norm": 1.706865871432203,
"learning_rate": 2.073828080752728e-06,
"loss": 0.9514,
"step": 1760
},
{
"epoch": 0.8144900784494693,
"grad_norm": 1.631591345911517,
"learning_rate": 2.024970627123295e-06,
"loss": 0.9593,
"step": 1765
},
{
"epoch": 0.8167974157821873,
"grad_norm": 1.7250014472303201,
"learning_rate": 1.976630661721207e-06,
"loss": 0.9312,
"step": 1770
},
{
"epoch": 0.8191047531149054,
"grad_norm": 1.7176929983275837,
"learning_rate": 1.9288113212575454e-06,
"loss": 0.9392,
"step": 1775
},
{
"epoch": 0.8214120904476234,
"grad_norm": 1.7515517183747666,
"learning_rate": 1.8815157086607826e-06,
"loss": 0.9461,
"step": 1780
},
{
"epoch": 0.8237194277803415,
"grad_norm": 1.7256573401014044,
"learning_rate": 1.8347468928754408e-06,
"loss": 0.9625,
"step": 1785
},
{
"epoch": 0.8260267651130595,
"grad_norm": 1.6925351828448565,
"learning_rate": 1.7885079086629598e-06,
"loss": 0.9618,
"step": 1790
},
{
"epoch": 0.8283341024457775,
"grad_norm": 1.6848423059711715,
"learning_rate": 1.7428017564047594e-06,
"loss": 0.957,
"step": 1795
},
{
"epoch": 0.8306414397784956,
"grad_norm": 1.685378392680059,
"learning_rate": 1.697631401907559e-06,
"loss": 0.9405,
"step": 1800
},
{
"epoch": 0.8329487771112136,
"grad_norm": 1.779147836504438,
"learning_rate": 1.6529997762109319e-06,
"loss": 0.9475,
"step": 1805
},
{
"epoch": 0.8352561144439317,
"grad_norm": 1.6862245640499274,
"learning_rate": 1.6089097753971061e-06,
"loss": 0.9433,
"step": 1810
},
{
"epoch": 0.8375634517766497,
"grad_norm": 1.6387699919494911,
"learning_rate": 1.565364260403055e-06,
"loss": 0.9393,
"step": 1815
},
{
"epoch": 0.8398707891093677,
"grad_norm": 1.7088328181524817,
"learning_rate": 1.522366056834844e-06,
"loss": 0.9322,
"step": 1820
},
{
"epoch": 0.8421781264420858,
"grad_norm": 1.717214554275275,
"learning_rate": 1.4799179547842823e-06,
"loss": 0.9393,
"step": 1825
},
{
"epoch": 0.8444854637748038,
"grad_norm": 1.717143943169584,
"learning_rate": 1.4380227086478816e-06,
"loss": 0.96,
"step": 1830
},
{
"epoch": 0.846792801107522,
"grad_norm": 1.7390583641873172,
"learning_rate": 1.3966830369481231e-06,
"loss": 0.9487,
"step": 1835
},
{
"epoch": 0.84910013844024,
"grad_norm": 1.6483510037137357,
"learning_rate": 1.3559016221570663e-06,
"loss": 0.9315,
"step": 1840
},
{
"epoch": 0.8514074757729581,
"grad_norm": 1.7716145374153562,
"learning_rate": 1.3156811105222723e-06,
"loss": 0.9375,
"step": 1845
},
{
"epoch": 0.8537148131056761,
"grad_norm": 1.7136369284376767,
"learning_rate": 1.276024111895101e-06,
"loss": 0.9592,
"step": 1850
},
{
"epoch": 0.8560221504383941,
"grad_norm": 1.659445261277345,
"learning_rate": 1.2369331995613664e-06,
"loss": 0.9466,
"step": 1855
},
{
"epoch": 0.8583294877711122,
"grad_norm": 1.7022849150801465,
"learning_rate": 1.1984109100743445e-06,
"loss": 0.934,
"step": 1860
},
{
"epoch": 0.8606368251038302,
"grad_norm": 1.7296616243070897,
"learning_rate": 1.1604597430902032e-06,
"loss": 0.9413,
"step": 1865
},
{
"epoch": 0.8629441624365483,
"grad_norm": 1.70400411645417,
"learning_rate": 1.123082161205775e-06,
"loss": 0.9192,
"step": 1870
},
{
"epoch": 0.8652514997692663,
"grad_norm": 1.7716704719549248,
"learning_rate": 1.0862805897987894e-06,
"loss": 0.9313,
"step": 1875
},
{
"epoch": 0.8675588371019843,
"grad_norm": 1.6028791905428008,
"learning_rate": 1.0500574168704746e-06,
"loss": 0.9222,
"step": 1880
},
{
"epoch": 0.8698661744347024,
"grad_norm": 1.6811903757943443,
"learning_rate": 1.014414992890611e-06,
"loss": 0.9613,
"step": 1885
},
{
"epoch": 0.8721735117674204,
"grad_norm": 1.6582769629121996,
"learning_rate": 9.793556306450125e-07,
"loss": 0.9397,
"step": 1890
},
{
"epoch": 0.8744808491001385,
"grad_norm": 1.7329477389715338,
"learning_rate": 9.448816050854559e-07,
"loss": 0.9456,
"step": 1895
},
{
"epoch": 0.8767881864328565,
"grad_norm": 1.6579278218600528,
"learning_rate": 9.10995153182056e-07,
"loss": 0.9622,
"step": 1900
},
{
"epoch": 0.8790955237655745,
"grad_norm": 1.7059466024322731,
"learning_rate": 8.776984737781135e-07,
"loss": 0.9247,
"step": 1905
},
{
"epoch": 0.8814028610982926,
"grad_norm": 1.6587934531932846,
"learning_rate": 8.449937274474396e-07,
"loss": 0.9287,
"step": 1910
},
{
"epoch": 0.8837101984310106,
"grad_norm": 1.7000421816452764,
"learning_rate": 8.128830363541574e-07,
"loss": 0.9579,
"step": 1915
},
{
"epoch": 0.8860175357637287,
"grad_norm": 1.5982289727068608,
"learning_rate": 7.81368484114996e-07,
"loss": 0.9252,
"step": 1920
},
{
"epoch": 0.8883248730964467,
"grad_norm": 1.6595039047537794,
"learning_rate": 7.504521156640854e-07,
"loss": 0.9535,
"step": 1925
},
{
"epoch": 0.8906322104291647,
"grad_norm": 1.67142320517366,
"learning_rate": 7.201359371202698e-07,
"loss": 0.9342,
"step": 1930
},
{
"epoch": 0.8929395477618828,
"grad_norm": 1.724210373217245,
"learning_rate": 6.904219156569325e-07,
"loss": 0.9537,
"step": 1935
},
{
"epoch": 0.8952468850946008,
"grad_norm": 1.775137829218875,
"learning_rate": 6.613119793743428e-07,
"loss": 0.9407,
"step": 1940
},
{
"epoch": 0.8975542224273189,
"grad_norm": 1.6829966494434596,
"learning_rate": 6.32808017174551e-07,
"loss": 0.9271,
"step": 1945
},
{
"epoch": 0.8998615597600369,
"grad_norm": 1.7666732440040138,
"learning_rate": 6.049118786388153e-07,
"loss": 0.9299,
"step": 1950
},
{
"epoch": 0.9021688970927549,
"grad_norm": 1.7328446243608238,
"learning_rate": 5.776253739075887e-07,
"loss": 0.9368,
"step": 1955
},
{
"epoch": 0.904476234425473,
"grad_norm": 1.794453478228994,
"learning_rate": 5.509502735630601e-07,
"loss": 0.9584,
"step": 1960
},
{
"epoch": 0.906783571758191,
"grad_norm": 1.8595416734645607,
"learning_rate": 5.248883085142653e-07,
"loss": 0.9278,
"step": 1965
},
{
"epoch": 0.9090909090909091,
"grad_norm": 1.8064599818138745,
"learning_rate": 4.994411698847668e-07,
"loss": 0.9521,
"step": 1970
},
{
"epoch": 0.9113982464236271,
"grad_norm": 11.97761580494739,
"learning_rate": 4.746105089029229e-07,
"loss": 0.9353,
"step": 1975
},
{
"epoch": 0.9137055837563451,
"grad_norm": 1.91129591540662,
"learning_rate": 4.50397936794742e-07,
"loss": 0.9518,
"step": 1980
},
{
"epoch": 0.9160129210890632,
"grad_norm": 1.68892958160335,
"learning_rate": 4.268050246793276e-07,
"loss": 0.9417,
"step": 1985
},
{
"epoch": 0.9183202584217812,
"grad_norm": 1.7682216031642473,
"learning_rate": 4.038333034669406e-07,
"loss": 0.9575,
"step": 1990
},
{
"epoch": 0.9206275957544993,
"grad_norm": 1.6202959145852942,
"learning_rate": 3.814842637596483e-07,
"loss": 0.9202,
"step": 1995
},
{
"epoch": 0.9229349330872173,
"grad_norm": 1.7355964322786217,
"learning_rate": 3.5975935575461083e-07,
"loss": 0.9408,
"step": 2000
},
{
"epoch": 0.9252422704199353,
"grad_norm": 1.784833888232325,
"learning_rate": 3.3865998914997645e-07,
"loss": 0.9451,
"step": 2005
},
{
"epoch": 0.9275496077526535,
"grad_norm": 1.7347540917405002,
"learning_rate": 3.1818753305340566e-07,
"loss": 0.9503,
"step": 2010
},
{
"epoch": 0.9298569450853715,
"grad_norm": 1.7098932522181238,
"learning_rate": 2.9834331589323697e-07,
"loss": 0.9648,
"step": 2015
},
{
"epoch": 0.9321642824180896,
"grad_norm": 1.7375733049730988,
"learning_rate": 2.791286253322856e-07,
"loss": 0.9325,
"step": 2020
},
{
"epoch": 0.9344716197508076,
"grad_norm": 1.7376811511996035,
"learning_rate": 2.605447081842838e-07,
"loss": 0.9236,
"step": 2025
},
{
"epoch": 0.9367789570835257,
"grad_norm": 1.6285382014408727,
"learning_rate": 2.425927703329856e-07,
"loss": 0.9374,
"step": 2030
},
{
"epoch": 0.9390862944162437,
"grad_norm": 1.7187090366001978,
"learning_rate": 2.2527397665391026e-07,
"loss": 0.9408,
"step": 2035
},
{
"epoch": 0.9413936317489617,
"grad_norm": 1.6018424242699771,
"learning_rate": 2.0858945093876315e-07,
"loss": 0.9255,
"step": 2040
},
{
"epoch": 0.9437009690816798,
"grad_norm": 1.7403672056926338,
"learning_rate": 1.9254027582250588e-07,
"loss": 0.9386,
"step": 2045
},
{
"epoch": 0.9460083064143978,
"grad_norm": 1.6413885110477684,
"learning_rate": 1.7712749271311392e-07,
"loss": 0.9463,
"step": 2050
},
{
"epoch": 0.9483156437471159,
"grad_norm": 1.6584652712298515,
"learning_rate": 1.6235210172399373e-07,
"loss": 0.9197,
"step": 2055
},
{
"epoch": 0.9506229810798339,
"grad_norm": 1.7146073811212112,
"learning_rate": 1.4821506160909492e-07,
"loss": 0.9325,
"step": 2060
},
{
"epoch": 0.9529303184125519,
"grad_norm": 1.7862356926674563,
"learning_rate": 1.3471728970068986e-07,
"loss": 0.9415,
"step": 2065
},
{
"epoch": 0.95523765574527,
"grad_norm": 1.7900835474706864,
"learning_rate": 1.2185966184985687e-07,
"loss": 0.9516,
"step": 2070
},
{
"epoch": 0.957544993077988,
"grad_norm": 1.61965861822062,
"learning_rate": 1.0964301236963904e-07,
"loss": 0.9272,
"step": 2075
},
{
"epoch": 0.9598523304107061,
"grad_norm": 1.752477339334271,
"learning_rate": 9.806813398091419e-08,
"loss": 0.9231,
"step": 2080
},
{
"epoch": 0.9621596677434241,
"grad_norm": 1.6566621651012274,
"learning_rate": 8.713577776095494e-08,
"loss": 0.9293,
"step": 2085
},
{
"epoch": 0.9644670050761421,
"grad_norm": 1.637994937785989,
"learning_rate": 7.684665309468875e-08,
"loss": 0.9539,
"step": 2090
},
{
"epoch": 0.9667743424088602,
"grad_norm": 1.6112435656292767,
"learning_rate": 6.720142762867032e-08,
"loss": 0.9558,
"step": 2095
},
{
"epoch": 0.9690816797415782,
"grad_norm": 1.7629733611663445,
"learning_rate": 5.820072722775849e-08,
"loss": 0.9441,
"step": 2100
},
{
"epoch": 0.9713890170742963,
"grad_norm": 1.6502417380976349,
"learning_rate": 4.984513593450424e-08,
"loss": 0.9527,
"step": 2105
},
{
"epoch": 0.9736963544070143,
"grad_norm": 1.697689686578805,
"learning_rate": 4.2135195931249925e-08,
"loss": 0.9468,
"step": 2110
},
{
"epoch": 0.9760036917397323,
"grad_norm": 1.7391293446601994,
"learning_rate": 3.50714075049563e-08,
"loss": 0.932,
"step": 2115
},
{
"epoch": 0.9783110290724504,
"grad_norm": 1.7092970910189305,
"learning_rate": 2.8654229014730694e-08,
"loss": 0.9377,
"step": 2120
},
{
"epoch": 0.9806183664051684,
"grad_norm": 1.6908979053958657,
"learning_rate": 2.2884076862089712e-08,
"loss": 0.9238,
"step": 2125
},
{
"epoch": 0.9829257037378865,
"grad_norm": 1.634296694613385,
"learning_rate": 1.7761325463937495e-08,
"loss": 0.9473,
"step": 2130
},
{
"epoch": 0.9852330410706045,
"grad_norm": 1.700195091519778,
"learning_rate": 1.3286307228269623e-08,
"loss": 0.9491,
"step": 2135
},
{
"epoch": 0.9875403784033225,
"grad_norm": 1.6899977441806657,
"learning_rate": 9.459312532608122e-09,
"loss": 0.9393,
"step": 2140
},
{
"epoch": 0.9898477157360406,
"grad_norm": 1.6459431767651571,
"learning_rate": 6.280589705153217e-09,
"loss": 0.9316,
"step": 2145
},
{
"epoch": 0.9921550530687586,
"grad_norm": 1.6824622254831971,
"learning_rate": 3.750345008675105e-09,
"loss": 0.9455,
"step": 2150
},
{
"epoch": 0.9944623904014767,
"grad_norm": 1.6711402676574085,
"learning_rate": 1.8687426271246646e-09,
"loss": 0.9299,
"step": 2155
},
{
"epoch": 0.9967697277341947,
"grad_norm": 1.7394983815714609,
"learning_rate": 6.359046549864189e-10,
"loss": 0.9262,
"step": 2160
},
{
"epoch": 0.9990770650669127,
"grad_norm": 1.9029748936658466,
"learning_rate": 5.1911089347100876e-11,
"loss": 0.973,
"step": 2165
},
{
"epoch": 1.0,
"eval_loss": 0.9539673924446106,
"eval_runtime": 317.6397,
"eval_samples_per_second": 48.325,
"eval_steps_per_second": 0.756,
"step": 2167
},
{
"epoch": 1.0,
"step": 2167,
"total_flos": 453725713858560.0,
"train_loss": 1.0076359760722753,
"train_runtime": 13620.7718,
"train_samples_per_second": 10.182,
"train_steps_per_second": 0.159
}
],
"logging_steps": 5,
"max_steps": 2167,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 453725713858560.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}