Mistral-7B-v0.3-sft-ultrachat / trainer_state.json
AmberYifan's picture
Model save
3068070 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 2166,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004616805170821791,
"grad_norm": 15.672144611335952,
"learning_rate": 9.216589861751152e-08,
"loss": 1.3168,
"step": 1
},
{
"epoch": 0.0023084025854108957,
"grad_norm": 14.680930201512727,
"learning_rate": 4.608294930875577e-07,
"loss": 1.2513,
"step": 5
},
{
"epoch": 0.0046168051708217915,
"grad_norm": 8.024031806354767,
"learning_rate": 9.216589861751154e-07,
"loss": 1.1965,
"step": 10
},
{
"epoch": 0.006925207756232687,
"grad_norm": 5.48716759630272,
"learning_rate": 1.382488479262673e-06,
"loss": 1.0985,
"step": 15
},
{
"epoch": 0.009233610341643583,
"grad_norm": 5.068857847099135,
"learning_rate": 1.8433179723502307e-06,
"loss": 1.0416,
"step": 20
},
{
"epoch": 0.011542012927054479,
"grad_norm": 5.784416390233076,
"learning_rate": 2.3041474654377884e-06,
"loss": 1.0414,
"step": 25
},
{
"epoch": 0.013850415512465374,
"grad_norm": 5.088059220204017,
"learning_rate": 2.764976958525346e-06,
"loss": 1.0977,
"step": 30
},
{
"epoch": 0.016158818097876268,
"grad_norm": 4.831013532516624,
"learning_rate": 3.225806451612903e-06,
"loss": 1.0864,
"step": 35
},
{
"epoch": 0.018467220683287166,
"grad_norm": 5.01522341202905,
"learning_rate": 3.6866359447004615e-06,
"loss": 1.0875,
"step": 40
},
{
"epoch": 0.02077562326869806,
"grad_norm": 5.2743430965797495,
"learning_rate": 4.147465437788019e-06,
"loss": 1.0582,
"step": 45
},
{
"epoch": 0.023084025854108958,
"grad_norm": 4.868797420697276,
"learning_rate": 4.608294930875577e-06,
"loss": 1.0706,
"step": 50
},
{
"epoch": 0.025392428439519853,
"grad_norm": 4.866881384682284,
"learning_rate": 5.0691244239631346e-06,
"loss": 1.0874,
"step": 55
},
{
"epoch": 0.027700831024930747,
"grad_norm": 4.942215298185941,
"learning_rate": 5.529953917050692e-06,
"loss": 1.0765,
"step": 60
},
{
"epoch": 0.030009233610341645,
"grad_norm": 4.510624758005134,
"learning_rate": 5.9907834101382485e-06,
"loss": 1.0651,
"step": 65
},
{
"epoch": 0.032317636195752536,
"grad_norm": 4.961160516804359,
"learning_rate": 6.451612903225806e-06,
"loss": 1.0826,
"step": 70
},
{
"epoch": 0.03462603878116344,
"grad_norm": 4.999318871040395,
"learning_rate": 6.912442396313365e-06,
"loss": 1.0966,
"step": 75
},
{
"epoch": 0.03693444136657433,
"grad_norm": 4.639315170945839,
"learning_rate": 7.373271889400923e-06,
"loss": 1.0934,
"step": 80
},
{
"epoch": 0.039242843951985226,
"grad_norm": 4.79620290699333,
"learning_rate": 7.83410138248848e-06,
"loss": 1.0932,
"step": 85
},
{
"epoch": 0.04155124653739612,
"grad_norm": 4.957993386602933,
"learning_rate": 8.294930875576038e-06,
"loss": 1.1032,
"step": 90
},
{
"epoch": 0.043859649122807015,
"grad_norm": 4.669607524515842,
"learning_rate": 8.755760368663595e-06,
"loss": 1.0875,
"step": 95
},
{
"epoch": 0.046168051708217916,
"grad_norm": 4.602861109332021,
"learning_rate": 9.216589861751153e-06,
"loss": 1.0809,
"step": 100
},
{
"epoch": 0.04847645429362881,
"grad_norm": 4.548726172146522,
"learning_rate": 9.67741935483871e-06,
"loss": 1.1218,
"step": 105
},
{
"epoch": 0.050784856879039705,
"grad_norm": 4.724149571099335,
"learning_rate": 1.0138248847926269e-05,
"loss": 1.1007,
"step": 110
},
{
"epoch": 0.0530932594644506,
"grad_norm": 5.309010432204349,
"learning_rate": 1.0599078341013826e-05,
"loss": 1.1368,
"step": 115
},
{
"epoch": 0.055401662049861494,
"grad_norm": 4.839305137997795,
"learning_rate": 1.1059907834101385e-05,
"loss": 1.1055,
"step": 120
},
{
"epoch": 0.05771006463527239,
"grad_norm": 4.5796294161615565,
"learning_rate": 1.152073732718894e-05,
"loss": 1.1155,
"step": 125
},
{
"epoch": 0.06001846722068329,
"grad_norm": 4.706959812240538,
"learning_rate": 1.1981566820276497e-05,
"loss": 1.1387,
"step": 130
},
{
"epoch": 0.062326869806094184,
"grad_norm": 4.492165983938348,
"learning_rate": 1.2442396313364056e-05,
"loss": 1.1733,
"step": 135
},
{
"epoch": 0.06463527239150507,
"grad_norm": 4.746032213098828,
"learning_rate": 1.2903225806451613e-05,
"loss": 1.1375,
"step": 140
},
{
"epoch": 0.06694367497691597,
"grad_norm": 4.713817907356248,
"learning_rate": 1.3364055299539171e-05,
"loss": 1.158,
"step": 145
},
{
"epoch": 0.06925207756232687,
"grad_norm": 4.342905646572964,
"learning_rate": 1.382488479262673e-05,
"loss": 1.1607,
"step": 150
},
{
"epoch": 0.07156048014773776,
"grad_norm": 4.502102336400582,
"learning_rate": 1.4285714285714287e-05,
"loss": 1.1382,
"step": 155
},
{
"epoch": 0.07386888273314866,
"grad_norm": 4.300393542300411,
"learning_rate": 1.4746543778801846e-05,
"loss": 1.1518,
"step": 160
},
{
"epoch": 0.07617728531855955,
"grad_norm": 4.400546990325483,
"learning_rate": 1.5207373271889403e-05,
"loss": 1.1436,
"step": 165
},
{
"epoch": 0.07848568790397045,
"grad_norm": 4.77590791643038,
"learning_rate": 1.566820276497696e-05,
"loss": 1.2173,
"step": 170
},
{
"epoch": 0.08079409048938135,
"grad_norm": 4.32969974114785,
"learning_rate": 1.6129032258064517e-05,
"loss": 1.1654,
"step": 175
},
{
"epoch": 0.08310249307479224,
"grad_norm": 5.285074448558262,
"learning_rate": 1.6589861751152075e-05,
"loss": 1.2185,
"step": 180
},
{
"epoch": 0.08541089566020314,
"grad_norm": 6.312179413881035,
"learning_rate": 1.705069124423963e-05,
"loss": 1.2063,
"step": 185
},
{
"epoch": 0.08771929824561403,
"grad_norm": 4.351482667809684,
"learning_rate": 1.751152073732719e-05,
"loss": 1.1814,
"step": 190
},
{
"epoch": 0.09002770083102493,
"grad_norm": 4.468079454686115,
"learning_rate": 1.7972350230414748e-05,
"loss": 1.2058,
"step": 195
},
{
"epoch": 0.09233610341643583,
"grad_norm": 5.51425273025908,
"learning_rate": 1.8433179723502307e-05,
"loss": 1.1646,
"step": 200
},
{
"epoch": 0.09464450600184672,
"grad_norm": 4.661323669253999,
"learning_rate": 1.8894009216589862e-05,
"loss": 1.1689,
"step": 205
},
{
"epoch": 0.09695290858725762,
"grad_norm": 726.888849011745,
"learning_rate": 1.935483870967742e-05,
"loss": 1.8785,
"step": 210
},
{
"epoch": 0.09926131117266851,
"grad_norm": 5.867844131835615,
"learning_rate": 1.981566820276498e-05,
"loss": 1.2661,
"step": 215
},
{
"epoch": 0.10156971375807941,
"grad_norm": 6.2684199277472015,
"learning_rate": 1.9999883080288618e-05,
"loss": 1.2545,
"step": 220
},
{
"epoch": 0.1038781163434903,
"grad_norm": 5.426004523317811,
"learning_rate": 1.999916858084231e-05,
"loss": 1.2259,
"step": 225
},
{
"epoch": 0.1061865189289012,
"grad_norm": 4.617593739028291,
"learning_rate": 1.999780458369908e-05,
"loss": 1.177,
"step": 230
},
{
"epoch": 0.1084949215143121,
"grad_norm": 4.412649452769939,
"learning_rate": 1.9995791177457598e-05,
"loss": 1.2127,
"step": 235
},
{
"epoch": 0.11080332409972299,
"grad_norm": 4.3422685444059965,
"learning_rate": 1.9993128492899012e-05,
"loss": 1.2398,
"step": 240
},
{
"epoch": 0.11311172668513389,
"grad_norm": 4.837187367612426,
"learning_rate": 1.9989816702978447e-05,
"loss": 1.2189,
"step": 245
},
{
"epoch": 0.11542012927054478,
"grad_norm": 4.203624655101154,
"learning_rate": 1.998585602281378e-05,
"loss": 1.1641,
"step": 250
},
{
"epoch": 0.11772853185595568,
"grad_norm": 4.172078683953242,
"learning_rate": 1.9981246709671668e-05,
"loss": 1.217,
"step": 255
},
{
"epoch": 0.12003693444136658,
"grad_norm": 4.445815868978626,
"learning_rate": 1.9975989062950828e-05,
"loss": 1.2198,
"step": 260
},
{
"epoch": 0.12234533702677747,
"grad_norm": 4.5591861583880045,
"learning_rate": 1.9970083424162598e-05,
"loss": 1.2971,
"step": 265
},
{
"epoch": 0.12465373961218837,
"grad_norm": 8.794456155689286,
"learning_rate": 1.9963530176908752e-05,
"loss": 1.2543,
"step": 270
},
{
"epoch": 0.12696214219759927,
"grad_norm": 4.296337355363852,
"learning_rate": 1.9956329746856583e-05,
"loss": 1.1902,
"step": 275
},
{
"epoch": 0.12927054478301014,
"grad_norm": 4.210037276606183,
"learning_rate": 1.9948482601711245e-05,
"loss": 1.2119,
"step": 280
},
{
"epoch": 0.13157894736842105,
"grad_norm": 4.525819047133829,
"learning_rate": 1.9939989251185386e-05,
"loss": 1.2267,
"step": 285
},
{
"epoch": 0.13388734995383195,
"grad_norm": 4.617114491344834,
"learning_rate": 1.993085024696604e-05,
"loss": 1.253,
"step": 290
},
{
"epoch": 0.13619575253924285,
"grad_norm": 4.306884827311129,
"learning_rate": 1.992106618267878e-05,
"loss": 1.2968,
"step": 295
},
{
"epoch": 0.13850415512465375,
"grad_norm": 4.06156778374873,
"learning_rate": 1.9910637693849166e-05,
"loss": 1.2523,
"step": 300
},
{
"epoch": 0.14081255771006462,
"grad_norm": 4.1725194301873225,
"learning_rate": 1.9899565457861463e-05,
"loss": 1.2465,
"step": 305
},
{
"epoch": 0.14312096029547552,
"grad_norm": 6.1582094432239005,
"learning_rate": 1.988785019391465e-05,
"loss": 1.2893,
"step": 310
},
{
"epoch": 0.14542936288088643,
"grad_norm": 4.468419480755536,
"learning_rate": 1.987549266297568e-05,
"loss": 1.2684,
"step": 315
},
{
"epoch": 0.14773776546629733,
"grad_norm": 4.663314719431853,
"learning_rate": 1.986249366773009e-05,
"loss": 1.2472,
"step": 320
},
{
"epoch": 0.15004616805170823,
"grad_norm": 4.557295583444763,
"learning_rate": 1.9848854052529822e-05,
"loss": 1.2856,
"step": 325
},
{
"epoch": 0.1523545706371191,
"grad_norm": 4.128322557091226,
"learning_rate": 1.9834574703338406e-05,
"loss": 1.2717,
"step": 330
},
{
"epoch": 0.15466297322253,
"grad_norm": 4.265562249971871,
"learning_rate": 1.9819656547673393e-05,
"loss": 1.2614,
"step": 335
},
{
"epoch": 0.1569713758079409,
"grad_norm": 4.189648461283852,
"learning_rate": 1.9804100554546127e-05,
"loss": 1.2221,
"step": 340
},
{
"epoch": 0.1592797783933518,
"grad_norm": 4.753781028146877,
"learning_rate": 1.9787907734398785e-05,
"loss": 1.2641,
"step": 345
},
{
"epoch": 0.1615881809787627,
"grad_norm": 4.624571477954883,
"learning_rate": 1.9771079139038765e-05,
"loss": 1.3082,
"step": 350
},
{
"epoch": 0.16389658356417358,
"grad_norm": 4.51192227446534,
"learning_rate": 1.9753615861570338e-05,
"loss": 1.3116,
"step": 355
},
{
"epoch": 0.16620498614958448,
"grad_norm": 4.392313959090253,
"learning_rate": 1.9735519036323656e-05,
"loss": 1.2304,
"step": 360
},
{
"epoch": 0.16851338873499538,
"grad_norm": 4.979240339208881,
"learning_rate": 1.9716789838781095e-05,
"loss": 1.2682,
"step": 365
},
{
"epoch": 0.17082179132040629,
"grad_norm": 4.96937836441046,
"learning_rate": 1.9697429485500862e-05,
"loss": 1.3054,
"step": 370
},
{
"epoch": 0.1731301939058172,
"grad_norm": 3.935739346153204,
"learning_rate": 1.9677439234038004e-05,
"loss": 1.2704,
"step": 375
},
{
"epoch": 0.17543859649122806,
"grad_norm": 4.366123456450803,
"learning_rate": 1.96568203828627e-05,
"loss": 1.236,
"step": 380
},
{
"epoch": 0.17774699907663896,
"grad_norm": 4.003638705307624,
"learning_rate": 1.963557427127594e-05,
"loss": 1.2134,
"step": 385
},
{
"epoch": 0.18005540166204986,
"grad_norm": 4.711836278485082,
"learning_rate": 1.9613702279322518e-05,
"loss": 1.2424,
"step": 390
},
{
"epoch": 0.18236380424746076,
"grad_norm": 4.7756346414851345,
"learning_rate": 1.95912058277014e-05,
"loss": 1.2513,
"step": 395
},
{
"epoch": 0.18467220683287167,
"grad_norm": 4.055556447653374,
"learning_rate": 1.9568086377673422e-05,
"loss": 1.2305,
"step": 400
},
{
"epoch": 0.18698060941828254,
"grad_norm": 3.9870929086001605,
"learning_rate": 1.9544345430966398e-05,
"loss": 1.2766,
"step": 405
},
{
"epoch": 0.18928901200369344,
"grad_norm": 4.3683569271591525,
"learning_rate": 1.951998452967756e-05,
"loss": 1.2701,
"step": 410
},
{
"epoch": 0.19159741458910434,
"grad_norm": 4.282177503327308,
"learning_rate": 1.9495005256173398e-05,
"loss": 1.2173,
"step": 415
},
{
"epoch": 0.19390581717451524,
"grad_norm": 4.122228465513596,
"learning_rate": 1.9469409232986876e-05,
"loss": 1.293,
"step": 420
},
{
"epoch": 0.19621421975992612,
"grad_norm": 4.391730062186428,
"learning_rate": 1.9443198122712036e-05,
"loss": 1.3013,
"step": 425
},
{
"epoch": 0.19852262234533702,
"grad_norm": 4.2533205751093,
"learning_rate": 1.9416373627896002e-05,
"loss": 1.2478,
"step": 430
},
{
"epoch": 0.20083102493074792,
"grad_norm": 4.982151398275928,
"learning_rate": 1.9388937490928402e-05,
"loss": 1.289,
"step": 435
},
{
"epoch": 0.20313942751615882,
"grad_norm": 4.254393940238592,
"learning_rate": 1.9360891493928186e-05,
"loss": 1.2773,
"step": 440
},
{
"epoch": 0.20544783010156972,
"grad_norm": 4.812233488623846,
"learning_rate": 1.933223745862786e-05,
"loss": 1.2571,
"step": 445
},
{
"epoch": 0.2077562326869806,
"grad_norm": 4.193819364681046,
"learning_rate": 1.930297724625516e-05,
"loss": 1.3167,
"step": 450
},
{
"epoch": 0.2100646352723915,
"grad_norm": 4.318967687699199,
"learning_rate": 1.9273112757412165e-05,
"loss": 1.2578,
"step": 455
},
{
"epoch": 0.2123730378578024,
"grad_norm": 4.021438837096732,
"learning_rate": 1.9242645931951833e-05,
"loss": 1.2703,
"step": 460
},
{
"epoch": 0.2146814404432133,
"grad_norm": 3.9988355301981344,
"learning_rate": 1.921157874885199e-05,
"loss": 1.2702,
"step": 465
},
{
"epoch": 0.2169898430286242,
"grad_norm": 3.866018897785007,
"learning_rate": 1.91799132260868e-05,
"loss": 1.2651,
"step": 470
},
{
"epoch": 0.21929824561403508,
"grad_norm": 4.228145732575894,
"learning_rate": 1.9147651420495696e-05,
"loss": 1.2429,
"step": 475
},
{
"epoch": 0.22160664819944598,
"grad_norm": 4.16044625111994,
"learning_rate": 1.9114795427649735e-05,
"loss": 1.2263,
"step": 480
},
{
"epoch": 0.22391505078485688,
"grad_norm": 3.7071606709047678,
"learning_rate": 1.9081347381715535e-05,
"loss": 1.2592,
"step": 485
},
{
"epoch": 0.22622345337026778,
"grad_norm": 4.093983584879632,
"learning_rate": 1.904730945531661e-05,
"loss": 1.2819,
"step": 490
},
{
"epoch": 0.22853185595567868,
"grad_norm": 4.247421291613911,
"learning_rate": 1.901268385939226e-05,
"loss": 1.3118,
"step": 495
},
{
"epoch": 0.23084025854108955,
"grad_norm": 4.088704419142061,
"learning_rate": 1.8977472843053962e-05,
"loss": 1.2529,
"step": 500
},
{
"epoch": 0.23314866112650046,
"grad_norm": 3.9526614218286698,
"learning_rate": 1.8941678693439272e-05,
"loss": 1.2254,
"step": 505
},
{
"epoch": 0.23545706371191136,
"grad_norm": 3.767319095108075,
"learning_rate": 1.8905303735563274e-05,
"loss": 1.2705,
"step": 510
},
{
"epoch": 0.23776546629732226,
"grad_norm": 4.1464464034097,
"learning_rate": 1.886835033216755e-05,
"loss": 1.2841,
"step": 515
},
{
"epoch": 0.24007386888273316,
"grad_norm": 4.154511161776497,
"learning_rate": 1.88308208835667e-05,
"loss": 1.2715,
"step": 520
},
{
"epoch": 0.24238227146814403,
"grad_norm": 4.815166096996458,
"learning_rate": 1.8792717827492446e-05,
"loss": 1.3034,
"step": 525
},
{
"epoch": 0.24469067405355494,
"grad_norm": 22.245546847367528,
"learning_rate": 1.8754043638935283e-05,
"loss": 1.2532,
"step": 530
},
{
"epoch": 0.24699907663896584,
"grad_norm": 4.177323522295811,
"learning_rate": 1.871480082998371e-05,
"loss": 1.2501,
"step": 535
},
{
"epoch": 0.24930747922437674,
"grad_norm": 3.9426463777773346,
"learning_rate": 1.867499194966106e-05,
"loss": 1.2683,
"step": 540
},
{
"epoch": 0.2516158818097876,
"grad_norm": 3.912690873331932,
"learning_rate": 1.8634619583759933e-05,
"loss": 1.2874,
"step": 545
},
{
"epoch": 0.25392428439519854,
"grad_norm": 3.972529239438344,
"learning_rate": 1.8593686354674223e-05,
"loss": 1.2698,
"step": 550
},
{
"epoch": 0.2562326869806094,
"grad_norm": 3.958572886167977,
"learning_rate": 1.8552194921228793e-05,
"loss": 1.2293,
"step": 555
},
{
"epoch": 0.2585410895660203,
"grad_norm": 3.7553829117034767,
"learning_rate": 1.851014797850676e-05,
"loss": 1.2818,
"step": 560
},
{
"epoch": 0.2608494921514312,
"grad_norm": 4.352268879736511,
"learning_rate": 1.8467548257674453e-05,
"loss": 1.2552,
"step": 565
},
{
"epoch": 0.2631578947368421,
"grad_norm": 5.014139215739045,
"learning_rate": 1.8424398525803983e-05,
"loss": 1.2228,
"step": 570
},
{
"epoch": 0.265466297322253,
"grad_norm": 4.192590762422093,
"learning_rate": 1.8380701585693526e-05,
"loss": 1.2526,
"step": 575
},
{
"epoch": 0.2677746999076639,
"grad_norm": 4.209340122955672,
"learning_rate": 1.8336460275685267e-05,
"loss": 1.2681,
"step": 580
},
{
"epoch": 0.27008310249307477,
"grad_norm": 3.801129619164067,
"learning_rate": 1.8291677469481025e-05,
"loss": 1.2623,
"step": 585
},
{
"epoch": 0.2723915050784857,
"grad_norm": 5.60448449703679,
"learning_rate": 1.8246356075955594e-05,
"loss": 1.2778,
"step": 590
},
{
"epoch": 0.27469990766389657,
"grad_norm": 3.8415685450636143,
"learning_rate": 1.820049903896782e-05,
"loss": 1.2546,
"step": 595
},
{
"epoch": 0.2770083102493075,
"grad_norm": 3.766423848242755,
"learning_rate": 1.8154109337169326e-05,
"loss": 1.2994,
"step": 600
},
{
"epoch": 0.2793167128347184,
"grad_norm": 3.8445299977202363,
"learning_rate": 1.8107189983811094e-05,
"loss": 1.2779,
"step": 605
},
{
"epoch": 0.28162511542012925,
"grad_norm": 4.20182793655244,
"learning_rate": 1.8059744026547713e-05,
"loss": 1.2794,
"step": 610
},
{
"epoch": 0.2839335180055402,
"grad_norm": 3.6927184982852554,
"learning_rate": 1.8011774547239403e-05,
"loss": 1.2217,
"step": 615
},
{
"epoch": 0.28624192059095105,
"grad_norm": 3.906241578603264,
"learning_rate": 1.796328466175186e-05,
"loss": 1.3162,
"step": 620
},
{
"epoch": 0.288550323176362,
"grad_norm": 3.7221850266429675,
"learning_rate": 1.791427751975385e-05,
"loss": 1.2591,
"step": 625
},
{
"epoch": 0.29085872576177285,
"grad_norm": 4.11815775927983,
"learning_rate": 1.786475630451262e-05,
"loss": 1.2572,
"step": 630
},
{
"epoch": 0.2931671283471837,
"grad_norm": 3.8995508626898454,
"learning_rate": 1.781472423268713e-05,
"loss": 1.2604,
"step": 635
},
{
"epoch": 0.29547553093259465,
"grad_norm": 4.5219499712986035,
"learning_rate": 1.776418455411913e-05,
"loss": 1.298,
"step": 640
},
{
"epoch": 0.29778393351800553,
"grad_norm": 4.5899598168207785,
"learning_rate": 1.7713140551622032e-05,
"loss": 1.2664,
"step": 645
},
{
"epoch": 0.30009233610341646,
"grad_norm": 4.641570078800192,
"learning_rate": 1.7661595540767714e-05,
"loss": 1.2689,
"step": 650
},
{
"epoch": 0.30240073868882733,
"grad_norm": 4.383087991217795,
"learning_rate": 1.7609552869671126e-05,
"loss": 1.2551,
"step": 655
},
{
"epoch": 0.3047091412742382,
"grad_norm": 3.9687899547292576,
"learning_rate": 1.7557015918772822e-05,
"loss": 1.2379,
"step": 660
},
{
"epoch": 0.30701754385964913,
"grad_norm": 4.133840300932013,
"learning_rate": 1.750398810061939e-05,
"loss": 1.2779,
"step": 665
},
{
"epoch": 0.30932594644506,
"grad_norm": 3.84778329275165,
"learning_rate": 1.745047285964179e-05,
"loss": 1.2306,
"step": 670
},
{
"epoch": 0.31163434903047094,
"grad_norm": 4.054603771464119,
"learning_rate": 1.7396473671931597e-05,
"loss": 1.2089,
"step": 675
},
{
"epoch": 0.3139427516158818,
"grad_norm": 4.013882196193361,
"learning_rate": 1.7341994045015245e-05,
"loss": 1.2225,
"step": 680
},
{
"epoch": 0.3162511542012927,
"grad_norm": 4.076399340438248,
"learning_rate": 1.7287037517626174e-05,
"loss": 1.3166,
"step": 685
},
{
"epoch": 0.3185595567867036,
"grad_norm": 3.991144267549364,
"learning_rate": 1.7231607659474972e-05,
"loss": 1.2706,
"step": 690
},
{
"epoch": 0.3208679593721145,
"grad_norm": 3.592102167186549,
"learning_rate": 1.7175708071017503e-05,
"loss": 1.2066,
"step": 695
},
{
"epoch": 0.3231763619575254,
"grad_norm": 4.2490266329322655,
"learning_rate": 1.7119342383221055e-05,
"loss": 1.3011,
"step": 700
},
{
"epoch": 0.3254847645429363,
"grad_norm": 3.7487591296204266,
"learning_rate": 1.7062514257328474e-05,
"loss": 1.2587,
"step": 705
},
{
"epoch": 0.32779316712834716,
"grad_norm": 3.6111287365523466,
"learning_rate": 1.7005227384620336e-05,
"loss": 1.2626,
"step": 710
},
{
"epoch": 0.3301015697137581,
"grad_norm": 3.8624035554609892,
"learning_rate": 1.6947485486175223e-05,
"loss": 1.266,
"step": 715
},
{
"epoch": 0.33240997229916897,
"grad_norm": 4.191574332500623,
"learning_rate": 1.688929231262797e-05,
"loss": 1.2275,
"step": 720
},
{
"epoch": 0.3347183748845799,
"grad_norm": 3.931766819485826,
"learning_rate": 1.683065164392606e-05,
"loss": 1.2525,
"step": 725
},
{
"epoch": 0.33702677746999077,
"grad_norm": 3.8224846577065685,
"learning_rate": 1.6771567289084122e-05,
"loss": 1.228,
"step": 730
},
{
"epoch": 0.33933518005540164,
"grad_norm": 3.7975499971303024,
"learning_rate": 1.6712043085936473e-05,
"loss": 1.2121,
"step": 735
},
{
"epoch": 0.34164358264081257,
"grad_norm": 3.7233983105114326,
"learning_rate": 1.6652082900887858e-05,
"loss": 1.2439,
"step": 740
},
{
"epoch": 0.34395198522622344,
"grad_norm": 4.0496534376278674,
"learning_rate": 1.6591690628662305e-05,
"loss": 1.3064,
"step": 745
},
{
"epoch": 0.3462603878116344,
"grad_norm": 4.397682055950332,
"learning_rate": 1.6530870192050134e-05,
"loss": 1.2433,
"step": 750
},
{
"epoch": 0.34856879039704525,
"grad_norm": 3.999160650641557,
"learning_rate": 1.6469625541653152e-05,
"loss": 1.2117,
"step": 755
},
{
"epoch": 0.3508771929824561,
"grad_norm": 4.475385002364299,
"learning_rate": 1.6407960655628055e-05,
"loss": 1.203,
"step": 760
},
{
"epoch": 0.35318559556786705,
"grad_norm": 3.5042875341184416,
"learning_rate": 1.6345879539428e-05,
"loss": 1.2567,
"step": 765
},
{
"epoch": 0.3554939981532779,
"grad_norm": 3.678612416780679,
"learning_rate": 1.6283386225542467e-05,
"loss": 1.2276,
"step": 770
},
{
"epoch": 0.35780240073868885,
"grad_norm": 5.063348081613382,
"learning_rate": 1.622048477323529e-05,
"loss": 1.2297,
"step": 775
},
{
"epoch": 0.3601108033240997,
"grad_norm": 4.04397764374825,
"learning_rate": 1.6157179268281007e-05,
"loss": 1.2498,
"step": 780
},
{
"epoch": 0.3624192059095106,
"grad_norm": 3.7786600086660553,
"learning_rate": 1.6093473822699467e-05,
"loss": 1.2156,
"step": 785
},
{
"epoch": 0.36472760849492153,
"grad_norm": 3.726670143436363,
"learning_rate": 1.6029372574488732e-05,
"loss": 1.248,
"step": 790
},
{
"epoch": 0.3670360110803324,
"grad_norm": 3.6023664901819115,
"learning_rate": 1.5964879687356286e-05,
"loss": 1.2762,
"step": 795
},
{
"epoch": 0.36934441366574333,
"grad_norm": 3.684618843127009,
"learning_rate": 1.589999935044859e-05,
"loss": 1.2269,
"step": 800
},
{
"epoch": 0.3716528162511542,
"grad_norm": 3.6119834291134465,
"learning_rate": 1.5834735778078968e-05,
"loss": 1.2078,
"step": 805
},
{
"epoch": 0.3739612188365651,
"grad_norm": 3.66332363718426,
"learning_rate": 1.5769093209453876e-05,
"loss": 1.2713,
"step": 810
},
{
"epoch": 0.376269621421976,
"grad_norm": 4.137676249046753,
"learning_rate": 1.5703075908397523e-05,
"loss": 1.2816,
"step": 815
},
{
"epoch": 0.3785780240073869,
"grad_norm": 3.8481468093108475,
"learning_rate": 1.563668816307494e-05,
"loss": 1.2203,
"step": 820
},
{
"epoch": 0.3808864265927978,
"grad_norm": 3.7158307301305156,
"learning_rate": 1.556993428571342e-05,
"loss": 1.2163,
"step": 825
},
{
"epoch": 0.3831948291782087,
"grad_norm": 3.851222452502614,
"learning_rate": 1.550281861232243e-05,
"loss": 1.243,
"step": 830
},
{
"epoch": 0.38550323176361956,
"grad_norm": 3.6817891692377978,
"learning_rate": 1.5435345502411956e-05,
"loss": 1.2821,
"step": 835
},
{
"epoch": 0.3878116343490305,
"grad_norm": 3.9683025462284998,
"learning_rate": 1.536751933870934e-05,
"loss": 1.2019,
"step": 840
},
{
"epoch": 0.39012003693444136,
"grad_norm": 3.94265762295689,
"learning_rate": 1.5299344526874576e-05,
"loss": 1.2774,
"step": 845
},
{
"epoch": 0.39242843951985223,
"grad_norm": 4.123641725136207,
"learning_rate": 1.5230825495214184e-05,
"loss": 1.2352,
"step": 850
},
{
"epoch": 0.39473684210526316,
"grad_norm": 3.9570109790957653,
"learning_rate": 1.5161966694393516e-05,
"loss": 1.215,
"step": 855
},
{
"epoch": 0.39704524469067404,
"grad_norm": 3.6427091867450714,
"learning_rate": 1.5092772597147707e-05,
"loss": 1.2202,
"step": 860
},
{
"epoch": 0.39935364727608497,
"grad_norm": 3.8425754107191796,
"learning_rate": 1.5023247697991114e-05,
"loss": 1.2432,
"step": 865
},
{
"epoch": 0.40166204986149584,
"grad_norm": 3.759319372367797,
"learning_rate": 1.4953396512925398e-05,
"loss": 1.1838,
"step": 870
},
{
"epoch": 0.4039704524469067,
"grad_norm": 3.872324982369786,
"learning_rate": 1.4883223579146167e-05,
"loss": 1.2331,
"step": 875
},
{
"epoch": 0.40627885503231764,
"grad_norm": 3.8616658245003435,
"learning_rate": 1.4812733454748283e-05,
"loss": 1.2277,
"step": 880
},
{
"epoch": 0.4085872576177285,
"grad_norm": 3.5624714154298163,
"learning_rate": 1.4741930718429772e-05,
"loss": 1.2051,
"step": 885
},
{
"epoch": 0.41089566020313945,
"grad_norm": 3.6961173549363924,
"learning_rate": 1.4670819969194416e-05,
"loss": 1.2309,
"step": 890
},
{
"epoch": 0.4132040627885503,
"grad_norm": 3.5654510220296847,
"learning_rate": 1.4599405826053039e-05,
"loss": 1.1884,
"step": 895
},
{
"epoch": 0.4155124653739612,
"grad_norm": 4.205884899208378,
"learning_rate": 1.4527692927723465e-05,
"loss": 1.2223,
"step": 900
},
{
"epoch": 0.4178208679593721,
"grad_norm": 3.9431786244545997,
"learning_rate": 1.4455685932329204e-05,
"loss": 1.2389,
"step": 905
},
{
"epoch": 0.420129270544783,
"grad_norm": 3.579703652121505,
"learning_rate": 1.4383389517096899e-05,
"loss": 1.2429,
"step": 910
},
{
"epoch": 0.4224376731301939,
"grad_norm": 3.7807582830713105,
"learning_rate": 1.4310808378052506e-05,
"loss": 1.1874,
"step": 915
},
{
"epoch": 0.4247460757156048,
"grad_norm": 3.9020463886513914,
"learning_rate": 1.4237947229716262e-05,
"loss": 1.2587,
"step": 920
},
{
"epoch": 0.42705447830101567,
"grad_norm": 3.7663448915088633,
"learning_rate": 1.4164810804796464e-05,
"loss": 1.184,
"step": 925
},
{
"epoch": 0.4293628808864266,
"grad_norm": 3.7907471270783937,
"learning_rate": 1.409140385388203e-05,
"loss": 1.2445,
"step": 930
},
{
"epoch": 0.4316712834718375,
"grad_norm": 3.791543245723202,
"learning_rate": 1.4017731145133955e-05,
"loss": 1.2527,
"step": 935
},
{
"epoch": 0.4339796860572484,
"grad_norm": 3.8566751713668666,
"learning_rate": 1.3943797463975575e-05,
"loss": 1.2048,
"step": 940
},
{
"epoch": 0.4362880886426593,
"grad_norm": 3.943257567360323,
"learning_rate": 1.3869607612781733e-05,
"loss": 1.2773,
"step": 945
},
{
"epoch": 0.43859649122807015,
"grad_norm": 3.53206021655625,
"learning_rate": 1.3795166410566834e-05,
"loss": 1.2066,
"step": 950
},
{
"epoch": 0.4409048938134811,
"grad_norm": 3.8322607840339504,
"learning_rate": 1.372047869267184e-05,
"loss": 1.2104,
"step": 955
},
{
"epoch": 0.44321329639889195,
"grad_norm": 4.982802180271467,
"learning_rate": 1.364554931045018e-05,
"loss": 1.2782,
"step": 960
},
{
"epoch": 0.4455216989843029,
"grad_norm": 4.121927772157904,
"learning_rate": 1.3570383130952627e-05,
"loss": 1.2221,
"step": 965
},
{
"epoch": 0.44783010156971376,
"grad_norm": 3.5401426054616674,
"learning_rate": 1.349498503661116e-05,
"loss": 1.249,
"step": 970
},
{
"epoch": 0.45013850415512463,
"grad_norm": 3.8347876039826647,
"learning_rate": 1.3419359924921833e-05,
"loss": 1.2736,
"step": 975
},
{
"epoch": 0.45244690674053556,
"grad_norm": 4.86416192250325,
"learning_rate": 1.3343512708126642e-05,
"loss": 1.2032,
"step": 980
},
{
"epoch": 0.45475530932594643,
"grad_norm": 3.8508803970513004,
"learning_rate": 1.326744831289447e-05,
"loss": 1.2465,
"step": 985
},
{
"epoch": 0.45706371191135736,
"grad_norm": 3.276661833625774,
"learning_rate": 1.3191171680001048e-05,
"loss": 1.1905,
"step": 990
},
{
"epoch": 0.45937211449676824,
"grad_norm": 3.6488550777243933,
"learning_rate": 1.3114687764008048e-05,
"loss": 1.1991,
"step": 995
},
{
"epoch": 0.4616805170821791,
"grad_norm": 3.9637997706000223,
"learning_rate": 1.3038001532941249e-05,
"loss": 1.1994,
"step": 1000
},
{
"epoch": 0.46398891966759004,
"grad_norm": 3.7798295608326447,
"learning_rate": 1.2961117967967844e-05,
"loss": 1.2327,
"step": 1005
},
{
"epoch": 0.4662973222530009,
"grad_norm": 3.742363753899004,
"learning_rate": 1.2884042063072881e-05,
"loss": 1.2415,
"step": 1010
},
{
"epoch": 0.46860572483841184,
"grad_norm": 4.00995610689072,
"learning_rate": 1.280677882473488e-05,
"loss": 1.2449,
"step": 1015
},
{
"epoch": 0.4709141274238227,
"grad_norm": 3.7802768150285284,
"learning_rate": 1.272933327160063e-05,
"loss": 1.2055,
"step": 1020
},
{
"epoch": 0.4732225300092336,
"grad_norm": 3.979719082398227,
"learning_rate": 1.2651710434159223e-05,
"loss": 1.1452,
"step": 1025
},
{
"epoch": 0.4755309325946445,
"grad_norm": 3.7987734509998012,
"learning_rate": 1.2573915354415274e-05,
"loss": 1.2266,
"step": 1030
},
{
"epoch": 0.4778393351800554,
"grad_norm": 3.4449265105850344,
"learning_rate": 1.2495953085561426e-05,
"loss": 1.1678,
"step": 1035
},
{
"epoch": 0.4801477377654663,
"grad_norm": 4.703831538180476,
"learning_rate": 1.241782869165012e-05,
"loss": 1.1893,
"step": 1040
},
{
"epoch": 0.4824561403508772,
"grad_norm": 3.56138065098868,
"learning_rate": 1.2339547247264658e-05,
"loss": 1.2285,
"step": 1045
},
{
"epoch": 0.48476454293628807,
"grad_norm": 3.8664090630676147,
"learning_rate": 1.2261113837189587e-05,
"loss": 1.1995,
"step": 1050
},
{
"epoch": 0.487072945521699,
"grad_norm": 3.6587622685467553,
"learning_rate": 1.2182533556080402e-05,
"loss": 1.2456,
"step": 1055
},
{
"epoch": 0.48938134810710987,
"grad_norm": 3.4219623018934615,
"learning_rate": 1.2103811508132642e-05,
"loss": 1.1904,
"step": 1060
},
{
"epoch": 0.4916897506925208,
"grad_norm": 3.91141223990254,
"learning_rate": 1.2024952806750321e-05,
"loss": 1.1811,
"step": 1065
},
{
"epoch": 0.4939981532779317,
"grad_norm": 3.707066130468398,
"learning_rate": 1.1945962574213814e-05,
"loss": 1.212,
"step": 1070
},
{
"epoch": 0.49630655586334255,
"grad_norm": 3.5782501836947653,
"learning_rate": 1.1866845941347118e-05,
"loss": 1.2255,
"step": 1075
},
{
"epoch": 0.4986149584487535,
"grad_norm": 4.303350644777213,
"learning_rate": 1.1787608047184583e-05,
"loss": 1.1376,
"step": 1080
},
{
"epoch": 0.5009233610341643,
"grad_norm": 3.419543860379626,
"learning_rate": 1.1708254038637115e-05,
"loss": 1.1872,
"step": 1085
},
{
"epoch": 0.5032317636195752,
"grad_norm": 3.586294780528409,
"learning_rate": 1.1628789070157836e-05,
"loss": 1.2114,
"step": 1090
},
{
"epoch": 0.5055401662049861,
"grad_norm": 3.6647616517214496,
"learning_rate": 1.1549218303407305e-05,
"loss": 1.2088,
"step": 1095
},
{
"epoch": 0.5078485687903971,
"grad_norm": 3.6209405687157794,
"learning_rate": 1.1469546906918219e-05,
"loss": 1.1535,
"step": 1100
},
{
"epoch": 0.510156971375808,
"grad_norm": 3.4760951984933777,
"learning_rate": 1.1389780055759689e-05,
"loss": 1.1692,
"step": 1105
},
{
"epoch": 0.5124653739612188,
"grad_norm": 3.523587148397925,
"learning_rate": 1.1309922931201114e-05,
"loss": 1.1795,
"step": 1110
},
{
"epoch": 0.5147737765466297,
"grad_norm": 3.399747435026194,
"learning_rate": 1.1229980720375609e-05,
"loss": 1.1913,
"step": 1115
},
{
"epoch": 0.5170821791320406,
"grad_norm": 3.802970464768176,
"learning_rate": 1.114995861594308e-05,
"loss": 1.1692,
"step": 1120
},
{
"epoch": 0.5193905817174516,
"grad_norm": 3.571347595436078,
"learning_rate": 1.1069861815752944e-05,
"loss": 1.1575,
"step": 1125
},
{
"epoch": 0.5216989843028624,
"grad_norm": 3.702241350827994,
"learning_rate": 1.0989695522506486e-05,
"loss": 1.1776,
"step": 1130
},
{
"epoch": 0.5240073868882733,
"grad_norm": 4.396145181294285,
"learning_rate": 1.0909464943418926e-05,
"loss": 1.2055,
"step": 1135
},
{
"epoch": 0.5263157894736842,
"grad_norm": 3.402649511273165,
"learning_rate": 1.0829175289881188e-05,
"loss": 1.2024,
"step": 1140
},
{
"epoch": 0.528624192059095,
"grad_norm": 3.321901777095843,
"learning_rate": 1.074883177712138e-05,
"loss": 1.1317,
"step": 1145
},
{
"epoch": 0.530932594644506,
"grad_norm": 4.575011114858196,
"learning_rate": 1.0668439623866043e-05,
"loss": 1.1516,
"step": 1150
},
{
"epoch": 0.5332409972299169,
"grad_norm": 3.428811319179132,
"learning_rate": 1.0588004052001177e-05,
"loss": 1.1326,
"step": 1155
},
{
"epoch": 0.5355493998153278,
"grad_norm": 3.758823500740248,
"learning_rate": 1.0507530286233042e-05,
"loss": 1.1523,
"step": 1160
},
{
"epoch": 0.5378578024007387,
"grad_norm": 3.828420445656179,
"learning_rate": 1.0427023553748792e-05,
"loss": 1.215,
"step": 1165
},
{
"epoch": 0.5401662049861495,
"grad_norm": 3.872474623427253,
"learning_rate": 1.0346489083876928e-05,
"loss": 1.1798,
"step": 1170
},
{
"epoch": 0.5424746075715605,
"grad_norm": 4.343223419966708,
"learning_rate": 1.0265932107747656e-05,
"loss": 1.1964,
"step": 1175
},
{
"epoch": 0.5447830101569714,
"grad_norm": 3.4458152638291533,
"learning_rate": 1.0185357857953064e-05,
"loss": 1.188,
"step": 1180
},
{
"epoch": 0.5470914127423823,
"grad_norm": 3.3343026801443765,
"learning_rate": 1.0104771568207266e-05,
"loss": 1.1524,
"step": 1185
},
{
"epoch": 0.5493998153277931,
"grad_norm": 3.8325280372919774,
"learning_rate": 1.0024178473006418e-05,
"loss": 1.1445,
"step": 1190
},
{
"epoch": 0.551708217913204,
"grad_norm": 3.913934401934443,
"learning_rate": 9.943583807288746e-06,
"loss": 1.1497,
"step": 1195
},
{
"epoch": 0.554016620498615,
"grad_norm": 3.8771337742661585,
"learning_rate": 9.862992806094473e-06,
"loss": 1.1584,
"step": 1200
},
{
"epoch": 0.5563250230840259,
"grad_norm": 3.385706053842486,
"learning_rate": 9.782410704225793e-06,
"loss": 1.133,
"step": 1205
},
{
"epoch": 0.5586334256694367,
"grad_norm": 3.228558572718497,
"learning_rate": 9.701842735906855e-06,
"loss": 1.1714,
"step": 1210
},
{
"epoch": 0.5609418282548476,
"grad_norm": 3.376489834368575,
"learning_rate": 9.621294134443747e-06,
"loss": 1.1782,
"step": 1215
},
{
"epoch": 0.5632502308402585,
"grad_norm": 4.101023970778267,
"learning_rate": 9.54077013188459e-06,
"loss": 1.1679,
"step": 1220
},
{
"epoch": 0.5655586334256695,
"grad_norm": 3.459693677788322,
"learning_rate": 9.460275958679674e-06,
"loss": 1.2272,
"step": 1225
},
{
"epoch": 0.5678670360110804,
"grad_norm": 3.5741244509053556,
"learning_rate": 9.379816843341715e-06,
"loss": 1.1679,
"step": 1230
},
{
"epoch": 0.5701754385964912,
"grad_norm": 14.959841662019736,
"learning_rate": 9.299398012106246e-06,
"loss": 1.1557,
"step": 1235
},
{
"epoch": 0.5724838411819021,
"grad_norm": 3.479142794568544,
"learning_rate": 9.219024688592136e-06,
"loss": 1.191,
"step": 1240
},
{
"epoch": 0.574792243767313,
"grad_norm": 3.4791994405128195,
"learning_rate": 9.138702093462286e-06,
"loss": 1.1632,
"step": 1245
},
{
"epoch": 0.577100646352724,
"grad_norm": 3.378297795269278,
"learning_rate": 9.058435444084543e-06,
"loss": 1.2058,
"step": 1250
},
{
"epoch": 0.5794090489381348,
"grad_norm": 3.3312286796444948,
"learning_rate": 8.978229954192775e-06,
"loss": 1.2072,
"step": 1255
},
{
"epoch": 0.5817174515235457,
"grad_norm": 3.2936946867277497,
"learning_rate": 8.898090833548226e-06,
"loss": 1.1479,
"step": 1260
},
{
"epoch": 0.5840258541089566,
"grad_norm": 3.5657195698986306,
"learning_rate": 8.818023287601117e-06,
"loss": 1.1579,
"step": 1265
},
{
"epoch": 0.5863342566943675,
"grad_norm": 3.85534125869907,
"learning_rate": 8.738032517152523e-06,
"loss": 1.1748,
"step": 1270
},
{
"epoch": 0.5886426592797784,
"grad_norm": 3.3807308381583585,
"learning_rate": 8.658123718016548e-06,
"loss": 1.1365,
"step": 1275
},
{
"epoch": 0.5909510618651893,
"grad_norm": 3.75547737356039,
"learning_rate": 8.578302080682844e-06,
"loss": 1.1657,
"step": 1280
},
{
"epoch": 0.5932594644506002,
"grad_norm": 3.334058557259955,
"learning_rate": 8.498572789979446e-06,
"loss": 1.1653,
"step": 1285
},
{
"epoch": 0.5955678670360111,
"grad_norm": 3.596795067568704,
"learning_rate": 8.418941024735997e-06,
"loss": 1.1909,
"step": 1290
},
{
"epoch": 0.5978762696214219,
"grad_norm": 3.754106205642103,
"learning_rate": 8.33941195744737e-06,
"loss": 1.1595,
"step": 1295
},
{
"epoch": 0.6001846722068329,
"grad_norm": 3.3575559431036988,
"learning_rate": 8.259990753937662e-06,
"loss": 1.1378,
"step": 1300
},
{
"epoch": 0.6024930747922438,
"grad_norm": 4.011372021010383,
"learning_rate": 8.18068257302466e-06,
"loss": 1.1832,
"step": 1305
},
{
"epoch": 0.6048014773776547,
"grad_norm": 3.404379906541828,
"learning_rate": 8.101492566184757e-06,
"loss": 1.1592,
"step": 1310
},
{
"epoch": 0.6071098799630655,
"grad_norm": 3.498132121516362,
"learning_rate": 8.022425877218321e-06,
"loss": 1.1591,
"step": 1315
},
{
"epoch": 0.6094182825484764,
"grad_norm": 3.523586580045349,
"learning_rate": 7.943487641915595e-06,
"loss": 1.1525,
"step": 1320
},
{
"epoch": 0.6117266851338874,
"grad_norm": 3.6229726894839858,
"learning_rate": 7.864682987723082e-06,
"loss": 1.1618,
"step": 1325
},
{
"epoch": 0.6140350877192983,
"grad_norm": 3.696989469787097,
"learning_rate": 7.78601703341051e-06,
"loss": 1.1824,
"step": 1330
},
{
"epoch": 0.6163434903047091,
"grad_norm": 3.567967173775001,
"learning_rate": 7.70749488873833e-06,
"loss": 1.1792,
"step": 1335
},
{
"epoch": 0.61865189289012,
"grad_norm": 3.399928397766497,
"learning_rate": 7.629121654125808e-06,
"loss": 1.1438,
"step": 1340
},
{
"epoch": 0.6209602954755309,
"grad_norm": 3.6344006441397414,
"learning_rate": 7.550902420319742e-06,
"loss": 1.1591,
"step": 1345
},
{
"epoch": 0.6232686980609419,
"grad_norm": 3.538106316840523,
"learning_rate": 7.472842268063776e-06,
"loss": 1.1311,
"step": 1350
},
{
"epoch": 0.6255771006463527,
"grad_norm": 3.661558906665894,
"learning_rate": 7.394946267768381e-06,
"loss": 1.1621,
"step": 1355
},
{
"epoch": 0.6278855032317636,
"grad_norm": 3.6197107279149954,
"learning_rate": 7.317219479181517e-06,
"loss": 1.1028,
"step": 1360
},
{
"epoch": 0.6301939058171745,
"grad_norm": 3.4094252840241355,
"learning_rate": 7.23966695105996e-06,
"loss": 1.119,
"step": 1365
},
{
"epoch": 0.6325023084025854,
"grad_norm": 3.4085855538144467,
"learning_rate": 7.162293720841378e-06,
"loss": 1.1438,
"step": 1370
},
{
"epoch": 0.6348107109879964,
"grad_norm": 4.073406312500022,
"learning_rate": 7.085104814317101e-06,
"loss": 1.1729,
"step": 1375
},
{
"epoch": 0.6371191135734072,
"grad_norm": 3.572178264074241,
"learning_rate": 7.008105245305699e-06,
"loss": 1.1661,
"step": 1380
},
{
"epoch": 0.6394275161588181,
"grad_norm": 3.81528951625221,
"learning_rate": 6.931300015327274e-06,
"loss": 1.1571,
"step": 1385
},
{
"epoch": 0.641735918744229,
"grad_norm": 3.2846636335941763,
"learning_rate": 6.854694113278614e-06,
"loss": 1.154,
"step": 1390
},
{
"epoch": 0.6440443213296398,
"grad_norm": 3.2544013227776007,
"learning_rate": 6.7782925151091224e-06,
"loss": 1.0823,
"step": 1395
},
{
"epoch": 0.6463527239150508,
"grad_norm": 3.482450898904014,
"learning_rate": 6.702100183497613e-06,
"loss": 1.1803,
"step": 1400
},
{
"epoch": 0.6486611265004617,
"grad_norm": 3.412256349030684,
"learning_rate": 6.62612206752995e-06,
"loss": 1.1643,
"step": 1405
},
{
"epoch": 0.6509695290858726,
"grad_norm": 3.75196899322532,
"learning_rate": 6.550363102377588e-06,
"loss": 1.1117,
"step": 1410
},
{
"epoch": 0.6532779316712835,
"grad_norm": 3.3485189294016258,
"learning_rate": 6.474828208976998e-06,
"loss": 1.1466,
"step": 1415
},
{
"epoch": 0.6555863342566943,
"grad_norm": 3.4421443761863104,
"learning_rate": 6.3995222937100455e-06,
"loss": 1.1468,
"step": 1420
},
{
"epoch": 0.6578947368421053,
"grad_norm": 3.4653107797221683,
"learning_rate": 6.324450248085265e-06,
"loss": 1.1418,
"step": 1425
},
{
"epoch": 0.6602031394275162,
"grad_norm": 3.450235228111911,
"learning_rate": 6.249616948420161e-06,
"loss": 1.1393,
"step": 1430
},
{
"epoch": 0.6625115420129271,
"grad_norm": 3.648594332616919,
"learning_rate": 6.175027255524446e-06,
"loss": 1.1263,
"step": 1435
},
{
"epoch": 0.6648199445983379,
"grad_norm": 3.50804118935427,
"learning_rate": 6.100686014384315e-06,
"loss": 1.1497,
"step": 1440
},
{
"epoch": 0.6671283471837488,
"grad_norm": 3.407303145023877,
"learning_rate": 6.026598053847743e-06,
"loss": 1.1217,
"step": 1445
},
{
"epoch": 0.6694367497691598,
"grad_norm": 3.6049741156075426,
"learning_rate": 5.952768186310813e-06,
"loss": 1.2134,
"step": 1450
},
{
"epoch": 0.6717451523545707,
"grad_norm": 3.347553717603198,
"learning_rate": 5.879201207405136e-06,
"loss": 1.1189,
"step": 1455
},
{
"epoch": 0.6740535549399815,
"grad_norm": 3.7624263901785087,
"learning_rate": 5.805901895686344e-06,
"loss": 1.1217,
"step": 1460
},
{
"epoch": 0.6763619575253924,
"grad_norm": 3.6359056480115193,
"learning_rate": 5.732875012323712e-06,
"loss": 1.1275,
"step": 1465
},
{
"epoch": 0.6786703601108033,
"grad_norm": 3.5660085050284946,
"learning_rate": 5.660125300790873e-06,
"loss": 1.153,
"step": 1470
},
{
"epoch": 0.6809787626962143,
"grad_norm": 3.4188915438262946,
"learning_rate": 5.58765748655772e-06,
"loss": 1.126,
"step": 1475
},
{
"epoch": 0.6832871652816251,
"grad_norm": 3.7409360766713995,
"learning_rate": 5.5154762767834605e-06,
"loss": 1.1312,
"step": 1480
},
{
"epoch": 0.685595567867036,
"grad_norm": 3.5176838276710787,
"learning_rate": 5.443586360010859e-06,
"loss": 1.118,
"step": 1485
},
{
"epoch": 0.6879039704524469,
"grad_norm": 3.940112737940071,
"learning_rate": 5.3719924058616975e-06,
"loss": 1.1084,
"step": 1490
},
{
"epoch": 0.6902123730378578,
"grad_norm": 3.5725656073039516,
"learning_rate": 5.30069906473345e-06,
"loss": 1.1462,
"step": 1495
},
{
"epoch": 0.6925207756232687,
"grad_norm": 3.585328985764251,
"learning_rate": 5.2297109674972166e-06,
"loss": 1.1275,
"step": 1500
},
{
"epoch": 0.6948291782086796,
"grad_norm": 3.7150630899084276,
"learning_rate": 5.159032725196946e-06,
"loss": 1.1573,
"step": 1505
},
{
"epoch": 0.6971375807940905,
"grad_norm": 3.4991531847637893,
"learning_rate": 5.088668928749891e-06,
"loss": 1.1339,
"step": 1510
},
{
"epoch": 0.6994459833795014,
"grad_norm": 3.337315702796277,
"learning_rate": 5.0186241486484245e-06,
"loss": 1.1121,
"step": 1515
},
{
"epoch": 0.7017543859649122,
"grad_norm": 3.166495977462906,
"learning_rate": 4.948902934663158e-06,
"loss": 1.1207,
"step": 1520
},
{
"epoch": 0.7040627885503232,
"grad_norm": 3.269883473204096,
"learning_rate": 4.879509815547413e-06,
"loss": 1.1067,
"step": 1525
},
{
"epoch": 0.7063711911357341,
"grad_norm": 3.2549683491943138,
"learning_rate": 4.810449298743051e-06,
"loss": 1.0858,
"step": 1530
},
{
"epoch": 0.708679593721145,
"grad_norm": 3.673192940396545,
"learning_rate": 4.741725870087693e-06,
"loss": 1.1674,
"step": 1535
},
{
"epoch": 0.7109879963065558,
"grad_norm": 3.295243146355197,
"learning_rate": 4.673343993523347e-06,
"loss": 1.1087,
"step": 1540
},
{
"epoch": 0.7132963988919667,
"grad_norm": 3.4162872942710867,
"learning_rate": 4.605308110806436e-06,
"loss": 1.1224,
"step": 1545
},
{
"epoch": 0.7156048014773777,
"grad_norm": 3.3989883160652865,
"learning_rate": 4.537622641219309e-06,
"loss": 1.1307,
"step": 1550
},
{
"epoch": 0.7179132040627886,
"grad_norm": 3.2956559559454663,
"learning_rate": 4.47029198128316e-06,
"loss": 1.0944,
"step": 1555
},
{
"epoch": 0.7202216066481995,
"grad_norm": 3.3797718456778765,
"learning_rate": 4.403320504472463e-06,
"loss": 1.1426,
"step": 1560
},
{
"epoch": 0.7225300092336103,
"grad_norm": 3.1832339015639826,
"learning_rate": 4.336712560930891e-06,
"loss": 1.1223,
"step": 1565
},
{
"epoch": 0.7248384118190212,
"grad_norm": 3.40273969921815,
"learning_rate": 4.270472477188755e-06,
"loss": 1.1151,
"step": 1570
},
{
"epoch": 0.7271468144044322,
"grad_norm": 3.32953363908172,
"learning_rate": 4.204604555881967e-06,
"loss": 1.1055,
"step": 1575
},
{
"epoch": 0.7294552169898431,
"grad_norm": 3.363759228103856,
"learning_rate": 4.139113075472565e-06,
"loss": 1.15,
"step": 1580
},
{
"epoch": 0.7317636195752539,
"grad_norm": 3.5692625205390214,
"learning_rate": 4.074002289970801e-06,
"loss": 1.1249,
"step": 1585
},
{
"epoch": 0.7340720221606648,
"grad_norm": 3.6298117912857895,
"learning_rate": 4.009276428658836e-06,
"loss": 1.0911,
"step": 1590
},
{
"epoch": 0.7363804247460757,
"grad_norm": 3.501911680130801,
"learning_rate": 3.944939695816005e-06,
"loss": 1.0591,
"step": 1595
},
{
"epoch": 0.7386888273314867,
"grad_norm": 3.314254645913856,
"learning_rate": 3.8809962704457375e-06,
"loss": 1.122,
"step": 1600
},
{
"epoch": 0.7409972299168975,
"grad_norm": 3.56145944415269,
"learning_rate": 3.81745030600411e-06,
"loss": 1.1036,
"step": 1605
},
{
"epoch": 0.7433056325023084,
"grad_norm": 3.4910849192084235,
"learning_rate": 3.75430593013006e-06,
"loss": 1.1353,
"step": 1610
},
{
"epoch": 0.7456140350877193,
"grad_norm": 3.325715619787326,
"learning_rate": 3.6915672443772644e-06,
"loss": 1.1538,
"step": 1615
},
{
"epoch": 0.7479224376731302,
"grad_norm": 3.5950013679874724,
"learning_rate": 3.62923832394774e-06,
"loss": 1.0909,
"step": 1620
},
{
"epoch": 0.7502308402585411,
"grad_norm": 3.1524005532212334,
"learning_rate": 3.56732321742712e-06,
"loss": 1.1125,
"step": 1625
},
{
"epoch": 0.752539242843952,
"grad_norm": 3.6760451234626124,
"learning_rate": 3.5058259465216828e-06,
"loss": 1.1039,
"step": 1630
},
{
"epoch": 0.7548476454293629,
"grad_norm": 3.341546948891595,
"learning_rate": 3.444750505797123e-06,
"loss": 1.0531,
"step": 1635
},
{
"epoch": 0.7571560480147738,
"grad_norm": 3.35627649123262,
"learning_rate": 3.384100862419096e-06,
"loss": 1.0931,
"step": 1640
},
{
"epoch": 0.7594644506001846,
"grad_norm": 3.6221419833131527,
"learning_rate": 3.3238809558955054e-06,
"loss": 1.0797,
"step": 1645
},
{
"epoch": 0.7617728531855956,
"grad_norm": 3.3487671296828267,
"learning_rate": 3.2640946978206266e-06,
"loss": 1.0812,
"step": 1650
},
{
"epoch": 0.7640812557710065,
"grad_norm": 3.441031645390376,
"learning_rate": 3.2047459716210306e-06,
"loss": 1.1155,
"step": 1655
},
{
"epoch": 0.7663896583564174,
"grad_norm": 3.4825057106301096,
"learning_rate": 3.145838632303325e-06,
"loss": 1.096,
"step": 1660
},
{
"epoch": 0.7686980609418282,
"grad_norm": 3.4525699686491875,
"learning_rate": 3.087376506203763e-06,
"loss": 1.145,
"step": 1665
},
{
"epoch": 0.7710064635272391,
"grad_norm": 3.2639030957505715,
"learning_rate": 3.0293633907396903e-06,
"loss": 1.0711,
"step": 1670
},
{
"epoch": 0.7733148661126501,
"grad_norm": 3.247147491878351,
"learning_rate": 2.971803054162903e-06,
"loss": 1.0367,
"step": 1675
},
{
"epoch": 0.775623268698061,
"grad_norm": 3.3628039668359824,
"learning_rate": 2.914699235314855e-06,
"loss": 1.1311,
"step": 1680
},
{
"epoch": 0.7779316712834718,
"grad_norm": 3.294560766749018,
"learning_rate": 2.858055643383818e-06,
"loss": 1.1303,
"step": 1685
},
{
"epoch": 0.7802400738688827,
"grad_norm": 3.252460881051861,
"learning_rate": 2.8018759576639478e-06,
"loss": 1.0894,
"step": 1690
},
{
"epoch": 0.7825484764542936,
"grad_norm": 3.6541818791755083,
"learning_rate": 2.7461638273162895e-06,
"loss": 1.1416,
"step": 1695
},
{
"epoch": 0.7848568790397045,
"grad_norm": 3.3018114290440286,
"learning_rate": 2.6909228711317526e-06,
"loss": 1.0898,
"step": 1700
},
{
"epoch": 0.7871652816251155,
"grad_norm": 3.5110479717681704,
"learning_rate": 2.6361566772960466e-06,
"loss": 1.0887,
"step": 1705
},
{
"epoch": 0.7894736842105263,
"grad_norm": 3.469571849173682,
"learning_rate": 2.5818688031566132e-06,
"loss": 1.0182,
"step": 1710
},
{
"epoch": 0.7917820867959372,
"grad_norm": 3.761287355693432,
"learning_rate": 2.5280627749915544e-06,
"loss": 1.1246,
"step": 1715
},
{
"epoch": 0.7940904893813481,
"grad_norm": 3.7171990367681866,
"learning_rate": 2.4747420877805905e-06,
"loss": 1.1008,
"step": 1720
},
{
"epoch": 0.796398891966759,
"grad_norm": 3.583342537171837,
"learning_rate": 2.421910204978033e-06,
"loss": 1.092,
"step": 1725
},
{
"epoch": 0.7987072945521699,
"grad_norm": 3.3105866570237343,
"learning_rate": 2.369570558287819e-06,
"loss": 1.0495,
"step": 1730
},
{
"epoch": 0.8010156971375808,
"grad_norm": 3.453250654565143,
"learning_rate": 2.3177265474406084e-06,
"loss": 1.0952,
"step": 1735
},
{
"epoch": 0.8033240997229917,
"grad_norm": 3.2111312681793294,
"learning_rate": 2.2663815399729495e-06,
"loss": 1.0756,
"step": 1740
},
{
"epoch": 0.8056325023084026,
"grad_norm": 3.398739502823191,
"learning_rate": 2.215538871008538e-06,
"loss": 1.0855,
"step": 1745
},
{
"epoch": 0.8079409048938134,
"grad_norm": 3.4089573083048883,
"learning_rate": 2.1652018430415923e-06,
"loss": 1.0707,
"step": 1750
},
{
"epoch": 0.8102493074792244,
"grad_norm": 3.7996382043873744,
"learning_rate": 2.115373725722326e-06,
"loss": 1.1419,
"step": 1755
},
{
"epoch": 0.8125577100646353,
"grad_norm": 3.4303103622199203,
"learning_rate": 2.066057755644587e-06,
"loss": 1.1101,
"step": 1760
},
{
"epoch": 0.8148661126500462,
"grad_norm": 3.3758394994097363,
"learning_rate": 2.0172571361356007e-06,
"loss": 1.0975,
"step": 1765
},
{
"epoch": 0.817174515235457,
"grad_norm": 3.2901551940425673,
"learning_rate": 1.9689750370479134e-06,
"loss": 1.0797,
"step": 1770
},
{
"epoch": 0.8194829178208679,
"grad_norm": 3.661068632899665,
"learning_rate": 1.921214594553488e-06,
"loss": 1.1287,
"step": 1775
},
{
"epoch": 0.8217913204062789,
"grad_norm": 3.5442080312978415,
"learning_rate": 1.8739789109399954e-06,
"loss": 1.1514,
"step": 1780
},
{
"epoch": 0.8240997229916898,
"grad_norm": 3.3534741257777325,
"learning_rate": 1.8272710544093019e-06,
"loss": 1.0824,
"step": 1785
},
{
"epoch": 0.8264081255771006,
"grad_norm": 3.570055818522298,
"learning_rate": 1.7810940588781811e-06,
"loss": 1.1313,
"step": 1790
},
{
"epoch": 0.8287165281625115,
"grad_norm": 3.3907592881825352,
"learning_rate": 1.7354509237812334e-06,
"loss": 1.0458,
"step": 1795
},
{
"epoch": 0.8310249307479224,
"grad_norm": 3.7660635086416794,
"learning_rate": 1.690344613876066e-06,
"loss": 1.109,
"step": 1800
},
{
"epoch": 0.8333333333333334,
"grad_norm": 20.624336407323348,
"learning_rate": 1.64577805905072e-06,
"loss": 1.0872,
"step": 1805
},
{
"epoch": 0.8356417359187442,
"grad_norm": 3.38434013035599,
"learning_rate": 1.601754154133347e-06,
"loss": 1.0943,
"step": 1810
},
{
"epoch": 0.8379501385041551,
"grad_norm": 3.3282071197431318,
"learning_rate": 1.558275758704183e-06,
"loss": 1.0983,
"step": 1815
},
{
"epoch": 0.840258541089566,
"grad_norm": 3.4156960745203286,
"learning_rate": 1.5153456969098013e-06,
"loss": 1.0381,
"step": 1820
},
{
"epoch": 0.8425669436749769,
"grad_norm": 3.3418973703656274,
"learning_rate": 1.4729667572796735e-06,
"loss": 1.1452,
"step": 1825
},
{
"epoch": 0.8448753462603878,
"grad_norm": 3.333897377962453,
"learning_rate": 1.431141692545036e-06,
"loss": 1.1076,
"step": 1830
},
{
"epoch": 0.8471837488457987,
"grad_norm": 3.402941306050666,
"learning_rate": 1.389873219460085e-06,
"loss": 1.0869,
"step": 1835
},
{
"epoch": 0.8494921514312096,
"grad_norm": 3.3313186519496423,
"learning_rate": 1.349164018625513e-06,
"loss": 1.0765,
"step": 1840
},
{
"epoch": 0.8518005540166205,
"grad_norm": 3.6011720414080566,
"learning_rate": 1.3090167343143911e-06,
"loss": 1.0846,
"step": 1845
},
{
"epoch": 0.8541089566020313,
"grad_norm": 3.629326020817196,
"learning_rate": 1.2694339743004037e-06,
"loss": 1.1088,
"step": 1850
},
{
"epoch": 0.8564173591874423,
"grad_norm": 3.6305906598709767,
"learning_rate": 1.2304183096884626e-06,
"loss": 1.0875,
"step": 1855
},
{
"epoch": 0.8587257617728532,
"grad_norm": 3.35865168543221,
"learning_rate": 1.1919722747477024e-06,
"loss": 1.1143,
"step": 1860
},
{
"epoch": 0.8610341643582641,
"grad_norm": 3.3889339992199177,
"learning_rate": 1.1540983667468686e-06,
"loss": 1.0916,
"step": 1865
},
{
"epoch": 0.863342566943675,
"grad_norm": 3.3133014347890324,
"learning_rate": 1.1167990457920985e-06,
"loss": 1.0877,
"step": 1870
},
{
"epoch": 0.8656509695290858,
"grad_norm": 3.415023896862017,
"learning_rate": 1.0800767346671347e-06,
"loss": 1.0284,
"step": 1875
},
{
"epoch": 0.8679593721144968,
"grad_norm": 3.322962975732958,
"learning_rate": 1.043933818675944e-06,
"loss": 1.0782,
"step": 1880
},
{
"epoch": 0.8702677746999077,
"grad_norm": 3.583896655771928,
"learning_rate": 1.008372645487785e-06,
"loss": 1.08,
"step": 1885
},
{
"epoch": 0.8725761772853186,
"grad_norm": 3.3057678718948726,
"learning_rate": 9.733955249847183e-07,
"loss": 1.1034,
"step": 1890
},
{
"epoch": 0.8748845798707294,
"grad_norm": 3.4387092657320997,
"learning_rate": 9.390047291115567e-07,
"loss": 1.0915,
"step": 1895
},
{
"epoch": 0.8771929824561403,
"grad_norm": 3.8029482282950324,
"learning_rate": 9.052024917282987e-07,
"loss": 1.057,
"step": 1900
},
{
"epoch": 0.8795013850415513,
"grad_norm": 3.3990790831971465,
"learning_rate": 8.719910084650262e-07,
"loss": 1.0725,
"step": 1905
},
{
"epoch": 0.8818097876269622,
"grad_norm": 3.262416726762208,
"learning_rate": 8.393724365792866e-07,
"loss": 1.1028,
"step": 1910
},
{
"epoch": 0.884118190212373,
"grad_norm": 3.551691283783414,
"learning_rate": 8.073488948159691e-07,
"loss": 1.0546,
"step": 1915
},
{
"epoch": 0.8864265927977839,
"grad_norm": 3.5211563130144197,
"learning_rate": 7.759224632696793e-07,
"loss": 1.1024,
"step": 1920
},
{
"epoch": 0.8887349953831948,
"grad_norm": 3.5958803804208976,
"learning_rate": 7.450951832496233e-07,
"loss": 1.0698,
"step": 1925
},
{
"epoch": 0.8910433979686058,
"grad_norm": 4.107811963680795,
"learning_rate": 7.148690571470251e-07,
"loss": 1.0613,
"step": 1930
},
{
"epoch": 0.8933518005540166,
"grad_norm": 3.6280688174940416,
"learning_rate": 6.852460483050494e-07,
"loss": 1.0987,
"step": 1935
},
{
"epoch": 0.8956602031394275,
"grad_norm": 3.4197153407779055,
"learning_rate": 6.562280808912768e-07,
"loss": 1.081,
"step": 1940
},
{
"epoch": 0.8979686057248384,
"grad_norm": 3.3975321682078494,
"learning_rate": 6.278170397727179e-07,
"loss": 1.0881,
"step": 1945
},
{
"epoch": 0.9002770083102493,
"grad_norm": 3.385824657440924,
"learning_rate": 6.000147703933845e-07,
"loss": 1.0725,
"step": 1950
},
{
"epoch": 0.9025854108956602,
"grad_norm": 3.733106691108023,
"learning_rate": 5.728230786544153e-07,
"loss": 1.0886,
"step": 1955
},
{
"epoch": 0.9048938134810711,
"grad_norm": 3.3831515124529288,
"learning_rate": 5.46243730796776e-07,
"loss": 1.0854,
"step": 1960
},
{
"epoch": 0.907202216066482,
"grad_norm": 3.4106013139907065,
"learning_rate": 5.202784532865302e-07,
"loss": 1.114,
"step": 1965
},
{
"epoch": 0.9095106186518929,
"grad_norm": 3.130011381325973,
"learning_rate": 4.949289327026952e-07,
"loss": 1.0873,
"step": 1970
},
{
"epoch": 0.9118190212373037,
"grad_norm": 3.3600219468750394,
"learning_rate": 4.7019681562769816e-07,
"loss": 1.0689,
"step": 1975
},
{
"epoch": 0.9141274238227147,
"grad_norm": 3.379655670825615,
"learning_rate": 4.460837085404113e-07,
"loss": 1.0874,
"step": 1980
},
{
"epoch": 0.9164358264081256,
"grad_norm": 3.324809563310868,
"learning_rate": 4.225911777118097e-07,
"loss": 1.0894,
"step": 1985
},
{
"epoch": 0.9187442289935365,
"grad_norm": 3.4668181744618196,
"learning_rate": 3.9972074910323066e-07,
"loss": 1.0896,
"step": 1990
},
{
"epoch": 0.9210526315789473,
"grad_norm": 3.4175120046363276,
"learning_rate": 3.7747390826725736e-07,
"loss": 1.0608,
"step": 1995
},
{
"epoch": 0.9233610341643582,
"grad_norm": 3.365932789028912,
"learning_rate": 3.5585210025122166e-07,
"loss": 1.0465,
"step": 2000
},
{
"epoch": 0.9256694367497692,
"grad_norm": 3.3721429412301442,
"learning_rate": 3.3485672950334447e-07,
"loss": 1.0782,
"step": 2005
},
{
"epoch": 0.9279778393351801,
"grad_norm": 3.402893452692765,
"learning_rate": 3.1448915978150365e-07,
"loss": 1.0575,
"step": 2010
},
{
"epoch": 0.930286241920591,
"grad_norm": 3.3246351042614606,
"learning_rate": 2.947507140646588e-07,
"loss": 1.093,
"step": 2015
},
{
"epoch": 0.9325946445060018,
"grad_norm": 3.42392243323848,
"learning_rate": 2.756426744669105e-07,
"loss": 1.0709,
"step": 2020
},
{
"epoch": 0.9349030470914127,
"grad_norm": 3.3870385964627565,
"learning_rate": 2.57166282154222e-07,
"loss": 1.0944,
"step": 2025
},
{
"epoch": 0.9372114496768237,
"grad_norm": 3.4345800654530128,
"learning_rate": 2.393227372638018e-07,
"loss": 1.0829,
"step": 2030
},
{
"epoch": 0.9395198522622346,
"grad_norm": 3.2304527099741094,
"learning_rate": 2.221131988261438e-07,
"loss": 1.0663,
"step": 2035
},
{
"epoch": 0.9418282548476454,
"grad_norm": 3.4212248154000324,
"learning_rate": 2.055387846897472e-07,
"loss": 1.0608,
"step": 2040
},
{
"epoch": 0.9441366574330563,
"grad_norm": 3.3424495231710356,
"learning_rate": 1.8960057144850163e-07,
"loss": 1.0513,
"step": 2045
},
{
"epoch": 0.9464450600184672,
"grad_norm": 8.42913586604929,
"learning_rate": 1.742995943717607e-07,
"loss": 1.0698,
"step": 2050
},
{
"epoch": 0.9487534626038782,
"grad_norm": 4.03605470816158,
"learning_rate": 1.5963684733709462e-07,
"loss": 1.0787,
"step": 2055
},
{
"epoch": 0.951061865189289,
"grad_norm": 3.572766321551919,
"learning_rate": 1.4561328276573415e-07,
"loss": 1.0625,
"step": 2060
},
{
"epoch": 0.9533702677746999,
"grad_norm": 3.213406555168112,
"learning_rate": 1.3222981156070126e-07,
"loss": 1.0861,
"step": 2065
},
{
"epoch": 0.9556786703601108,
"grad_norm": 3.216022210724082,
"learning_rate": 1.1948730304764622e-07,
"loss": 1.0572,
"step": 2070
},
{
"epoch": 0.9579870729455217,
"grad_norm": 3.8142801195990237,
"learning_rate": 1.073865849183786e-07,
"loss": 1.1151,
"step": 2075
},
{
"epoch": 0.9602954755309326,
"grad_norm": 3.2011503381896413,
"learning_rate": 9.592844317710238e-08,
"loss": 1.0585,
"step": 2080
},
{
"epoch": 0.9626038781163435,
"grad_norm": 3.3780038857652226,
"learning_rate": 8.511362208936447e-08,
"loss": 1.0591,
"step": 2085
},
{
"epoch": 0.9649122807017544,
"grad_norm": 3.3212612452494295,
"learning_rate": 7.494282413371135e-08,
"loss": 1.0787,
"step": 2090
},
{
"epoch": 0.9672206832871653,
"grad_norm": 3.797857330316498,
"learning_rate": 6.541670995605321e-08,
"loss": 1.0859,
"step": 2095
},
{
"epoch": 0.9695290858725761,
"grad_norm": 3.153773745189338,
"learning_rate": 5.653589832675943e-08,
"loss": 1.0983,
"step": 2100
},
{
"epoch": 0.9718374884579871,
"grad_norm": 3.4652822167549906,
"learning_rate": 4.830096610045854e-08,
"loss": 1.0713,
"step": 2105
},
{
"epoch": 0.974145891043398,
"grad_norm": 3.6601967632905796,
"learning_rate": 4.071244817857589e-08,
"loss": 1.1118,
"step": 2110
},
{
"epoch": 0.9764542936288089,
"grad_norm": 3.135385406063897,
"learning_rate": 3.3770837474584874e-08,
"loss": 1.072,
"step": 2115
},
{
"epoch": 0.9787626962142197,
"grad_norm": 3.4884714571677784,
"learning_rate": 2.747658488199023e-08,
"loss": 1.0738,
"step": 2120
},
{
"epoch": 0.9810710987996306,
"grad_norm": 3.7803263448925706,
"learning_rate": 2.1830099245040427e-08,
"loss": 1.0549,
"step": 2125
},
{
"epoch": 0.9833795013850416,
"grad_norm": 3.3045019552603585,
"learning_rate": 1.683174733216997e-08,
"loss": 1.1129,
"step": 2130
},
{
"epoch": 0.9856879039704525,
"grad_norm": 3.2212668180182784,
"learning_rate": 1.248185381217848e-08,
"loss": 1.0777,
"step": 2135
},
{
"epoch": 0.9879963065558633,
"grad_norm": 3.324768260260177,
"learning_rate": 8.780701233139789e-09,
"loss": 1.0503,
"step": 2140
},
{
"epoch": 0.9903047091412742,
"grad_norm": 3.214869100486745,
"learning_rate": 5.728530004051047e-09,
"loss": 1.0367,
"step": 2145
},
{
"epoch": 0.9926131117266851,
"grad_norm": 3.3583215428853666,
"learning_rate": 3.325538379211901e-09,
"loss": 1.0554,
"step": 2150
},
{
"epoch": 0.9949215143120961,
"grad_norm": 4.075445923312751,
"learning_rate": 1.5718824453525572e-09,
"loss": 1.1222,
"step": 2155
},
{
"epoch": 0.997229916897507,
"grad_norm": 3.3599147688364903,
"learning_rate": 4.676761114941197e-10,
"loss": 1.0646,
"step": 2160
},
{
"epoch": 0.9995383194829178,
"grad_norm": 3.4496692841727543,
"learning_rate": 1.2991101545622998e-11,
"loss": 1.1038,
"step": 2165
},
{
"epoch": 1.0,
"eval_loss": 1.1177629232406616,
"eval_runtime": 1154.8442,
"eval_samples_per_second": 26.579,
"eval_steps_per_second": 0.831,
"step": 2166
},
{
"epoch": 1.0,
"step": 2166,
"total_flos": 113379083550720.0,
"train_loss": 1.171416565762112,
"train_runtime": 11018.7418,
"train_samples_per_second": 6.29,
"train_steps_per_second": 0.197
}
],
"logging_steps": 5,
"max_steps": 2166,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 113379083550720.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}