mgpt-lora-multi-shared-1024 / trainer_state.json
MHGanainy's picture
MHGanainy/mgpt-lora-multi-shared-1024
094a4ec verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 62447,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016013579515429084,
"grad_norm": 19.96757698059082,
"learning_rate": 4.003442960946414e-08,
"loss": 6.6824,
"step": 100
},
{
"epoch": 0.0032027159030858167,
"grad_norm": 20.086483001708984,
"learning_rate": 8.006885921892828e-08,
"loss": 6.6684,
"step": 200
},
{
"epoch": 0.004804073854628725,
"grad_norm": 15.201075553894043,
"learning_rate": 1.2010328882839244e-07,
"loss": 6.5818,
"step": 300
},
{
"epoch": 0.006405431806171633,
"grad_norm": 12.548062324523926,
"learning_rate": 1.6013771843785657e-07,
"loss": 6.409,
"step": 400
},
{
"epoch": 0.008006789757714542,
"grad_norm": 10.694886207580566,
"learning_rate": 2.0017214804732072e-07,
"loss": 6.1352,
"step": 500
},
{
"epoch": 0.00960814770925745,
"grad_norm": 8.358333587646484,
"learning_rate": 2.402065776567849e-07,
"loss": 5.7786,
"step": 600
},
{
"epoch": 0.011209505660800359,
"grad_norm": 6.130667686462402,
"learning_rate": 2.80241007266249e-07,
"loss": 5.3755,
"step": 700
},
{
"epoch": 0.012810863612343267,
"grad_norm": 3.611398935317993,
"learning_rate": 3.2027543687571313e-07,
"loss": 5.1389,
"step": 800
},
{
"epoch": 0.014412221563886175,
"grad_norm": 2.8347551822662354,
"learning_rate": 3.603098664851773e-07,
"loss": 4.9266,
"step": 900
},
{
"epoch": 0.016013579515429085,
"grad_norm": 21.159616470336914,
"learning_rate": 4.0034429609464144e-07,
"loss": 4.8398,
"step": 1000
},
{
"epoch": 0.017614937466971993,
"grad_norm": 6.242217540740967,
"learning_rate": 4.4037872570410557e-07,
"loss": 4.7241,
"step": 1100
},
{
"epoch": 0.0192162954185149,
"grad_norm": 3.52362322807312,
"learning_rate": 4.804131553135698e-07,
"loss": 4.6735,
"step": 1200
},
{
"epoch": 0.02081765337005781,
"grad_norm": 2.831575393676758,
"learning_rate": 5.204475849230339e-07,
"loss": 4.5398,
"step": 1300
},
{
"epoch": 0.022419011321600717,
"grad_norm": 11.5364351272583,
"learning_rate": 5.60482014532498e-07,
"loss": 4.5072,
"step": 1400
},
{
"epoch": 0.024020369273143626,
"grad_norm": 3.1574575901031494,
"learning_rate": 6.005164441419621e-07,
"loss": 4.4694,
"step": 1500
},
{
"epoch": 0.025621727224686534,
"grad_norm": 2.988778829574585,
"learning_rate": 6.405508737514263e-07,
"loss": 4.451,
"step": 1600
},
{
"epoch": 0.027223085176229442,
"grad_norm": 2.388536214828491,
"learning_rate": 6.805853033608904e-07,
"loss": 4.3784,
"step": 1700
},
{
"epoch": 0.02882444312777235,
"grad_norm": 8.523998260498047,
"learning_rate": 7.206197329703546e-07,
"loss": 4.3341,
"step": 1800
},
{
"epoch": 0.030425801079315258,
"grad_norm": 5.110918045043945,
"learning_rate": 7.606541625798188e-07,
"loss": 4.2974,
"step": 1900
},
{
"epoch": 0.03202715903085817,
"grad_norm": 2.1784770488739014,
"learning_rate": 8.006885921892829e-07,
"loss": 4.2403,
"step": 2000
},
{
"epoch": 0.033628516982401074,
"grad_norm": 2.2727229595184326,
"learning_rate": 8.407230217987469e-07,
"loss": 4.1771,
"step": 2100
},
{
"epoch": 0.035229874933943986,
"grad_norm": 2.410856008529663,
"learning_rate": 8.807574514082111e-07,
"loss": 4.1723,
"step": 2200
},
{
"epoch": 0.03683123288548689,
"grad_norm": 7.233943462371826,
"learning_rate": 9.207918810176753e-07,
"loss": 4.1031,
"step": 2300
},
{
"epoch": 0.0384325908370298,
"grad_norm": 9.451576232910156,
"learning_rate": 9.608263106271395e-07,
"loss": 4.0296,
"step": 2400
},
{
"epoch": 0.04003394878857271,
"grad_norm": 5.198200225830078,
"learning_rate": 1.0008607402366035e-06,
"loss": 3.9371,
"step": 2500
},
{
"epoch": 0.04163530674011562,
"grad_norm": 11.912164688110352,
"learning_rate": 1.0408951698460678e-06,
"loss": 3.8349,
"step": 2600
},
{
"epoch": 0.04323666469165852,
"grad_norm": 6.008382320404053,
"learning_rate": 1.0809295994555318e-06,
"loss": 3.7505,
"step": 2700
},
{
"epoch": 0.044838022643201435,
"grad_norm": 3.3153979778289795,
"learning_rate": 1.120964029064996e-06,
"loss": 3.6149,
"step": 2800
},
{
"epoch": 0.046439380594744346,
"grad_norm": 8.011855125427246,
"learning_rate": 1.16099845867446e-06,
"loss": 3.5414,
"step": 2900
},
{
"epoch": 0.04804073854628725,
"grad_norm": 3.550476312637329,
"learning_rate": 1.2010328882839243e-06,
"loss": 3.4248,
"step": 3000
},
{
"epoch": 0.04964209649783016,
"grad_norm": 3.9144866466522217,
"learning_rate": 1.2410673178933883e-06,
"loss": 3.3224,
"step": 3100
},
{
"epoch": 0.05124345444937307,
"grad_norm": 3.6054248809814453,
"learning_rate": 1.2811017475028525e-06,
"loss": 3.2983,
"step": 3200
},
{
"epoch": 0.05284481240091598,
"grad_norm": 4.165266990661621,
"learning_rate": 1.3211361771123166e-06,
"loss": 3.1677,
"step": 3300
},
{
"epoch": 0.054446170352458884,
"grad_norm": 4.654821872711182,
"learning_rate": 1.3611706067217808e-06,
"loss": 3.14,
"step": 3400
},
{
"epoch": 0.056047528304001795,
"grad_norm": 3.641819715499878,
"learning_rate": 1.4012050363312448e-06,
"loss": 3.0439,
"step": 3500
},
{
"epoch": 0.0576488862555447,
"grad_norm": 3.61091947555542,
"learning_rate": 1.4412394659407093e-06,
"loss": 2.9522,
"step": 3600
},
{
"epoch": 0.05925024420708761,
"grad_norm": 22.04112434387207,
"learning_rate": 1.4812738955501733e-06,
"loss": 2.9255,
"step": 3700
},
{
"epoch": 0.060851602158630516,
"grad_norm": 5.0808892250061035,
"learning_rate": 1.5213083251596375e-06,
"loss": 2.8402,
"step": 3800
},
{
"epoch": 0.06245296011017343,
"grad_norm": 9.055444717407227,
"learning_rate": 1.5613427547691015e-06,
"loss": 2.8354,
"step": 3900
},
{
"epoch": 0.06405431806171634,
"grad_norm": 3.44482684135437,
"learning_rate": 1.6013771843785658e-06,
"loss": 2.7592,
"step": 4000
},
{
"epoch": 0.06565567601325924,
"grad_norm": 2.7728819847106934,
"learning_rate": 1.6414116139880298e-06,
"loss": 2.7746,
"step": 4100
},
{
"epoch": 0.06725703396480215,
"grad_norm": 1.9306970834732056,
"learning_rate": 1.6814460435974938e-06,
"loss": 2.7233,
"step": 4200
},
{
"epoch": 0.06885839191634506,
"grad_norm": 1.8614246845245361,
"learning_rate": 1.7214804732069583e-06,
"loss": 2.7021,
"step": 4300
},
{
"epoch": 0.07045974986788797,
"grad_norm": 3.224013566970825,
"learning_rate": 1.7615149028164223e-06,
"loss": 2.6586,
"step": 4400
},
{
"epoch": 0.07206110781943088,
"grad_norm": 4.159784317016602,
"learning_rate": 1.8015493324258865e-06,
"loss": 2.6666,
"step": 4500
},
{
"epoch": 0.07366246577097378,
"grad_norm": 2.2219038009643555,
"learning_rate": 1.8415837620353505e-06,
"loss": 2.6465,
"step": 4600
},
{
"epoch": 0.07526382372251669,
"grad_norm": 14.757235527038574,
"learning_rate": 1.8816181916448148e-06,
"loss": 2.6125,
"step": 4700
},
{
"epoch": 0.0768651816740596,
"grad_norm": 1.881609559059143,
"learning_rate": 1.921652621254279e-06,
"loss": 2.5652,
"step": 4800
},
{
"epoch": 0.07846653962560252,
"grad_norm": 1.9000244140625,
"learning_rate": 1.9616870508637432e-06,
"loss": 2.5676,
"step": 4900
},
{
"epoch": 0.08006789757714541,
"grad_norm": 3.4342846870422363,
"learning_rate": 2.001721480473207e-06,
"loss": 2.5934,
"step": 5000
},
{
"epoch": 0.08166925552868833,
"grad_norm": 3.2394461631774902,
"learning_rate": 2.0417559100826713e-06,
"loss": 2.5371,
"step": 5100
},
{
"epoch": 0.08327061348023124,
"grad_norm": 2.726757287979126,
"learning_rate": 2.0817903396921355e-06,
"loss": 2.5211,
"step": 5200
},
{
"epoch": 0.08487197143177415,
"grad_norm": 1.8385337591171265,
"learning_rate": 2.1218247693015993e-06,
"loss": 2.5449,
"step": 5300
},
{
"epoch": 0.08647332938331705,
"grad_norm": 1.7317003011703491,
"learning_rate": 2.1618591989110636e-06,
"loss": 2.5368,
"step": 5400
},
{
"epoch": 0.08807468733485996,
"grad_norm": 1.8202093839645386,
"learning_rate": 2.201893628520528e-06,
"loss": 2.4703,
"step": 5500
},
{
"epoch": 0.08967604528640287,
"grad_norm": 1.627389669418335,
"learning_rate": 2.241928058129992e-06,
"loss": 2.4741,
"step": 5600
},
{
"epoch": 0.09127740323794578,
"grad_norm": 3.039496660232544,
"learning_rate": 2.2819624877394563e-06,
"loss": 2.4966,
"step": 5700
},
{
"epoch": 0.09287876118948869,
"grad_norm": 5.223389148712158,
"learning_rate": 2.32199691734892e-06,
"loss": 2.4383,
"step": 5800
},
{
"epoch": 0.09448011914103159,
"grad_norm": 1.7681688070297241,
"learning_rate": 2.3620313469583843e-06,
"loss": 2.4656,
"step": 5900
},
{
"epoch": 0.0960814770925745,
"grad_norm": 4.00803804397583,
"learning_rate": 2.4020657765678486e-06,
"loss": 2.481,
"step": 6000
},
{
"epoch": 0.09768283504411741,
"grad_norm": 14.015419960021973,
"learning_rate": 2.4421002061773128e-06,
"loss": 2.4758,
"step": 6100
},
{
"epoch": 0.09928419299566033,
"grad_norm": 3.860048294067383,
"learning_rate": 2.4821346357867766e-06,
"loss": 2.4548,
"step": 6200
},
{
"epoch": 0.10088555094720322,
"grad_norm": 1.4068512916564941,
"learning_rate": 2.5221690653962413e-06,
"loss": 2.4428,
"step": 6300
},
{
"epoch": 0.10248690889874613,
"grad_norm": 3.721557855606079,
"learning_rate": 2.562203495005705e-06,
"loss": 2.3956,
"step": 6400
},
{
"epoch": 0.10408826685028905,
"grad_norm": 2.806149482727051,
"learning_rate": 2.6022379246151693e-06,
"loss": 2.3903,
"step": 6500
},
{
"epoch": 0.10568962480183196,
"grad_norm": 2.8240647315979004,
"learning_rate": 2.642272354224633e-06,
"loss": 2.395,
"step": 6600
},
{
"epoch": 0.10729098275337486,
"grad_norm": 1.7092350721359253,
"learning_rate": 2.6823067838340978e-06,
"loss": 2.4076,
"step": 6700
},
{
"epoch": 0.10889234070491777,
"grad_norm": 1.814175009727478,
"learning_rate": 2.7223412134435616e-06,
"loss": 2.4055,
"step": 6800
},
{
"epoch": 0.11049369865646068,
"grad_norm": 2.08941650390625,
"learning_rate": 2.762375643053026e-06,
"loss": 2.4097,
"step": 6900
},
{
"epoch": 0.11209505660800359,
"grad_norm": 2.0335028171539307,
"learning_rate": 2.8024100726624896e-06,
"loss": 2.3769,
"step": 7000
},
{
"epoch": 0.1136964145595465,
"grad_norm": 14.262283325195312,
"learning_rate": 2.8424445022719543e-06,
"loss": 2.3706,
"step": 7100
},
{
"epoch": 0.1152977725110894,
"grad_norm": 2.324890375137329,
"learning_rate": 2.8824789318814185e-06,
"loss": 2.3688,
"step": 7200
},
{
"epoch": 0.11689913046263231,
"grad_norm": 2.6902220249176025,
"learning_rate": 2.9225133614908823e-06,
"loss": 2.3829,
"step": 7300
},
{
"epoch": 0.11850048841417522,
"grad_norm": 3.410318613052368,
"learning_rate": 2.9625477911003466e-06,
"loss": 2.3687,
"step": 7400
},
{
"epoch": 0.12010184636571813,
"grad_norm": 1.4391207695007324,
"learning_rate": 3.0025822207098104e-06,
"loss": 2.3909,
"step": 7500
},
{
"epoch": 0.12170320431726103,
"grad_norm": 11.690342903137207,
"learning_rate": 3.042616650319275e-06,
"loss": 2.3387,
"step": 7600
},
{
"epoch": 0.12330456226880394,
"grad_norm": 1.5653709173202515,
"learning_rate": 3.082651079928739e-06,
"loss": 2.3451,
"step": 7700
},
{
"epoch": 0.12490592022034686,
"grad_norm": 3.124866247177124,
"learning_rate": 3.122685509538203e-06,
"loss": 2.322,
"step": 7800
},
{
"epoch": 0.12650727817188975,
"grad_norm": 12.413910865783691,
"learning_rate": 3.162719939147667e-06,
"loss": 2.3182,
"step": 7900
},
{
"epoch": 0.12810863612343268,
"grad_norm": 1.7550314664840698,
"learning_rate": 3.2027543687571315e-06,
"loss": 2.3099,
"step": 8000
},
{
"epoch": 0.12970999407497558,
"grad_norm": 1.9001699686050415,
"learning_rate": 3.2427887983665958e-06,
"loss": 2.3299,
"step": 8100
},
{
"epoch": 0.13131135202651847,
"grad_norm": 1.4118369817733765,
"learning_rate": 3.2828232279760596e-06,
"loss": 2.3003,
"step": 8200
},
{
"epoch": 0.1329127099780614,
"grad_norm": 3.046459913253784,
"learning_rate": 3.322857657585524e-06,
"loss": 2.3053,
"step": 8300
},
{
"epoch": 0.1345140679296043,
"grad_norm": 6.424179553985596,
"learning_rate": 3.3628920871949876e-06,
"loss": 2.2845,
"step": 8400
},
{
"epoch": 0.13611542588114722,
"grad_norm": 3.9462482929229736,
"learning_rate": 3.4029265168044523e-06,
"loss": 2.2821,
"step": 8500
},
{
"epoch": 0.13771678383269012,
"grad_norm": 2.464116096496582,
"learning_rate": 3.4429609464139165e-06,
"loss": 2.3144,
"step": 8600
},
{
"epoch": 0.13931814178423302,
"grad_norm": 17.63976287841797,
"learning_rate": 3.4829953760233803e-06,
"loss": 2.2811,
"step": 8700
},
{
"epoch": 0.14091949973577594,
"grad_norm": 3.135732650756836,
"learning_rate": 3.5230298056328446e-06,
"loss": 2.2953,
"step": 8800
},
{
"epoch": 0.14252085768731884,
"grad_norm": 4.162137031555176,
"learning_rate": 3.563064235242309e-06,
"loss": 2.2692,
"step": 8900
},
{
"epoch": 0.14412221563886177,
"grad_norm": 6.429003715515137,
"learning_rate": 3.603098664851773e-06,
"loss": 2.2819,
"step": 9000
},
{
"epoch": 0.14572357359040466,
"grad_norm": 6.803035736083984,
"learning_rate": 3.643133094461237e-06,
"loss": 2.2672,
"step": 9100
},
{
"epoch": 0.14732493154194756,
"grad_norm": 15.847606658935547,
"learning_rate": 3.683167524070701e-06,
"loss": 2.26,
"step": 9200
},
{
"epoch": 0.1489262894934905,
"grad_norm": 3.1911871433258057,
"learning_rate": 3.723201953680165e-06,
"loss": 2.2355,
"step": 9300
},
{
"epoch": 0.15052764744503339,
"grad_norm": 1.6060032844543457,
"learning_rate": 3.7632363832896296e-06,
"loss": 2.2608,
"step": 9400
},
{
"epoch": 0.15212900539657628,
"grad_norm": 1.5236974954605103,
"learning_rate": 3.8032708128990938e-06,
"loss": 2.2507,
"step": 9500
},
{
"epoch": 0.1537303633481192,
"grad_norm": 8.704015731811523,
"learning_rate": 3.843305242508558e-06,
"loss": 2.2457,
"step": 9600
},
{
"epoch": 0.1553317212996621,
"grad_norm": 4.1284918785095215,
"learning_rate": 3.883339672118022e-06,
"loss": 2.2321,
"step": 9700
},
{
"epoch": 0.15693307925120503,
"grad_norm": 8.519213676452637,
"learning_rate": 3.9233741017274865e-06,
"loss": 2.2356,
"step": 9800
},
{
"epoch": 0.15853443720274793,
"grad_norm": 6.228696823120117,
"learning_rate": 3.96340853133695e-06,
"loss": 2.2243,
"step": 9900
},
{
"epoch": 0.16013579515429083,
"grad_norm": 2.693775177001953,
"learning_rate": 4.003442960946414e-06,
"loss": 2.2288,
"step": 10000
},
{
"epoch": 0.16173715310583375,
"grad_norm": 8.416048049926758,
"learning_rate": 4.043477390555878e-06,
"loss": 2.2311,
"step": 10100
},
{
"epoch": 0.16333851105737665,
"grad_norm": 1.5264601707458496,
"learning_rate": 4.083511820165343e-06,
"loss": 2.2186,
"step": 10200
},
{
"epoch": 0.16493986900891958,
"grad_norm": 1.7846661806106567,
"learning_rate": 4.123546249774807e-06,
"loss": 2.2132,
"step": 10300
},
{
"epoch": 0.16654122696046247,
"grad_norm": 3.9117202758789062,
"learning_rate": 4.163580679384271e-06,
"loss": 2.228,
"step": 10400
},
{
"epoch": 0.16814258491200537,
"grad_norm": 4.531779766082764,
"learning_rate": 4.203615108993735e-06,
"loss": 2.2066,
"step": 10500
},
{
"epoch": 0.1697439428635483,
"grad_norm": 2.1657228469848633,
"learning_rate": 4.243649538603199e-06,
"loss": 2.1929,
"step": 10600
},
{
"epoch": 0.1713453008150912,
"grad_norm": 2.9067344665527344,
"learning_rate": 4.283683968212663e-06,
"loss": 2.2093,
"step": 10700
},
{
"epoch": 0.1729466587666341,
"grad_norm": 3.7661423683166504,
"learning_rate": 4.323718397822127e-06,
"loss": 2.1919,
"step": 10800
},
{
"epoch": 0.17454801671817702,
"grad_norm": 2.9169373512268066,
"learning_rate": 4.363752827431592e-06,
"loss": 2.2099,
"step": 10900
},
{
"epoch": 0.17614937466971992,
"grad_norm": 2.1810638904571533,
"learning_rate": 4.403787257041056e-06,
"loss": 2.1923,
"step": 11000
},
{
"epoch": 0.17775073262126284,
"grad_norm": 8.174213409423828,
"learning_rate": 4.443821686650519e-06,
"loss": 2.1886,
"step": 11100
},
{
"epoch": 0.17935209057280574,
"grad_norm": 2.431321382522583,
"learning_rate": 4.483856116259984e-06,
"loss": 2.1991,
"step": 11200
},
{
"epoch": 0.18095344852434864,
"grad_norm": 3.7426862716674805,
"learning_rate": 4.523890545869448e-06,
"loss": 2.1763,
"step": 11300
},
{
"epoch": 0.18255480647589156,
"grad_norm": 2.5155022144317627,
"learning_rate": 4.5639249754789125e-06,
"loss": 2.1906,
"step": 11400
},
{
"epoch": 0.18415616442743446,
"grad_norm": 1.7059454917907715,
"learning_rate": 4.603959405088376e-06,
"loss": 2.1872,
"step": 11500
},
{
"epoch": 0.18575752237897739,
"grad_norm": 5.253864765167236,
"learning_rate": 4.64399383469784e-06,
"loss": 2.1889,
"step": 11600
},
{
"epoch": 0.18735888033052028,
"grad_norm": 1.5918197631835938,
"learning_rate": 4.684028264307305e-06,
"loss": 2.1746,
"step": 11700
},
{
"epoch": 0.18896023828206318,
"grad_norm": 10.147111892700195,
"learning_rate": 4.724062693916769e-06,
"loss": 2.1712,
"step": 11800
},
{
"epoch": 0.1905615962336061,
"grad_norm": 4.3356781005859375,
"learning_rate": 4.764097123526233e-06,
"loss": 2.1815,
"step": 11900
},
{
"epoch": 0.192162954185149,
"grad_norm": 10.20026683807373,
"learning_rate": 4.804131553135697e-06,
"loss": 2.176,
"step": 12000
},
{
"epoch": 0.1937643121366919,
"grad_norm": 1.9123090505599976,
"learning_rate": 4.844165982745162e-06,
"loss": 2.1807,
"step": 12100
},
{
"epoch": 0.19536567008823483,
"grad_norm": 1.6245704889297485,
"learning_rate": 4.8842004123546256e-06,
"loss": 2.1637,
"step": 12200
},
{
"epoch": 0.19696702803977773,
"grad_norm": 5.880768299102783,
"learning_rate": 4.924234841964089e-06,
"loss": 2.1735,
"step": 12300
},
{
"epoch": 0.19856838599132065,
"grad_norm": 5.809731960296631,
"learning_rate": 4.964269271573553e-06,
"loss": 2.1523,
"step": 12400
},
{
"epoch": 0.20016974394286355,
"grad_norm": 1.827416181564331,
"learning_rate": 5.004303701183018e-06,
"loss": 2.1485,
"step": 12500
},
{
"epoch": 0.20177110189440645,
"grad_norm": 2.386488437652588,
"learning_rate": 5.0443381307924825e-06,
"loss": 2.1641,
"step": 12600
},
{
"epoch": 0.20337245984594937,
"grad_norm": 5.080982208251953,
"learning_rate": 5.0843725604019455e-06,
"loss": 2.1706,
"step": 12700
},
{
"epoch": 0.20497381779749227,
"grad_norm": 6.828605651855469,
"learning_rate": 5.12440699001141e-06,
"loss": 2.1736,
"step": 12800
},
{
"epoch": 0.2065751757490352,
"grad_norm": 2.243302822113037,
"learning_rate": 5.164441419620875e-06,
"loss": 2.1238,
"step": 12900
},
{
"epoch": 0.2081765337005781,
"grad_norm": 3.8954567909240723,
"learning_rate": 5.204475849230339e-06,
"loss": 2.1461,
"step": 13000
},
{
"epoch": 0.209777891652121,
"grad_norm": 3.563438653945923,
"learning_rate": 5.244510278839802e-06,
"loss": 2.1492,
"step": 13100
},
{
"epoch": 0.21137924960366392,
"grad_norm": 2.1851043701171875,
"learning_rate": 5.284544708449266e-06,
"loss": 2.1407,
"step": 13200
},
{
"epoch": 0.2129806075552068,
"grad_norm": 4.8792524337768555,
"learning_rate": 5.324579138058731e-06,
"loss": 2.1403,
"step": 13300
},
{
"epoch": 0.2145819655067497,
"grad_norm": 4.021134376525879,
"learning_rate": 5.3646135676681955e-06,
"loss": 2.1628,
"step": 13400
},
{
"epoch": 0.21618332345829264,
"grad_norm": 3.8988146781921387,
"learning_rate": 5.4046479972776585e-06,
"loss": 2.1425,
"step": 13500
},
{
"epoch": 0.21778468140983553,
"grad_norm": 6.337070941925049,
"learning_rate": 5.444682426887123e-06,
"loss": 2.1493,
"step": 13600
},
{
"epoch": 0.21938603936137846,
"grad_norm": 2.077366828918457,
"learning_rate": 5.484716856496588e-06,
"loss": 2.1264,
"step": 13700
},
{
"epoch": 0.22098739731292136,
"grad_norm": 1.3507400751113892,
"learning_rate": 5.524751286106052e-06,
"loss": 2.1306,
"step": 13800
},
{
"epoch": 0.22258875526446426,
"grad_norm": 1.5656003952026367,
"learning_rate": 5.564785715715516e-06,
"loss": 2.135,
"step": 13900
},
{
"epoch": 0.22419011321600718,
"grad_norm": 3.315119981765747,
"learning_rate": 5.604820145324979e-06,
"loss": 2.1449,
"step": 14000
},
{
"epoch": 0.22579147116755008,
"grad_norm": 1.677067518234253,
"learning_rate": 5.644854574934444e-06,
"loss": 2.1126,
"step": 14100
},
{
"epoch": 0.227392829119093,
"grad_norm": 1.3107109069824219,
"learning_rate": 5.6848890045439086e-06,
"loss": 2.1415,
"step": 14200
},
{
"epoch": 0.2289941870706359,
"grad_norm": 1.887251853942871,
"learning_rate": 5.724923434153372e-06,
"loss": 2.1312,
"step": 14300
},
{
"epoch": 0.2305955450221788,
"grad_norm": 4.706649303436279,
"learning_rate": 5.764957863762837e-06,
"loss": 2.1417,
"step": 14400
},
{
"epoch": 0.23219690297372172,
"grad_norm": 4.202969074249268,
"learning_rate": 5.8049922933723e-06,
"loss": 2.1403,
"step": 14500
},
{
"epoch": 0.23379826092526462,
"grad_norm": 2.2349281311035156,
"learning_rate": 5.845026722981765e-06,
"loss": 2.1164,
"step": 14600
},
{
"epoch": 0.23539961887680752,
"grad_norm": 1.7390815019607544,
"learning_rate": 5.885061152591229e-06,
"loss": 2.1313,
"step": 14700
},
{
"epoch": 0.23700097682835045,
"grad_norm": 1.9534856081008911,
"learning_rate": 5.925095582200693e-06,
"loss": 2.1252,
"step": 14800
},
{
"epoch": 0.23860233477989334,
"grad_norm": 1.7701072692871094,
"learning_rate": 5.965130011810158e-06,
"loss": 2.1207,
"step": 14900
},
{
"epoch": 0.24020369273143627,
"grad_norm": 6.166327953338623,
"learning_rate": 6.005164441419621e-06,
"loss": 2.1079,
"step": 15000
},
{
"epoch": 0.24180505068297917,
"grad_norm": 2.4361186027526855,
"learning_rate": 6.045198871029085e-06,
"loss": 2.114,
"step": 15100
},
{
"epoch": 0.24340640863452206,
"grad_norm": 2.536973714828491,
"learning_rate": 6.08523330063855e-06,
"loss": 2.109,
"step": 15200
},
{
"epoch": 0.245007766586065,
"grad_norm": 3.394212484359741,
"learning_rate": 6.125267730248014e-06,
"loss": 2.1193,
"step": 15300
},
{
"epoch": 0.2466091245376079,
"grad_norm": 1.725258708000183,
"learning_rate": 6.165302159857478e-06,
"loss": 2.1238,
"step": 15400
},
{
"epoch": 0.2482104824891508,
"grad_norm": 2.9132273197174072,
"learning_rate": 6.205336589466942e-06,
"loss": 2.115,
"step": 15500
},
{
"epoch": 0.2498118404406937,
"grad_norm": 1.6105629205703735,
"learning_rate": 6.245371019076406e-06,
"loss": 2.1103,
"step": 15600
},
{
"epoch": 0.25141319839223664,
"grad_norm": 1.4759615659713745,
"learning_rate": 6.285405448685871e-06,
"loss": 2.1018,
"step": 15700
},
{
"epoch": 0.2530145563437795,
"grad_norm": 6.175992488861084,
"learning_rate": 6.325439878295334e-06,
"loss": 2.1052,
"step": 15800
},
{
"epoch": 0.25461591429532243,
"grad_norm": 1.496497631072998,
"learning_rate": 6.3654743079047984e-06,
"loss": 2.1098,
"step": 15900
},
{
"epoch": 0.25621727224686536,
"grad_norm": 2.9353444576263428,
"learning_rate": 6.405508737514263e-06,
"loss": 2.1142,
"step": 16000
},
{
"epoch": 0.2578186301984082,
"grad_norm": 3.003761053085327,
"learning_rate": 6.445543167123727e-06,
"loss": 2.1096,
"step": 16100
},
{
"epoch": 0.25941998814995115,
"grad_norm": 1.8897191286087036,
"learning_rate": 6.4855775967331916e-06,
"loss": 2.0977,
"step": 16200
},
{
"epoch": 0.2610213461014941,
"grad_norm": 1.1225190162658691,
"learning_rate": 6.5256120263426545e-06,
"loss": 2.1022,
"step": 16300
},
{
"epoch": 0.26262270405303695,
"grad_norm": 5.252044200897217,
"learning_rate": 6.565646455952119e-06,
"loss": 2.1068,
"step": 16400
},
{
"epoch": 0.2642240620045799,
"grad_norm": 1.9852492809295654,
"learning_rate": 6.605680885561584e-06,
"loss": 2.0882,
"step": 16500
},
{
"epoch": 0.2658254199561228,
"grad_norm": 1.1616008281707764,
"learning_rate": 6.645715315171048e-06,
"loss": 2.0944,
"step": 16600
},
{
"epoch": 0.2674267779076657,
"grad_norm": 2.1226704120635986,
"learning_rate": 6.685749744780512e-06,
"loss": 2.0927,
"step": 16700
},
{
"epoch": 0.2690281358592086,
"grad_norm": 1.4191474914550781,
"learning_rate": 6.725784174389975e-06,
"loss": 2.0998,
"step": 16800
},
{
"epoch": 0.2706294938107515,
"grad_norm": 2.283435106277466,
"learning_rate": 6.76581860399944e-06,
"loss": 2.1157,
"step": 16900
},
{
"epoch": 0.27223085176229445,
"grad_norm": 1.6899996995925903,
"learning_rate": 6.805853033608905e-06,
"loss": 2.0937,
"step": 17000
},
{
"epoch": 0.2738322097138373,
"grad_norm": 1.3105698823928833,
"learning_rate": 6.845887463218368e-06,
"loss": 2.0545,
"step": 17100
},
{
"epoch": 0.27543356766538024,
"grad_norm": 1.1776176691055298,
"learning_rate": 6.885921892827833e-06,
"loss": 2.0984,
"step": 17200
},
{
"epoch": 0.27703492561692317,
"grad_norm": 1.651307225227356,
"learning_rate": 6.925956322437296e-06,
"loss": 2.0959,
"step": 17300
},
{
"epoch": 0.27863628356846604,
"grad_norm": 1.7482041120529175,
"learning_rate": 6.965990752046761e-06,
"loss": 2.0636,
"step": 17400
},
{
"epoch": 0.28023764152000896,
"grad_norm": 3.625835418701172,
"learning_rate": 7.006025181656225e-06,
"loss": 2.085,
"step": 17500
},
{
"epoch": 0.2818389994715519,
"grad_norm": 1.6532440185546875,
"learning_rate": 7.046059611265689e-06,
"loss": 2.0883,
"step": 17600
},
{
"epoch": 0.28344035742309476,
"grad_norm": 1.331597924232483,
"learning_rate": 7.086094040875153e-06,
"loss": 2.1034,
"step": 17700
},
{
"epoch": 0.2850417153746377,
"grad_norm": 3.6023612022399902,
"learning_rate": 7.126128470484618e-06,
"loss": 2.0991,
"step": 17800
},
{
"epoch": 0.2866430733261806,
"grad_norm": 1.4167087078094482,
"learning_rate": 7.166162900094081e-06,
"loss": 2.1057,
"step": 17900
},
{
"epoch": 0.28824443127772353,
"grad_norm": 6.183845520019531,
"learning_rate": 7.206197329703546e-06,
"loss": 2.0951,
"step": 18000
},
{
"epoch": 0.2898457892292664,
"grad_norm": 1.5191693305969238,
"learning_rate": 7.246231759313009e-06,
"loss": 2.062,
"step": 18100
},
{
"epoch": 0.29144714718080933,
"grad_norm": 1.5019919872283936,
"learning_rate": 7.286266188922474e-06,
"loss": 2.073,
"step": 18200
},
{
"epoch": 0.29304850513235225,
"grad_norm": 2.338139533996582,
"learning_rate": 7.326300618531938e-06,
"loss": 2.0935,
"step": 18300
},
{
"epoch": 0.2946498630838951,
"grad_norm": 3.6389622688293457,
"learning_rate": 7.366335048141402e-06,
"loss": 2.0907,
"step": 18400
},
{
"epoch": 0.29625122103543805,
"grad_norm": 1.3060230016708374,
"learning_rate": 7.406369477750867e-06,
"loss": 2.0691,
"step": 18500
},
{
"epoch": 0.297852578986981,
"grad_norm": 2.181640863418579,
"learning_rate": 7.44640390736033e-06,
"loss": 2.0668,
"step": 18600
},
{
"epoch": 0.29945393693852385,
"grad_norm": 2.1645591259002686,
"learning_rate": 7.4864383369697944e-06,
"loss": 2.0589,
"step": 18700
},
{
"epoch": 0.30105529489006677,
"grad_norm": 2.522383451461792,
"learning_rate": 7.526472766579259e-06,
"loss": 2.0624,
"step": 18800
},
{
"epoch": 0.3026566528416097,
"grad_norm": 2.0438318252563477,
"learning_rate": 7.566507196188723e-06,
"loss": 2.0756,
"step": 18900
},
{
"epoch": 0.30425801079315257,
"grad_norm": 1.5602883100509644,
"learning_rate": 7.6065416257981876e-06,
"loss": 2.0539,
"step": 19000
},
{
"epoch": 0.3058593687446955,
"grad_norm": 1.2384752035140991,
"learning_rate": 7.64657605540765e-06,
"loss": 2.0698,
"step": 19100
},
{
"epoch": 0.3074607266962384,
"grad_norm": 3.290865659713745,
"learning_rate": 7.686610485017116e-06,
"loss": 2.0538,
"step": 19200
},
{
"epoch": 0.30906208464778134,
"grad_norm": 1.9636443853378296,
"learning_rate": 7.72664491462658e-06,
"loss": 2.0679,
"step": 19300
},
{
"epoch": 0.3106634425993242,
"grad_norm": 2.1679654121398926,
"learning_rate": 7.766679344236044e-06,
"loss": 2.0734,
"step": 19400
},
{
"epoch": 0.31226480055086714,
"grad_norm": 2.441173553466797,
"learning_rate": 7.806713773845507e-06,
"loss": 2.0475,
"step": 19500
},
{
"epoch": 0.31386615850241006,
"grad_norm": 1.2764122486114502,
"learning_rate": 7.846748203454973e-06,
"loss": 2.0773,
"step": 19600
},
{
"epoch": 0.31546751645395293,
"grad_norm": 1.106123685836792,
"learning_rate": 7.886782633064435e-06,
"loss": 2.0645,
"step": 19700
},
{
"epoch": 0.31706887440549586,
"grad_norm": 1.025707721710205,
"learning_rate": 7.9268170626739e-06,
"loss": 2.0643,
"step": 19800
},
{
"epoch": 0.3186702323570388,
"grad_norm": 1.2565511465072632,
"learning_rate": 7.966851492283364e-06,
"loss": 2.0666,
"step": 19900
},
{
"epoch": 0.32027159030858166,
"grad_norm": 1.2378392219543457,
"learning_rate": 8.006885921892828e-06,
"loss": 2.0601,
"step": 20000
},
{
"epoch": 0.3218729482601246,
"grad_norm": 1.9206656217575073,
"learning_rate": 8.046920351502294e-06,
"loss": 2.0576,
"step": 20100
},
{
"epoch": 0.3234743062116675,
"grad_norm": 1.6953002214431763,
"learning_rate": 8.086954781111756e-06,
"loss": 2.0502,
"step": 20200
},
{
"epoch": 0.3250756641632104,
"grad_norm": 1.6600971221923828,
"learning_rate": 8.126989210721221e-06,
"loss": 2.0589,
"step": 20300
},
{
"epoch": 0.3266770221147533,
"grad_norm": 2.360778331756592,
"learning_rate": 8.167023640330685e-06,
"loss": 2.0591,
"step": 20400
},
{
"epoch": 0.3282783800662962,
"grad_norm": 1.5475653409957886,
"learning_rate": 8.207058069940149e-06,
"loss": 2.0703,
"step": 20500
},
{
"epoch": 0.32987973801783915,
"grad_norm": 1.2999683618545532,
"learning_rate": 8.247092499549614e-06,
"loss": 2.0651,
"step": 20600
},
{
"epoch": 0.331481095969382,
"grad_norm": 3.301884889602661,
"learning_rate": 8.287126929159077e-06,
"loss": 2.0485,
"step": 20700
},
{
"epoch": 0.33308245392092495,
"grad_norm": 3.200942277908325,
"learning_rate": 8.327161358768542e-06,
"loss": 2.0444,
"step": 20800
},
{
"epoch": 0.3346838118724679,
"grad_norm": 1.2649630308151245,
"learning_rate": 8.367195788378006e-06,
"loss": 2.0526,
"step": 20900
},
{
"epoch": 0.33628516982401074,
"grad_norm": 1.187700867652893,
"learning_rate": 8.40723021798747e-06,
"loss": 2.0651,
"step": 21000
},
{
"epoch": 0.33788652777555367,
"grad_norm": 1.5766338109970093,
"learning_rate": 8.447264647596935e-06,
"loss": 2.0575,
"step": 21100
},
{
"epoch": 0.3394878857270966,
"grad_norm": 1.1678153276443481,
"learning_rate": 8.487299077206397e-06,
"loss": 2.0394,
"step": 21200
},
{
"epoch": 0.34108924367863946,
"grad_norm": 1.978745698928833,
"learning_rate": 8.527333506815863e-06,
"loss": 2.0434,
"step": 21300
},
{
"epoch": 0.3426906016301824,
"grad_norm": 1.311265230178833,
"learning_rate": 8.567367936425327e-06,
"loss": 2.0423,
"step": 21400
},
{
"epoch": 0.3442919595817253,
"grad_norm": 1.4099359512329102,
"learning_rate": 8.60740236603479e-06,
"loss": 2.0375,
"step": 21500
},
{
"epoch": 0.3458933175332682,
"grad_norm": 1.2521507740020752,
"learning_rate": 8.647436795644254e-06,
"loss": 2.0355,
"step": 21600
},
{
"epoch": 0.3474946754848111,
"grad_norm": 2.544433832168579,
"learning_rate": 8.687471225253718e-06,
"loss": 2.0351,
"step": 21700
},
{
"epoch": 0.34909603343635404,
"grad_norm": 1.6786710023880005,
"learning_rate": 8.727505654863184e-06,
"loss": 2.0544,
"step": 21800
},
{
"epoch": 0.35069739138789696,
"grad_norm": 1.224026083946228,
"learning_rate": 8.767540084472647e-06,
"loss": 2.0406,
"step": 21900
},
{
"epoch": 0.35229874933943983,
"grad_norm": 7.5012431144714355,
"learning_rate": 8.807574514082111e-06,
"loss": 2.0355,
"step": 22000
},
{
"epoch": 0.35390010729098276,
"grad_norm": 1.4292916059494019,
"learning_rate": 8.847608943691575e-06,
"loss": 2.0445,
"step": 22100
},
{
"epoch": 0.3555014652425257,
"grad_norm": 1.1762036085128784,
"learning_rate": 8.887643373301039e-06,
"loss": 2.0358,
"step": 22200
},
{
"epoch": 0.35710282319406855,
"grad_norm": 1.1497453451156616,
"learning_rate": 8.927677802910504e-06,
"loss": 2.0411,
"step": 22300
},
{
"epoch": 0.3587041811456115,
"grad_norm": 1.7819931507110596,
"learning_rate": 8.967712232519968e-06,
"loss": 2.0414,
"step": 22400
},
{
"epoch": 0.3603055390971544,
"grad_norm": 4.624775409698486,
"learning_rate": 9.007746662129432e-06,
"loss": 2.0309,
"step": 22500
},
{
"epoch": 0.3619068970486973,
"grad_norm": 1.5174845457077026,
"learning_rate": 9.047781091738896e-06,
"loss": 2.0494,
"step": 22600
},
{
"epoch": 0.3635082550002402,
"grad_norm": 2.5349197387695312,
"learning_rate": 9.08781552134836e-06,
"loss": 2.0199,
"step": 22700
},
{
"epoch": 0.3651096129517831,
"grad_norm": 1.4281384944915771,
"learning_rate": 9.127849950957825e-06,
"loss": 2.0461,
"step": 22800
},
{
"epoch": 0.366710970903326,
"grad_norm": 1.4501956701278687,
"learning_rate": 9.167884380567289e-06,
"loss": 2.0275,
"step": 22900
},
{
"epoch": 0.3683123288548689,
"grad_norm": 1.7848312854766846,
"learning_rate": 9.207918810176753e-06,
"loss": 2.0459,
"step": 23000
},
{
"epoch": 0.36991368680641185,
"grad_norm": 1.2266578674316406,
"learning_rate": 9.247953239786217e-06,
"loss": 2.0382,
"step": 23100
},
{
"epoch": 0.37151504475795477,
"grad_norm": 2.917593002319336,
"learning_rate": 9.28798766939568e-06,
"loss": 2.0338,
"step": 23200
},
{
"epoch": 0.37311640270949764,
"grad_norm": 1.7669585943222046,
"learning_rate": 9.328022099005146e-06,
"loss": 2.0098,
"step": 23300
},
{
"epoch": 0.37471776066104057,
"grad_norm": 1.3076069355010986,
"learning_rate": 9.36805652861461e-06,
"loss": 2.0259,
"step": 23400
},
{
"epoch": 0.3763191186125835,
"grad_norm": 1.26585054397583,
"learning_rate": 9.408090958224073e-06,
"loss": 2.0096,
"step": 23500
},
{
"epoch": 0.37792047656412636,
"grad_norm": 1.330881953239441,
"learning_rate": 9.448125387833537e-06,
"loss": 2.0141,
"step": 23600
},
{
"epoch": 0.3795218345156693,
"grad_norm": 1.3129397630691528,
"learning_rate": 9.488159817443003e-06,
"loss": 2.0351,
"step": 23700
},
{
"epoch": 0.3811231924672122,
"grad_norm": 2.2104837894439697,
"learning_rate": 9.528194247052467e-06,
"loss": 2.0458,
"step": 23800
},
{
"epoch": 0.3827245504187551,
"grad_norm": 4.37896728515625,
"learning_rate": 9.56822867666193e-06,
"loss": 2.0432,
"step": 23900
},
{
"epoch": 0.384325908370298,
"grad_norm": 1.4323294162750244,
"learning_rate": 9.608263106271394e-06,
"loss": 2.028,
"step": 24000
},
{
"epoch": 0.38592726632184093,
"grad_norm": 2.277630567550659,
"learning_rate": 9.648297535880858e-06,
"loss": 2.012,
"step": 24100
},
{
"epoch": 0.3875286242733838,
"grad_norm": 1.0068135261535645,
"learning_rate": 9.688331965490324e-06,
"loss": 2.0153,
"step": 24200
},
{
"epoch": 0.38912998222492673,
"grad_norm": 1.464872121810913,
"learning_rate": 9.728366395099786e-06,
"loss": 2.0255,
"step": 24300
},
{
"epoch": 0.39073134017646965,
"grad_norm": 1.6919342279434204,
"learning_rate": 9.768400824709251e-06,
"loss": 2.0146,
"step": 24400
},
{
"epoch": 0.3923326981280126,
"grad_norm": 1.4236170053482056,
"learning_rate": 9.808435254318715e-06,
"loss": 2.0235,
"step": 24500
},
{
"epoch": 0.39393405607955545,
"grad_norm": 1.2634207010269165,
"learning_rate": 9.848469683928179e-06,
"loss": 2.0067,
"step": 24600
},
{
"epoch": 0.3955354140310984,
"grad_norm": 1.185770034790039,
"learning_rate": 9.888504113537644e-06,
"loss": 2.0249,
"step": 24700
},
{
"epoch": 0.3971367719826413,
"grad_norm": 1.6554452180862427,
"learning_rate": 9.928538543147106e-06,
"loss": 2.0224,
"step": 24800
},
{
"epoch": 0.39873812993418417,
"grad_norm": 1.7017241716384888,
"learning_rate": 9.968572972756572e-06,
"loss": 2.0138,
"step": 24900
},
{
"epoch": 0.4003394878857271,
"grad_norm": 1.0250684022903442,
"learning_rate": 1.0008607402366036e-05,
"loss": 2.0082,
"step": 25000
},
{
"epoch": 0.40194084583727,
"grad_norm": 1.3391590118408203,
"learning_rate": 1.0048641831975501e-05,
"loss": 2.008,
"step": 25100
},
{
"epoch": 0.4035422037888129,
"grad_norm": 1.0555273294448853,
"learning_rate": 1.0088676261584965e-05,
"loss": 2.0199,
"step": 25200
},
{
"epoch": 0.4051435617403558,
"grad_norm": 2.1245908737182617,
"learning_rate": 1.0128710691194427e-05,
"loss": 2.0141,
"step": 25300
},
{
"epoch": 0.40674491969189874,
"grad_norm": 1.1639268398284912,
"learning_rate": 1.0168745120803891e-05,
"loss": 2.0274,
"step": 25400
},
{
"epoch": 0.4083462776434416,
"grad_norm": 1.75816011428833,
"learning_rate": 1.0208779550413356e-05,
"loss": 2.0065,
"step": 25500
},
{
"epoch": 0.40994763559498454,
"grad_norm": 3.2224700450897217,
"learning_rate": 1.024881398002282e-05,
"loss": 2.0036,
"step": 25600
},
{
"epoch": 0.41154899354652746,
"grad_norm": 1.0586453676223755,
"learning_rate": 1.0288848409632284e-05,
"loss": 2.0216,
"step": 25700
},
{
"epoch": 0.4131503514980704,
"grad_norm": 1.5636674165725708,
"learning_rate": 1.032888283924175e-05,
"loss": 2.0035,
"step": 25800
},
{
"epoch": 0.41475170944961326,
"grad_norm": 1.287876009941101,
"learning_rate": 1.0368917268851213e-05,
"loss": 2.0033,
"step": 25900
},
{
"epoch": 0.4163530674011562,
"grad_norm": 1.1676390171051025,
"learning_rate": 1.0408951698460677e-05,
"loss": 1.9948,
"step": 26000
},
{
"epoch": 0.4179544253526991,
"grad_norm": 2.230921506881714,
"learning_rate": 1.0448986128070143e-05,
"loss": 1.9747,
"step": 26100
},
{
"epoch": 0.419555783304242,
"grad_norm": 1.1102570295333862,
"learning_rate": 1.0489020557679605e-05,
"loss": 2.002,
"step": 26200
},
{
"epoch": 0.4211571412557849,
"grad_norm": 12.577959060668945,
"learning_rate": 1.0529054987289069e-05,
"loss": 1.9873,
"step": 26300
},
{
"epoch": 0.42275849920732783,
"grad_norm": 1.0285041332244873,
"learning_rate": 1.0569089416898532e-05,
"loss": 2.0182,
"step": 26400
},
{
"epoch": 0.4243598571588707,
"grad_norm": 2.1250357627868652,
"learning_rate": 1.0609123846507998e-05,
"loss": 1.9949,
"step": 26500
},
{
"epoch": 0.4259612151104136,
"grad_norm": 0.90369713306427,
"learning_rate": 1.0649158276117462e-05,
"loss": 2.0081,
"step": 26600
},
{
"epoch": 0.42756257306195655,
"grad_norm": 1.0429993867874146,
"learning_rate": 1.0689192705726926e-05,
"loss": 2.0188,
"step": 26700
},
{
"epoch": 0.4291639310134994,
"grad_norm": 1.2060284614562988,
"learning_rate": 1.0729227135336391e-05,
"loss": 1.9747,
"step": 26800
},
{
"epoch": 0.43076528896504235,
"grad_norm": 1.7947618961334229,
"learning_rate": 1.0769261564945855e-05,
"loss": 1.9963,
"step": 26900
},
{
"epoch": 0.4323666469165853,
"grad_norm": 0.970507025718689,
"learning_rate": 1.0809295994555317e-05,
"loss": 2.0089,
"step": 27000
},
{
"epoch": 0.4339680048681282,
"grad_norm": 1.038913607597351,
"learning_rate": 1.0849330424164784e-05,
"loss": 1.9827,
"step": 27100
},
{
"epoch": 0.43556936281967107,
"grad_norm": 2.165769100189209,
"learning_rate": 1.0889364853774246e-05,
"loss": 1.9961,
"step": 27200
},
{
"epoch": 0.437170720771214,
"grad_norm": 1.196454644203186,
"learning_rate": 1.092939928338371e-05,
"loss": 2.0035,
"step": 27300
},
{
"epoch": 0.4387720787227569,
"grad_norm": 0.956650972366333,
"learning_rate": 1.0969433712993176e-05,
"loss": 1.9841,
"step": 27400
},
{
"epoch": 0.4403734366742998,
"grad_norm": 1.084486961364746,
"learning_rate": 1.100946814260264e-05,
"loss": 1.9789,
"step": 27500
},
{
"epoch": 0.4419747946258427,
"grad_norm": 0.9682411551475525,
"learning_rate": 1.1049502572212103e-05,
"loss": 1.9999,
"step": 27600
},
{
"epoch": 0.44357615257738564,
"grad_norm": 2.1347734928131104,
"learning_rate": 1.1089537001821567e-05,
"loss": 2.0091,
"step": 27700
},
{
"epoch": 0.4451775105289285,
"grad_norm": 4.513906478881836,
"learning_rate": 1.1129571431431033e-05,
"loss": 1.9896,
"step": 27800
},
{
"epoch": 0.44677886848047144,
"grad_norm": 1.6367132663726807,
"learning_rate": 1.1169605861040496e-05,
"loss": 1.9781,
"step": 27900
},
{
"epoch": 0.44838022643201436,
"grad_norm": 1.0168904066085815,
"learning_rate": 1.1209640290649958e-05,
"loss": 1.9924,
"step": 28000
},
{
"epoch": 0.44998158438355723,
"grad_norm": 1.7051305770874023,
"learning_rate": 1.1249674720259424e-05,
"loss": 1.9872,
"step": 28100
},
{
"epoch": 0.45158294233510016,
"grad_norm": 0.9768884778022766,
"learning_rate": 1.1289709149868888e-05,
"loss": 1.9809,
"step": 28200
},
{
"epoch": 0.4531843002866431,
"grad_norm": 1.0439552068710327,
"learning_rate": 1.1329743579478352e-05,
"loss": 1.999,
"step": 28300
},
{
"epoch": 0.454785658238186,
"grad_norm": 0.9658423066139221,
"learning_rate": 1.1369778009087817e-05,
"loss": 2.0104,
"step": 28400
},
{
"epoch": 0.4563870161897289,
"grad_norm": 0.9558666944503784,
"learning_rate": 1.1409812438697281e-05,
"loss": 2.0104,
"step": 28500
},
{
"epoch": 0.4579883741412718,
"grad_norm": 1.591242790222168,
"learning_rate": 1.1449846868306745e-05,
"loss": 1.9888,
"step": 28600
},
{
"epoch": 0.45958973209281473,
"grad_norm": 1.8828788995742798,
"learning_rate": 1.148988129791621e-05,
"loss": 1.9951,
"step": 28700
},
{
"epoch": 0.4611910900443576,
"grad_norm": 1.1350332498550415,
"learning_rate": 1.1529915727525674e-05,
"loss": 1.9842,
"step": 28800
},
{
"epoch": 0.4627924479959005,
"grad_norm": 1.6506210565567017,
"learning_rate": 1.1569950157135136e-05,
"loss": 1.9927,
"step": 28900
},
{
"epoch": 0.46439380594744345,
"grad_norm": 1.0234204530715942,
"learning_rate": 1.16099845867446e-05,
"loss": 1.9981,
"step": 29000
},
{
"epoch": 0.4659951638989863,
"grad_norm": 0.9220559597015381,
"learning_rate": 1.1650019016354065e-05,
"loss": 1.9772,
"step": 29100
},
{
"epoch": 0.46759652185052925,
"grad_norm": 1.008548617362976,
"learning_rate": 1.169005344596353e-05,
"loss": 1.9885,
"step": 29200
},
{
"epoch": 0.46919787980207217,
"grad_norm": 1.0374430418014526,
"learning_rate": 1.1730087875572993e-05,
"loss": 1.9901,
"step": 29300
},
{
"epoch": 0.47079923775361504,
"grad_norm": 1.4683129787445068,
"learning_rate": 1.1770122305182459e-05,
"loss": 1.9905,
"step": 29400
},
{
"epoch": 0.47240059570515797,
"grad_norm": 2.1045260429382324,
"learning_rate": 1.1810156734791922e-05,
"loss": 1.9764,
"step": 29500
},
{
"epoch": 0.4740019536567009,
"grad_norm": 0.9143902063369751,
"learning_rate": 1.1850191164401386e-05,
"loss": 1.9914,
"step": 29600
},
{
"epoch": 0.4756033116082438,
"grad_norm": 1.0126798152923584,
"learning_rate": 1.1890225594010852e-05,
"loss": 1.9559,
"step": 29700
},
{
"epoch": 0.4772046695597867,
"grad_norm": 1.282818078994751,
"learning_rate": 1.1930260023620316e-05,
"loss": 1.9927,
"step": 29800
},
{
"epoch": 0.4788060275113296,
"grad_norm": 1.2307484149932861,
"learning_rate": 1.1970294453229778e-05,
"loss": 1.9825,
"step": 29900
},
{
"epoch": 0.48040738546287254,
"grad_norm": 1.429739236831665,
"learning_rate": 1.2010328882839241e-05,
"loss": 1.9616,
"step": 30000
},
{
"epoch": 0.4820087434144154,
"grad_norm": 1.5777498483657837,
"learning_rate": 1.2050363312448707e-05,
"loss": 1.9821,
"step": 30100
},
{
"epoch": 0.48361010136595833,
"grad_norm": 1.1172056198120117,
"learning_rate": 1.209039774205817e-05,
"loss": 1.9669,
"step": 30200
},
{
"epoch": 0.48521145931750126,
"grad_norm": 1.8118427991867065,
"learning_rate": 1.2130432171667635e-05,
"loss": 1.9555,
"step": 30300
},
{
"epoch": 0.48681281726904413,
"grad_norm": 5.031758785247803,
"learning_rate": 1.21704666012771e-05,
"loss": 1.958,
"step": 30400
},
{
"epoch": 0.48841417522058705,
"grad_norm": 1.171064853668213,
"learning_rate": 1.2210501030886564e-05,
"loss": 1.9697,
"step": 30500
},
{
"epoch": 0.49001553317213,
"grad_norm": 1.6317328214645386,
"learning_rate": 1.2250535460496028e-05,
"loss": 1.9722,
"step": 30600
},
{
"epoch": 0.49161689112367285,
"grad_norm": 0.9671623110771179,
"learning_rate": 1.2290569890105493e-05,
"loss": 1.9659,
"step": 30700
},
{
"epoch": 0.4932182490752158,
"grad_norm": 1.0588128566741943,
"learning_rate": 1.2330604319714955e-05,
"loss": 1.9534,
"step": 30800
},
{
"epoch": 0.4948196070267587,
"grad_norm": 1.1236603260040283,
"learning_rate": 1.237063874932442e-05,
"loss": 1.9505,
"step": 30900
},
{
"epoch": 0.4964209649783016,
"grad_norm": 1.175752878189087,
"learning_rate": 1.2410673178933885e-05,
"loss": 1.9712,
"step": 31000
},
{
"epoch": 0.4980223229298445,
"grad_norm": 1.0395989418029785,
"learning_rate": 1.2450707608543348e-05,
"loss": 1.9493,
"step": 31100
},
{
"epoch": 0.4996236808813874,
"grad_norm": 0.9693764448165894,
"learning_rate": 1.2490742038152812e-05,
"loss": 1.9581,
"step": 31200
},
{
"epoch": 0.5012250388329303,
"grad_norm": 1.100197434425354,
"learning_rate": 1.2530776467762276e-05,
"loss": 1.955,
"step": 31300
},
{
"epoch": 0.5028263967844733,
"grad_norm": 1.3823459148406982,
"learning_rate": 1.2570810897371742e-05,
"loss": 1.9734,
"step": 31400
},
{
"epoch": 0.5044277547360161,
"grad_norm": 0.9062979221343994,
"learning_rate": 1.2610845326981205e-05,
"loss": 1.9612,
"step": 31500
},
{
"epoch": 0.506029112687559,
"grad_norm": 1.0007665157318115,
"learning_rate": 1.2650879756590668e-05,
"loss": 1.9664,
"step": 31600
},
{
"epoch": 0.5076304706391019,
"grad_norm": 0.9745628833770752,
"learning_rate": 1.2690914186200135e-05,
"loss": 1.9648,
"step": 31700
},
{
"epoch": 0.5092318285906449,
"grad_norm": 1.407834768295288,
"learning_rate": 1.2730948615809597e-05,
"loss": 1.9562,
"step": 31800
},
{
"epoch": 0.5108331865421878,
"grad_norm": 1.207322597503662,
"learning_rate": 1.277098304541906e-05,
"loss": 1.9696,
"step": 31900
},
{
"epoch": 0.5124345444937307,
"grad_norm": 1.4670792818069458,
"learning_rate": 1.2811017475028526e-05,
"loss": 1.9524,
"step": 32000
},
{
"epoch": 0.5140359024452736,
"grad_norm": 1.023777961730957,
"learning_rate": 1.285105190463799e-05,
"loss": 1.97,
"step": 32100
},
{
"epoch": 0.5156372603968165,
"grad_norm": 0.9778289198875427,
"learning_rate": 1.2891086334247454e-05,
"loss": 1.9494,
"step": 32200
},
{
"epoch": 0.5172386183483594,
"grad_norm": 0.8971097469329834,
"learning_rate": 1.2931120763856918e-05,
"loss": 1.9628,
"step": 32300
},
{
"epoch": 0.5188399762999023,
"grad_norm": 1.8562573194503784,
"learning_rate": 1.2971155193466383e-05,
"loss": 1.9543,
"step": 32400
},
{
"epoch": 0.5204413342514452,
"grad_norm": 1.7294055223464966,
"learning_rate": 1.3011189623075847e-05,
"loss": 1.9519,
"step": 32500
},
{
"epoch": 0.5220426922029882,
"grad_norm": 1.2172763347625732,
"learning_rate": 1.3051224052685309e-05,
"loss": 1.9758,
"step": 32600
},
{
"epoch": 0.5236440501545311,
"grad_norm": 1.144281268119812,
"learning_rate": 1.3091258482294775e-05,
"loss": 1.9589,
"step": 32700
},
{
"epoch": 0.5252454081060739,
"grad_norm": 1.057813048362732,
"learning_rate": 1.3131292911904238e-05,
"loss": 1.9443,
"step": 32800
},
{
"epoch": 0.5268467660576168,
"grad_norm": 1.297404170036316,
"learning_rate": 1.3171327341513702e-05,
"loss": 1.9614,
"step": 32900
},
{
"epoch": 0.5284481240091597,
"grad_norm": 1.0840290784835815,
"learning_rate": 1.3211361771123168e-05,
"loss": 1.9633,
"step": 33000
},
{
"epoch": 0.5300494819607027,
"grad_norm": 1.0041546821594238,
"learning_rate": 1.3251396200732631e-05,
"loss": 1.9484,
"step": 33100
},
{
"epoch": 0.5316508399122456,
"grad_norm": 1.780435562133789,
"learning_rate": 1.3291430630342095e-05,
"loss": 1.9438,
"step": 33200
},
{
"epoch": 0.5332521978637885,
"grad_norm": 0.9901188015937805,
"learning_rate": 1.333146505995156e-05,
"loss": 1.9384,
"step": 33300
},
{
"epoch": 0.5348535558153314,
"grad_norm": 0.9118313789367676,
"learning_rate": 1.3371499489561025e-05,
"loss": 1.9507,
"step": 33400
},
{
"epoch": 0.5364549137668743,
"grad_norm": 1.0270628929138184,
"learning_rate": 1.3411533919170487e-05,
"loss": 1.9685,
"step": 33500
},
{
"epoch": 0.5380562717184172,
"grad_norm": 2.4503536224365234,
"learning_rate": 1.345156834877995e-05,
"loss": 1.9481,
"step": 33600
},
{
"epoch": 0.5396576296699601,
"grad_norm": 1.1191452741622925,
"learning_rate": 1.3491602778389416e-05,
"loss": 1.9529,
"step": 33700
},
{
"epoch": 0.541258987621503,
"grad_norm": 0.8804434537887573,
"learning_rate": 1.353163720799888e-05,
"loss": 1.9591,
"step": 33800
},
{
"epoch": 0.542860345573046,
"grad_norm": 1.1734013557434082,
"learning_rate": 1.3571671637608344e-05,
"loss": 1.9643,
"step": 33900
},
{
"epoch": 0.5444617035245889,
"grad_norm": 0.9487005472183228,
"learning_rate": 1.361170606721781e-05,
"loss": 1.9408,
"step": 34000
},
{
"epoch": 0.5460630614761317,
"grad_norm": 1.025894045829773,
"learning_rate": 1.3651740496827273e-05,
"loss": 1.9682,
"step": 34100
},
{
"epoch": 0.5476644194276746,
"grad_norm": 1.3745815753936768,
"learning_rate": 1.3691774926436737e-05,
"loss": 1.9441,
"step": 34200
},
{
"epoch": 0.5492657773792176,
"grad_norm": 0.9772420525550842,
"learning_rate": 1.3731809356046202e-05,
"loss": 1.9593,
"step": 34300
},
{
"epoch": 0.5508671353307605,
"grad_norm": 0.8825002908706665,
"learning_rate": 1.3771843785655666e-05,
"loss": 1.9413,
"step": 34400
},
{
"epoch": 0.5524684932823034,
"grad_norm": 2.0654349327087402,
"learning_rate": 1.3811878215265128e-05,
"loss": 1.9478,
"step": 34500
},
{
"epoch": 0.5540698512338463,
"grad_norm": 0.9932202696800232,
"learning_rate": 1.3851912644874592e-05,
"loss": 1.9529,
"step": 34600
},
{
"epoch": 0.5556712091853893,
"grad_norm": 0.923985481262207,
"learning_rate": 1.3891947074484058e-05,
"loss": 1.9542,
"step": 34700
},
{
"epoch": 0.5572725671369321,
"grad_norm": 1.2756383419036865,
"learning_rate": 1.3931981504093521e-05,
"loss": 1.9437,
"step": 34800
},
{
"epoch": 0.558873925088475,
"grad_norm": 1.025530457496643,
"learning_rate": 1.3972015933702985e-05,
"loss": 1.9479,
"step": 34900
},
{
"epoch": 0.5604752830400179,
"grad_norm": 0.9658239483833313,
"learning_rate": 1.401205036331245e-05,
"loss": 1.9392,
"step": 35000
},
{
"epoch": 0.5620766409915608,
"grad_norm": 1.0094221830368042,
"learning_rate": 1.4052084792921914e-05,
"loss": 1.9311,
"step": 35100
},
{
"epoch": 0.5636779989431038,
"grad_norm": 0.933716893196106,
"learning_rate": 1.4092119222531378e-05,
"loss": 1.9605,
"step": 35200
},
{
"epoch": 0.5652793568946467,
"grad_norm": 1.0568841695785522,
"learning_rate": 1.4132153652140844e-05,
"loss": 1.9453,
"step": 35300
},
{
"epoch": 0.5668807148461895,
"grad_norm": 0.9029392004013062,
"learning_rate": 1.4172188081750306e-05,
"loss": 1.9327,
"step": 35400
},
{
"epoch": 0.5684820727977324,
"grad_norm": 0.9875580668449402,
"learning_rate": 1.421222251135977e-05,
"loss": 1.9405,
"step": 35500
},
{
"epoch": 0.5700834307492754,
"grad_norm": 0.9351832270622253,
"learning_rate": 1.4252256940969235e-05,
"loss": 1.9527,
"step": 35600
},
{
"epoch": 0.5716847887008183,
"grad_norm": 1.1400425434112549,
"learning_rate": 1.4292291370578699e-05,
"loss": 1.9451,
"step": 35700
},
{
"epoch": 0.5732861466523612,
"grad_norm": 0.971022367477417,
"learning_rate": 1.4332325800188163e-05,
"loss": 1.9336,
"step": 35800
},
{
"epoch": 0.5748875046039041,
"grad_norm": 0.8905283808708191,
"learning_rate": 1.4372360229797627e-05,
"loss": 1.9518,
"step": 35900
},
{
"epoch": 0.5764888625554471,
"grad_norm": 1.2511688470840454,
"learning_rate": 1.4412394659407092e-05,
"loss": 1.9276,
"step": 36000
},
{
"epoch": 0.5780902205069899,
"grad_norm": 1.2555015087127686,
"learning_rate": 1.4452429089016556e-05,
"loss": 1.9306,
"step": 36100
},
{
"epoch": 0.5796915784585328,
"grad_norm": 2.5456793308258057,
"learning_rate": 1.4492463518626018e-05,
"loss": 1.9212,
"step": 36200
},
{
"epoch": 0.5812929364100757,
"grad_norm": 5.189430236816406,
"learning_rate": 1.4532497948235485e-05,
"loss": 1.9298,
"step": 36300
},
{
"epoch": 0.5828942943616187,
"grad_norm": 0.8082601428031921,
"learning_rate": 1.4572532377844947e-05,
"loss": 1.9289,
"step": 36400
},
{
"epoch": 0.5844956523131616,
"grad_norm": 1.2962714433670044,
"learning_rate": 1.4612566807454411e-05,
"loss": 1.9303,
"step": 36500
},
{
"epoch": 0.5860970102647045,
"grad_norm": 1.9360517263412476,
"learning_rate": 1.4652601237063877e-05,
"loss": 1.9155,
"step": 36600
},
{
"epoch": 0.5876983682162473,
"grad_norm": 1.16732919216156,
"learning_rate": 1.469263566667334e-05,
"loss": 1.9132,
"step": 36700
},
{
"epoch": 0.5892997261677902,
"grad_norm": 0.8907911777496338,
"learning_rate": 1.4732670096282804e-05,
"loss": 1.9312,
"step": 36800
},
{
"epoch": 0.5909010841193332,
"grad_norm": 0.9275608062744141,
"learning_rate": 1.477270452589227e-05,
"loss": 1.9638,
"step": 36900
},
{
"epoch": 0.5925024420708761,
"grad_norm": 1.2977879047393799,
"learning_rate": 1.4812738955501734e-05,
"loss": 1.9372,
"step": 37000
},
{
"epoch": 0.594103800022419,
"grad_norm": 1.1967015266418457,
"learning_rate": 1.4852773385111196e-05,
"loss": 1.9319,
"step": 37100
},
{
"epoch": 0.595705157973962,
"grad_norm": 1.0788534879684448,
"learning_rate": 1.489280781472066e-05,
"loss": 1.9326,
"step": 37200
},
{
"epoch": 0.5973065159255049,
"grad_norm": 0.8467668890953064,
"learning_rate": 1.4932842244330125e-05,
"loss": 1.9238,
"step": 37300
},
{
"epoch": 0.5989078738770477,
"grad_norm": 0.8952154517173767,
"learning_rate": 1.4972876673939589e-05,
"loss": 1.926,
"step": 37400
},
{
"epoch": 0.6005092318285906,
"grad_norm": 0.8892629742622375,
"learning_rate": 1.5012911103549053e-05,
"loss": 1.943,
"step": 37500
},
{
"epoch": 0.6021105897801335,
"grad_norm": 0.8832671642303467,
"learning_rate": 1.5052945533158518e-05,
"loss": 1.9035,
"step": 37600
},
{
"epoch": 0.6037119477316765,
"grad_norm": 1.0101639032363892,
"learning_rate": 1.5092979962767982e-05,
"loss": 1.9282,
"step": 37700
},
{
"epoch": 0.6053133056832194,
"grad_norm": 0.9980772733688354,
"learning_rate": 1.5133014392377446e-05,
"loss": 1.9334,
"step": 37800
},
{
"epoch": 0.6069146636347623,
"grad_norm": 0.9352878332138062,
"learning_rate": 1.5173048821986911e-05,
"loss": 1.9286,
"step": 37900
},
{
"epoch": 0.6085160215863051,
"grad_norm": 0.9329906105995178,
"learning_rate": 1.5213083251596375e-05,
"loss": 1.9133,
"step": 38000
},
{
"epoch": 0.6101173795378481,
"grad_norm": 1.0744600296020508,
"learning_rate": 1.5253117681205837e-05,
"loss": 1.9431,
"step": 38100
},
{
"epoch": 0.611718737489391,
"grad_norm": 1.1284574270248413,
"learning_rate": 1.52931521108153e-05,
"loss": 1.9236,
"step": 38200
},
{
"epoch": 0.6133200954409339,
"grad_norm": 0.7931867241859436,
"learning_rate": 1.5333186540424767e-05,
"loss": 1.9239,
"step": 38300
},
{
"epoch": 0.6149214533924768,
"grad_norm": 0.9535111784934998,
"learning_rate": 1.5373220970034232e-05,
"loss": 1.933,
"step": 38400
},
{
"epoch": 0.6165228113440198,
"grad_norm": 1.1604766845703125,
"learning_rate": 1.5413255399643694e-05,
"loss": 1.9118,
"step": 38500
},
{
"epoch": 0.6181241692955627,
"grad_norm": 0.9939236640930176,
"learning_rate": 1.545328982925316e-05,
"loss": 1.9004,
"step": 38600
},
{
"epoch": 0.6197255272471055,
"grad_norm": 0.901757538318634,
"learning_rate": 1.5493324258862622e-05,
"loss": 1.9198,
"step": 38700
},
{
"epoch": 0.6213268851986484,
"grad_norm": 1.034832239151001,
"learning_rate": 1.5533358688472087e-05,
"loss": 1.9175,
"step": 38800
},
{
"epoch": 0.6229282431501914,
"grad_norm": 0.8186530470848083,
"learning_rate": 1.5573393118081553e-05,
"loss": 1.9098,
"step": 38900
},
{
"epoch": 0.6245296011017343,
"grad_norm": 1.0724900960922241,
"learning_rate": 1.5613427547691015e-05,
"loss": 1.9143,
"step": 39000
},
{
"epoch": 0.6261309590532772,
"grad_norm": 0.9440537691116333,
"learning_rate": 1.565346197730048e-05,
"loss": 1.9327,
"step": 39100
},
{
"epoch": 0.6277323170048201,
"grad_norm": 0.9175347089767456,
"learning_rate": 1.5693496406909946e-05,
"loss": 1.9097,
"step": 39200
},
{
"epoch": 0.6293336749563629,
"grad_norm": 1.075506567955017,
"learning_rate": 1.5733530836519408e-05,
"loss": 1.9148,
"step": 39300
},
{
"epoch": 0.6309350329079059,
"grad_norm": 1.156162142753601,
"learning_rate": 1.577356526612887e-05,
"loss": 1.928,
"step": 39400
},
{
"epoch": 0.6325363908594488,
"grad_norm": 1.2199561595916748,
"learning_rate": 1.5813599695738336e-05,
"loss": 1.9212,
"step": 39500
},
{
"epoch": 0.6341377488109917,
"grad_norm": 1.088230848312378,
"learning_rate": 1.58536341253478e-05,
"loss": 1.9147,
"step": 39600
},
{
"epoch": 0.6357391067625346,
"grad_norm": 0.911685049533844,
"learning_rate": 1.5893668554957263e-05,
"loss": 1.914,
"step": 39700
},
{
"epoch": 0.6373404647140776,
"grad_norm": 0.8977714776992798,
"learning_rate": 1.593370298456673e-05,
"loss": 1.9212,
"step": 39800
},
{
"epoch": 0.6389418226656205,
"grad_norm": 0.9816354513168335,
"learning_rate": 1.5973737414176194e-05,
"loss": 1.9046,
"step": 39900
},
{
"epoch": 0.6405431806171633,
"grad_norm": 0.88201904296875,
"learning_rate": 1.6013771843785656e-05,
"loss": 1.9359,
"step": 40000
},
{
"epoch": 0.6421445385687062,
"grad_norm": 0.9104109406471252,
"learning_rate": 1.6053806273395122e-05,
"loss": 1.9302,
"step": 40100
},
{
"epoch": 0.6437458965202492,
"grad_norm": 1.5256859064102173,
"learning_rate": 1.6093840703004587e-05,
"loss": 1.9226,
"step": 40200
},
{
"epoch": 0.6453472544717921,
"grad_norm": 0.8858827948570251,
"learning_rate": 1.613387513261405e-05,
"loss": 1.9143,
"step": 40300
},
{
"epoch": 0.646948612423335,
"grad_norm": 1.480420470237732,
"learning_rate": 1.617390956222351e-05,
"loss": 1.9171,
"step": 40400
},
{
"epoch": 0.6485499703748779,
"grad_norm": 0.9443252682685852,
"learning_rate": 1.6213943991832977e-05,
"loss": 1.9104,
"step": 40500
},
{
"epoch": 0.6501513283264208,
"grad_norm": 1.4180731773376465,
"learning_rate": 1.6253978421442443e-05,
"loss": 1.9015,
"step": 40600
},
{
"epoch": 0.6517526862779637,
"grad_norm": 1.0369699001312256,
"learning_rate": 1.6294012851051905e-05,
"loss": 1.9085,
"step": 40700
},
{
"epoch": 0.6533540442295066,
"grad_norm": 1.0155749320983887,
"learning_rate": 1.633404728066137e-05,
"loss": 1.8968,
"step": 40800
},
{
"epoch": 0.6549554021810495,
"grad_norm": 1.0214248895645142,
"learning_rate": 1.6374081710270836e-05,
"loss": 1.9109,
"step": 40900
},
{
"epoch": 0.6565567601325925,
"grad_norm": 1.2233892679214478,
"learning_rate": 1.6414116139880298e-05,
"loss": 1.8968,
"step": 41000
},
{
"epoch": 0.6581581180841354,
"grad_norm": 0.8677876591682434,
"learning_rate": 1.6454150569489763e-05,
"loss": 1.9121,
"step": 41100
},
{
"epoch": 0.6597594760356783,
"grad_norm": 0.8257797956466675,
"learning_rate": 1.649418499909923e-05,
"loss": 1.9329,
"step": 41200
},
{
"epoch": 0.6613608339872211,
"grad_norm": 0.904925525188446,
"learning_rate": 1.653421942870869e-05,
"loss": 1.8934,
"step": 41300
},
{
"epoch": 0.662962191938764,
"grad_norm": 0.8754270672798157,
"learning_rate": 1.6574253858318153e-05,
"loss": 1.8885,
"step": 41400
},
{
"epoch": 0.664563549890307,
"grad_norm": 0.9102962613105774,
"learning_rate": 1.661428828792762e-05,
"loss": 1.9046,
"step": 41500
},
{
"epoch": 0.6661649078418499,
"grad_norm": 0.9199568033218384,
"learning_rate": 1.6654322717537084e-05,
"loss": 1.9122,
"step": 41600
},
{
"epoch": 0.6677662657933928,
"grad_norm": 0.9582586288452148,
"learning_rate": 1.6694357147146546e-05,
"loss": 1.8959,
"step": 41700
},
{
"epoch": 0.6693676237449357,
"grad_norm": 0.8151847124099731,
"learning_rate": 1.6734391576756012e-05,
"loss": 1.887,
"step": 41800
},
{
"epoch": 0.6709689816964786,
"grad_norm": 0.9953237771987915,
"learning_rate": 1.6774426006365477e-05,
"loss": 1.9236,
"step": 41900
},
{
"epoch": 0.6725703396480215,
"grad_norm": 1.465527057647705,
"learning_rate": 1.681446043597494e-05,
"loss": 1.9136,
"step": 42000
},
{
"epoch": 0.6741716975995644,
"grad_norm": 0.9603108763694763,
"learning_rate": 1.68544948655844e-05,
"loss": 1.8941,
"step": 42100
},
{
"epoch": 0.6757730555511073,
"grad_norm": 0.8624867796897888,
"learning_rate": 1.689452929519387e-05,
"loss": 1.905,
"step": 42200
},
{
"epoch": 0.6773744135026503,
"grad_norm": 0.9774655699729919,
"learning_rate": 1.6934563724803333e-05,
"loss": 1.9156,
"step": 42300
},
{
"epoch": 0.6789757714541932,
"grad_norm": 2.9199743270874023,
"learning_rate": 1.6974598154412795e-05,
"loss": 1.9126,
"step": 42400
},
{
"epoch": 0.6805771294057361,
"grad_norm": 1.2201206684112549,
"learning_rate": 1.701463258402226e-05,
"loss": 1.8976,
"step": 42500
},
{
"epoch": 0.6821784873572789,
"grad_norm": 1.0182702541351318,
"learning_rate": 1.7054667013631726e-05,
"loss": 1.8968,
"step": 42600
},
{
"epoch": 0.6837798453088219,
"grad_norm": 1.134906530380249,
"learning_rate": 1.7094701443241188e-05,
"loss": 1.9361,
"step": 42700
},
{
"epoch": 0.6853812032603648,
"grad_norm": 1.635399341583252,
"learning_rate": 1.7134735872850653e-05,
"loss": 1.919,
"step": 42800
},
{
"epoch": 0.6869825612119077,
"grad_norm": 0.8835542798042297,
"learning_rate": 1.717477030246012e-05,
"loss": 1.8776,
"step": 42900
},
{
"epoch": 0.6885839191634506,
"grad_norm": 0.9510149955749512,
"learning_rate": 1.721480473206958e-05,
"loss": 1.9036,
"step": 43000
},
{
"epoch": 0.6901852771149936,
"grad_norm": 0.8410897850990295,
"learning_rate": 1.7254839161679043e-05,
"loss": 1.9091,
"step": 43100
},
{
"epoch": 0.6917866350665364,
"grad_norm": 1.4297950267791748,
"learning_rate": 1.729487359128851e-05,
"loss": 1.8934,
"step": 43200
},
{
"epoch": 0.6933879930180793,
"grad_norm": 0.9010776877403259,
"learning_rate": 1.7334908020897974e-05,
"loss": 1.8997,
"step": 43300
},
{
"epoch": 0.6949893509696222,
"grad_norm": 0.8833039999008179,
"learning_rate": 1.7374942450507436e-05,
"loss": 1.892,
"step": 43400
},
{
"epoch": 0.6965907089211651,
"grad_norm": 0.9560312032699585,
"learning_rate": 1.74149768801169e-05,
"loss": 1.8977,
"step": 43500
},
{
"epoch": 0.6981920668727081,
"grad_norm": 0.8603575825691223,
"learning_rate": 1.7455011309726367e-05,
"loss": 1.8881,
"step": 43600
},
{
"epoch": 0.699793424824251,
"grad_norm": 0.8545820116996765,
"learning_rate": 1.749504573933583e-05,
"loss": 1.8992,
"step": 43700
},
{
"epoch": 0.7013947827757939,
"grad_norm": 1.6138850450515747,
"learning_rate": 1.7535080168945295e-05,
"loss": 1.8823,
"step": 43800
},
{
"epoch": 0.7029961407273367,
"grad_norm": 1.339882731437683,
"learning_rate": 1.757511459855476e-05,
"loss": 1.8942,
"step": 43900
},
{
"epoch": 0.7045974986788797,
"grad_norm": 0.8209664225578308,
"learning_rate": 1.7615149028164222e-05,
"loss": 1.885,
"step": 44000
},
{
"epoch": 0.7061988566304226,
"grad_norm": 0.8096824884414673,
"learning_rate": 1.7655183457773685e-05,
"loss": 1.8738,
"step": 44100
},
{
"epoch": 0.7078002145819655,
"grad_norm": 1.0560259819030762,
"learning_rate": 1.769521788738315e-05,
"loss": 1.8941,
"step": 44200
},
{
"epoch": 0.7094015725335084,
"grad_norm": 1.1268258094787598,
"learning_rate": 1.7735252316992616e-05,
"loss": 1.8974,
"step": 44300
},
{
"epoch": 0.7110029304850514,
"grad_norm": 0.9307839274406433,
"learning_rate": 1.7775286746602078e-05,
"loss": 1.9122,
"step": 44400
},
{
"epoch": 0.7126042884365942,
"grad_norm": 1.0069445371627808,
"learning_rate": 1.7815321176211543e-05,
"loss": 1.8597,
"step": 44500
},
{
"epoch": 0.7142056463881371,
"grad_norm": 1.2771753072738647,
"learning_rate": 1.785535560582101e-05,
"loss": 1.8819,
"step": 44600
},
{
"epoch": 0.71580700433968,
"grad_norm": 0.7819973230361938,
"learning_rate": 1.789539003543047e-05,
"loss": 1.8722,
"step": 44700
},
{
"epoch": 0.717408362291223,
"grad_norm": 0.8193828463554382,
"learning_rate": 1.7935424465039936e-05,
"loss": 1.8745,
"step": 44800
},
{
"epoch": 0.7190097202427659,
"grad_norm": 0.7969743609428406,
"learning_rate": 1.7975458894649402e-05,
"loss": 1.911,
"step": 44900
},
{
"epoch": 0.7206110781943088,
"grad_norm": 1.4411369562149048,
"learning_rate": 1.8015493324258864e-05,
"loss": 1.8763,
"step": 45000
},
{
"epoch": 0.7222124361458517,
"grad_norm": 1.0016000270843506,
"learning_rate": 1.805552775386833e-05,
"loss": 1.8875,
"step": 45100
},
{
"epoch": 0.7238137940973945,
"grad_norm": 0.8997382521629333,
"learning_rate": 1.809556218347779e-05,
"loss": 1.8766,
"step": 45200
},
{
"epoch": 0.7254151520489375,
"grad_norm": 0.8878375291824341,
"learning_rate": 1.8135596613087257e-05,
"loss": 1.8803,
"step": 45300
},
{
"epoch": 0.7270165100004804,
"grad_norm": 0.8563076853752136,
"learning_rate": 1.817563104269672e-05,
"loss": 1.8855,
"step": 45400
},
{
"epoch": 0.7286178679520233,
"grad_norm": 1.020241618156433,
"learning_rate": 1.8215665472306185e-05,
"loss": 1.8583,
"step": 45500
},
{
"epoch": 0.7302192259035662,
"grad_norm": 0.8296322822570801,
"learning_rate": 1.825569990191565e-05,
"loss": 1.8692,
"step": 45600
},
{
"epoch": 0.7318205838551092,
"grad_norm": 1.9434692859649658,
"learning_rate": 1.8295734331525112e-05,
"loss": 1.8831,
"step": 45700
},
{
"epoch": 0.733421941806652,
"grad_norm": 0.9088252782821655,
"learning_rate": 1.8335768761134578e-05,
"loss": 1.8868,
"step": 45800
},
{
"epoch": 0.7350232997581949,
"grad_norm": 1.7532590627670288,
"learning_rate": 1.837580319074404e-05,
"loss": 1.8734,
"step": 45900
},
{
"epoch": 0.7366246577097378,
"grad_norm": 0.9662244319915771,
"learning_rate": 1.8415837620353505e-05,
"loss": 1.888,
"step": 46000
},
{
"epoch": 0.7382260156612808,
"grad_norm": 3.4851512908935547,
"learning_rate": 1.845587204996297e-05,
"loss": 1.8829,
"step": 46100
},
{
"epoch": 0.7398273736128237,
"grad_norm": 0.9157941341400146,
"learning_rate": 1.8495906479572433e-05,
"loss": 1.9091,
"step": 46200
},
{
"epoch": 0.7414287315643666,
"grad_norm": 0.8992369771003723,
"learning_rate": 1.85359409091819e-05,
"loss": 1.8932,
"step": 46300
},
{
"epoch": 0.7430300895159095,
"grad_norm": 0.8611487150192261,
"learning_rate": 1.857597533879136e-05,
"loss": 1.8672,
"step": 46400
},
{
"epoch": 0.7446314474674524,
"grad_norm": 1.0629839897155762,
"learning_rate": 1.8616009768400826e-05,
"loss": 1.8819,
"step": 46500
},
{
"epoch": 0.7462328054189953,
"grad_norm": 0.8317407369613647,
"learning_rate": 1.865604419801029e-05,
"loss": 1.8684,
"step": 46600
},
{
"epoch": 0.7478341633705382,
"grad_norm": 0.8102233409881592,
"learning_rate": 1.8696078627619754e-05,
"loss": 1.8796,
"step": 46700
},
{
"epoch": 0.7494355213220811,
"grad_norm": 0.8077260255813599,
"learning_rate": 1.873611305722922e-05,
"loss": 1.8872,
"step": 46800
},
{
"epoch": 0.7510368792736241,
"grad_norm": 0.9285743236541748,
"learning_rate": 1.877614748683868e-05,
"loss": 1.8828,
"step": 46900
},
{
"epoch": 0.752638237225167,
"grad_norm": 0.835612416267395,
"learning_rate": 1.8816181916448147e-05,
"loss": 1.8685,
"step": 47000
},
{
"epoch": 0.7542395951767098,
"grad_norm": 1.5960347652435303,
"learning_rate": 1.8856216346057612e-05,
"loss": 1.8705,
"step": 47100
},
{
"epoch": 0.7558409531282527,
"grad_norm": 0.7755472660064697,
"learning_rate": 1.8896250775667075e-05,
"loss": 1.8851,
"step": 47200
},
{
"epoch": 0.7574423110797956,
"grad_norm": 1.0042415857315063,
"learning_rate": 1.893628520527654e-05,
"loss": 1.855,
"step": 47300
},
{
"epoch": 0.7590436690313386,
"grad_norm": 0.8991414904594421,
"learning_rate": 1.8976319634886006e-05,
"loss": 1.8569,
"step": 47400
},
{
"epoch": 0.7606450269828815,
"grad_norm": 0.813565194606781,
"learning_rate": 1.9016354064495468e-05,
"loss": 1.8578,
"step": 47500
},
{
"epoch": 0.7622463849344244,
"grad_norm": 0.7883344292640686,
"learning_rate": 1.9056388494104933e-05,
"loss": 1.8968,
"step": 47600
},
{
"epoch": 0.7638477428859674,
"grad_norm": 1.0632473230361938,
"learning_rate": 1.9096422923714395e-05,
"loss": 1.879,
"step": 47700
},
{
"epoch": 0.7654491008375102,
"grad_norm": 0.8479236364364624,
"learning_rate": 1.913645735332386e-05,
"loss": 1.8683,
"step": 47800
},
{
"epoch": 0.7670504587890531,
"grad_norm": 0.871159553527832,
"learning_rate": 1.9176491782933323e-05,
"loss": 1.8653,
"step": 47900
},
{
"epoch": 0.768651816740596,
"grad_norm": 0.8534667491912842,
"learning_rate": 1.921652621254279e-05,
"loss": 1.8507,
"step": 48000
},
{
"epoch": 0.7702531746921389,
"grad_norm": 0.8931534290313721,
"learning_rate": 1.9256560642152254e-05,
"loss": 1.8625,
"step": 48100
},
{
"epoch": 0.7718545326436819,
"grad_norm": 1.1518031358718872,
"learning_rate": 1.9296595071761716e-05,
"loss": 1.8786,
"step": 48200
},
{
"epoch": 0.7734558905952248,
"grad_norm": 0.9310818910598755,
"learning_rate": 1.933662950137118e-05,
"loss": 1.8517,
"step": 48300
},
{
"epoch": 0.7750572485467676,
"grad_norm": 1.314759612083435,
"learning_rate": 1.9376663930980647e-05,
"loss": 1.853,
"step": 48400
},
{
"epoch": 0.7766586064983105,
"grad_norm": 0.8431141972541809,
"learning_rate": 1.941669836059011e-05,
"loss": 1.863,
"step": 48500
},
{
"epoch": 0.7782599644498535,
"grad_norm": 0.90580815076828,
"learning_rate": 1.945673279019957e-05,
"loss": 1.8496,
"step": 48600
},
{
"epoch": 0.7798613224013964,
"grad_norm": 1.0436537265777588,
"learning_rate": 1.9496767219809037e-05,
"loss": 1.8629,
"step": 48700
},
{
"epoch": 0.7814626803529393,
"grad_norm": 0.8080843091011047,
"learning_rate": 1.9536801649418502e-05,
"loss": 1.857,
"step": 48800
},
{
"epoch": 0.7830640383044822,
"grad_norm": 0.8750945925712585,
"learning_rate": 1.9576836079027964e-05,
"loss": 1.8725,
"step": 48900
},
{
"epoch": 0.7846653962560252,
"grad_norm": 1.2619659900665283,
"learning_rate": 1.961687050863743e-05,
"loss": 1.8422,
"step": 49000
},
{
"epoch": 0.786266754207568,
"grad_norm": 0.84897780418396,
"learning_rate": 1.9656904938246895e-05,
"loss": 1.8725,
"step": 49100
},
{
"epoch": 0.7878681121591109,
"grad_norm": 0.7454677820205688,
"learning_rate": 1.9696939367856358e-05,
"loss": 1.8735,
"step": 49200
},
{
"epoch": 0.7894694701106538,
"grad_norm": 0.8530156016349792,
"learning_rate": 1.9736973797465823e-05,
"loss": 1.8597,
"step": 49300
},
{
"epoch": 0.7910708280621968,
"grad_norm": 0.9725930690765381,
"learning_rate": 1.977700822707529e-05,
"loss": 1.8515,
"step": 49400
},
{
"epoch": 0.7926721860137397,
"grad_norm": 0.8235682249069214,
"learning_rate": 1.981704265668475e-05,
"loss": 1.8791,
"step": 49500
},
{
"epoch": 0.7942735439652826,
"grad_norm": 0.9344043135643005,
"learning_rate": 1.9857077086294213e-05,
"loss": 1.8663,
"step": 49600
},
{
"epoch": 0.7958749019168254,
"grad_norm": 0.9629167318344116,
"learning_rate": 1.9897111515903678e-05,
"loss": 1.8647,
"step": 49700
},
{
"epoch": 0.7974762598683683,
"grad_norm": 0.7384589910507202,
"learning_rate": 1.9937145945513144e-05,
"loss": 1.8586,
"step": 49800
},
{
"epoch": 0.7990776178199113,
"grad_norm": 1.069229245185852,
"learning_rate": 1.9977180375122606e-05,
"loss": 1.8622,
"step": 49900
},
{
"epoch": 0.8006789757714542,
"grad_norm": 0.8724033236503601,
"learning_rate": 1.9999415105482566e-05,
"loss": 1.8602,
"step": 50000
},
{
"epoch": 0.8022803337229971,
"grad_norm": 0.8449952602386475,
"learning_rate": 1.9993531998299776e-05,
"loss": 1.8321,
"step": 50100
},
{
"epoch": 0.80388169167454,
"grad_norm": 0.8022226095199585,
"learning_rate": 1.9981326651105962e-05,
"loss": 1.8735,
"step": 50200
},
{
"epoch": 0.805483049626083,
"grad_norm": 0.8840874433517456,
"learning_rate": 1.9962806785408838e-05,
"loss": 1.829,
"step": 50300
},
{
"epoch": 0.8070844075776258,
"grad_norm": 1.0448670387268066,
"learning_rate": 1.993798411749008e-05,
"loss": 1.8595,
"step": 50400
},
{
"epoch": 0.8086857655291687,
"grad_norm": 0.904961109161377,
"learning_rate": 1.9906874350993245e-05,
"loss": 1.8586,
"step": 50500
},
{
"epoch": 0.8102871234807116,
"grad_norm": 2.3403968811035156,
"learning_rate": 1.98694971669891e-05,
"loss": 1.8492,
"step": 50600
},
{
"epoch": 0.8118884814322546,
"grad_norm": 1.0114359855651855,
"learning_rate": 1.9825876211524724e-05,
"loss": 1.8609,
"step": 50700
},
{
"epoch": 0.8134898393837975,
"grad_norm": 0.8057318329811096,
"learning_rate": 1.977603908066426e-05,
"loss": 1.829,
"step": 50800
},
{
"epoch": 0.8150911973353404,
"grad_norm": 0.8821057677268982,
"learning_rate": 1.9720017303030703e-05,
"loss": 1.862,
"step": 50900
},
{
"epoch": 0.8166925552868832,
"grad_norm": 2.767875909805298,
"learning_rate": 1.9657846319859854e-05,
"loss": 1.8678,
"step": 51000
},
{
"epoch": 0.8182939132384262,
"grad_norm": 0.7666317820549011,
"learning_rate": 1.9589565462579015e-05,
"loss": 1.8621,
"step": 51100
},
{
"epoch": 0.8198952711899691,
"grad_norm": 1.021729588508606,
"learning_rate": 1.9515217927924633e-05,
"loss": 1.8352,
"step": 51200
},
{
"epoch": 0.821496629141512,
"grad_norm": 0.8282054662704468,
"learning_rate": 1.943485075061461e-05,
"loss": 1.8583,
"step": 51300
},
{
"epoch": 0.8230979870930549,
"grad_norm": 3.0896732807159424,
"learning_rate": 1.934851477359256e-05,
"loss": 1.8394,
"step": 51400
},
{
"epoch": 0.8246993450445979,
"grad_norm": 0.9474732279777527,
"learning_rate": 1.9256264615862893e-05,
"loss": 1.8682,
"step": 51500
},
{
"epoch": 0.8263007029961408,
"grad_norm": 0.7810207009315491,
"learning_rate": 1.9158158637937027e-05,
"loss": 1.8337,
"step": 51600
},
{
"epoch": 0.8279020609476836,
"grad_norm": 0.8208989500999451,
"learning_rate": 1.9054258904912575e-05,
"loss": 1.8367,
"step": 51700
},
{
"epoch": 0.8295034188992265,
"grad_norm": 0.8764814138412476,
"learning_rate": 1.89446311472089e-05,
"loss": 1.8403,
"step": 51800
},
{
"epoch": 0.8311047768507694,
"grad_norm": 1.1485708951950073,
"learning_rate": 1.8829344718983903e-05,
"loss": 1.8576,
"step": 51900
},
{
"epoch": 0.8327061348023124,
"grad_norm": 1.06003737449646,
"learning_rate": 1.8708472554258237e-05,
"loss": 1.872,
"step": 52000
},
{
"epoch": 0.8343074927538553,
"grad_norm": 0.8322979807853699,
"learning_rate": 1.8582091120774855e-05,
"loss": 1.859,
"step": 52100
},
{
"epoch": 0.8359088507053982,
"grad_norm": 0.7536402940750122,
"learning_rate": 1.845028037162298e-05,
"loss": 1.8401,
"step": 52200
},
{
"epoch": 0.837510208656941,
"grad_norm": 1.4201630353927612,
"learning_rate": 1.83131236946571e-05,
"loss": 1.8723,
"step": 52300
},
{
"epoch": 0.839111566608484,
"grad_norm": 0.7676379680633545,
"learning_rate": 1.8170707859743067e-05,
"loss": 1.8572,
"step": 52400
},
{
"epoch": 0.8407129245600269,
"grad_norm": 0.8063752055168152,
"learning_rate": 1.8023122963864602e-05,
"loss": 1.8469,
"step": 52500
},
{
"epoch": 0.8423142825115698,
"grad_norm": 0.8385179042816162,
"learning_rate": 1.787046237412493e-05,
"loss": 1.8564,
"step": 52600
},
{
"epoch": 0.8439156404631127,
"grad_norm": 0.8969714641571045,
"learning_rate": 1.7712822668679682e-05,
"loss": 1.8556,
"step": 52700
},
{
"epoch": 0.8455169984146557,
"grad_norm": 1.184692621231079,
"learning_rate": 1.7550303575638318e-05,
"loss": 1.8423,
"step": 52800
},
{
"epoch": 0.8471183563661986,
"grad_norm": 0.8388579487800598,
"learning_rate": 1.7383007909972844e-05,
"loss": 1.8157,
"step": 52900
},
{
"epoch": 0.8487197143177414,
"grad_norm": 0.7864462733268738,
"learning_rate": 1.721104150847362e-05,
"loss": 1.8526,
"step": 53000
},
{
"epoch": 0.8503210722692843,
"grad_norm": 0.86407071352005,
"learning_rate": 1.703451316279353e-05,
"loss": 1.8428,
"step": 53100
},
{
"epoch": 0.8519224302208273,
"grad_norm": 0.8313634395599365,
"learning_rate": 1.6853534550622722e-05,
"loss": 1.8479,
"step": 53200
},
{
"epoch": 0.8535237881723702,
"grad_norm": 1.4253445863723755,
"learning_rate": 1.666822016503765e-05,
"loss": 1.8275,
"step": 53300
},
{
"epoch": 0.8551251461239131,
"grad_norm": 5.398781776428223,
"learning_rate": 1.6478687242068904e-05,
"loss": 1.854,
"step": 53400
},
{
"epoch": 0.856726504075456,
"grad_norm": 1.7977509498596191,
"learning_rate": 1.628505568653385e-05,
"loss": 1.8339,
"step": 53500
},
{
"epoch": 0.8583278620269988,
"grad_norm": 0.8206777572631836,
"learning_rate": 1.6087447996180826e-05,
"loss": 1.8511,
"step": 53600
},
{
"epoch": 0.8599292199785418,
"grad_norm": 0.8535060286521912,
"learning_rate": 1.5885989184193027e-05,
"loss": 1.8586,
"step": 53700
},
{
"epoch": 0.8615305779300847,
"grad_norm": 1.6550579071044922,
"learning_rate": 1.5680806700101e-05,
"loss": 1.8482,
"step": 53800
},
{
"epoch": 0.8631319358816276,
"grad_norm": 0.8122648000717163,
"learning_rate": 1.5472030349153854e-05,
"loss": 1.8335,
"step": 53900
},
{
"epoch": 0.8647332938331705,
"grad_norm": 0.7805556058883667,
"learning_rate": 1.525979221020014e-05,
"loss": 1.8252,
"step": 54000
},
{
"epoch": 0.8663346517847135,
"grad_norm": 0.8546029329299927,
"learning_rate": 1.5044226552130399e-05,
"loss": 1.8353,
"step": 54100
},
{
"epoch": 0.8679360097362564,
"grad_norm": 0.7961782217025757,
"learning_rate": 1.4825469748934192e-05,
"loss": 1.8348,
"step": 54200
},
{
"epoch": 0.8695373676877992,
"grad_norm": 0.9392079710960388,
"learning_rate": 1.4603660193425402e-05,
"loss": 1.8205,
"step": 54300
},
{
"epoch": 0.8711387256393421,
"grad_norm": 0.7852017283439636,
"learning_rate": 1.4378938209690334e-05,
"loss": 1.8327,
"step": 54400
},
{
"epoch": 0.8727400835908851,
"grad_norm": 0.8385934829711914,
"learning_rate": 1.4151445964314057e-05,
"loss": 1.8383,
"step": 54500
},
{
"epoch": 0.874341441542428,
"grad_norm": 0.7498407363891602,
"learning_rate": 1.3921327376441087e-05,
"loss": 1.8121,
"step": 54600
},
{
"epoch": 0.8759427994939709,
"grad_norm": 0.8227770924568176,
"learning_rate": 1.3688728026727369e-05,
"loss": 1.8395,
"step": 54700
},
{
"epoch": 0.8775441574455138,
"grad_norm": 0.911970317363739,
"learning_rate": 1.3453795065241128e-05,
"loss": 1.8262,
"step": 54800
},
{
"epoch": 0.8791455153970567,
"grad_norm": 0.8143411874771118,
"learning_rate": 1.3216677118370834e-05,
"loss": 1.8571,
"step": 54900
},
{
"epoch": 0.8807468733485996,
"grad_norm": 0.8301388621330261,
"learning_rate": 1.2977524194799229e-05,
"loss": 1.8435,
"step": 55000
},
{
"epoch": 0.8823482313001425,
"grad_norm": 1.3477791547775269,
"learning_rate": 1.2736487590602864e-05,
"loss": 1.8372,
"step": 55100
},
{
"epoch": 0.8839495892516854,
"grad_norm": 0.8804235458374023,
"learning_rate": 1.2493719793537157e-05,
"loss": 1.841,
"step": 55200
},
{
"epoch": 0.8855509472032284,
"grad_norm": 0.7941620349884033,
"learning_rate": 1.2249374386567598e-05,
"loss": 1.8271,
"step": 55300
},
{
"epoch": 0.8871523051547713,
"grad_norm": 0.8681734800338745,
"learning_rate": 1.2003605950708059e-05,
"loss": 1.8459,
"step": 55400
},
{
"epoch": 0.8887536631063142,
"grad_norm": 0.7299553155899048,
"learning_rate": 1.1756569967227716e-05,
"loss": 1.8684,
"step": 55500
},
{
"epoch": 0.890355021057857,
"grad_norm": 0.7805650234222412,
"learning_rate": 1.1508422719288434e-05,
"loss": 1.8113,
"step": 55600
},
{
"epoch": 0.8919563790094,
"grad_norm": 0.7692527770996094,
"learning_rate": 1.125932119307486e-05,
"loss": 1.8252,
"step": 55700
},
{
"epoch": 0.8935577369609429,
"grad_norm": 0.8291378021240234,
"learning_rate": 1.1009422978479742e-05,
"loss": 1.7992,
"step": 55800
},
{
"epoch": 0.8951590949124858,
"grad_norm": 0.8779826164245605,
"learning_rate": 1.0758886169407351e-05,
"loss": 1.8336,
"step": 55900
},
{
"epoch": 0.8967604528640287,
"grad_norm": 0.7980159521102905,
"learning_rate": 1.050786926375801e-05,
"loss": 1.8212,
"step": 56000
},
{
"epoch": 0.8983618108155716,
"grad_norm": 3.2298014163970947,
"learning_rate": 1.025653106315707e-05,
"loss": 1.8188,
"step": 56100
},
{
"epoch": 0.8999631687671145,
"grad_norm": 0.8914725184440613,
"learning_rate": 1.0005030572491733e-05,
"loss": 1.8387,
"step": 56200
},
{
"epoch": 0.9015645267186574,
"grad_norm": 0.8599027395248413,
"learning_rate": 9.753526899319275e-06,
"loss": 1.8327,
"step": 56300
},
{
"epoch": 0.9031658846702003,
"grad_norm": 0.9533581733703613,
"learning_rate": 9.50217915321035e-06,
"loss": 1.822,
"step": 56400
},
{
"epoch": 0.9047672426217432,
"grad_norm": 0.8099405169487,
"learning_rate": 9.251146345090958e-06,
"loss": 1.8462,
"step": 56500
},
{
"epoch": 0.9063686005732862,
"grad_norm": 0.8883758783340454,
"learning_rate": 9.000587286646886e-06,
"loss": 1.8184,
"step": 56600
},
{
"epoch": 0.9079699585248291,
"grad_norm": 1.6830765008926392,
"learning_rate": 8.750660489854142e-06,
"loss": 1.82,
"step": 56700
},
{
"epoch": 0.909571316476372,
"grad_norm": 1.2402883768081665,
"learning_rate": 8.501524066699047e-06,
"loss": 1.816,
"step": 56800
},
{
"epoch": 0.9111726744279148,
"grad_norm": 0.8525800108909607,
"learning_rate": 8.253335629151306e-06,
"loss": 1.8248,
"step": 56900
},
{
"epoch": 0.9127740323794578,
"grad_norm": 0.8562950491905212,
"learning_rate": 8.006252189453485e-06,
"loss": 1.8284,
"step": 57000
},
{
"epoch": 0.9143753903310007,
"grad_norm": 0.7687914371490479,
"learning_rate": 7.760430060789828e-06,
"loss": 1.8198,
"step": 57100
},
{
"epoch": 0.9159767482825436,
"grad_norm": 0.9463182091712952,
"learning_rate": 7.51602475839736e-06,
"loss": 1.8266,
"step": 57200
},
{
"epoch": 0.9175781062340865,
"grad_norm": 1.0767518281936646,
"learning_rate": 7.273190901181783e-06,
"loss": 1.8054,
"step": 57300
},
{
"epoch": 0.9191794641856295,
"grad_norm": 0.8242263197898865,
"learning_rate": 7.032082113900434e-06,
"loss": 1.8337,
"step": 57400
},
{
"epoch": 0.9207808221371723,
"grad_norm": 0.7926039695739746,
"learning_rate": 6.792850929974142e-06,
"loss": 1.8144,
"step": 57500
},
{
"epoch": 0.9223821800887152,
"grad_norm": 0.7732511162757874,
"learning_rate": 6.55564869498956e-06,
"loss": 1.804,
"step": 57600
},
{
"epoch": 0.9239835380402581,
"grad_norm": 0.7959622144699097,
"learning_rate": 6.32062547095288e-06,
"loss": 1.8222,
"step": 57700
},
{
"epoch": 0.925584895991801,
"grad_norm": 0.8663679957389832,
"learning_rate": 6.087929941355671e-06,
"loss": 1.8496,
"step": 57800
},
{
"epoch": 0.927186253943344,
"grad_norm": 0.7793252468109131,
"learning_rate": 5.857709317112736e-06,
"loss": 1.8177,
"step": 57900
},
{
"epoch": 0.9287876118948869,
"grad_norm": 0.9085448980331421,
"learning_rate": 5.630109243431608e-06,
"loss": 1.8193,
"step": 58000
},
{
"epoch": 0.9303889698464298,
"grad_norm": 0.7569569945335388,
"learning_rate": 5.4052737076725824e-06,
"loss": 1.8196,
"step": 58100
},
{
"epoch": 0.9319903277979726,
"grad_norm": 0.8424269556999207,
"learning_rate": 5.1833449482574895e-06,
"loss": 1.835,
"step": 58200
},
{
"epoch": 0.9335916857495156,
"grad_norm": 0.8512621521949768,
"learning_rate": 4.964463364685001e-06,
"loss": 1.8145,
"step": 58300
},
{
"epoch": 0.9351930437010585,
"grad_norm": 1.0519986152648926,
"learning_rate": 4.748767428709187e-06,
"loss": 1.8213,
"step": 58400
},
{
"epoch": 0.9367944016526014,
"grad_norm": 0.7896735072135925,
"learning_rate": 4.536393596737752e-06,
"loss": 1.8243,
"step": 58500
},
{
"epoch": 0.9383957596041443,
"grad_norm": 1.0739407539367676,
"learning_rate": 4.327476223505136e-06,
"loss": 1.832,
"step": 58600
},
{
"epoch": 0.9399971175556873,
"grad_norm": 0.8374795913696289,
"learning_rate": 4.12214747707527e-06,
"loss": 1.8338,
"step": 58700
},
{
"epoch": 0.9415984755072301,
"grad_norm": 1.0221420526504517,
"learning_rate": 3.920537255227669e-06,
"loss": 1.8101,
"step": 58800
},
{
"epoch": 0.943199833458773,
"grad_norm": 0.8421764969825745,
"learning_rate": 3.7227731032797853e-06,
"loss": 1.8329,
"step": 58900
},
{
"epoch": 0.9448011914103159,
"grad_norm": 0.7701355814933777,
"learning_rate": 3.5289801333976102e-06,
"loss": 1.8216,
"step": 59000
},
{
"epoch": 0.9464025493618589,
"grad_norm": 0.7741368412971497,
"learning_rate": 3.339280945445559e-06,
"loss": 1.8272,
"step": 59100
},
{
"epoch": 0.9480039073134018,
"grad_norm": 1.7360873222351074,
"learning_rate": 3.1537955494257345e-06,
"loss": 1.8372,
"step": 59200
},
{
"epoch": 0.9496052652649447,
"grad_norm": 0.7760699987411499,
"learning_rate": 2.972641289555616e-06,
"loss": 1.8182,
"step": 59300
},
{
"epoch": 0.9512066232164876,
"grad_norm": 0.7646809220314026,
"learning_rate": 2.7959327700322036e-06,
"loss": 1.8084,
"step": 59400
},
{
"epoch": 0.9528079811680304,
"grad_norm": 0.9442381858825684,
"learning_rate": 2.623781782529625e-06,
"loss": 1.8239,
"step": 59500
},
{
"epoch": 0.9544093391195734,
"grad_norm": 0.8009527325630188,
"learning_rate": 2.4562972354759698e-06,
"loss": 1.8272,
"step": 59600
},
{
"epoch": 0.9560106970711163,
"grad_norm": 0.7591850757598877,
"learning_rate": 2.293585085154252e-06,
"loss": 1.8314,
"step": 59700
},
{
"epoch": 0.9576120550226592,
"grad_norm": 0.7954255938529968,
"learning_rate": 2.135748268670902e-06,
"loss": 1.8341,
"step": 59800
},
{
"epoch": 0.9592134129742022,
"grad_norm": 1.0002678632736206,
"learning_rate": 1.9828866388343814e-06,
"loss": 1.8075,
"step": 59900
},
{
"epoch": 0.9608147709257451,
"grad_norm": 0.7856830954551697,
"learning_rate": 1.8350969009849483e-06,
"loss": 1.8005,
"step": 60000
},
{
"epoch": 0.9624161288772879,
"grad_norm": 0.9126999378204346,
"learning_rate": 1.6924725518156637e-06,
"loss": 1.8277,
"step": 60100
},
{
"epoch": 0.9640174868288308,
"grad_norm": 0.8106286525726318,
"learning_rate": 1.5551038202232805e-06,
"loss": 1.8108,
"step": 60200
},
{
"epoch": 0.9656188447803737,
"grad_norm": 1.359531044960022,
"learning_rate": 1.4230776102264454e-06,
"loss": 1.8475,
"step": 60300
},
{
"epoch": 0.9672202027319167,
"grad_norm": 0.7704586386680603,
"learning_rate": 1.2964774459873364e-06,
"loss": 1.8482,
"step": 60400
},
{
"epoch": 0.9688215606834596,
"grad_norm": 0.7488996982574463,
"learning_rate": 1.1753834189715019e-06,
"loss": 1.8115,
"step": 60500
},
{
"epoch": 0.9704229186350025,
"grad_norm": 1.662976861000061,
"learning_rate": 1.059872137279342e-06,
"loss": 1.8391,
"step": 60600
},
{
"epoch": 0.9720242765865454,
"grad_norm": 1.0111815929412842,
"learning_rate": 9.500166771812902e-07,
"loss": 1.8161,
"step": 60700
},
{
"epoch": 0.9736256345380883,
"grad_norm": 0.7973281145095825,
"learning_rate": 8.458865368873204e-07,
"loss": 1.8219,
"step": 60800
},
{
"epoch": 0.9752269924896312,
"grad_norm": 0.8591629266738892,
"learning_rate": 7.475475925800968e-07,
"loss": 1.8399,
"step": 60900
},
{
"epoch": 0.9768283504411741,
"grad_norm": 0.9209094047546387,
"learning_rate": 6.550620567394883e-07,
"loss": 1.8319,
"step": 61000
},
{
"epoch": 0.978429708392717,
"grad_norm": 0.916976273059845,
"learning_rate": 5.684884387849176e-07,
"loss": 1.8189,
"step": 61100
},
{
"epoch": 0.98003106634426,
"grad_norm": 0.950470507144928,
"learning_rate": 4.878815080603372e-07,
"loss": 1.8052,
"step": 61200
},
{
"epoch": 0.9816324242958029,
"grad_norm": 0.7501734495162964,
"learning_rate": 4.1329225918533277e-07,
"loss": 1.8419,
"step": 61300
},
{
"epoch": 0.9832337822473457,
"grad_norm": 0.8855769038200378,
"learning_rate": 3.447678797942389e-07,
"loss": 1.8168,
"step": 61400
},
{
"epoch": 0.9848351401988886,
"grad_norm": 0.9513728618621826,
"learning_rate": 2.823517206836701e-07,
"loss": 1.8219,
"step": 61500
},
{
"epoch": 0.9864364981504316,
"grad_norm": 0.9888412952423096,
"learning_rate": 2.2608326838736817e-07,
"loss": 1.8183,
"step": 61600
},
{
"epoch": 0.9880378561019745,
"grad_norm": 0.8009938597679138,
"learning_rate": 1.7599812019571395e-07,
"loss": 1.8027,
"step": 61700
},
{
"epoch": 0.9896392140535174,
"grad_norm": 0.9275427460670471,
"learning_rate": 1.321279616356963e-07,
"loss": 1.8145,
"step": 61800
},
{
"epoch": 0.9912405720050603,
"grad_norm": 0.7663293480873108,
"learning_rate": 9.450054642560102e-08,
"loss": 1.8332,
"step": 61900
},
{
"epoch": 0.9928419299566033,
"grad_norm": 0.7306997776031494,
"learning_rate": 6.313967891707906e-08,
"loss": 1.8059,
"step": 62000
},
{
"epoch": 0.9944432879081461,
"grad_norm": 0.8004014492034912,
"learning_rate": 3.806519903573502e-08,
"loss": 1.8347,
"step": 62100
},
{
"epoch": 0.996044645859689,
"grad_norm": 0.7328791618347168,
"learning_rate": 1.9292969729719502e-08,
"loss": 1.8156,
"step": 62200
},
{
"epoch": 0.9976460038112319,
"grad_norm": 0.8255366086959839,
"learning_rate": 6.834866934314344e-09,
"loss": 1.8029,
"step": 62300
},
{
"epoch": 0.9992473617627748,
"grad_norm": 0.8802406787872314,
"learning_rate": 6.987720588080837e-10,
"loss": 1.8173,
"step": 62400
},
{
"epoch": 1.0,
"step": 62447,
"total_flos": 7.631778497299481e+18,
"train_loss": 2.1485319636500972,
"train_runtime": 14119.5859,
"train_samples_per_second": 35.382,
"train_steps_per_second": 4.423
}
],
"logging_steps": 100,
"max_steps": 62447,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.631778497299481e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}