werent4's picture
Upload trainer_state.json with huggingface_hub
306bfc2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 95491,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001047219109654313,
"grad_norm": 6.445349216461182,
"learning_rate": 5.238344683080147e-08,
"loss": 1.2293,
"step": 100
},
{
"epoch": 0.002094438219308626,
"grad_norm": 7.579877853393555,
"learning_rate": 1.0476689366160294e-07,
"loss": 1.2053,
"step": 200
},
{
"epoch": 0.003141657328962939,
"grad_norm": 5.277140140533447,
"learning_rate": 1.5715034049240438e-07,
"loss": 1.1086,
"step": 300
},
{
"epoch": 0.004188876438617252,
"grad_norm": 3.0632076263427734,
"learning_rate": 2.0953378732320588e-07,
"loss": 1.0615,
"step": 400
},
{
"epoch": 0.005236095548271565,
"grad_norm": 8.091245651245117,
"learning_rate": 2.6191723415400735e-07,
"loss": 0.9659,
"step": 500
},
{
"epoch": 0.006283314657925878,
"grad_norm": 2.5814743041992188,
"learning_rate": 3.1430068098480877e-07,
"loss": 0.9656,
"step": 600
},
{
"epoch": 0.007330533767580191,
"grad_norm": 8.025248527526855,
"learning_rate": 3.6668412781561024e-07,
"loss": 0.9068,
"step": 700
},
{
"epoch": 0.008377752877234504,
"grad_norm": 2.8023085594177246,
"learning_rate": 4.1906757464641176e-07,
"loss": 0.8278,
"step": 800
},
{
"epoch": 0.009424971986888816,
"grad_norm": 2.9815306663513184,
"learning_rate": 4.714510214772132e-07,
"loss": 0.8097,
"step": 900
},
{
"epoch": 0.01047219109654313,
"grad_norm": 4.450624465942383,
"learning_rate": 5.238344683080147e-07,
"loss": 0.8611,
"step": 1000
},
{
"epoch": 0.011519410206197442,
"grad_norm": 2.9705615043640137,
"learning_rate": 5.762179151388162e-07,
"loss": 0.8217,
"step": 1100
},
{
"epoch": 0.012566629315851756,
"grad_norm": 5.060612678527832,
"learning_rate": 6.286013619696175e-07,
"loss": 0.8326,
"step": 1200
},
{
"epoch": 0.01361384842550607,
"grad_norm": 4.002683639526367,
"learning_rate": 6.809848088004191e-07,
"loss": 0.7742,
"step": 1300
},
{
"epoch": 0.014661067535160381,
"grad_norm": 3.3899588584899902,
"learning_rate": 7.333682556312205e-07,
"loss": 0.7594,
"step": 1400
},
{
"epoch": 0.015708286644814693,
"grad_norm": 4.091441631317139,
"learning_rate": 7.857517024620219e-07,
"loss": 0.7871,
"step": 1500
},
{
"epoch": 0.01675550575446901,
"grad_norm": 3.302689790725708,
"learning_rate": 8.381351492928235e-07,
"loss": 0.721,
"step": 1600
},
{
"epoch": 0.01780272486412332,
"grad_norm": 3.8457956314086914,
"learning_rate": 8.905185961236249e-07,
"loss": 0.6789,
"step": 1700
},
{
"epoch": 0.018849943973777632,
"grad_norm": 3.763422727584839,
"learning_rate": 9.429020429544264e-07,
"loss": 0.7035,
"step": 1800
},
{
"epoch": 0.019897163083431948,
"grad_norm": 2.3855648040771484,
"learning_rate": 9.95285489785228e-07,
"loss": 0.7331,
"step": 1900
},
{
"epoch": 0.02094438219308626,
"grad_norm": 3.0932857990264893,
"learning_rate": 9.999976668774249e-07,
"loss": 0.7123,
"step": 2000
},
{
"epoch": 0.02199160130274057,
"grad_norm": 2.939152956008911,
"learning_rate": 9.999897217221058e-07,
"loss": 0.6106,
"step": 2100
},
{
"epoch": 0.023038820412394884,
"grad_norm": 2.148160934448242,
"learning_rate": 9.999761418022958e-07,
"loss": 0.6828,
"step": 2200
},
{
"epoch": 0.0240860395220492,
"grad_norm": 2.302873134613037,
"learning_rate": 9.999569272710377e-07,
"loss": 0.6691,
"step": 2300
},
{
"epoch": 0.02513325863170351,
"grad_norm": 4.346377372741699,
"learning_rate": 9.999320783448744e-07,
"loss": 0.6698,
"step": 2400
},
{
"epoch": 0.026180477741357823,
"grad_norm": 2.157055616378784,
"learning_rate": 9.999015953038474e-07,
"loss": 0.6019,
"step": 2500
},
{
"epoch": 0.02722769685101214,
"grad_norm": 2.7303714752197266,
"learning_rate": 9.998654784914935e-07,
"loss": 0.5972,
"step": 2600
},
{
"epoch": 0.02827491596066645,
"grad_norm": 4.359681606292725,
"learning_rate": 9.9982372831484e-07,
"loss": 0.6381,
"step": 2700
},
{
"epoch": 0.029322135070320762,
"grad_norm": 3.2993288040161133,
"learning_rate": 9.997763452444018e-07,
"loss": 0.6093,
"step": 2800
},
{
"epoch": 0.030369354179975078,
"grad_norm": 3.061521053314209,
"learning_rate": 9.99723329814175e-07,
"loss": 0.6875,
"step": 2900
},
{
"epoch": 0.031416573289629386,
"grad_norm": 2.3765642642974854,
"learning_rate": 9.996646826216302e-07,
"loss": 0.6031,
"step": 3000
},
{
"epoch": 0.0324637923992837,
"grad_norm": 2.144615411758423,
"learning_rate": 9.996004043277078e-07,
"loss": 0.637,
"step": 3100
},
{
"epoch": 0.03351101150893802,
"grad_norm": 3.2836430072784424,
"learning_rate": 9.995304956568083e-07,
"loss": 0.6425,
"step": 3200
},
{
"epoch": 0.034558230618592325,
"grad_norm": 2.8710663318634033,
"learning_rate": 9.99454957396786e-07,
"loss": 0.6199,
"step": 3300
},
{
"epoch": 0.03560544972824664,
"grad_norm": 2.5998404026031494,
"learning_rate": 9.993737903989387e-07,
"loss": 0.5903,
"step": 3400
},
{
"epoch": 0.036652668837900956,
"grad_norm": 2.677945613861084,
"learning_rate": 9.992869955779995e-07,
"loss": 0.6473,
"step": 3500
},
{
"epoch": 0.037699887947555265,
"grad_norm": 3.9936769008636475,
"learning_rate": 9.991945739121251e-07,
"loss": 0.5847,
"step": 3600
},
{
"epoch": 0.03874710705720958,
"grad_norm": 2.839268207550049,
"learning_rate": 9.990965264428851e-07,
"loss": 0.5893,
"step": 3700
},
{
"epoch": 0.039794326166863896,
"grad_norm": 2.4763646125793457,
"learning_rate": 9.989928542752516e-07,
"loss": 0.5865,
"step": 3800
},
{
"epoch": 0.040841545276518204,
"grad_norm": 4.822995662689209,
"learning_rate": 9.98883558577585e-07,
"loss": 0.579,
"step": 3900
},
{
"epoch": 0.04188876438617252,
"grad_norm": 2.6188089847564697,
"learning_rate": 9.987686405816216e-07,
"loss": 0.6065,
"step": 4000
},
{
"epoch": 0.042935983495826835,
"grad_norm": 2.550874710083008,
"learning_rate": 9.986481015824592e-07,
"loss": 0.5911,
"step": 4100
},
{
"epoch": 0.04398320260548114,
"grad_norm": 2.973268985748291,
"learning_rate": 9.985219429385443e-07,
"loss": 0.6216,
"step": 4200
},
{
"epoch": 0.04503042171513546,
"grad_norm": 6.536316394805908,
"learning_rate": 9.98390166071654e-07,
"loss": 0.5904,
"step": 4300
},
{
"epoch": 0.04607764082478977,
"grad_norm": 2.6079025268554688,
"learning_rate": 9.982527724668825e-07,
"loss": 0.5942,
"step": 4400
},
{
"epoch": 0.04712485993444408,
"grad_norm": 2.2787749767303467,
"learning_rate": 9.981097636726227e-07,
"loss": 0.6174,
"step": 4500
},
{
"epoch": 0.0481720790440984,
"grad_norm": 1.995902419090271,
"learning_rate": 9.979611413005493e-07,
"loss": 0.5698,
"step": 4600
},
{
"epoch": 0.04921929815375271,
"grad_norm": 3.4670004844665527,
"learning_rate": 9.97806907025601e-07,
"loss": 0.5871,
"step": 4700
},
{
"epoch": 0.05026651726340702,
"grad_norm": 2.329735279083252,
"learning_rate": 9.97647062585961e-07,
"loss": 0.6061,
"step": 4800
},
{
"epoch": 0.05131373637306134,
"grad_norm": 2.4299092292785645,
"learning_rate": 9.97481609783038e-07,
"loss": 0.5944,
"step": 4900
},
{
"epoch": 0.052360955482715646,
"grad_norm": 4.186954498291016,
"learning_rate": 9.973105504814458e-07,
"loss": 0.6131,
"step": 5000
},
{
"epoch": 0.05340817459236996,
"grad_norm": 2.038557767868042,
"learning_rate": 9.971338866089812e-07,
"loss": 0.5668,
"step": 5100
},
{
"epoch": 0.05445539370202428,
"grad_norm": 2.6505930423736572,
"learning_rate": 9.96951620156604e-07,
"loss": 0.5697,
"step": 5200
},
{
"epoch": 0.055502612811678585,
"grad_norm": 3.494474411010742,
"learning_rate": 9.967637531784138e-07,
"loss": 0.6061,
"step": 5300
},
{
"epoch": 0.0565498319213329,
"grad_norm": 1.573089599609375,
"learning_rate": 9.965702877916262e-07,
"loss": 0.5714,
"step": 5400
},
{
"epoch": 0.057597051030987216,
"grad_norm": 3.103743553161621,
"learning_rate": 9.963712261765495e-07,
"loss": 0.6045,
"step": 5500
},
{
"epoch": 0.058644270140641525,
"grad_norm": 2.182767152786255,
"learning_rate": 9.96166570576561e-07,
"loss": 0.6209,
"step": 5600
},
{
"epoch": 0.05969148925029584,
"grad_norm": 2.818512439727783,
"learning_rate": 9.959563232980801e-07,
"loss": 0.5825,
"step": 5700
},
{
"epoch": 0.060738708359950155,
"grad_norm": 6.24643611907959,
"learning_rate": 9.957404867105435e-07,
"loss": 0.5645,
"step": 5800
},
{
"epoch": 0.061785927469604464,
"grad_norm": 2.866800308227539,
"learning_rate": 9.955190632463774e-07,
"loss": 0.5826,
"step": 5900
},
{
"epoch": 0.06283314657925877,
"grad_norm": 1.9323431253433228,
"learning_rate": 9.952920554009715e-07,
"loss": 0.5706,
"step": 6000
},
{
"epoch": 0.06388036568891309,
"grad_norm": 2.389801263809204,
"learning_rate": 9.9505946573265e-07,
"loss": 0.5888,
"step": 6100
},
{
"epoch": 0.0649275847985674,
"grad_norm": 2.6937005519866943,
"learning_rate": 9.948212968626429e-07,
"loss": 0.5848,
"step": 6200
},
{
"epoch": 0.06597480390822172,
"grad_norm": 3.2649362087249756,
"learning_rate": 9.945775514750558e-07,
"loss": 0.5746,
"step": 6300
},
{
"epoch": 0.06702202301787603,
"grad_norm": 3.9703376293182373,
"learning_rate": 9.943282323168416e-07,
"loss": 0.5219,
"step": 6400
},
{
"epoch": 0.06806924212753035,
"grad_norm": 3.0078823566436768,
"learning_rate": 9.94073342197767e-07,
"loss": 0.5867,
"step": 6500
},
{
"epoch": 0.06911646123718465,
"grad_norm": 2.0793182849884033,
"learning_rate": 9.938128839903829e-07,
"loss": 0.5757,
"step": 6600
},
{
"epoch": 0.07016368034683897,
"grad_norm": 1.7143627405166626,
"learning_rate": 9.935468606299908e-07,
"loss": 0.5753,
"step": 6700
},
{
"epoch": 0.07121089945649328,
"grad_norm": 1.6375339031219482,
"learning_rate": 9.932752751146102e-07,
"loss": 0.5875,
"step": 6800
},
{
"epoch": 0.0722581185661476,
"grad_norm": 3.0804569721221924,
"learning_rate": 9.929981305049452e-07,
"loss": 0.5399,
"step": 6900
},
{
"epoch": 0.07330533767580191,
"grad_norm": 1.8709744215011597,
"learning_rate": 9.92715429924349e-07,
"loss": 0.5555,
"step": 7000
},
{
"epoch": 0.07435255678545621,
"grad_norm": 2.213629722595215,
"learning_rate": 9.924271765587897e-07,
"loss": 0.5536,
"step": 7100
},
{
"epoch": 0.07539977589511053,
"grad_norm": 1.5812900066375732,
"learning_rate": 9.921333736568133e-07,
"loss": 0.5973,
"step": 7200
},
{
"epoch": 0.07644699500476484,
"grad_norm": 1.2580069303512573,
"learning_rate": 9.918340245295086e-07,
"loss": 0.549,
"step": 7300
},
{
"epoch": 0.07749421411441916,
"grad_norm": 3.4917242527008057,
"learning_rate": 9.915291325504685e-07,
"loss": 0.5493,
"step": 7400
},
{
"epoch": 0.07854143322407348,
"grad_norm": 3.6106157302856445,
"learning_rate": 9.912187011557523e-07,
"loss": 0.5367,
"step": 7500
},
{
"epoch": 0.07958865233372779,
"grad_norm": 2.585413694381714,
"learning_rate": 9.90902733843848e-07,
"loss": 0.5242,
"step": 7600
},
{
"epoch": 0.08063587144338209,
"grad_norm": 2.1417288780212402,
"learning_rate": 9.905812341756314e-07,
"loss": 0.5657,
"step": 7700
},
{
"epoch": 0.08168309055303641,
"grad_norm": 2.6701626777648926,
"learning_rate": 9.902542057743267e-07,
"loss": 0.533,
"step": 7800
},
{
"epoch": 0.08273030966269072,
"grad_norm": 2.7961204051971436,
"learning_rate": 9.899216523254657e-07,
"loss": 0.5833,
"step": 7900
},
{
"epoch": 0.08377752877234504,
"grad_norm": 3.9673585891723633,
"learning_rate": 9.895835775768464e-07,
"loss": 0.5548,
"step": 8000
},
{
"epoch": 0.08482474788199935,
"grad_norm": 2.384716272354126,
"learning_rate": 9.892399853384903e-07,
"loss": 0.5802,
"step": 8100
},
{
"epoch": 0.08587196699165367,
"grad_norm": 2.7740979194641113,
"learning_rate": 9.888908794825994e-07,
"loss": 0.5565,
"step": 8200
},
{
"epoch": 0.08691918610130797,
"grad_norm": 2.4571990966796875,
"learning_rate": 9.885362639435133e-07,
"loss": 0.5538,
"step": 8300
},
{
"epoch": 0.08796640521096229,
"grad_norm": 2.063465118408203,
"learning_rate": 9.88176142717664e-07,
"loss": 0.603,
"step": 8400
},
{
"epoch": 0.0890136243206166,
"grad_norm": 1.9801498651504517,
"learning_rate": 9.878105198635321e-07,
"loss": 0.5479,
"step": 8500
},
{
"epoch": 0.09006084343027092,
"grad_norm": 2.044619083404541,
"learning_rate": 9.87439399501599e-07,
"loss": 0.5446,
"step": 8600
},
{
"epoch": 0.09110806253992523,
"grad_norm": 2.573242664337158,
"learning_rate": 9.87062785814303e-07,
"loss": 0.5347,
"step": 8700
},
{
"epoch": 0.09215528164957953,
"grad_norm": 2.520949125289917,
"learning_rate": 9.866806830459898e-07,
"loss": 0.5467,
"step": 8800
},
{
"epoch": 0.09320250075923385,
"grad_norm": 2.924830913543701,
"learning_rate": 9.86293095502866e-07,
"loss": 0.5187,
"step": 8900
},
{
"epoch": 0.09424971986888817,
"grad_norm": 2.2049362659454346,
"learning_rate": 9.859000275529507e-07,
"loss": 0.5549,
"step": 9000
},
{
"epoch": 0.09529693897854248,
"grad_norm": 2.932223320007324,
"learning_rate": 9.855014836260256e-07,
"loss": 0.5723,
"step": 9100
},
{
"epoch": 0.0963441580881968,
"grad_norm": 2.659306526184082,
"learning_rate": 9.850974682135855e-07,
"loss": 0.5471,
"step": 9200
},
{
"epoch": 0.09739137719785111,
"grad_norm": 3.1078333854675293,
"learning_rate": 9.84687985868787e-07,
"loss": 0.5498,
"step": 9300
},
{
"epoch": 0.09843859630750541,
"grad_norm": 2.73991322517395,
"learning_rate": 9.842730412063984e-07,
"loss": 0.5509,
"step": 9400
},
{
"epoch": 0.09948581541715973,
"grad_norm": 2.288360595703125,
"learning_rate": 9.83852638902747e-07,
"loss": 0.5311,
"step": 9500
},
{
"epoch": 0.10053303452681404,
"grad_norm": 2.391042947769165,
"learning_rate": 9.834267836956652e-07,
"loss": 0.569,
"step": 9600
},
{
"epoch": 0.10158025363646836,
"grad_norm": 2.225496292114258,
"learning_rate": 9.829954803844404e-07,
"loss": 0.5432,
"step": 9700
},
{
"epoch": 0.10262747274612267,
"grad_norm": 1.877164363861084,
"learning_rate": 9.82558733829757e-07,
"loss": 0.5795,
"step": 9800
},
{
"epoch": 0.10367469185577699,
"grad_norm": 2.455549478530884,
"learning_rate": 9.82116548953644e-07,
"loss": 0.577,
"step": 9900
},
{
"epoch": 0.10472191096543129,
"grad_norm": 3.1859889030456543,
"learning_rate": 9.816689307394198e-07,
"loss": 0.5742,
"step": 10000
},
{
"epoch": 0.10576913007508561,
"grad_norm": 2.9405317306518555,
"learning_rate": 9.812158842316341e-07,
"loss": 0.5674,
"step": 10100
},
{
"epoch": 0.10681634918473992,
"grad_norm": 2.1740851402282715,
"learning_rate": 9.807574145360125e-07,
"loss": 0.5219,
"step": 10200
},
{
"epoch": 0.10786356829439424,
"grad_norm": 2.1551525592803955,
"learning_rate": 9.80293526819399e-07,
"loss": 0.5378,
"step": 10300
},
{
"epoch": 0.10891078740404855,
"grad_norm": 1.479442834854126,
"learning_rate": 9.798242263096968e-07,
"loss": 0.5137,
"step": 10400
},
{
"epoch": 0.10995800651370287,
"grad_norm": 2.2272469997406006,
"learning_rate": 9.793495182958107e-07,
"loss": 0.5469,
"step": 10500
},
{
"epoch": 0.11100522562335717,
"grad_norm": 1.9610800743103027,
"learning_rate": 9.78869408127586e-07,
"loss": 0.5685,
"step": 10600
},
{
"epoch": 0.11205244473301149,
"grad_norm": 2.2086081504821777,
"learning_rate": 9.7838390121575e-07,
"loss": 0.5505,
"step": 10700
},
{
"epoch": 0.1130996638426658,
"grad_norm": 3.1201093196868896,
"learning_rate": 9.778930030318488e-07,
"loss": 0.5829,
"step": 10800
},
{
"epoch": 0.11414688295232012,
"grad_norm": 2.6629204750061035,
"learning_rate": 9.773967191081875e-07,
"loss": 0.5925,
"step": 10900
},
{
"epoch": 0.11519410206197443,
"grad_norm": 2.593073844909668,
"learning_rate": 9.768950550377674e-07,
"loss": 0.572,
"step": 11000
},
{
"epoch": 0.11624132117162873,
"grad_norm": 4.5134687423706055,
"learning_rate": 9.763880164742224e-07,
"loss": 0.5106,
"step": 11100
},
{
"epoch": 0.11728854028128305,
"grad_norm": 3.3710708618164062,
"learning_rate": 9.758756091317557e-07,
"loss": 0.567,
"step": 11200
},
{
"epoch": 0.11833575939093736,
"grad_norm": 3.414686679840088,
"learning_rate": 9.753578387850754e-07,
"loss": 0.578,
"step": 11300
},
{
"epoch": 0.11938297850059168,
"grad_norm": 2.6787045001983643,
"learning_rate": 9.748347112693294e-07,
"loss": 0.5587,
"step": 11400
},
{
"epoch": 0.120430197610246,
"grad_norm": 2.505725860595703,
"learning_rate": 9.743062324800395e-07,
"loss": 0.5513,
"step": 11500
},
{
"epoch": 0.12147741671990031,
"grad_norm": 2.5358970165252686,
"learning_rate": 9.737724083730354e-07,
"loss": 0.5378,
"step": 11600
},
{
"epoch": 0.12252463582955461,
"grad_norm": 1.6748542785644531,
"learning_rate": 9.732332449643868e-07,
"loss": 0.5062,
"step": 11700
},
{
"epoch": 0.12357185493920893,
"grad_norm": 2.4574966430664062,
"learning_rate": 9.726887483303364e-07,
"loss": 0.5721,
"step": 11800
},
{
"epoch": 0.12461907404886324,
"grad_norm": 2.737337589263916,
"learning_rate": 9.721389246072307e-07,
"loss": 0.5963,
"step": 11900
},
{
"epoch": 0.12566629315851754,
"grad_norm": 2.453996181488037,
"learning_rate": 9.715837799914517e-07,
"loss": 0.5917,
"step": 12000
},
{
"epoch": 0.12671351226817187,
"grad_norm": 2.9003748893737793,
"learning_rate": 9.710233207393463e-07,
"loss": 0.5603,
"step": 12100
},
{
"epoch": 0.12776073137782618,
"grad_norm": 2.409175395965576,
"learning_rate": 9.704575531671562e-07,
"loss": 0.568,
"step": 12200
},
{
"epoch": 0.1288079504874805,
"grad_norm": 3.183899402618408,
"learning_rate": 9.698864836509463e-07,
"loss": 0.5702,
"step": 12300
},
{
"epoch": 0.1298551695971348,
"grad_norm": 2.7574760913848877,
"learning_rate": 9.693101186265336e-07,
"loss": 0.5394,
"step": 12400
},
{
"epoch": 0.1309023887067891,
"grad_norm": 2.9319100379943848,
"learning_rate": 9.687284645894139e-07,
"loss": 0.5504,
"step": 12500
},
{
"epoch": 0.13194960781644344,
"grad_norm": 2.8977279663085938,
"learning_rate": 9.681415280946887e-07,
"loss": 0.611,
"step": 12600
},
{
"epoch": 0.13299682692609774,
"grad_norm": 1.9469819068908691,
"learning_rate": 9.675493157569922e-07,
"loss": 0.5621,
"step": 12700
},
{
"epoch": 0.13404404603575207,
"grad_norm": 2.0829553604125977,
"learning_rate": 9.669518342504155e-07,
"loss": 0.5305,
"step": 12800
},
{
"epoch": 0.13509126514540637,
"grad_norm": 3.0171096324920654,
"learning_rate": 9.663490903084324e-07,
"loss": 0.5666,
"step": 12900
},
{
"epoch": 0.1361384842550607,
"grad_norm": 3.0453896522521973,
"learning_rate": 9.657410907238224e-07,
"loss": 0.5332,
"step": 13000
},
{
"epoch": 0.137185703364715,
"grad_norm": 2.2059998512268066,
"learning_rate": 9.651278423485958e-07,
"loss": 0.5859,
"step": 13100
},
{
"epoch": 0.1382329224743693,
"grad_norm": 2.076673746109009,
"learning_rate": 9.645093520939146e-07,
"loss": 0.5048,
"step": 13200
},
{
"epoch": 0.13928014158402363,
"grad_norm": 1.7987829446792603,
"learning_rate": 9.638856269300163e-07,
"loss": 0.5501,
"step": 13300
},
{
"epoch": 0.14032736069367793,
"grad_norm": 3.1706273555755615,
"learning_rate": 9.63256673886134e-07,
"loss": 0.5389,
"step": 13400
},
{
"epoch": 0.14137457980333226,
"grad_norm": 2.9992752075195312,
"learning_rate": 9.626225000504177e-07,
"loss": 0.5517,
"step": 13500
},
{
"epoch": 0.14242179891298656,
"grad_norm": 1.2536182403564453,
"learning_rate": 9.619831125698552e-07,
"loss": 0.5304,
"step": 13600
},
{
"epoch": 0.14346901802264087,
"grad_norm": 2.491206645965576,
"learning_rate": 9.6133851865019e-07,
"loss": 0.5001,
"step": 13700
},
{
"epoch": 0.1445162371322952,
"grad_norm": 2.180227518081665,
"learning_rate": 9.606887255558417e-07,
"loss": 0.5149,
"step": 13800
},
{
"epoch": 0.1455634562419495,
"grad_norm": 1.546883463859558,
"learning_rate": 9.60033740609823e-07,
"loss": 0.5566,
"step": 13900
},
{
"epoch": 0.14661067535160383,
"grad_norm": 2.402559757232666,
"learning_rate": 9.593735711936567e-07,
"loss": 0.5343,
"step": 14000
},
{
"epoch": 0.14765789446125813,
"grad_norm": 4.94249153137207,
"learning_rate": 9.587082247472948e-07,
"loss": 0.516,
"step": 14100
},
{
"epoch": 0.14870511357091243,
"grad_norm": 1.760003924369812,
"learning_rate": 9.580377087690324e-07,
"loss": 0.5395,
"step": 14200
},
{
"epoch": 0.14975233268056676,
"grad_norm": 2.1215927600860596,
"learning_rate": 9.573620308154238e-07,
"loss": 0.55,
"step": 14300
},
{
"epoch": 0.15079955179022106,
"grad_norm": 2.929760217666626,
"learning_rate": 9.566811985011981e-07,
"loss": 0.5571,
"step": 14400
},
{
"epoch": 0.1518467708998754,
"grad_norm": 2.7724721431732178,
"learning_rate": 9.559952194991726e-07,
"loss": 0.5712,
"step": 14500
},
{
"epoch": 0.1528939900095297,
"grad_norm": 2.270812749862671,
"learning_rate": 9.55304101540166e-07,
"loss": 0.5355,
"step": 14600
},
{
"epoch": 0.15394120911918402,
"grad_norm": 2.3572235107421875,
"learning_rate": 9.546078524129127e-07,
"loss": 0.5595,
"step": 14700
},
{
"epoch": 0.15498842822883832,
"grad_norm": 1.5402534008026123,
"learning_rate": 9.539064799639735e-07,
"loss": 0.5561,
"step": 14800
},
{
"epoch": 0.15603564733849262,
"grad_norm": 3.2286136150360107,
"learning_rate": 9.531999920976481e-07,
"loss": 0.4951,
"step": 14900
},
{
"epoch": 0.15708286644814695,
"grad_norm": 1.4825396537780762,
"learning_rate": 9.524883967758858e-07,
"loss": 0.5099,
"step": 15000
},
{
"epoch": 0.15813008555780125,
"grad_norm": 1.649629831314087,
"learning_rate": 9.517717020181953e-07,
"loss": 0.5694,
"step": 15100
},
{
"epoch": 0.15917730466745558,
"grad_norm": 1.8996721506118774,
"learning_rate": 9.510499159015553e-07,
"loss": 0.5364,
"step": 15200
},
{
"epoch": 0.16022452377710988,
"grad_norm": 3.648730993270874,
"learning_rate": 9.50323046560322e-07,
"loss": 0.5276,
"step": 15300
},
{
"epoch": 0.16127174288676419,
"grad_norm": 2.633986473083496,
"learning_rate": 9.495911021861396e-07,
"loss": 0.5399,
"step": 15400
},
{
"epoch": 0.16231896199641851,
"grad_norm": 1.8254631757736206,
"learning_rate": 9.488540910278455e-07,
"loss": 0.5484,
"step": 15500
},
{
"epoch": 0.16336618110607282,
"grad_norm": 2.676395893096924,
"learning_rate": 9.481120213913794e-07,
"loss": 0.5741,
"step": 15600
},
{
"epoch": 0.16441340021572715,
"grad_norm": 3.6794283390045166,
"learning_rate": 9.47364901639688e-07,
"loss": 0.5481,
"step": 15700
},
{
"epoch": 0.16546061932538145,
"grad_norm": 1.8362795114517212,
"learning_rate": 9.466127401926326e-07,
"loss": 0.5704,
"step": 15800
},
{
"epoch": 0.16650783843503575,
"grad_norm": 2.256762742996216,
"learning_rate": 9.458555455268924e-07,
"loss": 0.5159,
"step": 15900
},
{
"epoch": 0.16755505754469008,
"grad_norm": 2.6386005878448486,
"learning_rate": 9.450933261758702e-07,
"loss": 0.4916,
"step": 16000
},
{
"epoch": 0.16860227665434438,
"grad_norm": 2.635512113571167,
"learning_rate": 9.443260907295955e-07,
"loss": 0.508,
"step": 16100
},
{
"epoch": 0.1696494957639987,
"grad_norm": 1.6727428436279297,
"learning_rate": 9.435538478346282e-07,
"loss": 0.5282,
"step": 16200
},
{
"epoch": 0.170696714873653,
"grad_norm": 2.1256072521209717,
"learning_rate": 9.42776606193961e-07,
"loss": 0.5878,
"step": 16300
},
{
"epoch": 0.17174393398330734,
"grad_norm": 2.557060956954956,
"learning_rate": 9.419943745669209e-07,
"loss": 0.5392,
"step": 16400
},
{
"epoch": 0.17279115309296164,
"grad_norm": 2.912794828414917,
"learning_rate": 9.412071617690713e-07,
"loss": 0.5631,
"step": 16500
},
{
"epoch": 0.17383837220261594,
"grad_norm": 2.380751132965088,
"learning_rate": 9.40414976672112e-07,
"loss": 0.5518,
"step": 16600
},
{
"epoch": 0.17488559131227027,
"grad_norm": 2.5645503997802734,
"learning_rate": 9.396178282037795e-07,
"loss": 0.5377,
"step": 16700
},
{
"epoch": 0.17593281042192457,
"grad_norm": 2.270052433013916,
"learning_rate": 9.388157253477459e-07,
"loss": 0.524,
"step": 16800
},
{
"epoch": 0.1769800295315789,
"grad_norm": 2.3046374320983887,
"learning_rate": 9.380086771435187e-07,
"loss": 0.5224,
"step": 16900
},
{
"epoch": 0.1780272486412332,
"grad_norm": 1.9633408784866333,
"learning_rate": 9.371966926863381e-07,
"loss": 0.5241,
"step": 17000
},
{
"epoch": 0.1790744677508875,
"grad_norm": 2.206256628036499,
"learning_rate": 9.363797811270743e-07,
"loss": 0.5599,
"step": 17100
},
{
"epoch": 0.18012168686054184,
"grad_norm": 2.883242607116699,
"learning_rate": 9.355579516721251e-07,
"loss": 0.5472,
"step": 17200
},
{
"epoch": 0.18116890597019614,
"grad_norm": 3.9055755138397217,
"learning_rate": 9.34731213583312e-07,
"loss": 0.5463,
"step": 17300
},
{
"epoch": 0.18221612507985047,
"grad_norm": 2.9254720211029053,
"learning_rate": 9.338995761777751e-07,
"loss": 0.5385,
"step": 17400
},
{
"epoch": 0.18326334418950477,
"grad_norm": 2.070220947265625,
"learning_rate": 9.33063048827869e-07,
"loss": 0.597,
"step": 17500
},
{
"epoch": 0.18431056329915907,
"grad_norm": 2.241502285003662,
"learning_rate": 9.322216409610566e-07,
"loss": 0.4954,
"step": 17600
},
{
"epoch": 0.1853577824088134,
"grad_norm": 2.7689974308013916,
"learning_rate": 9.313753620598035e-07,
"loss": 0.5536,
"step": 17700
},
{
"epoch": 0.1864050015184677,
"grad_norm": 2.5464389324188232,
"learning_rate": 9.3052422166147e-07,
"loss": 0.5342,
"step": 17800
},
{
"epoch": 0.18745222062812203,
"grad_norm": 1.727013111114502,
"learning_rate": 9.296682293582049e-07,
"loss": 0.5383,
"step": 17900
},
{
"epoch": 0.18849943973777633,
"grad_norm": 4.623219966888428,
"learning_rate": 9.288073947968364e-07,
"loss": 0.5305,
"step": 18000
},
{
"epoch": 0.18954665884743066,
"grad_norm": 1.5261229276657104,
"learning_rate": 9.27941727678764e-07,
"loss": 0.5235,
"step": 18100
},
{
"epoch": 0.19059387795708496,
"grad_norm": 1.9866268634796143,
"learning_rate": 9.270712377598491e-07,
"loss": 0.5217,
"step": 18200
},
{
"epoch": 0.19164109706673926,
"grad_norm": 3.0393967628479004,
"learning_rate": 9.261959348503046e-07,
"loss": 0.5241,
"step": 18300
},
{
"epoch": 0.1926883161763936,
"grad_norm": 2.8217124938964844,
"learning_rate": 9.253158288145848e-07,
"loss": 0.5713,
"step": 18400
},
{
"epoch": 0.1937355352860479,
"grad_norm": 2.327930450439453,
"learning_rate": 9.24430929571274e-07,
"loss": 0.5191,
"step": 18500
},
{
"epoch": 0.19478275439570222,
"grad_norm": 2.090432643890381,
"learning_rate": 9.235412470929748e-07,
"loss": 0.5285,
"step": 18600
},
{
"epoch": 0.19582997350535652,
"grad_norm": 2.427619457244873,
"learning_rate": 9.226467914061962e-07,
"loss": 0.5157,
"step": 18700
},
{
"epoch": 0.19687719261501083,
"grad_norm": 3.4102041721343994,
"learning_rate": 9.217475725912391e-07,
"loss": 0.52,
"step": 18800
},
{
"epoch": 0.19792441172466516,
"grad_norm": 1.7967109680175781,
"learning_rate": 9.208436007820848e-07,
"loss": 0.514,
"step": 18900
},
{
"epoch": 0.19897163083431946,
"grad_norm": 2.5887088775634766,
"learning_rate": 9.19934886166279e-07,
"loss": 0.4798,
"step": 19000
},
{
"epoch": 0.2000188499439738,
"grad_norm": 2.08363676071167,
"learning_rate": 9.190214389848181e-07,
"loss": 0.5348,
"step": 19100
},
{
"epoch": 0.2010660690536281,
"grad_norm": 2.4554569721221924,
"learning_rate": 9.18103269532033e-07,
"loss": 0.4976,
"step": 19200
},
{
"epoch": 0.20211328816328242,
"grad_norm": 2.604750633239746,
"learning_rate": 9.171803881554736e-07,
"loss": 0.5048,
"step": 19300
},
{
"epoch": 0.20316050727293672,
"grad_norm": 1.9831663370132446,
"learning_rate": 9.162528052557925e-07,
"loss": 0.5618,
"step": 19400
},
{
"epoch": 0.20420772638259102,
"grad_norm": 2.6448137760162354,
"learning_rate": 9.153205312866265e-07,
"loss": 0.5382,
"step": 19500
},
{
"epoch": 0.20525494549224535,
"grad_norm": 2.27817964553833,
"learning_rate": 9.143835767544805e-07,
"loss": 0.5189,
"step": 19600
},
{
"epoch": 0.20630216460189965,
"grad_norm": 1.8295369148254395,
"learning_rate": 9.134419522186075e-07,
"loss": 0.5083,
"step": 19700
},
{
"epoch": 0.20734938371155398,
"grad_norm": 3.7082695960998535,
"learning_rate": 9.124956682908908e-07,
"loss": 0.4839,
"step": 19800
},
{
"epoch": 0.20839660282120828,
"grad_norm": 2.17672061920166,
"learning_rate": 9.115447356357238e-07,
"loss": 0.5203,
"step": 19900
},
{
"epoch": 0.20944382193086258,
"grad_norm": 2.759127378463745,
"learning_rate": 9.105891649698898e-07,
"loss": 0.5339,
"step": 20000
},
{
"epoch": 0.2104910410405169,
"grad_norm": 2.4461498260498047,
"learning_rate": 9.096289670624416e-07,
"loss": 0.5536,
"step": 20100
},
{
"epoch": 0.21153826015017121,
"grad_norm": 2.8688385486602783,
"learning_rate": 9.086641527345796e-07,
"loss": 0.5266,
"step": 20200
},
{
"epoch": 0.21258547925982554,
"grad_norm": 2.589167356491089,
"learning_rate": 9.076947328595306e-07,
"loss": 0.5031,
"step": 20300
},
{
"epoch": 0.21363269836947985,
"grad_norm": 3.033956289291382,
"learning_rate": 9.067207183624243e-07,
"loss": 0.5288,
"step": 20400
},
{
"epoch": 0.21467991747913415,
"grad_norm": 2.5122592449188232,
"learning_rate": 9.057421202201714e-07,
"loss": 0.5002,
"step": 20500
},
{
"epoch": 0.21572713658878848,
"grad_norm": 2.099766731262207,
"learning_rate": 9.047589494613381e-07,
"loss": 0.5389,
"step": 20600
},
{
"epoch": 0.21677435569844278,
"grad_norm": 2.65134596824646,
"learning_rate": 9.037712171660241e-07,
"loss": 0.5537,
"step": 20700
},
{
"epoch": 0.2178215748080971,
"grad_norm": 2.301417589187622,
"learning_rate": 9.027789344657357e-07,
"loss": 0.5554,
"step": 20800
},
{
"epoch": 0.2188687939177514,
"grad_norm": 2.6696295738220215,
"learning_rate": 9.017821125432612e-07,
"loss": 0.5191,
"step": 20900
},
{
"epoch": 0.21991601302740574,
"grad_norm": 2.455559015274048,
"learning_rate": 9.007807626325455e-07,
"loss": 0.5053,
"step": 21000
},
{
"epoch": 0.22096323213706004,
"grad_norm": 2.676161289215088,
"learning_rate": 8.997748960185622e-07,
"loss": 0.518,
"step": 21100
},
{
"epoch": 0.22201045124671434,
"grad_norm": 2.6200263500213623,
"learning_rate": 8.987645240371873e-07,
"loss": 0.4884,
"step": 21200
},
{
"epoch": 0.22305767035636867,
"grad_norm": 3.8255863189697266,
"learning_rate": 8.977496580750712e-07,
"loss": 0.5348,
"step": 21300
},
{
"epoch": 0.22410488946602297,
"grad_norm": 2.0892577171325684,
"learning_rate": 8.967303095695105e-07,
"loss": 0.5178,
"step": 21400
},
{
"epoch": 0.2251521085756773,
"grad_norm": 2.40419864654541,
"learning_rate": 8.957064900083187e-07,
"loss": 0.584,
"step": 21500
},
{
"epoch": 0.2261993276853316,
"grad_norm": 3.042703628540039,
"learning_rate": 8.946782109296973e-07,
"loss": 0.5267,
"step": 21600
},
{
"epoch": 0.2272465467949859,
"grad_norm": 1.6234790086746216,
"learning_rate": 8.936454839221054e-07,
"loss": 0.5217,
"step": 21700
},
{
"epoch": 0.22829376590464023,
"grad_norm": 1.706650972366333,
"learning_rate": 8.926083206241291e-07,
"loss": 0.5242,
"step": 21800
},
{
"epoch": 0.22934098501429453,
"grad_norm": 4.158198833465576,
"learning_rate": 8.915667327243506e-07,
"loss": 0.524,
"step": 21900
},
{
"epoch": 0.23038820412394886,
"grad_norm": 2.2484548091888428,
"learning_rate": 8.905207319612163e-07,
"loss": 0.5347,
"step": 22000
},
{
"epoch": 0.23143542323360317,
"grad_norm": 2.990169048309326,
"learning_rate": 8.894703301229043e-07,
"loss": 0.5408,
"step": 22100
},
{
"epoch": 0.23248264234325747,
"grad_norm": 3.9766592979431152,
"learning_rate": 8.884155390471919e-07,
"loss": 0.5046,
"step": 22200
},
{
"epoch": 0.2335298614529118,
"grad_norm": 2.5463485717773438,
"learning_rate": 8.873563706213221e-07,
"loss": 0.4881,
"step": 22300
},
{
"epoch": 0.2345770805625661,
"grad_norm": 2.7277047634124756,
"learning_rate": 8.862928367818696e-07,
"loss": 0.5228,
"step": 22400
},
{
"epoch": 0.23562429967222043,
"grad_norm": 1.9528217315673828,
"learning_rate": 8.852249495146063e-07,
"loss": 0.5056,
"step": 22500
},
{
"epoch": 0.23667151878187473,
"grad_norm": 2.527414083480835,
"learning_rate": 8.841527208543658e-07,
"loss": 0.5186,
"step": 22600
},
{
"epoch": 0.23771873789152906,
"grad_norm": 1.9525986909866333,
"learning_rate": 8.830761628849087e-07,
"loss": 0.5195,
"step": 22700
},
{
"epoch": 0.23876595700118336,
"grad_norm": 1.6230095624923706,
"learning_rate": 8.819952877387855e-07,
"loss": 0.4834,
"step": 22800
},
{
"epoch": 0.23981317611083766,
"grad_norm": 2.2290198802948,
"learning_rate": 8.809101075972005e-07,
"loss": 0.5207,
"step": 22900
},
{
"epoch": 0.240860395220492,
"grad_norm": 3.419203996658325,
"learning_rate": 8.798206346898743e-07,
"loss": 0.5064,
"step": 23000
},
{
"epoch": 0.2419076143301463,
"grad_norm": 2.360508441925049,
"learning_rate": 8.787268812949054e-07,
"loss": 0.5011,
"step": 23100
},
{
"epoch": 0.24295483343980062,
"grad_norm": 1.8023535013198853,
"learning_rate": 8.77628859738633e-07,
"loss": 0.5099,
"step": 23200
},
{
"epoch": 0.24400205254945492,
"grad_norm": 1.9575679302215576,
"learning_rate": 8.765265823954972e-07,
"loss": 0.5361,
"step": 23300
},
{
"epoch": 0.24504927165910922,
"grad_norm": 1.5841313600540161,
"learning_rate": 8.754200616879001e-07,
"loss": 0.541,
"step": 23400
},
{
"epoch": 0.24609649076876355,
"grad_norm": 2.8605728149414062,
"learning_rate": 8.743093100860648e-07,
"loss": 0.5541,
"step": 23500
},
{
"epoch": 0.24714370987841786,
"grad_norm": 1.696733832359314,
"learning_rate": 8.731943401078961e-07,
"loss": 0.511,
"step": 23600
},
{
"epoch": 0.24819092898807218,
"grad_norm": 2.1618356704711914,
"learning_rate": 8.720751643188389e-07,
"loss": 0.5066,
"step": 23700
},
{
"epoch": 0.2492381480977265,
"grad_norm": 2.721067428588867,
"learning_rate": 8.709517953317365e-07,
"loss": 0.5398,
"step": 23800
},
{
"epoch": 0.2502853672073808,
"grad_norm": 1.8457568883895874,
"learning_rate": 8.698242458066882e-07,
"loss": 0.4879,
"step": 23900
},
{
"epoch": 0.2513325863170351,
"grad_norm": 2.435941696166992,
"learning_rate": 8.686925284509077e-07,
"loss": 0.531,
"step": 24000
},
{
"epoch": 0.2523798054266894,
"grad_norm": 2.617920160293579,
"learning_rate": 8.675566560185786e-07,
"loss": 0.5189,
"step": 24100
},
{
"epoch": 0.25342702453634375,
"grad_norm": 2.538632869720459,
"learning_rate": 8.664166413107109e-07,
"loss": 0.5433,
"step": 24200
},
{
"epoch": 0.2544742436459981,
"grad_norm": 2.3944451808929443,
"learning_rate": 8.65272497174998e-07,
"loss": 0.5401,
"step": 24300
},
{
"epoch": 0.25552146275565235,
"grad_norm": 3.6203765869140625,
"learning_rate": 8.641242365056705e-07,
"loss": 0.544,
"step": 24400
},
{
"epoch": 0.2565686818653067,
"grad_norm": 2.866250991821289,
"learning_rate": 8.629718722433507e-07,
"loss": 0.5357,
"step": 24500
},
{
"epoch": 0.257615900974961,
"grad_norm": 3.3872838020324707,
"learning_rate": 8.618154173749088e-07,
"loss": 0.5261,
"step": 24600
},
{
"epoch": 0.2586631200846153,
"grad_norm": 2.269967794418335,
"learning_rate": 8.606548849333138e-07,
"loss": 0.5128,
"step": 24700
},
{
"epoch": 0.2597103391942696,
"grad_norm": 2.1335697174072266,
"learning_rate": 8.594902879974888e-07,
"loss": 0.5645,
"step": 24800
},
{
"epoch": 0.26075755830392394,
"grad_norm": 2.443239212036133,
"learning_rate": 8.583216396921624e-07,
"loss": 0.4806,
"step": 24900
},
{
"epoch": 0.2618047774135782,
"grad_norm": 2.713833808898926,
"learning_rate": 8.571489531877214e-07,
"loss": 0.5271,
"step": 25000
},
{
"epoch": 0.26285199652323255,
"grad_norm": 3.485182046890259,
"learning_rate": 8.559722417000619e-07,
"loss": 0.4962,
"step": 25100
},
{
"epoch": 0.2638992156328869,
"grad_norm": 2.306403160095215,
"learning_rate": 8.547915184904409e-07,
"loss": 0.5122,
"step": 25200
},
{
"epoch": 0.2649464347425412,
"grad_norm": 2.6151928901672363,
"learning_rate": 8.536067968653261e-07,
"loss": 0.5316,
"step": 25300
},
{
"epoch": 0.2659936538521955,
"grad_norm": 2.3466389179229736,
"learning_rate": 8.524180901762469e-07,
"loss": 0.4991,
"step": 25400
},
{
"epoch": 0.2670408729618498,
"grad_norm": 2.0926601886749268,
"learning_rate": 8.512254118196429e-07,
"loss": 0.5254,
"step": 25500
},
{
"epoch": 0.26808809207150414,
"grad_norm": 1.9708478450775146,
"learning_rate": 8.500287752367142e-07,
"loss": 0.507,
"step": 25600
},
{
"epoch": 0.2691353111811584,
"grad_norm": 2.028843879699707,
"learning_rate": 8.48828193913268e-07,
"loss": 0.5066,
"step": 25700
},
{
"epoch": 0.27018253029081274,
"grad_norm": 2.9337289333343506,
"learning_rate": 8.47623681379569e-07,
"loss": 0.5023,
"step": 25800
},
{
"epoch": 0.27122974940046707,
"grad_norm": 2.8608200550079346,
"learning_rate": 8.464152512101848e-07,
"loss": 0.5417,
"step": 25900
},
{
"epoch": 0.2722769685101214,
"grad_norm": 3.0925405025482178,
"learning_rate": 8.452029170238344e-07,
"loss": 0.5415,
"step": 26000
},
{
"epoch": 0.27332418761977567,
"grad_norm": 1.9558321237564087,
"learning_rate": 8.439866924832338e-07,
"loss": 0.519,
"step": 26100
},
{
"epoch": 0.27437140672943,
"grad_norm": 1.5545213222503662,
"learning_rate": 8.427665912949425e-07,
"loss": 0.5441,
"step": 26200
},
{
"epoch": 0.27541862583908433,
"grad_norm": 3.6202712059020996,
"learning_rate": 8.415426272092089e-07,
"loss": 0.5559,
"step": 26300
},
{
"epoch": 0.2764658449487386,
"grad_norm": 1.8004056215286255,
"learning_rate": 8.403148140198151e-07,
"loss": 0.5034,
"step": 26400
},
{
"epoch": 0.27751306405839293,
"grad_norm": 2.5597338676452637,
"learning_rate": 8.390831655639223e-07,
"loss": 0.5294,
"step": 26500
},
{
"epoch": 0.27856028316804726,
"grad_norm": 2.014400005340576,
"learning_rate": 8.378476957219134e-07,
"loss": 0.5663,
"step": 26600
},
{
"epoch": 0.27960750227770154,
"grad_norm": 2.069840669631958,
"learning_rate": 8.366084184172377e-07,
"loss": 0.5007,
"step": 26700
},
{
"epoch": 0.28065472138735587,
"grad_norm": 5.621069431304932,
"learning_rate": 8.353653476162543e-07,
"loss": 0.5263,
"step": 26800
},
{
"epoch": 0.2817019404970102,
"grad_norm": 3.1065540313720703,
"learning_rate": 8.341184973280732e-07,
"loss": 0.5048,
"step": 26900
},
{
"epoch": 0.2827491596066645,
"grad_norm": 2.579742431640625,
"learning_rate": 8.328678816043988e-07,
"loss": 0.5272,
"step": 27000
},
{
"epoch": 0.2837963787163188,
"grad_norm": 2.476778030395508,
"learning_rate": 8.31613514539371e-07,
"loss": 0.4944,
"step": 27100
},
{
"epoch": 0.2848435978259731,
"grad_norm": 2.7026314735412598,
"learning_rate": 8.303554102694065e-07,
"loss": 0.5257,
"step": 27200
},
{
"epoch": 0.28589081693562746,
"grad_norm": 2.1597368717193604,
"learning_rate": 8.290935829730391e-07,
"loss": 0.5282,
"step": 27300
},
{
"epoch": 0.28693803604528173,
"grad_norm": 2.447305202484131,
"learning_rate": 8.278280468707606e-07,
"loss": 0.5295,
"step": 27400
},
{
"epoch": 0.28798525515493606,
"grad_norm": 2.806995391845703,
"learning_rate": 8.265588162248597e-07,
"loss": 0.4933,
"step": 27500
},
{
"epoch": 0.2890324742645904,
"grad_norm": 2.1765849590301514,
"learning_rate": 8.252859053392622e-07,
"loss": 0.5486,
"step": 27600
},
{
"epoch": 0.2900796933742447,
"grad_norm": 2.122382640838623,
"learning_rate": 8.240093285593692e-07,
"loss": 0.5255,
"step": 27700
},
{
"epoch": 0.291126912483899,
"grad_norm": 2.136657476425171,
"learning_rate": 8.22729100271895e-07,
"loss": 0.5214,
"step": 27800
},
{
"epoch": 0.2921741315935533,
"grad_norm": 2.033987522125244,
"learning_rate": 8.214452349047065e-07,
"loss": 0.5065,
"step": 27900
},
{
"epoch": 0.29322135070320765,
"grad_norm": 3.346703290939331,
"learning_rate": 8.20157746926659e-07,
"loss": 0.5349,
"step": 28000
},
{
"epoch": 0.2942685698128619,
"grad_norm": 2.63242244720459,
"learning_rate": 8.188666508474335e-07,
"loss": 0.5264,
"step": 28100
},
{
"epoch": 0.29531578892251625,
"grad_norm": 2.475911855697632,
"learning_rate": 8.175719612173741e-07,
"loss": 0.5186,
"step": 28200
},
{
"epoch": 0.2963630080321706,
"grad_norm": 1.5967457294464111,
"learning_rate": 8.162736926273231e-07,
"loss": 0.5321,
"step": 28300
},
{
"epoch": 0.29741022714182486,
"grad_norm": 1.6950793266296387,
"learning_rate": 8.149718597084565e-07,
"loss": 0.5028,
"step": 28400
},
{
"epoch": 0.2984574462514792,
"grad_norm": 1.8821123838424683,
"learning_rate": 8.136664771321198e-07,
"loss": 0.5147,
"step": 28500
},
{
"epoch": 0.2995046653611335,
"grad_norm": 3.8432750701904297,
"learning_rate": 8.123575596096624e-07,
"loss": 0.5055,
"step": 28600
},
{
"epoch": 0.30055188447078784,
"grad_norm": 2.2065136432647705,
"learning_rate": 8.110451218922711e-07,
"loss": 0.4804,
"step": 28700
},
{
"epoch": 0.3015991035804421,
"grad_norm": 3.215104103088379,
"learning_rate": 8.097291787708052e-07,
"loss": 0.508,
"step": 28800
},
{
"epoch": 0.30264632269009645,
"grad_norm": 2.6659111976623535,
"learning_rate": 8.084097450756286e-07,
"loss": 0.5058,
"step": 28900
},
{
"epoch": 0.3036935417997508,
"grad_norm": 3.1594624519348145,
"learning_rate": 8.070868356764431e-07,
"loss": 0.4819,
"step": 29000
},
{
"epoch": 0.30474076090940505,
"grad_norm": 3.2502479553222656,
"learning_rate": 8.05760465482121e-07,
"loss": 0.5132,
"step": 29100
},
{
"epoch": 0.3057879800190594,
"grad_norm": 2.3569111824035645,
"learning_rate": 8.044306494405372e-07,
"loss": 0.4989,
"step": 29200
},
{
"epoch": 0.3068351991287137,
"grad_norm": 2.7516555786132812,
"learning_rate": 8.030974025384e-07,
"loss": 0.4982,
"step": 29300
},
{
"epoch": 0.30788241823836804,
"grad_norm": 2.388401508331299,
"learning_rate": 8.017607398010829e-07,
"loss": 0.492,
"step": 29400
},
{
"epoch": 0.3089296373480223,
"grad_norm": 2.49920392036438,
"learning_rate": 8.004206762924548e-07,
"loss": 0.4729,
"step": 29500
},
{
"epoch": 0.30997685645767664,
"grad_norm": 2.528714179992676,
"learning_rate": 7.99077227114711e-07,
"loss": 0.5229,
"step": 29600
},
{
"epoch": 0.31102407556733097,
"grad_norm": 2.0866329669952393,
"learning_rate": 7.977304074082021e-07,
"loss": 0.483,
"step": 29700
},
{
"epoch": 0.31207129467698524,
"grad_norm": 3.1670796871185303,
"learning_rate": 7.963802323512638e-07,
"loss": 0.4816,
"step": 29800
},
{
"epoch": 0.3131185137866396,
"grad_norm": 1.9715406894683838,
"learning_rate": 7.950267171600458e-07,
"loss": 0.4666,
"step": 29900
},
{
"epoch": 0.3141657328962939,
"grad_norm": 1.6176679134368896,
"learning_rate": 7.936698770883404e-07,
"loss": 0.4886,
"step": 30000
},
{
"epoch": 0.3152129520059482,
"grad_norm": 2.4239096641540527,
"learning_rate": 7.923097274274103e-07,
"loss": 0.5085,
"step": 30100
},
{
"epoch": 0.3162601711156025,
"grad_norm": 1.8292428255081177,
"learning_rate": 7.909462835058169e-07,
"loss": 0.538,
"step": 30200
},
{
"epoch": 0.31730739022525684,
"grad_norm": 2.2372076511383057,
"learning_rate": 7.895795606892466e-07,
"loss": 0.5099,
"step": 30300
},
{
"epoch": 0.31835460933491116,
"grad_norm": 1.9392811059951782,
"learning_rate": 7.882095743803386e-07,
"loss": 0.4947,
"step": 30400
},
{
"epoch": 0.31940182844456544,
"grad_norm": 2.645183801651001,
"learning_rate": 7.868363400185106e-07,
"loss": 0.5012,
"step": 30500
},
{
"epoch": 0.32044904755421977,
"grad_norm": 3.2452821731567383,
"learning_rate": 7.85459873079785e-07,
"loss": 0.4696,
"step": 30600
},
{
"epoch": 0.3214962666638741,
"grad_norm": 1.310027003288269,
"learning_rate": 7.84080189076615e-07,
"loss": 0.5183,
"step": 30700
},
{
"epoch": 0.32254348577352837,
"grad_norm": 2.6369211673736572,
"learning_rate": 7.826973035577091e-07,
"loss": 0.5135,
"step": 30800
},
{
"epoch": 0.3235907048831827,
"grad_norm": 2.9246723651885986,
"learning_rate": 7.813112321078559e-07,
"loss": 0.527,
"step": 30900
},
{
"epoch": 0.32463792399283703,
"grad_norm": 3.309020519256592,
"learning_rate": 7.799219903477489e-07,
"loss": 0.5322,
"step": 31000
},
{
"epoch": 0.32568514310249136,
"grad_norm": 2.4480512142181396,
"learning_rate": 7.785295939338105e-07,
"loss": 0.5234,
"step": 31100
},
{
"epoch": 0.32673236221214563,
"grad_norm": 1.7909550666809082,
"learning_rate": 7.771340585580149e-07,
"loss": 0.4938,
"step": 31200
},
{
"epoch": 0.32777958132179996,
"grad_norm": 2.6975667476654053,
"learning_rate": 7.757353999477114e-07,
"loss": 0.491,
"step": 31300
},
{
"epoch": 0.3288268004314543,
"grad_norm": 2.4480390548706055,
"learning_rate": 7.743336338654483e-07,
"loss": 0.538,
"step": 31400
},
{
"epoch": 0.32987401954110857,
"grad_norm": 1.8292025327682495,
"learning_rate": 7.729287761087935e-07,
"loss": 0.4906,
"step": 31500
},
{
"epoch": 0.3309212386507629,
"grad_norm": 1.5502568483352661,
"learning_rate": 7.715208425101576e-07,
"loss": 0.459,
"step": 31600
},
{
"epoch": 0.3319684577604172,
"grad_norm": 2.6698973178863525,
"learning_rate": 7.701098489366156e-07,
"loss": 0.5086,
"step": 31700
},
{
"epoch": 0.3330156768700715,
"grad_norm": 2.4431324005126953,
"learning_rate": 7.686958112897271e-07,
"loss": 0.4843,
"step": 31800
},
{
"epoch": 0.3340628959797258,
"grad_norm": 2.875575065612793,
"learning_rate": 7.67278745505358e-07,
"loss": 0.5171,
"step": 31900
},
{
"epoch": 0.33511011508938016,
"grad_norm": 2.196960210800171,
"learning_rate": 7.658586675535005e-07,
"loss": 0.5026,
"step": 32000
},
{
"epoch": 0.3361573341990345,
"grad_norm": 2.801039457321167,
"learning_rate": 7.644355934380933e-07,
"loss": 0.5175,
"step": 32100
},
{
"epoch": 0.33720455330868876,
"grad_norm": 2.4252429008483887,
"learning_rate": 7.630095391968407e-07,
"loss": 0.492,
"step": 32200
},
{
"epoch": 0.3382517724183431,
"grad_norm": 1.9080466032028198,
"learning_rate": 7.615805209010334e-07,
"loss": 0.5203,
"step": 32300
},
{
"epoch": 0.3392989915279974,
"grad_norm": 1.8371050357818604,
"learning_rate": 7.601485546553647e-07,
"loss": 0.5028,
"step": 32400
},
{
"epoch": 0.3403462106376517,
"grad_norm": 3.5394959449768066,
"learning_rate": 7.587136565977522e-07,
"loss": 0.5203,
"step": 32500
},
{
"epoch": 0.341393429747306,
"grad_norm": 2.381826400756836,
"learning_rate": 7.572758428991532e-07,
"loss": 0.5254,
"step": 32600
},
{
"epoch": 0.34244064885696035,
"grad_norm": 1.7615987062454224,
"learning_rate": 7.55835129763384e-07,
"loss": 0.5091,
"step": 32700
},
{
"epoch": 0.3434878679666147,
"grad_norm": 2.329334020614624,
"learning_rate": 7.543915334269365e-07,
"loss": 0.5004,
"step": 32800
},
{
"epoch": 0.34453508707626895,
"grad_norm": 2.9679040908813477,
"learning_rate": 7.529450701587963e-07,
"loss": 0.5114,
"step": 32900
},
{
"epoch": 0.3455823061859233,
"grad_norm": 3.3162288665771484,
"learning_rate": 7.514957562602582e-07,
"loss": 0.5055,
"step": 33000
},
{
"epoch": 0.3466295252955776,
"grad_norm": 2.0709986686706543,
"learning_rate": 7.500436080647428e-07,
"loss": 0.5574,
"step": 33100
},
{
"epoch": 0.3476767444052319,
"grad_norm": 2.1400296688079834,
"learning_rate": 7.485886419376126e-07,
"loss": 0.5777,
"step": 33200
},
{
"epoch": 0.3487239635148862,
"grad_norm": 2.4479362964630127,
"learning_rate": 7.471308742759879e-07,
"loss": 0.5378,
"step": 33300
},
{
"epoch": 0.34977118262454054,
"grad_norm": 2.2012875080108643,
"learning_rate": 7.456703215085609e-07,
"loss": 0.4941,
"step": 33400
},
{
"epoch": 0.3508184017341948,
"grad_norm": 2.5233943462371826,
"learning_rate": 7.44207000095412e-07,
"loss": 0.547,
"step": 33500
},
{
"epoch": 0.35186562084384915,
"grad_norm": 2.050294876098633,
"learning_rate": 7.427409265278235e-07,
"loss": 0.5326,
"step": 33600
},
{
"epoch": 0.3529128399535035,
"grad_norm": 1.9416810274124146,
"learning_rate": 7.412721173280931e-07,
"loss": 0.5373,
"step": 33700
},
{
"epoch": 0.3539600590631578,
"grad_norm": 2.4550209045410156,
"learning_rate": 7.398005890493493e-07,
"loss": 0.5025,
"step": 33800
},
{
"epoch": 0.3550072781728121,
"grad_norm": 2.1860315799713135,
"learning_rate": 7.383263582753633e-07,
"loss": 0.4961,
"step": 33900
},
{
"epoch": 0.3560544972824664,
"grad_norm": 3.3393681049346924,
"learning_rate": 7.368494416203632e-07,
"loss": 0.5014,
"step": 34000
},
{
"epoch": 0.35710171639212074,
"grad_norm": 2.2855758666992188,
"learning_rate": 7.353698557288462e-07,
"loss": 0.5179,
"step": 34100
},
{
"epoch": 0.358148935501775,
"grad_norm": 2.719910144805908,
"learning_rate": 7.338876172753913e-07,
"loss": 0.5151,
"step": 34200
},
{
"epoch": 0.35919615461142934,
"grad_norm": 2.3122212886810303,
"learning_rate": 7.324027429644709e-07,
"loss": 0.5075,
"step": 34300
},
{
"epoch": 0.36024337372108367,
"grad_norm": 2.5901198387145996,
"learning_rate": 7.309152495302631e-07,
"loss": 0.5185,
"step": 34400
},
{
"epoch": 0.361290592830738,
"grad_norm": 2.749903440475464,
"learning_rate": 7.294251537364629e-07,
"loss": 0.4728,
"step": 34500
},
{
"epoch": 0.3623378119403923,
"grad_norm": 2.453977108001709,
"learning_rate": 7.279324723760932e-07,
"loss": 0.5197,
"step": 34600
},
{
"epoch": 0.3633850310500466,
"grad_norm": 3.2406835556030273,
"learning_rate": 7.264372222713157e-07,
"loss": 0.4856,
"step": 34700
},
{
"epoch": 0.36443225015970093,
"grad_norm": 2.1802427768707275,
"learning_rate": 7.249394202732414e-07,
"loss": 0.4996,
"step": 34800
},
{
"epoch": 0.3654794692693552,
"grad_norm": 1.560670256614685,
"learning_rate": 7.234390832617399e-07,
"loss": 0.5032,
"step": 34900
},
{
"epoch": 0.36652668837900954,
"grad_norm": 2.8153815269470215,
"learning_rate": 7.219362281452504e-07,
"loss": 0.4882,
"step": 35000
},
{
"epoch": 0.36757390748866386,
"grad_norm": 3.205367088317871,
"learning_rate": 7.204308718605906e-07,
"loss": 0.5232,
"step": 35100
},
{
"epoch": 0.36862112659831814,
"grad_norm": 1.6098523139953613,
"learning_rate": 7.189230313727651e-07,
"loss": 0.488,
"step": 35200
},
{
"epoch": 0.36966834570797247,
"grad_norm": 2.2674808502197266,
"learning_rate": 7.174127236747756e-07,
"loss": 0.5026,
"step": 35300
},
{
"epoch": 0.3707155648176268,
"grad_norm": 2.0923283100128174,
"learning_rate": 7.158999657874283e-07,
"loss": 0.5292,
"step": 35400
},
{
"epoch": 0.3717627839272811,
"grad_norm": 2.078521251678467,
"learning_rate": 7.143847747591423e-07,
"loss": 0.5002,
"step": 35500
},
{
"epoch": 0.3728100030369354,
"grad_norm": 2.299473285675049,
"learning_rate": 7.128671676657579e-07,
"loss": 0.5132,
"step": 35600
},
{
"epoch": 0.37385722214658973,
"grad_norm": 1.3978760242462158,
"learning_rate": 7.113471616103441e-07,
"loss": 0.5182,
"step": 35700
},
{
"epoch": 0.37490444125624406,
"grad_norm": 2.559293746948242,
"learning_rate": 7.098247737230052e-07,
"loss": 0.5202,
"step": 35800
},
{
"epoch": 0.37595166036589833,
"grad_norm": 2.457498788833618,
"learning_rate": 7.083000211606881e-07,
"loss": 0.4946,
"step": 35900
},
{
"epoch": 0.37699887947555266,
"grad_norm": 1.9849262237548828,
"learning_rate": 7.067729211069892e-07,
"loss": 0.4932,
"step": 36000
},
{
"epoch": 0.378046098585207,
"grad_norm": 2.242328405380249,
"learning_rate": 7.05243490771961e-07,
"loss": 0.4853,
"step": 36100
},
{
"epoch": 0.3790933176948613,
"grad_norm": 4.18756103515625,
"learning_rate": 7.037117473919169e-07,
"loss": 0.5271,
"step": 36200
},
{
"epoch": 0.3801405368045156,
"grad_norm": 2.454249382019043,
"learning_rate": 7.021777082292384e-07,
"loss": 0.5208,
"step": 36300
},
{
"epoch": 0.3811877559141699,
"grad_norm": 1.5989599227905273,
"learning_rate": 7.006413905721796e-07,
"loss": 0.5252,
"step": 36400
},
{
"epoch": 0.38223497502382425,
"grad_norm": 3.1384224891662598,
"learning_rate": 6.991028117346727e-07,
"loss": 0.5231,
"step": 36500
},
{
"epoch": 0.3832821941334785,
"grad_norm": 3.674887180328369,
"learning_rate": 6.975619890561331e-07,
"loss": 0.5338,
"step": 36600
},
{
"epoch": 0.38432941324313286,
"grad_norm": 2.8714184761047363,
"learning_rate": 6.960189399012635e-07,
"loss": 0.4667,
"step": 36700
},
{
"epoch": 0.3853766323527872,
"grad_norm": 2.0271899700164795,
"learning_rate": 6.944736816598585e-07,
"loss": 0.5439,
"step": 36800
},
{
"epoch": 0.38642385146244146,
"grad_norm": 2.3302154541015625,
"learning_rate": 6.929262317466087e-07,
"loss": 0.5085,
"step": 36900
},
{
"epoch": 0.3874710705720958,
"grad_norm": 1.89630126953125,
"learning_rate": 6.913766076009042e-07,
"loss": 0.489,
"step": 37000
},
{
"epoch": 0.3885182896817501,
"grad_norm": 3.864342212677002,
"learning_rate": 6.898248266866383e-07,
"loss": 0.4782,
"step": 37100
},
{
"epoch": 0.38956550879140445,
"grad_norm": 3.6760518550872803,
"learning_rate": 6.882709064920104e-07,
"loss": 0.5387,
"step": 37200
},
{
"epoch": 0.3906127279010587,
"grad_norm": 2.225639581680298,
"learning_rate": 6.867148645293292e-07,
"loss": 0.5417,
"step": 37300
},
{
"epoch": 0.39165994701071305,
"grad_norm": 1.6425765752792358,
"learning_rate": 6.85156718334815e-07,
"loss": 0.501,
"step": 37400
},
{
"epoch": 0.3927071661203674,
"grad_norm": 2.095388650894165,
"learning_rate": 6.835964854684027e-07,
"loss": 0.5244,
"step": 37500
},
{
"epoch": 0.39375438523002165,
"grad_norm": 1.9956177473068237,
"learning_rate": 6.820341835135434e-07,
"loss": 0.4862,
"step": 37600
},
{
"epoch": 0.394801604339676,
"grad_norm": 2.3689606189727783,
"learning_rate": 6.804698300770058e-07,
"loss": 0.5174,
"step": 37700
},
{
"epoch": 0.3958488234493303,
"grad_norm": 2.4154350757598877,
"learning_rate": 6.789034427886788e-07,
"loss": 0.5232,
"step": 37800
},
{
"epoch": 0.39689604255898464,
"grad_norm": 2.841860055923462,
"learning_rate": 6.773350393013725e-07,
"loss": 0.4952,
"step": 37900
},
{
"epoch": 0.3979432616686389,
"grad_norm": 1.6685402393341064,
"learning_rate": 6.757646372906183e-07,
"loss": 0.5136,
"step": 38000
},
{
"epoch": 0.39899048077829324,
"grad_norm": 2.3947384357452393,
"learning_rate": 6.741922544544716e-07,
"loss": 0.4728,
"step": 38100
},
{
"epoch": 0.4000376998879476,
"grad_norm": 1.9924613237380981,
"learning_rate": 6.726179085133102e-07,
"loss": 0.5101,
"step": 38200
},
{
"epoch": 0.40108491899760185,
"grad_norm": 2.3830676078796387,
"learning_rate": 6.710416172096361e-07,
"loss": 0.489,
"step": 38300
},
{
"epoch": 0.4021321381072562,
"grad_norm": 2.6001055240631104,
"learning_rate": 6.69463398307875e-07,
"loss": 0.5337,
"step": 38400
},
{
"epoch": 0.4031793572169105,
"grad_norm": 2.329277753829956,
"learning_rate": 6.678832695941763e-07,
"loss": 0.469,
"step": 38500
},
{
"epoch": 0.40422657632656483,
"grad_norm": 2.2831122875213623,
"learning_rate": 6.663012488762123e-07,
"loss": 0.5279,
"step": 38600
},
{
"epoch": 0.4052737954362191,
"grad_norm": 2.813821315765381,
"learning_rate": 6.647173539829778e-07,
"loss": 0.4873,
"step": 38700
},
{
"epoch": 0.40632101454587344,
"grad_norm": 2.3835694789886475,
"learning_rate": 6.631316027645892e-07,
"loss": 0.4991,
"step": 38800
},
{
"epoch": 0.40736823365552777,
"grad_norm": 2.7960257530212402,
"learning_rate": 6.615440130920833e-07,
"loss": 0.5366,
"step": 38900
},
{
"epoch": 0.40841545276518204,
"grad_norm": 1.9220885038375854,
"learning_rate": 6.599546028572153e-07,
"loss": 0.5111,
"step": 39000
},
{
"epoch": 0.40946267187483637,
"grad_norm": 2.636683464050293,
"learning_rate": 6.583633899722587e-07,
"loss": 0.5058,
"step": 39100
},
{
"epoch": 0.4105098909844907,
"grad_norm": 2.0583505630493164,
"learning_rate": 6.567703923698013e-07,
"loss": 0.4796,
"step": 39200
},
{
"epoch": 0.411557110094145,
"grad_norm": 3.092818021774292,
"learning_rate": 6.551756280025453e-07,
"loss": 0.5181,
"step": 39300
},
{
"epoch": 0.4126043292037993,
"grad_norm": 2.689857006072998,
"learning_rate": 6.535791148431031e-07,
"loss": 0.5424,
"step": 39400
},
{
"epoch": 0.41365154831345363,
"grad_norm": 1.4727122783660889,
"learning_rate": 6.519808708837958e-07,
"loss": 0.5257,
"step": 39500
},
{
"epoch": 0.41469876742310796,
"grad_norm": 2.4704394340515137,
"learning_rate": 6.503809141364506e-07,
"loss": 0.5043,
"step": 39600
},
{
"epoch": 0.41574598653276224,
"grad_norm": 2.2205686569213867,
"learning_rate": 6.487792626321969e-07,
"loss": 0.4732,
"step": 39700
},
{
"epoch": 0.41679320564241656,
"grad_norm": 4.539642333984375,
"learning_rate": 6.471759344212637e-07,
"loss": 0.5028,
"step": 39800
},
{
"epoch": 0.4178404247520709,
"grad_norm": 3.22900652885437,
"learning_rate": 6.455709475727764e-07,
"loss": 0.4802,
"step": 39900
},
{
"epoch": 0.41888764386172517,
"grad_norm": 1.7866666316986084,
"learning_rate": 6.439643201745524e-07,
"loss": 0.4677,
"step": 40000
},
{
"epoch": 0.4199348629713795,
"grad_norm": 1.5298930406570435,
"learning_rate": 6.423560703328981e-07,
"loss": 0.4663,
"step": 40100
},
{
"epoch": 0.4209820820810338,
"grad_norm": 2.7381436824798584,
"learning_rate": 6.407462161724042e-07,
"loss": 0.5032,
"step": 40200
},
{
"epoch": 0.42202930119068816,
"grad_norm": 1.915801763534546,
"learning_rate": 6.391347758357418e-07,
"loss": 0.4876,
"step": 40300
},
{
"epoch": 0.42307652030034243,
"grad_norm": 2.128645658493042,
"learning_rate": 6.375217674834578e-07,
"loss": 0.4947,
"step": 40400
},
{
"epoch": 0.42412373940999676,
"grad_norm": 2.3809661865234375,
"learning_rate": 6.359072092937702e-07,
"loss": 0.5207,
"step": 40500
},
{
"epoch": 0.4251709585196511,
"grad_norm": 2.089869976043701,
"learning_rate": 6.342911194623636e-07,
"loss": 0.5179,
"step": 40600
},
{
"epoch": 0.42621817762930536,
"grad_norm": 2.531280040740967,
"learning_rate": 6.326735162021832e-07,
"loss": 0.5003,
"step": 40700
},
{
"epoch": 0.4272653967389597,
"grad_norm": 1.5095371007919312,
"learning_rate": 6.310544177432308e-07,
"loss": 0.475,
"step": 40800
},
{
"epoch": 0.428312615848614,
"grad_norm": 3.487618923187256,
"learning_rate": 6.294338423323584e-07,
"loss": 0.5382,
"step": 40900
},
{
"epoch": 0.4293598349582683,
"grad_norm": 3.1474342346191406,
"learning_rate": 6.27811808233063e-07,
"loss": 0.5147,
"step": 41000
},
{
"epoch": 0.4304070540679226,
"grad_norm": 3.5564653873443604,
"learning_rate": 6.261883337252808e-07,
"loss": 0.5062,
"step": 41100
},
{
"epoch": 0.43145427317757695,
"grad_norm": 2.47421932220459,
"learning_rate": 6.245634371051808e-07,
"loss": 0.5364,
"step": 41200
},
{
"epoch": 0.4325014922872313,
"grad_norm": 1.5858722925186157,
"learning_rate": 6.22937136684959e-07,
"loss": 0.5319,
"step": 41300
},
{
"epoch": 0.43354871139688556,
"grad_norm": 2.9193403720855713,
"learning_rate": 6.21309450792632e-07,
"loss": 0.486,
"step": 41400
},
{
"epoch": 0.4345959305065399,
"grad_norm": 1.9017012119293213,
"learning_rate": 6.1968039777183e-07,
"loss": 0.5445,
"step": 41500
},
{
"epoch": 0.4356431496161942,
"grad_norm": 2.5207788944244385,
"learning_rate": 6.180499959815908e-07,
"loss": 0.5274,
"step": 41600
},
{
"epoch": 0.4366903687258485,
"grad_norm": 2.239696979522705,
"learning_rate": 6.164182637961521e-07,
"loss": 0.5056,
"step": 41700
},
{
"epoch": 0.4377375878355028,
"grad_norm": 2.565997838973999,
"learning_rate": 6.147852196047455e-07,
"loss": 0.508,
"step": 41800
},
{
"epoch": 0.43878480694515715,
"grad_norm": 1.4207922220230103,
"learning_rate": 6.131508818113878e-07,
"loss": 0.4964,
"step": 41900
},
{
"epoch": 0.4398320260548115,
"grad_norm": 2.6042516231536865,
"learning_rate": 6.11515268834675e-07,
"loss": 0.5008,
"step": 42000
},
{
"epoch": 0.44087924516446575,
"grad_norm": 2.077496290206909,
"learning_rate": 6.098783991075736e-07,
"loss": 0.4964,
"step": 42100
},
{
"epoch": 0.4419264642741201,
"grad_norm": 2.444882392883301,
"learning_rate": 6.082402910772137e-07,
"loss": 0.493,
"step": 42200
},
{
"epoch": 0.4429736833837744,
"grad_norm": 3.973526954650879,
"learning_rate": 6.066009632046809e-07,
"loss": 0.5078,
"step": 42300
},
{
"epoch": 0.4440209024934287,
"grad_norm": 2.283217430114746,
"learning_rate": 6.049604339648078e-07,
"loss": 0.4756,
"step": 42400
},
{
"epoch": 0.445068121603083,
"grad_norm": 1.3749598264694214,
"learning_rate": 6.033187218459665e-07,
"loss": 0.494,
"step": 42500
},
{
"epoch": 0.44611534071273734,
"grad_norm": 3.739201068878174,
"learning_rate": 6.016758453498592e-07,
"loss": 0.4977,
"step": 42600
},
{
"epoch": 0.4471625598223916,
"grad_norm": 2.5676069259643555,
"learning_rate": 6.00031822991311e-07,
"loss": 0.4691,
"step": 42700
},
{
"epoch": 0.44820977893204594,
"grad_norm": 2.269869089126587,
"learning_rate": 5.983866732980607e-07,
"loss": 0.5088,
"step": 42800
},
{
"epoch": 0.4492569980417003,
"grad_norm": 1.8404080867767334,
"learning_rate": 5.96740414810551e-07,
"loss": 0.4666,
"step": 42900
},
{
"epoch": 0.4503042171513546,
"grad_norm": 2.3597822189331055,
"learning_rate": 5.950930660817214e-07,
"loss": 0.4976,
"step": 43000
},
{
"epoch": 0.4513514362610089,
"grad_norm": 1.5849223136901855,
"learning_rate": 5.934446456767977e-07,
"loss": 0.5176,
"step": 43100
},
{
"epoch": 0.4523986553706632,
"grad_norm": 1.3389567136764526,
"learning_rate": 5.917951721730834e-07,
"loss": 0.5244,
"step": 43200
},
{
"epoch": 0.45344587448031753,
"grad_norm": 2.6399717330932617,
"learning_rate": 5.901446641597498e-07,
"loss": 0.5227,
"step": 43300
},
{
"epoch": 0.4544930935899718,
"grad_norm": 2.2782344818115234,
"learning_rate": 5.884931402376274e-07,
"loss": 0.5351,
"step": 43400
},
{
"epoch": 0.45554031269962614,
"grad_norm": 4.411149024963379,
"learning_rate": 5.868406190189955e-07,
"loss": 0.4855,
"step": 43500
},
{
"epoch": 0.45658753180928047,
"grad_norm": 2.243643045425415,
"learning_rate": 5.851871191273726e-07,
"loss": 0.5299,
"step": 43600
},
{
"epoch": 0.4576347509189348,
"grad_norm": 2.678518533706665,
"learning_rate": 5.835326591973068e-07,
"loss": 0.5615,
"step": 43700
},
{
"epoch": 0.45868197002858907,
"grad_norm": 2.2850341796875,
"learning_rate": 5.818772578741654e-07,
"loss": 0.5314,
"step": 43800
},
{
"epoch": 0.4597291891382434,
"grad_norm": 2.199620246887207,
"learning_rate": 5.802209338139253e-07,
"loss": 0.4905,
"step": 43900
},
{
"epoch": 0.46077640824789773,
"grad_norm": 2.532054901123047,
"learning_rate": 5.785637056829619e-07,
"loss": 0.5143,
"step": 44000
},
{
"epoch": 0.461823627357552,
"grad_norm": 1.9873905181884766,
"learning_rate": 5.769055921578399e-07,
"loss": 0.5128,
"step": 44100
},
{
"epoch": 0.46287084646720633,
"grad_norm": 2.033123254776001,
"learning_rate": 5.752466119251018e-07,
"loss": 0.5027,
"step": 44200
},
{
"epoch": 0.46391806557686066,
"grad_norm": 1.890243649482727,
"learning_rate": 5.735867836810575e-07,
"loss": 0.4893,
"step": 44300
},
{
"epoch": 0.46496528468651493,
"grad_norm": 2.7789084911346436,
"learning_rate": 5.719261261315742e-07,
"loss": 0.4804,
"step": 44400
},
{
"epoch": 0.46601250379616926,
"grad_norm": 2.320241928100586,
"learning_rate": 5.702646579918651e-07,
"loss": 0.4727,
"step": 44500
},
{
"epoch": 0.4670597229058236,
"grad_norm": 2.557783603668213,
"learning_rate": 5.686023979862784e-07,
"loss": 0.4802,
"step": 44600
},
{
"epoch": 0.4681069420154779,
"grad_norm": 2.0354034900665283,
"learning_rate": 5.669393648480861e-07,
"loss": 0.4409,
"step": 44700
},
{
"epoch": 0.4691541611251322,
"grad_norm": 2.6490516662597656,
"learning_rate": 5.652755773192742e-07,
"loss": 0.5116,
"step": 44800
},
{
"epoch": 0.4702013802347865,
"grad_norm": 1.9367735385894775,
"learning_rate": 5.636110541503299e-07,
"loss": 0.51,
"step": 44900
},
{
"epoch": 0.47124859934444085,
"grad_norm": 2.3540682792663574,
"learning_rate": 5.619458141000305e-07,
"loss": 0.5053,
"step": 45000
},
{
"epoch": 0.47229581845409513,
"grad_norm": 2.308772325515747,
"learning_rate": 5.602798759352328e-07,
"loss": 0.4857,
"step": 45100
},
{
"epoch": 0.47334303756374946,
"grad_norm": 2.775662899017334,
"learning_rate": 5.586132584306617e-07,
"loss": 0.5039,
"step": 45200
},
{
"epoch": 0.4743902566734038,
"grad_norm": 2.4968132972717285,
"learning_rate": 5.569459803686971e-07,
"loss": 0.5047,
"step": 45300
},
{
"epoch": 0.4754374757830581,
"grad_norm": 2.3723912239074707,
"learning_rate": 5.552780605391637e-07,
"loss": 0.5022,
"step": 45400
},
{
"epoch": 0.4764846948927124,
"grad_norm": 2.080238103866577,
"learning_rate": 5.53609517739119e-07,
"loss": 0.5139,
"step": 45500
},
{
"epoch": 0.4775319140023667,
"grad_norm": 2.763566732406616,
"learning_rate": 5.519403707726409e-07,
"loss": 0.5269,
"step": 45600
},
{
"epoch": 0.47857913311202105,
"grad_norm": 2.2503960132598877,
"learning_rate": 5.502706384506162e-07,
"loss": 0.5049,
"step": 45700
},
{
"epoch": 0.4796263522216753,
"grad_norm": 2.2146077156066895,
"learning_rate": 5.486003395905284e-07,
"loss": 0.5164,
"step": 45800
},
{
"epoch": 0.48067357133132965,
"grad_norm": 2.077916145324707,
"learning_rate": 5.46929493016246e-07,
"loss": 0.5436,
"step": 45900
},
{
"epoch": 0.481720790440984,
"grad_norm": 2.990812301635742,
"learning_rate": 5.452581175578099e-07,
"loss": 0.4996,
"step": 46000
},
{
"epoch": 0.48276800955063826,
"grad_norm": 2.3420207500457764,
"learning_rate": 5.435862320512216e-07,
"loss": 0.4886,
"step": 46100
},
{
"epoch": 0.4838152286602926,
"grad_norm": 2.182870864868164,
"learning_rate": 5.419138553382303e-07,
"loss": 0.5081,
"step": 46200
},
{
"epoch": 0.4848624477699469,
"grad_norm": 2.5916247367858887,
"learning_rate": 5.402410062661217e-07,
"loss": 0.4863,
"step": 46300
},
{
"epoch": 0.48590966687960124,
"grad_norm": 2.3160765171051025,
"learning_rate": 5.38567703687504e-07,
"loss": 0.55,
"step": 46400
},
{
"epoch": 0.4869568859892555,
"grad_norm": 3.3683152198791504,
"learning_rate": 5.368939664600971e-07,
"loss": 0.4838,
"step": 46500
},
{
"epoch": 0.48800410509890985,
"grad_norm": 1.8857132196426392,
"learning_rate": 5.352198134465188e-07,
"loss": 0.5053,
"step": 46600
},
{
"epoch": 0.4890513242085642,
"grad_norm": 2.4393274784088135,
"learning_rate": 5.335452635140728e-07,
"loss": 0.53,
"step": 46700
},
{
"epoch": 0.49009854331821845,
"grad_norm": 2.8095269203186035,
"learning_rate": 5.318703355345361e-07,
"loss": 0.4955,
"step": 46800
},
{
"epoch": 0.4911457624278728,
"grad_norm": 3.766524076461792,
"learning_rate": 5.301950483839461e-07,
"loss": 0.5033,
"step": 46900
},
{
"epoch": 0.4921929815375271,
"grad_norm": 3.614816665649414,
"learning_rate": 5.285194209423881e-07,
"loss": 0.516,
"step": 47000
},
{
"epoch": 0.49324020064718144,
"grad_norm": 2.2229409217834473,
"learning_rate": 5.268434720937823e-07,
"loss": 0.5158,
"step": 47100
},
{
"epoch": 0.4942874197568357,
"grad_norm": 2.4111645221710205,
"learning_rate": 5.251672207256708e-07,
"loss": 0.5265,
"step": 47200
},
{
"epoch": 0.49533463886649004,
"grad_norm": 1.9818792343139648,
"learning_rate": 5.234906857290057e-07,
"loss": 0.5059,
"step": 47300
},
{
"epoch": 0.49638185797614437,
"grad_norm": 1.8921643495559692,
"learning_rate": 5.218138859979349e-07,
"loss": 0.5281,
"step": 47400
},
{
"epoch": 0.49742907708579864,
"grad_norm": 2.3685996532440186,
"learning_rate": 5.201368404295899e-07,
"loss": 0.5257,
"step": 47500
},
{
"epoch": 0.498476296195453,
"grad_norm": 3.2099828720092773,
"learning_rate": 5.184595679238732e-07,
"loss": 0.4806,
"step": 47600
},
{
"epoch": 0.4995235153051073,
"grad_norm": 2.328226089477539,
"learning_rate": 5.167820873832445e-07,
"loss": 0.5496,
"step": 47700
},
{
"epoch": 0.5005707344147616,
"grad_norm": 2.010138988494873,
"learning_rate": 5.151044177125077e-07,
"loss": 0.5025,
"step": 47800
},
{
"epoch": 0.501617953524416,
"grad_norm": 2.0107200145721436,
"learning_rate": 5.134265778185984e-07,
"loss": 0.4695,
"step": 47900
},
{
"epoch": 0.5026651726340702,
"grad_norm": 3.73002552986145,
"learning_rate": 5.117485866103707e-07,
"loss": 0.5489,
"step": 48000
},
{
"epoch": 0.5037123917437245,
"grad_norm": 1.203131914138794,
"learning_rate": 5.100704629983842e-07,
"loss": 0.4918,
"step": 48100
},
{
"epoch": 0.5047596108533788,
"grad_norm": 2.464951276779175,
"learning_rate": 5.083922258946899e-07,
"loss": 0.526,
"step": 48200
},
{
"epoch": 0.5058068299630332,
"grad_norm": 2.5923502445220947,
"learning_rate": 5.067138942126185e-07,
"loss": 0.5094,
"step": 48300
},
{
"epoch": 0.5068540490726875,
"grad_norm": 2.553731918334961,
"learning_rate": 5.050354868665663e-07,
"loss": 0.5116,
"step": 48400
},
{
"epoch": 0.5079012681823418,
"grad_norm": 2.171161413192749,
"learning_rate": 5.033570227717823e-07,
"loss": 0.5021,
"step": 48500
},
{
"epoch": 0.5089484872919962,
"grad_norm": 1.9675207138061523,
"learning_rate": 5.016785208441553e-07,
"loss": 0.4759,
"step": 48600
},
{
"epoch": 0.5099957064016504,
"grad_norm": 2.772975206375122,
"learning_rate": 5e-07,
"loss": 0.504,
"step": 48700
},
{
"epoch": 0.5110429255113047,
"grad_norm": 1.8081309795379639,
"learning_rate": 4.983214791558449e-07,
"loss": 0.4884,
"step": 48800
},
{
"epoch": 0.512090144620959,
"grad_norm": 2.1011574268341064,
"learning_rate": 4.966429772282177e-07,
"loss": 0.5411,
"step": 48900
},
{
"epoch": 0.5131373637306134,
"grad_norm": 1.7532665729522705,
"learning_rate": 4.949645131334338e-07,
"loss": 0.5217,
"step": 49000
},
{
"epoch": 0.5141845828402677,
"grad_norm": 1.9248243570327759,
"learning_rate": 4.932861057873817e-07,
"loss": 0.5161,
"step": 49100
},
{
"epoch": 0.515231801949922,
"grad_norm": 2.180882692337036,
"learning_rate": 4.916077741053101e-07,
"loss": 0.4977,
"step": 49200
},
{
"epoch": 0.5162790210595763,
"grad_norm": 2.663121223449707,
"learning_rate": 4.899295370016159e-07,
"loss": 0.4918,
"step": 49300
},
{
"epoch": 0.5173262401692306,
"grad_norm": 1.928085446357727,
"learning_rate": 4.882514133896293e-07,
"loss": 0.4863,
"step": 49400
},
{
"epoch": 0.5183734592788849,
"grad_norm": 2.9963412284851074,
"learning_rate": 4.865734221814016e-07,
"loss": 0.5015,
"step": 49500
},
{
"epoch": 0.5194206783885392,
"grad_norm": 2.45681095123291,
"learning_rate": 4.848955822874924e-07,
"loss": 0.5285,
"step": 49600
},
{
"epoch": 0.5204678974981936,
"grad_norm": 1.8462231159210205,
"learning_rate": 4.832179126167556e-07,
"loss": 0.467,
"step": 49700
},
{
"epoch": 0.5215151166078479,
"grad_norm": 2.27242374420166,
"learning_rate": 4.815404320761267e-07,
"loss": 0.4681,
"step": 49800
},
{
"epoch": 0.5225623357175022,
"grad_norm": 2.18723201751709,
"learning_rate": 4.7986315957041e-07,
"loss": 0.5005,
"step": 49900
},
{
"epoch": 0.5236095548271564,
"grad_norm": 3.0114426612854004,
"learning_rate": 4.781861140020652e-07,
"loss": 0.4861,
"step": 50000
},
{
"epoch": 0.5246567739368108,
"grad_norm": 2.07069730758667,
"learning_rate": 4.765093142709943e-07,
"loss": 0.4648,
"step": 50100
},
{
"epoch": 0.5257039930464651,
"grad_norm": 2.2993671894073486,
"learning_rate": 4.7483277927432924e-07,
"loss": 0.4835,
"step": 50200
},
{
"epoch": 0.5267512121561194,
"grad_norm": 2.224874258041382,
"learning_rate": 4.731565279062179e-07,
"loss": 0.4642,
"step": 50300
},
{
"epoch": 0.5277984312657737,
"grad_norm": 1.7376128435134888,
"learning_rate": 4.7148057905761187e-07,
"loss": 0.4883,
"step": 50400
},
{
"epoch": 0.5288456503754281,
"grad_norm": 3.3602840900421143,
"learning_rate": 4.698049516160539e-07,
"loss": 0.4762,
"step": 50500
},
{
"epoch": 0.5298928694850824,
"grad_norm": 1.7802869081497192,
"learning_rate": 4.681296644654639e-07,
"loss": 0.5264,
"step": 50600
},
{
"epoch": 0.5309400885947366,
"grad_norm": 1.8603919744491577,
"learning_rate": 4.6645473648592716e-07,
"loss": 0.4902,
"step": 50700
},
{
"epoch": 0.531987307704391,
"grad_norm": 2.204157590866089,
"learning_rate": 4.647801865534813e-07,
"loss": 0.4835,
"step": 50800
},
{
"epoch": 0.5330345268140453,
"grad_norm": 1.2694624662399292,
"learning_rate": 4.63106033539903e-07,
"loss": 0.5238,
"step": 50900
},
{
"epoch": 0.5340817459236996,
"grad_norm": 2.0624773502349854,
"learning_rate": 4.6143229631249596e-07,
"loss": 0.5033,
"step": 51000
},
{
"epoch": 0.5351289650333539,
"grad_norm": 1.9012243747711182,
"learning_rate": 4.597589937338784e-07,
"loss": 0.5076,
"step": 51100
},
{
"epoch": 0.5361761841430083,
"grad_norm": 2.1069536209106445,
"learning_rate": 4.580861446617698e-07,
"loss": 0.5171,
"step": 51200
},
{
"epoch": 0.5372234032526626,
"grad_norm": 1.5368138551712036,
"learning_rate": 4.564137679487785e-07,
"loss": 0.4803,
"step": 51300
},
{
"epoch": 0.5382706223623168,
"grad_norm": 1.5406559705734253,
"learning_rate": 4.5474188244219006e-07,
"loss": 0.4839,
"step": 51400
},
{
"epoch": 0.5393178414719711,
"grad_norm": 1.4071673154830933,
"learning_rate": 4.530705069837542e-07,
"loss": 0.4764,
"step": 51500
},
{
"epoch": 0.5403650605816255,
"grad_norm": 2.699596643447876,
"learning_rate": 4.513996604094716e-07,
"loss": 0.5177,
"step": 51600
},
{
"epoch": 0.5414122796912798,
"grad_norm": 1.542262315750122,
"learning_rate": 4.497293615493838e-07,
"loss": 0.508,
"step": 51700
},
{
"epoch": 0.5424594988009341,
"grad_norm": 3.0482521057128906,
"learning_rate": 4.480596292273592e-07,
"loss": 0.5303,
"step": 51800
},
{
"epoch": 0.5435067179105885,
"grad_norm": 2.214055061340332,
"learning_rate": 4.463904822608809e-07,
"loss": 0.4843,
"step": 51900
},
{
"epoch": 0.5445539370202428,
"grad_norm": 2.4003210067749023,
"learning_rate": 4.4472193946083634e-07,
"loss": 0.5024,
"step": 52000
},
{
"epoch": 0.545601156129897,
"grad_norm": 2.2942888736724854,
"learning_rate": 4.430540196313031e-07,
"loss": 0.5073,
"step": 52100
},
{
"epoch": 0.5466483752395513,
"grad_norm": 2.4813528060913086,
"learning_rate": 4.413867415693383e-07,
"loss": 0.5114,
"step": 52200
},
{
"epoch": 0.5476955943492057,
"grad_norm": 1.8171602487564087,
"learning_rate": 4.3972012406476715e-07,
"loss": 0.4714,
"step": 52300
},
{
"epoch": 0.54874281345886,
"grad_norm": 2.677717924118042,
"learning_rate": 4.3805418589996967e-07,
"loss": 0.5277,
"step": 52400
},
{
"epoch": 0.5497900325685143,
"grad_norm": 2.815244674682617,
"learning_rate": 4.363889458496701e-07,
"loss": 0.4969,
"step": 52500
},
{
"epoch": 0.5508372516781687,
"grad_norm": 2.719905376434326,
"learning_rate": 4.347244226807257e-07,
"loss": 0.494,
"step": 52600
},
{
"epoch": 0.551884470787823,
"grad_norm": 2.277196168899536,
"learning_rate": 4.3306063515191384e-07,
"loss": 0.4989,
"step": 52700
},
{
"epoch": 0.5529316898974772,
"grad_norm": 2.747807741165161,
"learning_rate": 4.3139760201372166e-07,
"loss": 0.475,
"step": 52800
},
{
"epoch": 0.5539789090071315,
"grad_norm": 2.1879899501800537,
"learning_rate": 4.29735342008135e-07,
"loss": 0.4727,
"step": 52900
},
{
"epoch": 0.5550261281167859,
"grad_norm": 1.5891708135604858,
"learning_rate": 4.280738738684259e-07,
"loss": 0.5209,
"step": 53000
},
{
"epoch": 0.5560733472264402,
"grad_norm": 2.6258082389831543,
"learning_rate": 4.2641321631894256e-07,
"loss": 0.5146,
"step": 53100
},
{
"epoch": 0.5571205663360945,
"grad_norm": 2.106497287750244,
"learning_rate": 4.2475338807489825e-07,
"loss": 0.5072,
"step": 53200
},
{
"epoch": 0.5581677854457489,
"grad_norm": 1.3520596027374268,
"learning_rate": 4.2309440784216014e-07,
"loss": 0.5007,
"step": 53300
},
{
"epoch": 0.5592150045554031,
"grad_norm": 2.2585766315460205,
"learning_rate": 4.21436294317038e-07,
"loss": 0.5661,
"step": 53400
},
{
"epoch": 0.5602622236650574,
"grad_norm": 2.4655063152313232,
"learning_rate": 4.1977906618607473e-07,
"loss": 0.5057,
"step": 53500
},
{
"epoch": 0.5613094427747117,
"grad_norm": 1.7120404243469238,
"learning_rate": 4.181227421258344e-07,
"loss": 0.4762,
"step": 53600
},
{
"epoch": 0.5623566618843661,
"grad_norm": 2.365668535232544,
"learning_rate": 4.164673408026932e-07,
"loss": 0.5015,
"step": 53700
},
{
"epoch": 0.5634038809940204,
"grad_norm": 2.5297205448150635,
"learning_rate": 4.148128808726274e-07,
"loss": 0.4789,
"step": 53800
},
{
"epoch": 0.5644511001036747,
"grad_norm": 2.997265577316284,
"learning_rate": 4.131593809810044e-07,
"loss": 0.4841,
"step": 53900
},
{
"epoch": 0.565498319213329,
"grad_norm": 2.2408447265625,
"learning_rate": 4.1150685976237253e-07,
"loss": 0.5194,
"step": 54000
},
{
"epoch": 0.5665455383229833,
"grad_norm": 1.8267594575881958,
"learning_rate": 4.098553358402503e-07,
"loss": 0.4978,
"step": 54100
},
{
"epoch": 0.5675927574326376,
"grad_norm": 3.2854866981506348,
"learning_rate": 4.0820482782691666e-07,
"loss": 0.499,
"step": 54200
},
{
"epoch": 0.5686399765422919,
"grad_norm": 2.401383638381958,
"learning_rate": 4.0655535432320225e-07,
"loss": 0.539,
"step": 54300
},
{
"epoch": 0.5696871956519463,
"grad_norm": 2.3308005332946777,
"learning_rate": 4.0490693391827867e-07,
"loss": 0.527,
"step": 54400
},
{
"epoch": 0.5707344147616006,
"grad_norm": 2.6808366775512695,
"learning_rate": 4.0325958518944893e-07,
"loss": 0.4965,
"step": 54500
},
{
"epoch": 0.5717816338712549,
"grad_norm": 2.82200026512146,
"learning_rate": 4.016133267019394e-07,
"loss": 0.5051,
"step": 54600
},
{
"epoch": 0.5728288529809092,
"grad_norm": 3.023541212081909,
"learning_rate": 3.99968177008689e-07,
"loss": 0.4623,
"step": 54700
},
{
"epoch": 0.5738760720905635,
"grad_norm": 2.405120372772217,
"learning_rate": 3.983241546501408e-07,
"loss": 0.5096,
"step": 54800
},
{
"epoch": 0.5749232912002178,
"grad_norm": 1.9728878736495972,
"learning_rate": 3.9668127815403353e-07,
"loss": 0.5405,
"step": 54900
},
{
"epoch": 0.5759705103098721,
"grad_norm": 3.312455415725708,
"learning_rate": 3.950395660351922e-07,
"loss": 0.5245,
"step": 55000
},
{
"epoch": 0.5770177294195264,
"grad_norm": 1.9875174760818481,
"learning_rate": 3.93399036795319e-07,
"loss": 0.4863,
"step": 55100
},
{
"epoch": 0.5780649485291808,
"grad_norm": 2.295588731765747,
"learning_rate": 3.917597089227863e-07,
"loss": 0.4868,
"step": 55200
},
{
"epoch": 0.5791121676388351,
"grad_norm": 2.505709409713745,
"learning_rate": 3.901216008924265e-07,
"loss": 0.4955,
"step": 55300
},
{
"epoch": 0.5801593867484894,
"grad_norm": 2.177341938018799,
"learning_rate": 3.88484731165325e-07,
"loss": 0.5103,
"step": 55400
},
{
"epoch": 0.5812066058581437,
"grad_norm": 1.426915168762207,
"learning_rate": 3.868491181886122e-07,
"loss": 0.5235,
"step": 55500
},
{
"epoch": 0.582253824967798,
"grad_norm": 2.258373498916626,
"learning_rate": 3.852147803952545e-07,
"loss": 0.4983,
"step": 55600
},
{
"epoch": 0.5833010440774523,
"grad_norm": 2.660693645477295,
"learning_rate": 3.835817362038477e-07,
"loss": 0.5127,
"step": 55700
},
{
"epoch": 0.5843482631871066,
"grad_norm": 2.2097291946411133,
"learning_rate": 3.8195000401840927e-07,
"loss": 0.5034,
"step": 55800
},
{
"epoch": 0.585395482296761,
"grad_norm": 2.2298669815063477,
"learning_rate": 3.803196022281701e-07,
"loss": 0.4971,
"step": 55900
},
{
"epoch": 0.5864427014064153,
"grad_norm": 2.1946804523468018,
"learning_rate": 3.78690549207368e-07,
"loss": 0.4942,
"step": 56000
},
{
"epoch": 0.5874899205160696,
"grad_norm": 3.2329068183898926,
"learning_rate": 3.77062863315041e-07,
"loss": 0.513,
"step": 56100
},
{
"epoch": 0.5885371396257238,
"grad_norm": 1.839722752571106,
"learning_rate": 3.7543656289481927e-07,
"loss": 0.5546,
"step": 56200
},
{
"epoch": 0.5895843587353782,
"grad_norm": 2.5834665298461914,
"learning_rate": 3.7381166627471914e-07,
"loss": 0.4821,
"step": 56300
},
{
"epoch": 0.5906315778450325,
"grad_norm": 2.00166916847229,
"learning_rate": 3.7218819176693693e-07,
"loss": 0.5187,
"step": 56400
},
{
"epoch": 0.5916787969546868,
"grad_norm": 3.0043110847473145,
"learning_rate": 3.7056615766764174e-07,
"loss": 0.5227,
"step": 56500
},
{
"epoch": 0.5927260160643412,
"grad_norm": 1.637872576713562,
"learning_rate": 3.6894558225676924e-07,
"loss": 0.4611,
"step": 56600
},
{
"epoch": 0.5937732351739955,
"grad_norm": 2.64483904838562,
"learning_rate": 3.6732648379781683e-07,
"loss": 0.4792,
"step": 56700
},
{
"epoch": 0.5948204542836497,
"grad_norm": 1.7451013326644897,
"learning_rate": 3.657088805376366e-07,
"loss": 0.5322,
"step": 56800
},
{
"epoch": 0.595867673393304,
"grad_norm": 2.465116500854492,
"learning_rate": 3.640927907062297e-07,
"loss": 0.4657,
"step": 56900
},
{
"epoch": 0.5969148925029584,
"grad_norm": 3.788491725921631,
"learning_rate": 3.624782325165421e-07,
"loss": 0.4855,
"step": 57000
},
{
"epoch": 0.5979621116126127,
"grad_norm": 2.519657850265503,
"learning_rate": 3.6086522416425823e-07,
"loss": 0.5125,
"step": 57100
},
{
"epoch": 0.599009330722267,
"grad_norm": 1.8677030801773071,
"learning_rate": 3.5925378382759577e-07,
"loss": 0.498,
"step": 57200
},
{
"epoch": 0.6000565498319214,
"grad_norm": 1.9577298164367676,
"learning_rate": 3.57643929667102e-07,
"loss": 0.4792,
"step": 57300
},
{
"epoch": 0.6011037689415757,
"grad_norm": 2.364872932434082,
"learning_rate": 3.560356798254477e-07,
"loss": 0.4882,
"step": 57400
},
{
"epoch": 0.6021509880512299,
"grad_norm": 2.4925103187561035,
"learning_rate": 3.5442905242722365e-07,
"loss": 0.4825,
"step": 57500
},
{
"epoch": 0.6031982071608842,
"grad_norm": 2.7740890979766846,
"learning_rate": 3.5282406557873635e-07,
"loss": 0.5345,
"step": 57600
},
{
"epoch": 0.6042454262705386,
"grad_norm": 1.0781739950180054,
"learning_rate": 3.512207373678032e-07,
"loss": 0.4665,
"step": 57700
},
{
"epoch": 0.6052926453801929,
"grad_norm": 2.9016547203063965,
"learning_rate": 3.496190858635494e-07,
"loss": 0.4655,
"step": 57800
},
{
"epoch": 0.6063398644898472,
"grad_norm": 0.917265772819519,
"learning_rate": 3.480191291162041e-07,
"loss": 0.4707,
"step": 57900
},
{
"epoch": 0.6073870835995016,
"grad_norm": 1.5372905731201172,
"learning_rate": 3.4642088515689695e-07,
"loss": 0.4867,
"step": 58000
},
{
"epoch": 0.6084343027091559,
"grad_norm": 1.8536443710327148,
"learning_rate": 3.4482437199745463e-07,
"loss": 0.4746,
"step": 58100
},
{
"epoch": 0.6094815218188101,
"grad_norm": 2.8087878227233887,
"learning_rate": 3.432296076301986e-07,
"loss": 0.5529,
"step": 58200
},
{
"epoch": 0.6105287409284644,
"grad_norm": 1.8362385034561157,
"learning_rate": 3.416366100277414e-07,
"loss": 0.4911,
"step": 58300
},
{
"epoch": 0.6115759600381188,
"grad_norm": 1.9666386842727661,
"learning_rate": 3.4004539714278457e-07,
"loss": 0.4902,
"step": 58400
},
{
"epoch": 0.6126231791477731,
"grad_norm": 1.745953917503357,
"learning_rate": 3.3845598690791675e-07,
"loss": 0.5204,
"step": 58500
},
{
"epoch": 0.6136703982574274,
"grad_norm": 1.9354580640792847,
"learning_rate": 3.368683972354108e-07,
"loss": 0.4763,
"step": 58600
},
{
"epoch": 0.6147176173670817,
"grad_norm": 2.232057809829712,
"learning_rate": 3.3528264601702217e-07,
"loss": 0.5116,
"step": 58700
},
{
"epoch": 0.6157648364767361,
"grad_norm": 2.1513118743896484,
"learning_rate": 3.336987511237877e-07,
"loss": 0.539,
"step": 58800
},
{
"epoch": 0.6168120555863903,
"grad_norm": 1.7164148092269897,
"learning_rate": 3.321167304058238e-07,
"loss": 0.4912,
"step": 58900
},
{
"epoch": 0.6178592746960446,
"grad_norm": 2.390707015991211,
"learning_rate": 3.305366016921249e-07,
"loss": 0.5207,
"step": 59000
},
{
"epoch": 0.618906493805699,
"grad_norm": 1.944360613822937,
"learning_rate": 3.289583827903639e-07,
"loss": 0.4786,
"step": 59100
},
{
"epoch": 0.6199537129153533,
"grad_norm": 3.611234426498413,
"learning_rate": 3.2738209148668996e-07,
"loss": 0.5597,
"step": 59200
},
{
"epoch": 0.6210009320250076,
"grad_norm": 2.125988245010376,
"learning_rate": 3.2580774554552834e-07,
"loss": 0.5064,
"step": 59300
},
{
"epoch": 0.6220481511346619,
"grad_norm": 2.2751822471618652,
"learning_rate": 3.242353627093817e-07,
"loss": 0.4839,
"step": 59400
},
{
"epoch": 0.6230953702443163,
"grad_norm": 2.4632444381713867,
"learning_rate": 3.226649606986277e-07,
"loss": 0.5085,
"step": 59500
},
{
"epoch": 0.6241425893539705,
"grad_norm": 2.596140146255493,
"learning_rate": 3.210965572113211e-07,
"loss": 0.4834,
"step": 59600
},
{
"epoch": 0.6251898084636248,
"grad_norm": 3.1402766704559326,
"learning_rate": 3.195301699229943e-07,
"loss": 0.4894,
"step": 59700
},
{
"epoch": 0.6262370275732791,
"grad_norm": 1.3100465536117554,
"learning_rate": 3.179658164864567e-07,
"loss": 0.5371,
"step": 59800
},
{
"epoch": 0.6272842466829335,
"grad_norm": 2.2746660709381104,
"learning_rate": 3.164035145315971e-07,
"loss": 0.4865,
"step": 59900
},
{
"epoch": 0.6283314657925878,
"grad_norm": 2.2843546867370605,
"learning_rate": 3.14843281665185e-07,
"loss": 0.4958,
"step": 60000
},
{
"epoch": 0.6293786849022421,
"grad_norm": 2.045327663421631,
"learning_rate": 3.132851354706709e-07,
"loss": 0.4747,
"step": 60100
},
{
"epoch": 0.6304259040118964,
"grad_norm": 2.59464430809021,
"learning_rate": 3.117290935079895e-07,
"loss": 0.4927,
"step": 60200
},
{
"epoch": 0.6314731231215507,
"grad_norm": 1.8439029455184937,
"learning_rate": 3.1017517331336175e-07,
"loss": 0.4829,
"step": 60300
},
{
"epoch": 0.632520342231205,
"grad_norm": 2.155336618423462,
"learning_rate": 3.0862339239909587e-07,
"loss": 0.4764,
"step": 60400
},
{
"epoch": 0.6335675613408593,
"grad_norm": 2.2298882007598877,
"learning_rate": 3.070737682533913e-07,
"loss": 0.5267,
"step": 60500
},
{
"epoch": 0.6346147804505137,
"grad_norm": 1.9075183868408203,
"learning_rate": 3.0552631834014153e-07,
"loss": 0.5101,
"step": 60600
},
{
"epoch": 0.635661999560168,
"grad_norm": 2.1493678092956543,
"learning_rate": 3.039810600987367e-07,
"loss": 0.455,
"step": 60700
},
{
"epoch": 0.6367092186698223,
"grad_norm": 1.9552183151245117,
"learning_rate": 3.024380109438669e-07,
"loss": 0.511,
"step": 60800
},
{
"epoch": 0.6377564377794765,
"grad_norm": 2.0828135013580322,
"learning_rate": 3.0089718826532727e-07,
"loss": 0.4816,
"step": 60900
},
{
"epoch": 0.6388036568891309,
"grad_norm": 1.6887547969818115,
"learning_rate": 2.9935860942782055e-07,
"loss": 0.4874,
"step": 61000
},
{
"epoch": 0.6398508759987852,
"grad_norm": 1.987060785293579,
"learning_rate": 2.978222917707616e-07,
"loss": 0.5237,
"step": 61100
},
{
"epoch": 0.6408980951084395,
"grad_norm": 1.8471943140029907,
"learning_rate": 2.9628825260808313e-07,
"loss": 0.4864,
"step": 61200
},
{
"epoch": 0.6419453142180939,
"grad_norm": 2.424875497817993,
"learning_rate": 2.9475650922803907e-07,
"loss": 0.4865,
"step": 61300
},
{
"epoch": 0.6429925333277482,
"grad_norm": 1.9071121215820312,
"learning_rate": 2.9322707889301066e-07,
"loss": 0.5097,
"step": 61400
},
{
"epoch": 0.6440397524374025,
"grad_norm": 1.9200624227523804,
"learning_rate": 2.9169997883931205e-07,
"loss": 0.4865,
"step": 61500
},
{
"epoch": 0.6450869715470567,
"grad_norm": 1.8281010389328003,
"learning_rate": 2.90175226276995e-07,
"loss": 0.4923,
"step": 61600
},
{
"epoch": 0.6461341906567111,
"grad_norm": 2.7019853591918945,
"learning_rate": 2.886528383896559e-07,
"loss": 0.4702,
"step": 61700
},
{
"epoch": 0.6471814097663654,
"grad_norm": 1.542846918106079,
"learning_rate": 2.87132832334242e-07,
"loss": 0.5025,
"step": 61800
},
{
"epoch": 0.6482286288760197,
"grad_norm": 3.2872512340545654,
"learning_rate": 2.856152252408578e-07,
"loss": 0.4896,
"step": 61900
},
{
"epoch": 0.6492758479856741,
"grad_norm": 3.8048501014709473,
"learning_rate": 2.841000342125719e-07,
"loss": 0.4723,
"step": 62000
},
{
"epoch": 0.6503230670953284,
"grad_norm": 2.0907108783721924,
"learning_rate": 2.825872763252245e-07,
"loss": 0.5326,
"step": 62100
},
{
"epoch": 0.6513702862049827,
"grad_norm": 2.4722342491149902,
"learning_rate": 2.81076968627235e-07,
"loss": 0.4774,
"step": 62200
},
{
"epoch": 0.6524175053146369,
"grad_norm": 2.449239492416382,
"learning_rate": 2.7956912813940947e-07,
"loss": 0.47,
"step": 62300
},
{
"epoch": 0.6534647244242913,
"grad_norm": 2.0104002952575684,
"learning_rate": 2.7806377185474953e-07,
"loss": 0.5017,
"step": 62400
},
{
"epoch": 0.6545119435339456,
"grad_norm": 2.3968191146850586,
"learning_rate": 2.765609167382602e-07,
"loss": 0.489,
"step": 62500
},
{
"epoch": 0.6555591626435999,
"grad_norm": 2.0325634479522705,
"learning_rate": 2.750605797267587e-07,
"loss": 0.5153,
"step": 62600
},
{
"epoch": 0.6566063817532543,
"grad_norm": 2.9563980102539062,
"learning_rate": 2.7356277772868427e-07,
"loss": 0.5121,
"step": 62700
},
{
"epoch": 0.6576536008629086,
"grad_norm": 1.5260460376739502,
"learning_rate": 2.7206752762390684e-07,
"loss": 0.5009,
"step": 62800
},
{
"epoch": 0.6587008199725629,
"grad_norm": 2.651346206665039,
"learning_rate": 2.7057484626353717e-07,
"loss": 0.4819,
"step": 62900
},
{
"epoch": 0.6597480390822171,
"grad_norm": 2.392993927001953,
"learning_rate": 2.69084750469737e-07,
"loss": 0.4924,
"step": 63000
},
{
"epoch": 0.6607952581918715,
"grad_norm": 2.065648078918457,
"learning_rate": 2.6759725703552916e-07,
"loss": 0.4576,
"step": 63100
},
{
"epoch": 0.6618424773015258,
"grad_norm": 1.6166179180145264,
"learning_rate": 2.661123827246088e-07,
"loss": 0.5187,
"step": 63200
},
{
"epoch": 0.6628896964111801,
"grad_norm": 2.0667145252227783,
"learning_rate": 2.646301442711538e-07,
"loss": 0.4963,
"step": 63300
},
{
"epoch": 0.6639369155208344,
"grad_norm": 3.5013437271118164,
"learning_rate": 2.6315055837963687e-07,
"loss": 0.5027,
"step": 63400
},
{
"epoch": 0.6649841346304888,
"grad_norm": 0.9413002133369446,
"learning_rate": 2.616736417246368e-07,
"loss": 0.4712,
"step": 63500
},
{
"epoch": 0.666031353740143,
"grad_norm": 1.4072952270507812,
"learning_rate": 2.601994109506508e-07,
"loss": 0.4731,
"step": 63600
},
{
"epoch": 0.6670785728497973,
"grad_norm": 2.4212138652801514,
"learning_rate": 2.587278826719069e-07,
"loss": 0.4828,
"step": 63700
},
{
"epoch": 0.6681257919594517,
"grad_norm": 1.7635606527328491,
"learning_rate": 2.5725907347217655e-07,
"loss": 0.4863,
"step": 63800
},
{
"epoch": 0.669173011069106,
"grad_norm": 2.0671000480651855,
"learning_rate": 2.5579299990458785e-07,
"loss": 0.4636,
"step": 63900
},
{
"epoch": 0.6702202301787603,
"grad_norm": 2.378913402557373,
"learning_rate": 2.5432967849143906e-07,
"loss": 0.4766,
"step": 64000
},
{
"epoch": 0.6712674492884146,
"grad_norm": 3.7450199127197266,
"learning_rate": 2.528691257240122e-07,
"loss": 0.5137,
"step": 64100
},
{
"epoch": 0.672314668398069,
"grad_norm": 2.676037073135376,
"learning_rate": 2.514113580623873e-07,
"loss": 0.4933,
"step": 64200
},
{
"epoch": 0.6733618875077232,
"grad_norm": 1.6275851726531982,
"learning_rate": 2.499563919352572e-07,
"loss": 0.5038,
"step": 64300
},
{
"epoch": 0.6744091066173775,
"grad_norm": 2.475569009780884,
"learning_rate": 2.485042437397418e-07,
"loss": 0.4518,
"step": 64400
},
{
"epoch": 0.6754563257270318,
"grad_norm": 3.2226366996765137,
"learning_rate": 2.470549298412036e-07,
"loss": 0.4634,
"step": 64500
},
{
"epoch": 0.6765035448366862,
"grad_norm": 2.9092655181884766,
"learning_rate": 2.456084665730634e-07,
"loss": 0.4851,
"step": 64600
},
{
"epoch": 0.6775507639463405,
"grad_norm": 1.9740290641784668,
"learning_rate": 2.441648702366161e-07,
"loss": 0.489,
"step": 64700
},
{
"epoch": 0.6785979830559948,
"grad_norm": 2.2705118656158447,
"learning_rate": 2.42724157100847e-07,
"loss": 0.4918,
"step": 64800
},
{
"epoch": 0.6796452021656492,
"grad_norm": 2.0279767513275146,
"learning_rate": 2.4128634340224767e-07,
"loss": 0.5309,
"step": 64900
},
{
"epoch": 0.6806924212753034,
"grad_norm": 2.4952125549316406,
"learning_rate": 2.3985144534463507e-07,
"loss": 0.5253,
"step": 65000
},
{
"epoch": 0.6817396403849577,
"grad_norm": 1.7526471614837646,
"learning_rate": 2.3841947909896675e-07,
"loss": 0.4919,
"step": 65100
},
{
"epoch": 0.682786859494612,
"grad_norm": 2.78068208694458,
"learning_rate": 2.369904608031591e-07,
"loss": 0.4678,
"step": 65200
},
{
"epoch": 0.6838340786042664,
"grad_norm": 1.9609248638153076,
"learning_rate": 2.3556440656190675e-07,
"loss": 0.5004,
"step": 65300
},
{
"epoch": 0.6848812977139207,
"grad_norm": 1.8966784477233887,
"learning_rate": 2.3414133244649965e-07,
"loss": 0.4609,
"step": 65400
},
{
"epoch": 0.685928516823575,
"grad_norm": 1.7883254289627075,
"learning_rate": 2.3272125449464197e-07,
"loss": 0.5053,
"step": 65500
},
{
"epoch": 0.6869757359332294,
"grad_norm": 2.0737862586975098,
"learning_rate": 2.3130418871027285e-07,
"loss": 0.5126,
"step": 65600
},
{
"epoch": 0.6880229550428836,
"grad_norm": 2.2858548164367676,
"learning_rate": 2.2989015106338456e-07,
"loss": 0.4954,
"step": 65700
},
{
"epoch": 0.6890701741525379,
"grad_norm": 2.121546506881714,
"learning_rate": 2.284791574898423e-07,
"loss": 0.5017,
"step": 65800
},
{
"epoch": 0.6901173932621922,
"grad_norm": 1.6191834211349487,
"learning_rate": 2.270712238912067e-07,
"loss": 0.4721,
"step": 65900
},
{
"epoch": 0.6911646123718466,
"grad_norm": 2.482290506362915,
"learning_rate": 2.2566636613455185e-07,
"loss": 0.5003,
"step": 66000
},
{
"epoch": 0.6922118314815009,
"grad_norm": 2.413865089416504,
"learning_rate": 2.242646000522885e-07,
"loss": 0.4864,
"step": 66100
},
{
"epoch": 0.6932590505911552,
"grad_norm": 2.390326738357544,
"learning_rate": 2.228659414419853e-07,
"loss": 0.5155,
"step": 66200
},
{
"epoch": 0.6943062697008096,
"grad_norm": 2.158834457397461,
"learning_rate": 2.2147040606618956e-07,
"loss": 0.4972,
"step": 66300
},
{
"epoch": 0.6953534888104638,
"grad_norm": 2.767620086669922,
"learning_rate": 2.2007800965225087e-07,
"loss": 0.4651,
"step": 66400
},
{
"epoch": 0.6964007079201181,
"grad_norm": 3.050821542739868,
"learning_rate": 2.1868876789214418e-07,
"loss": 0.5146,
"step": 66500
},
{
"epoch": 0.6974479270297724,
"grad_norm": 2.7702839374542236,
"learning_rate": 2.1730269644229104e-07,
"loss": 0.5143,
"step": 66600
},
{
"epoch": 0.6984951461394268,
"grad_norm": 2.543748140335083,
"learning_rate": 2.159198109233849e-07,
"loss": 0.5028,
"step": 66700
},
{
"epoch": 0.6995423652490811,
"grad_norm": 3.739572048187256,
"learning_rate": 2.1454012692021505e-07,
"loss": 0.5471,
"step": 66800
},
{
"epoch": 0.7005895843587354,
"grad_norm": 2.372471809387207,
"learning_rate": 2.131636599814896e-07,
"loss": 0.4978,
"step": 66900
},
{
"epoch": 0.7016368034683896,
"grad_norm": 2.276508092880249,
"learning_rate": 2.1179042561966154e-07,
"loss": 0.5153,
"step": 67000
},
{
"epoch": 0.702684022578044,
"grad_norm": 2.0715689659118652,
"learning_rate": 2.1042043931075342e-07,
"loss": 0.5127,
"step": 67100
},
{
"epoch": 0.7037312416876983,
"grad_norm": 1.9307739734649658,
"learning_rate": 2.0905371649418318e-07,
"loss": 0.4746,
"step": 67200
},
{
"epoch": 0.7047784607973526,
"grad_norm": 2.039501905441284,
"learning_rate": 2.076902725725897e-07,
"loss": 0.4952,
"step": 67300
},
{
"epoch": 0.705825679907007,
"grad_norm": 2.397334575653076,
"learning_rate": 2.063301229116597e-07,
"loss": 0.4728,
"step": 67400
},
{
"epoch": 0.7068728990166613,
"grad_norm": 3.5085904598236084,
"learning_rate": 2.0497328283995425e-07,
"loss": 0.5176,
"step": 67500
},
{
"epoch": 0.7079201181263156,
"grad_norm": 2.772425651550293,
"learning_rate": 2.0361976764873623e-07,
"loss": 0.5159,
"step": 67600
},
{
"epoch": 0.7089673372359698,
"grad_norm": 1.3938500881195068,
"learning_rate": 2.0226959259179794e-07,
"loss": 0.4949,
"step": 67700
},
{
"epoch": 0.7100145563456242,
"grad_norm": 2.1697475910186768,
"learning_rate": 2.0092277288528898e-07,
"loss": 0.466,
"step": 67800
},
{
"epoch": 0.7110617754552785,
"grad_norm": 1.512786865234375,
"learning_rate": 1.995793237075452e-07,
"loss": 0.5185,
"step": 67900
},
{
"epoch": 0.7121089945649328,
"grad_norm": 1.7060164213180542,
"learning_rate": 1.9823926019891724e-07,
"loss": 0.4649,
"step": 68000
},
{
"epoch": 0.7131562136745871,
"grad_norm": 2.2003238201141357,
"learning_rate": 1.9690259746160005e-07,
"loss": 0.4921,
"step": 68100
},
{
"epoch": 0.7142034327842415,
"grad_norm": 2.538870096206665,
"learning_rate": 1.9556935055946277e-07,
"loss": 0.5164,
"step": 68200
},
{
"epoch": 0.7152506518938958,
"grad_norm": 3.6677184104919434,
"learning_rate": 1.9423953451787888e-07,
"loss": 0.5299,
"step": 68300
},
{
"epoch": 0.71629787100355,
"grad_norm": 1.810766339302063,
"learning_rate": 1.929131643235569e-07,
"loss": 0.4917,
"step": 68400
},
{
"epoch": 0.7173450901132044,
"grad_norm": 1.973241925239563,
"learning_rate": 1.9159025492437143e-07,
"loss": 0.4827,
"step": 68500
},
{
"epoch": 0.7183923092228587,
"grad_norm": 2.1515488624572754,
"learning_rate": 1.9027082122919474e-07,
"loss": 0.4748,
"step": 68600
},
{
"epoch": 0.719439528332513,
"grad_norm": 1.521958827972412,
"learning_rate": 1.8895487810772882e-07,
"loss": 0.5087,
"step": 68700
},
{
"epoch": 0.7204867474421673,
"grad_norm": 2.1833043098449707,
"learning_rate": 1.876424403903376e-07,
"loss": 0.4784,
"step": 68800
},
{
"epoch": 0.7215339665518217,
"grad_norm": 2.8621373176574707,
"learning_rate": 1.8633352286788011e-07,
"loss": 0.5077,
"step": 68900
},
{
"epoch": 0.722581185661476,
"grad_norm": 1.9079474210739136,
"learning_rate": 1.8502814029154367e-07,
"loss": 0.5052,
"step": 69000
},
{
"epoch": 0.7236284047711302,
"grad_norm": 2.184054374694824,
"learning_rate": 1.837263073726769e-07,
"loss": 0.5109,
"step": 69100
},
{
"epoch": 0.7246756238807845,
"grad_norm": 2.0883328914642334,
"learning_rate": 1.824280387826258e-07,
"loss": 0.4888,
"step": 69200
},
{
"epoch": 0.7257228429904389,
"grad_norm": 2.368727207183838,
"learning_rate": 1.8113334915256663e-07,
"loss": 0.4963,
"step": 69300
},
{
"epoch": 0.7267700621000932,
"grad_norm": 2.7945289611816406,
"learning_rate": 1.7984225307334106e-07,
"loss": 0.4927,
"step": 69400
},
{
"epoch": 0.7278172812097475,
"grad_norm": 1.937376856803894,
"learning_rate": 1.7855476509529337e-07,
"loss": 0.4741,
"step": 69500
},
{
"epoch": 0.7288645003194019,
"grad_norm": 3.4460761547088623,
"learning_rate": 1.7727089972810505e-07,
"loss": 0.569,
"step": 69600
},
{
"epoch": 0.7299117194290562,
"grad_norm": 3.9340882301330566,
"learning_rate": 1.7599067144063086e-07,
"loss": 0.5028,
"step": 69700
},
{
"epoch": 0.7309589385387104,
"grad_norm": 3.2756307125091553,
"learning_rate": 1.7471409466073772e-07,
"loss": 0.5238,
"step": 69800
},
{
"epoch": 0.7320061576483647,
"grad_norm": 2.0363681316375732,
"learning_rate": 1.7344118377514044e-07,
"loss": 0.5528,
"step": 69900
},
{
"epoch": 0.7330533767580191,
"grad_norm": 2.6508500576019287,
"learning_rate": 1.7217195312923944e-07,
"loss": 0.4733,
"step": 70000
},
{
"epoch": 0.7341005958676734,
"grad_norm": 1.832088828086853,
"learning_rate": 1.7090641702696102e-07,
"loss": 0.4909,
"step": 70100
},
{
"epoch": 0.7351478149773277,
"grad_norm": 2.644780158996582,
"learning_rate": 1.6964458973059358e-07,
"loss": 0.4928,
"step": 70200
},
{
"epoch": 0.7361950340869821,
"grad_norm": 2.407883644104004,
"learning_rate": 1.683864854606289e-07,
"loss": 0.4497,
"step": 70300
},
{
"epoch": 0.7372422531966363,
"grad_norm": 2.3634557723999023,
"learning_rate": 1.6713211839560125e-07,
"loss": 0.4738,
"step": 70400
},
{
"epoch": 0.7382894723062906,
"grad_norm": 2.401092052459717,
"learning_rate": 1.658815026719269e-07,
"loss": 0.5084,
"step": 70500
},
{
"epoch": 0.7393366914159449,
"grad_norm": 2.105447292327881,
"learning_rate": 1.6463465238374568e-07,
"loss": 0.4681,
"step": 70600
},
{
"epoch": 0.7403839105255993,
"grad_norm": 2.5298540592193604,
"learning_rate": 1.633915815827623e-07,
"loss": 0.5149,
"step": 70700
},
{
"epoch": 0.7414311296352536,
"grad_norm": 2.3362057209014893,
"learning_rate": 1.621523042780868e-07,
"loss": 0.5225,
"step": 70800
},
{
"epoch": 0.7424783487449079,
"grad_norm": 3.7627904415130615,
"learning_rate": 1.6091683443607767e-07,
"loss": 0.4967,
"step": 70900
},
{
"epoch": 0.7435255678545623,
"grad_norm": 2.4007790088653564,
"learning_rate": 1.5968518598018483e-07,
"loss": 0.4878,
"step": 71000
},
{
"epoch": 0.7445727869642165,
"grad_norm": 2.1650781631469727,
"learning_rate": 1.5845737279079118e-07,
"loss": 0.502,
"step": 71100
},
{
"epoch": 0.7456200060738708,
"grad_norm": 1.9574668407440186,
"learning_rate": 1.5723340870505753e-07,
"loss": 0.4843,
"step": 71200
},
{
"epoch": 0.7466672251835251,
"grad_norm": 2.2389516830444336,
"learning_rate": 1.5601330751676624e-07,
"loss": 0.519,
"step": 71300
},
{
"epoch": 0.7477144442931795,
"grad_norm": 1.7965580224990845,
"learning_rate": 1.5479708297616567e-07,
"loss": 0.4676,
"step": 71400
},
{
"epoch": 0.7487616634028338,
"grad_norm": 2.057460069656372,
"learning_rate": 1.5358474878981526e-07,
"loss": 0.5106,
"step": 71500
},
{
"epoch": 0.7498088825124881,
"grad_norm": 2.1372034549713135,
"learning_rate": 1.5237631862043115e-07,
"loss": 0.4786,
"step": 71600
},
{
"epoch": 0.7508561016221424,
"grad_norm": 2.0700478553771973,
"learning_rate": 1.5117180608673203e-07,
"loss": 0.4855,
"step": 71700
},
{
"epoch": 0.7519033207317967,
"grad_norm": 1.7832368612289429,
"learning_rate": 1.4997122476328593e-07,
"loss": 0.5188,
"step": 71800
},
{
"epoch": 0.752950539841451,
"grad_norm": 3.6390135288238525,
"learning_rate": 1.4877458818035705e-07,
"loss": 0.5304,
"step": 71900
},
{
"epoch": 0.7539977589511053,
"grad_norm": 3.022871732711792,
"learning_rate": 1.4758190982375295e-07,
"loss": 0.4648,
"step": 72000
},
{
"epoch": 0.7550449780607597,
"grad_norm": 1.6055036783218384,
"learning_rate": 1.463932031346739e-07,
"loss": 0.5118,
"step": 72100
},
{
"epoch": 0.756092197170414,
"grad_norm": 4.166171550750732,
"learning_rate": 1.4520848150955912e-07,
"loss": 0.4986,
"step": 72200
},
{
"epoch": 0.7571394162800683,
"grad_norm": 3.3419265747070312,
"learning_rate": 1.44027758299938e-07,
"loss": 0.5049,
"step": 72300
},
{
"epoch": 0.7581866353897226,
"grad_norm": 3.171034336090088,
"learning_rate": 1.4285104681227854e-07,
"loss": 0.5091,
"step": 72400
},
{
"epoch": 0.7592338544993769,
"grad_norm": 2.6404178142547607,
"learning_rate": 1.4167836030783752e-07,
"loss": 0.5208,
"step": 72500
},
{
"epoch": 0.7602810736090312,
"grad_norm": 2.8442752361297607,
"learning_rate": 1.4050971200251115e-07,
"loss": 0.475,
"step": 72600
},
{
"epoch": 0.7613282927186855,
"grad_norm": 1.9694572687149048,
"learning_rate": 1.3934511506668616e-07,
"loss": 0.4477,
"step": 72700
},
{
"epoch": 0.7623755118283398,
"grad_norm": 3.6044440269470215,
"learning_rate": 1.3818458262509119e-07,
"loss": 0.4972,
"step": 72800
},
{
"epoch": 0.7634227309379942,
"grad_norm": 1.7680317163467407,
"learning_rate": 1.3702812775664917e-07,
"loss": 0.4964,
"step": 72900
},
{
"epoch": 0.7644699500476485,
"grad_norm": 1.948326587677002,
"learning_rate": 1.358757634943296e-07,
"loss": 0.4733,
"step": 73000
},
{
"epoch": 0.7655171691573028,
"grad_norm": 2.4567108154296875,
"learning_rate": 1.3472750282500195e-07,
"loss": 0.5247,
"step": 73100
},
{
"epoch": 0.766564388266957,
"grad_norm": 1.3387149572372437,
"learning_rate": 1.3358335868928906e-07,
"loss": 0.4894,
"step": 73200
},
{
"epoch": 0.7676116073766114,
"grad_norm": 1.793434977531433,
"learning_rate": 1.3244334398142154e-07,
"loss": 0.5103,
"step": 73300
},
{
"epoch": 0.7686588264862657,
"grad_norm": 2.429433822631836,
"learning_rate": 1.3130747154909227e-07,
"loss": 0.5304,
"step": 73400
},
{
"epoch": 0.76970604559592,
"grad_norm": 2.3653488159179688,
"learning_rate": 1.3017575419331173e-07,
"loss": 0.5092,
"step": 73500
},
{
"epoch": 0.7707532647055744,
"grad_norm": 3.5659842491149902,
"learning_rate": 1.2904820466826355e-07,
"loss": 0.4835,
"step": 73600
},
{
"epoch": 0.7718004838152287,
"grad_norm": 2.952862501144409,
"learning_rate": 1.279248356811611e-07,
"loss": 0.5015,
"step": 73700
},
{
"epoch": 0.7728477029248829,
"grad_norm": 2.398303508758545,
"learning_rate": 1.2680565989210385e-07,
"loss": 0.4938,
"step": 73800
},
{
"epoch": 0.7738949220345372,
"grad_norm": 2.317095994949341,
"learning_rate": 1.2569068991393523e-07,
"loss": 0.4617,
"step": 73900
},
{
"epoch": 0.7749421411441916,
"grad_norm": 2.453432559967041,
"learning_rate": 1.2457993831209989e-07,
"loss": 0.5198,
"step": 74000
},
{
"epoch": 0.7759893602538459,
"grad_norm": 1.8672329187393188,
"learning_rate": 1.2347341760450263e-07,
"loss": 0.4742,
"step": 74100
},
{
"epoch": 0.7770365793635002,
"grad_norm": 3.076641798019409,
"learning_rate": 1.223711402613669e-07,
"loss": 0.4928,
"step": 74200
},
{
"epoch": 0.7780837984731546,
"grad_norm": 2.7013864517211914,
"learning_rate": 1.212731187050946e-07,
"loss": 0.4565,
"step": 74300
},
{
"epoch": 0.7791310175828089,
"grad_norm": 3.7489242553710938,
"learning_rate": 1.2017936531012574e-07,
"loss": 0.5017,
"step": 74400
},
{
"epoch": 0.7801782366924631,
"grad_norm": 2.7046327590942383,
"learning_rate": 1.1908989240279938e-07,
"loss": 0.4551,
"step": 74500
},
{
"epoch": 0.7812254558021174,
"grad_norm": 1.9993566274642944,
"learning_rate": 1.1800471226121456e-07,
"loss": 0.4742,
"step": 74600
},
{
"epoch": 0.7822726749117718,
"grad_norm": 2.9598634243011475,
"learning_rate": 1.1692383711509129e-07,
"loss": 0.5121,
"step": 74700
},
{
"epoch": 0.7833198940214261,
"grad_norm": 3.2795605659484863,
"learning_rate": 1.158472791456342e-07,
"loss": 0.5344,
"step": 74800
},
{
"epoch": 0.7843671131310804,
"grad_norm": 1.8576877117156982,
"learning_rate": 1.1477505048539387e-07,
"loss": 0.4924,
"step": 74900
},
{
"epoch": 0.7854143322407348,
"grad_norm": 1.8820946216583252,
"learning_rate": 1.1370716321813029e-07,
"loss": 0.4794,
"step": 75000
},
{
"epoch": 0.7864615513503891,
"grad_norm": 3.3854475021362305,
"learning_rate": 1.1264362937867784e-07,
"loss": 0.4841,
"step": 75100
},
{
"epoch": 0.7875087704600433,
"grad_norm": 3.2768609523773193,
"learning_rate": 1.1158446095280821e-07,
"loss": 0.4802,
"step": 75200
},
{
"epoch": 0.7885559895696976,
"grad_norm": 2.02317476272583,
"learning_rate": 1.1052966987709572e-07,
"loss": 0.4762,
"step": 75300
},
{
"epoch": 0.789603208679352,
"grad_norm": 2.08528208732605,
"learning_rate": 1.0947926803878366e-07,
"loss": 0.5083,
"step": 75400
},
{
"epoch": 0.7906504277890063,
"grad_norm": 2.0258214473724365,
"learning_rate": 1.0843326727564945e-07,
"loss": 0.4927,
"step": 75500
},
{
"epoch": 0.7916976468986606,
"grad_norm": 3.184265375137329,
"learning_rate": 1.0739167937587079e-07,
"loss": 0.5066,
"step": 75600
},
{
"epoch": 0.792744866008315,
"grad_norm": 2.808084011077881,
"learning_rate": 1.0635451607789469e-07,
"loss": 0.5172,
"step": 75700
},
{
"epoch": 0.7937920851179693,
"grad_norm": 2.172506332397461,
"learning_rate": 1.0532178907030275e-07,
"loss": 0.4797,
"step": 75800
},
{
"epoch": 0.7948393042276235,
"grad_norm": 1.9276924133300781,
"learning_rate": 1.0429350999168119e-07,
"loss": 0.5057,
"step": 75900
},
{
"epoch": 0.7958865233372778,
"grad_norm": 2.1610867977142334,
"learning_rate": 1.0326969043048955e-07,
"loss": 0.4964,
"step": 76000
},
{
"epoch": 0.7969337424469322,
"grad_norm": 2.5907599925994873,
"learning_rate": 1.0225034192492876e-07,
"loss": 0.4886,
"step": 76100
},
{
"epoch": 0.7979809615565865,
"grad_norm": 1.8623499870300293,
"learning_rate": 1.0123547596281257e-07,
"loss": 0.5151,
"step": 76200
},
{
"epoch": 0.7990281806662408,
"grad_norm": 1.7319766283035278,
"learning_rate": 1.0022510398143785e-07,
"loss": 0.4983,
"step": 76300
},
{
"epoch": 0.8000753997758951,
"grad_norm": 3.9193685054779053,
"learning_rate": 9.921923736745452e-08,
"loss": 0.5011,
"step": 76400
},
{
"epoch": 0.8011226188855495,
"grad_norm": 1.8976281881332397,
"learning_rate": 9.821788745673864e-08,
"loss": 0.5036,
"step": 76500
},
{
"epoch": 0.8021698379952037,
"grad_norm": 2.426635980606079,
"learning_rate": 9.722106553426446e-08,
"loss": 0.4993,
"step": 76600
},
{
"epoch": 0.803217057104858,
"grad_norm": 1.929158329963684,
"learning_rate": 9.622878283397596e-08,
"loss": 0.515,
"step": 76700
},
{
"epoch": 0.8042642762145124,
"grad_norm": 3.309342622756958,
"learning_rate": 9.524105053866182e-08,
"loss": 0.5395,
"step": 76800
},
{
"epoch": 0.8053114953241667,
"grad_norm": 1.8991940021514893,
"learning_rate": 9.425787977982869e-08,
"loss": 0.5079,
"step": 76900
},
{
"epoch": 0.806358714433821,
"grad_norm": 2.271533250808716,
"learning_rate": 9.32792816375756e-08,
"loss": 0.4579,
"step": 77000
},
{
"epoch": 0.8074059335434753,
"grad_norm": 2.1554083824157715,
"learning_rate": 9.230526714046944e-08,
"loss": 0.4556,
"step": 77100
},
{
"epoch": 0.8084531526531297,
"grad_norm": 1.8269262313842773,
"learning_rate": 9.133584726542037e-08,
"loss": 0.4883,
"step": 77200
},
{
"epoch": 0.8095003717627839,
"grad_norm": 2.5304064750671387,
"learning_rate": 9.037103293755849e-08,
"loss": 0.4977,
"step": 77300
},
{
"epoch": 0.8105475908724382,
"grad_norm": 2.8901185989379883,
"learning_rate": 8.941083503011021e-08,
"loss": 0.5063,
"step": 77400
},
{
"epoch": 0.8115948099820925,
"grad_norm": 2.2524912357330322,
"learning_rate": 8.845526436427625e-08,
"loss": 0.5144,
"step": 77500
},
{
"epoch": 0.8126420290917469,
"grad_norm": 2.046915292739868,
"learning_rate": 8.750433170910915e-08,
"loss": 0.4933,
"step": 77600
},
{
"epoch": 0.8136892482014012,
"grad_norm": 2.644960641860962,
"learning_rate": 8.655804778139247e-08,
"loss": 0.4962,
"step": 77700
},
{
"epoch": 0.8147364673110555,
"grad_norm": 2.299511432647705,
"learning_rate": 8.561642324551954e-08,
"loss": 0.4546,
"step": 77800
},
{
"epoch": 0.8157836864207098,
"grad_norm": 2.5044310092926025,
"learning_rate": 8.467946871337344e-08,
"loss": 0.4768,
"step": 77900
},
{
"epoch": 0.8168309055303641,
"grad_norm": 1.8609235286712646,
"learning_rate": 8.374719474420749e-08,
"loss": 0.4724,
"step": 78000
},
{
"epoch": 0.8178781246400184,
"grad_norm": 1.9416966438293457,
"learning_rate": 8.281961184452629e-08,
"loss": 0.4956,
"step": 78100
},
{
"epoch": 0.8189253437496727,
"grad_norm": 2.851625919342041,
"learning_rate": 8.189673046796702e-08,
"loss": 0.5068,
"step": 78200
},
{
"epoch": 0.8199725628593271,
"grad_norm": 2.262005567550659,
"learning_rate": 8.097856101518186e-08,
"loss": 0.4846,
"step": 78300
},
{
"epoch": 0.8210197819689814,
"grad_norm": 2.1528186798095703,
"learning_rate": 8.00651138337209e-08,
"loss": 0.4776,
"step": 78400
},
{
"epoch": 0.8220670010786357,
"grad_norm": 2.505295991897583,
"learning_rate": 7.915639921791511e-08,
"loss": 0.5012,
"step": 78500
},
{
"epoch": 0.82311422018829,
"grad_norm": 2.5964581966400146,
"learning_rate": 7.825242740876081e-08,
"loss": 0.5111,
"step": 78600
},
{
"epoch": 0.8241614392979443,
"grad_norm": 2.3113765716552734,
"learning_rate": 7.735320859380384e-08,
"loss": 0.5262,
"step": 78700
},
{
"epoch": 0.8252086584075986,
"grad_norm": 1.8016088008880615,
"learning_rate": 7.645875290702519e-08,
"loss": 0.4794,
"step": 78800
},
{
"epoch": 0.8262558775172529,
"grad_norm": 2.7183265686035156,
"learning_rate": 7.556907042872601e-08,
"loss": 0.5013,
"step": 78900
},
{
"epoch": 0.8273030966269073,
"grad_norm": 1.6194109916687012,
"learning_rate": 7.46841711854152e-08,
"loss": 0.4662,
"step": 79000
},
{
"epoch": 0.8283503157365616,
"grad_norm": 1.8583705425262451,
"learning_rate": 7.38040651496955e-08,
"loss": 0.4602,
"step": 79100
},
{
"epoch": 0.8293975348462159,
"grad_norm": 2.0989129543304443,
"learning_rate": 7.292876224015082e-08,
"loss": 0.4922,
"step": 79200
},
{
"epoch": 0.8304447539558701,
"grad_norm": 2.0418784618377686,
"learning_rate": 7.205827232123585e-08,
"loss": 0.5032,
"step": 79300
},
{
"epoch": 0.8314919730655245,
"grad_norm": 2.34555983543396,
"learning_rate": 7.119260520316368e-08,
"loss": 0.4912,
"step": 79400
},
{
"epoch": 0.8325391921751788,
"grad_norm": 2.5016937255859375,
"learning_rate": 7.033177064179507e-08,
"loss": 0.4792,
"step": 79500
},
{
"epoch": 0.8335864112848331,
"grad_norm": 2.4543182849884033,
"learning_rate": 6.947577833852991e-08,
"loss": 0.4713,
"step": 79600
},
{
"epoch": 0.8346336303944875,
"grad_norm": 2.092000961303711,
"learning_rate": 6.862463794019657e-08,
"loss": 0.4607,
"step": 79700
},
{
"epoch": 0.8356808495041418,
"grad_norm": 2.430490255355835,
"learning_rate": 6.777835903894324e-08,
"loss": 0.5018,
"step": 79800
},
{
"epoch": 0.8367280686137961,
"grad_norm": 1.815276026725769,
"learning_rate": 6.69369511721311e-08,
"loss": 0.4967,
"step": 79900
},
{
"epoch": 0.8377752877234503,
"grad_norm": 2.1097006797790527,
"learning_rate": 6.610042382222497e-08,
"loss": 0.4601,
"step": 80000
},
{
"epoch": 0.8388225068331047,
"grad_norm": 3.367506504058838,
"learning_rate": 6.526878641668798e-08,
"loss": 0.4913,
"step": 80100
},
{
"epoch": 0.839869725942759,
"grad_norm": 1.4861557483673096,
"learning_rate": 6.444204832787486e-08,
"loss": 0.485,
"step": 80200
},
{
"epoch": 0.8409169450524133,
"grad_norm": 2.3718228340148926,
"learning_rate": 6.362021887292578e-08,
"loss": 0.4941,
"step": 80300
},
{
"epoch": 0.8419641641620677,
"grad_norm": 2.2200145721435547,
"learning_rate": 6.28033073136619e-08,
"loss": 0.4928,
"step": 80400
},
{
"epoch": 0.843011383271722,
"grad_norm": 2.4420855045318604,
"learning_rate": 6.199132285648129e-08,
"loss": 0.515,
"step": 80500
},
{
"epoch": 0.8440586023813763,
"grad_norm": 2.225245714187622,
"learning_rate": 6.118427465225418e-08,
"loss": 0.5029,
"step": 80600
},
{
"epoch": 0.8451058214910305,
"grad_norm": 2.7253527641296387,
"learning_rate": 6.038217179622057e-08,
"loss": 0.4898,
"step": 80700
},
{
"epoch": 0.8461530406006849,
"grad_norm": 1.8062297105789185,
"learning_rate": 5.958502332788806e-08,
"loss": 0.5089,
"step": 80800
},
{
"epoch": 0.8472002597103392,
"grad_norm": 3.0290756225585938,
"learning_rate": 5.8792838230928734e-08,
"loss": 0.4988,
"step": 80900
},
{
"epoch": 0.8482474788199935,
"grad_norm": 2.042731523513794,
"learning_rate": 5.800562543307913e-08,
"loss": 0.493,
"step": 81000
},
{
"epoch": 0.8492946979296478,
"grad_norm": 2.5578713417053223,
"learning_rate": 5.722339380603908e-08,
"loss": 0.475,
"step": 81100
},
{
"epoch": 0.8503419170393022,
"grad_norm": 3.2866199016571045,
"learning_rate": 5.6446152165371685e-08,
"loss": 0.5102,
"step": 81200
},
{
"epoch": 0.8513891361489564,
"grad_norm": 2.475862979888916,
"learning_rate": 5.5673909270404495e-08,
"loss": 0.4896,
"step": 81300
},
{
"epoch": 0.8524363552586107,
"grad_norm": 4.128602027893066,
"learning_rate": 5.490667382412978e-08,
"loss": 0.4781,
"step": 81400
},
{
"epoch": 0.853483574368265,
"grad_norm": 2.8154897689819336,
"learning_rate": 5.414445447310745e-08,
"loss": 0.5034,
"step": 81500
},
{
"epoch": 0.8545307934779194,
"grad_norm": 2.5624399185180664,
"learning_rate": 5.338725980736736e-08,
"loss": 0.4997,
"step": 81600
},
{
"epoch": 0.8555780125875737,
"grad_norm": 2.6771199703216553,
"learning_rate": 5.263509836031193e-08,
"loss": 0.5214,
"step": 81700
},
{
"epoch": 0.856625231697228,
"grad_norm": 2.225013494491577,
"learning_rate": 5.1887978608620596e-08,
"loss": 0.4838,
"step": 81800
},
{
"epoch": 0.8576724508068824,
"grad_norm": 2.8142294883728027,
"learning_rate": 5.114590897215448e-08,
"loss": 0.5037,
"step": 81900
},
{
"epoch": 0.8587196699165366,
"grad_norm": 2.071779727935791,
"learning_rate": 5.040889781386043e-08,
"loss": 0.4689,
"step": 82000
},
{
"epoch": 0.8597668890261909,
"grad_norm": 2.6963651180267334,
"learning_rate": 4.9676953439677925e-08,
"loss": 0.489,
"step": 82100
},
{
"epoch": 0.8608141081358452,
"grad_norm": 2.4148457050323486,
"learning_rate": 4.895008409844481e-08,
"loss": 0.4816,
"step": 82200
},
{
"epoch": 0.8618613272454996,
"grad_norm": 2.611649513244629,
"learning_rate": 4.822829798180467e-08,
"loss": 0.5531,
"step": 82300
},
{
"epoch": 0.8629085463551539,
"grad_norm": 1.8031556606292725,
"learning_rate": 4.751160322411418e-08,
"loss": 0.454,
"step": 82400
},
{
"epoch": 0.8639557654648082,
"grad_norm": 2.0377116203308105,
"learning_rate": 4.680000790235178e-08,
"loss": 0.5212,
"step": 82500
},
{
"epoch": 0.8650029845744626,
"grad_norm": 1.7090651988983154,
"learning_rate": 4.609352003602646e-08,
"loss": 0.4721,
"step": 82600
},
{
"epoch": 0.8660502036841168,
"grad_norm": 0.9355291724205017,
"learning_rate": 4.5392147587087315e-08,
"loss": 0.4535,
"step": 82700
},
{
"epoch": 0.8670974227937711,
"grad_norm": 2.991403579711914,
"learning_rate": 4.4695898459834016e-08,
"loss": 0.5108,
"step": 82800
},
{
"epoch": 0.8681446419034254,
"grad_norm": 2.0942938327789307,
"learning_rate": 4.400478050082751e-08,
"loss": 0.4919,
"step": 82900
},
{
"epoch": 0.8691918610130798,
"grad_norm": 1.971248745918274,
"learning_rate": 4.331880149880179e-08,
"loss": 0.4981,
"step": 83000
},
{
"epoch": 0.8702390801227341,
"grad_norm": 2.0472984313964844,
"learning_rate": 4.263796918457613e-08,
"loss": 0.4663,
"step": 83100
},
{
"epoch": 0.8712862992323884,
"grad_norm": 2.9207637310028076,
"learning_rate": 4.196229123096762e-08,
"loss": 0.4723,
"step": 83200
},
{
"epoch": 0.8723335183420428,
"grad_norm": 2.6545724868774414,
"learning_rate": 4.129177525270511e-08,
"loss": 0.5042,
"step": 83300
},
{
"epoch": 0.873380737451697,
"grad_norm": 2.008007526397705,
"learning_rate": 4.0626428806343205e-08,
"loss": 0.4904,
"step": 83400
},
{
"epoch": 0.8744279565613513,
"grad_norm": 1.2464555501937866,
"learning_rate": 3.996625939017711e-08,
"loss": 0.5248,
"step": 83500
},
{
"epoch": 0.8754751756710056,
"grad_norm": 3.1436216831207275,
"learning_rate": 3.9311274444158106e-08,
"loss": 0.4924,
"step": 83600
},
{
"epoch": 0.87652239478066,
"grad_norm": 3.0234928131103516,
"learning_rate": 3.8661481349809786e-08,
"loss": 0.493,
"step": 83700
},
{
"epoch": 0.8775696138903143,
"grad_norm": 2.1175239086151123,
"learning_rate": 3.8016887430144754e-08,
"loss": 0.4933,
"step": 83800
},
{
"epoch": 0.8786168329999686,
"grad_norm": 2.497673749923706,
"learning_rate": 3.737749994958228e-08,
"loss": 0.5146,
"step": 83900
},
{
"epoch": 0.879664052109623,
"grad_norm": 1.5378285646438599,
"learning_rate": 3.674332611386616e-08,
"loss": 0.4628,
"step": 84000
},
{
"epoch": 0.8807112712192772,
"grad_norm": 3.481321334838867,
"learning_rate": 3.6114373069983885e-08,
"loss": 0.513,
"step": 84100
},
{
"epoch": 0.8817584903289315,
"grad_norm": 3.8998842239379883,
"learning_rate": 3.549064790608536e-08,
"loss": 0.5157,
"step": 84200
},
{
"epoch": 0.8828057094385858,
"grad_norm": 4.254595756530762,
"learning_rate": 3.487215765140422e-08,
"loss": 0.503,
"step": 84300
},
{
"epoch": 0.8838529285482402,
"grad_norm": 1.633023977279663,
"learning_rate": 3.4258909276177584e-08,
"loss": 0.4763,
"step": 84400
},
{
"epoch": 0.8849001476578945,
"grad_norm": 2.1271402835845947,
"learning_rate": 3.365090969156764e-08,
"loss": 0.514,
"step": 84500
},
{
"epoch": 0.8859473667675488,
"grad_norm": 2.325639009475708,
"learning_rate": 3.304816574958441e-08,
"loss": 0.5295,
"step": 84600
},
{
"epoch": 0.886994585877203,
"grad_norm": 3.336534261703491,
"learning_rate": 3.2450684243007786e-08,
"loss": 0.498,
"step": 84700
},
{
"epoch": 0.8880418049868574,
"grad_norm": 2.818937301635742,
"learning_rate": 3.185847190531121e-08,
"loss": 0.4621,
"step": 84800
},
{
"epoch": 0.8890890240965117,
"grad_norm": 2.3609235286712646,
"learning_rate": 3.1271535410586136e-08,
"loss": 0.4536,
"step": 84900
},
{
"epoch": 0.890136243206166,
"grad_norm": 2.134856939315796,
"learning_rate": 3.06898813734664e-08,
"loss": 0.4955,
"step": 85000
},
{
"epoch": 0.8911834623158204,
"grad_norm": 2.349867105484009,
"learning_rate": 3.011351634905357e-08,
"loss": 0.5,
"step": 85100
},
{
"epoch": 0.8922306814254747,
"grad_norm": 2.3223259449005127,
"learning_rate": 2.9542446832843793e-08,
"loss": 0.5176,
"step": 85200
},
{
"epoch": 0.893277900535129,
"grad_norm": 2.8934836387634277,
"learning_rate": 2.8976679260653613e-08,
"loss": 0.5069,
"step": 85300
},
{
"epoch": 0.8943251196447832,
"grad_norm": 2.5627784729003906,
"learning_rate": 2.8416220008548152e-08,
"loss": 0.5019,
"step": 85400
},
{
"epoch": 0.8953723387544376,
"grad_norm": 4.0183796882629395,
"learning_rate": 2.7861075392769275e-08,
"loss": 0.4907,
"step": 85500
},
{
"epoch": 0.8964195578640919,
"grad_norm": 2.2696878910064697,
"learning_rate": 2.7311251669663692e-08,
"loss": 0.4785,
"step": 85600
},
{
"epoch": 0.8974667769737462,
"grad_norm": 2.5743296146392822,
"learning_rate": 2.6766755035613155e-08,
"loss": 0.4707,
"step": 85700
},
{
"epoch": 0.8985139960834005,
"grad_norm": 2.059088945388794,
"learning_rate": 2.622759162696464e-08,
"loss": 0.5246,
"step": 85800
},
{
"epoch": 0.8995612151930549,
"grad_norm": 1.2305697202682495,
"learning_rate": 2.5693767519960496e-08,
"loss": 0.4841,
"step": 85900
},
{
"epoch": 0.9006084343027092,
"grad_norm": 3.181995153427124,
"learning_rate": 2.5165288730670585e-08,
"loss": 0.4882,
"step": 86000
},
{
"epoch": 0.9016556534123634,
"grad_norm": 2.311540365219116,
"learning_rate": 2.464216121492463e-08,
"loss": 0.4918,
"step": 86100
},
{
"epoch": 0.9027028725220178,
"grad_norm": 1.5216143131256104,
"learning_rate": 2.412439086824436e-08,
"loss": 0.4877,
"step": 86200
},
{
"epoch": 0.9037500916316721,
"grad_norm": 1.816412091255188,
"learning_rate": 2.361198352577759e-08,
"loss": 0.495,
"step": 86300
},
{
"epoch": 0.9047973107413264,
"grad_norm": 1.8467931747436523,
"learning_rate": 2.310494496223253e-08,
"loss": 0.517,
"step": 86400
},
{
"epoch": 0.9058445298509807,
"grad_norm": 1.95524001121521,
"learning_rate": 2.260328089181246e-08,
"loss": 0.4702,
"step": 86500
},
{
"epoch": 0.9068917489606351,
"grad_norm": 2.4727303981781006,
"learning_rate": 2.210699696815127e-08,
"loss": 0.498,
"step": 86600
},
{
"epoch": 0.9079389680702894,
"grad_norm": 3.1941773891448975,
"learning_rate": 2.1616098784250082e-08,
"loss": 0.4655,
"step": 86700
},
{
"epoch": 0.9089861871799436,
"grad_norm": 3.8430733680725098,
"learning_rate": 2.1130591872413837e-08,
"loss": 0.5178,
"step": 86800
},
{
"epoch": 0.910033406289598,
"grad_norm": 1.787541151046753,
"learning_rate": 2.0650481704189315e-08,
"loss": 0.4858,
"step": 86900
},
{
"epoch": 0.9110806253992523,
"grad_norm": 1.8147176504135132,
"learning_rate": 2.017577369030321e-08,
"loss": 0.4997,
"step": 87000
},
{
"epoch": 0.9121278445089066,
"grad_norm": 2.207904100418091,
"learning_rate": 1.9706473180601145e-08,
"loss": 0.4998,
"step": 87100
},
{
"epoch": 0.9131750636185609,
"grad_norm": 2.220478057861328,
"learning_rate": 1.9242585463987548e-08,
"loss": 0.4939,
"step": 87200
},
{
"epoch": 0.9142222827282153,
"grad_norm": 2.459459066390991,
"learning_rate": 1.878411576836597e-08,
"loss": 0.5106,
"step": 87300
},
{
"epoch": 0.9152695018378696,
"grad_norm": 1.8161354064941406,
"learning_rate": 1.8331069260580147e-08,
"loss": 0.4519,
"step": 87400
},
{
"epoch": 0.9163167209475238,
"grad_norm": 2.2104363441467285,
"learning_rate": 1.78834510463558e-08,
"loss": 0.4841,
"step": 87500
},
{
"epoch": 0.9173639400571781,
"grad_norm": 3.3614344596862793,
"learning_rate": 1.744126617024305e-08,
"loss": 0.4699,
"step": 87600
},
{
"epoch": 0.9184111591668325,
"grad_norm": 1.9489402770996094,
"learning_rate": 1.70045196155596e-08,
"loss": 0.4884,
"step": 87700
},
{
"epoch": 0.9194583782764868,
"grad_norm": 2.2660348415374756,
"learning_rate": 1.6573216304334615e-08,
"loss": 0.4971,
"step": 87800
},
{
"epoch": 0.9205055973861411,
"grad_norm": 1.9117883443832397,
"learning_rate": 1.6147361097253122e-08,
"loss": 0.5133,
"step": 87900
},
{
"epoch": 0.9215528164957955,
"grad_norm": 2.3087127208709717,
"learning_rate": 1.5726958793601476e-08,
"loss": 0.481,
"step": 88000
},
{
"epoch": 0.9226000356054497,
"grad_norm": 2.1353018283843994,
"learning_rate": 1.5312014131212914e-08,
"loss": 0.4618,
"step": 88100
},
{
"epoch": 0.923647254715104,
"grad_norm": 2.694920778274536,
"learning_rate": 1.4902531786414542e-08,
"loss": 0.4633,
"step": 88200
},
{
"epoch": 0.9246944738247583,
"grad_norm": 2.070590019226074,
"learning_rate": 1.4498516373974312e-08,
"loss": 0.5069,
"step": 88300
},
{
"epoch": 0.9257416929344127,
"grad_norm": 1.7129287719726562,
"learning_rate": 1.4099972447049246e-08,
"loss": 0.479,
"step": 88400
},
{
"epoch": 0.926788912044067,
"grad_norm": 2.0258448123931885,
"learning_rate": 1.3706904497133964e-08,
"loss": 0.5026,
"step": 88500
},
{
"epoch": 0.9278361311537213,
"grad_norm": 2.2771730422973633,
"learning_rate": 1.331931695401034e-08,
"loss": 0.4739,
"step": 88600
},
{
"epoch": 0.9288833502633757,
"grad_norm": 2.1517481803894043,
"learning_rate": 1.2937214185696988e-08,
"loss": 0.5027,
"step": 88700
},
{
"epoch": 0.9299305693730299,
"grad_norm": 2.0524544715881348,
"learning_rate": 1.2560600498400852e-08,
"loss": 0.459,
"step": 88800
},
{
"epoch": 0.9309777884826842,
"grad_norm": 2.0591094493865967,
"learning_rate": 1.2189480136467978e-08,
"loss": 0.512,
"step": 88900
},
{
"epoch": 0.9320250075923385,
"grad_norm": 1.7868990898132324,
"learning_rate": 1.1823857282335869e-08,
"loss": 0.4755,
"step": 89000
},
{
"epoch": 0.9330722267019929,
"grad_norm": 2.4516055583953857,
"learning_rate": 1.146373605648676e-08,
"loss": 0.5004,
"step": 89100
},
{
"epoch": 0.9341194458116472,
"grad_norm": 2.602165699005127,
"learning_rate": 1.1109120517400704e-08,
"loss": 0.5163,
"step": 89200
},
{
"epoch": 0.9351666649213015,
"grad_norm": 4.763970851898193,
"learning_rate": 1.076001466150972e-08,
"loss": 0.5095,
"step": 89300
},
{
"epoch": 0.9362138840309558,
"grad_norm": 2.463984966278076,
"learning_rate": 1.0416422423153547e-08,
"loss": 0.5034,
"step": 89400
},
{
"epoch": 0.9372611031406101,
"grad_norm": 2.4041192531585693,
"learning_rate": 1.0078347674534194e-08,
"loss": 0.4741,
"step": 89500
},
{
"epoch": 0.9383083222502644,
"grad_norm": 3.2481226921081543,
"learning_rate": 9.745794225673288e-09,
"loss": 0.5558,
"step": 89600
},
{
"epoch": 0.9393555413599187,
"grad_norm": 2.0538644790649414,
"learning_rate": 9.418765824368625e-09,
"loss": 0.5126,
"step": 89700
},
{
"epoch": 0.940402760469573,
"grad_norm": 3.1280417442321777,
"learning_rate": 9.097266156151972e-09,
"loss": 0.4813,
"step": 89800
},
{
"epoch": 0.9414499795792274,
"grad_norm": 2.6181859970092773,
"learning_rate": 8.781298844247608e-09,
"loss": 0.4985,
"step": 89900
},
{
"epoch": 0.9424971986888817,
"grad_norm": 2.8424460887908936,
"learning_rate": 8.470867449531627e-09,
"loss": 0.5032,
"step": 90000
},
{
"epoch": 0.943544417798536,
"grad_norm": 1.8021912574768066,
"learning_rate": 8.165975470491416e-09,
"loss": 0.5082,
"step": 90100
},
{
"epoch": 0.9445916369081903,
"grad_norm": 2.1348044872283936,
"learning_rate": 7.866626343186577e-09,
"loss": 0.4811,
"step": 90200
},
{
"epoch": 0.9456388560178446,
"grad_norm": 1.665382981300354,
"learning_rate": 7.572823441210353e-09,
"loss": 0.5137,
"step": 90300
},
{
"epoch": 0.9466860751274989,
"grad_norm": 1.782528281211853,
"learning_rate": 7.284570075650864e-09,
"loss": 0.4861,
"step": 90400
},
{
"epoch": 0.9477332942371532,
"grad_norm": 2.0802054405212402,
"learning_rate": 7.001869495054713e-09,
"loss": 0.5201,
"step": 90500
},
{
"epoch": 0.9487805133468076,
"grad_norm": 2.515943765640259,
"learning_rate": 6.724724885389721e-09,
"loss": 0.4863,
"step": 90600
},
{
"epoch": 0.9498277324564619,
"grad_norm": 1.7922004461288452,
"learning_rate": 6.4531393700092415e-09,
"loss": 0.4858,
"step": 90700
},
{
"epoch": 0.9508749515661162,
"grad_norm": 1.5402792692184448,
"learning_rate": 6.187116009617188e-09,
"loss": 0.5174,
"step": 90800
},
{
"epoch": 0.9519221706757705,
"grad_norm": 2.370882987976074,
"learning_rate": 5.926657802233004e-09,
"loss": 0.5299,
"step": 90900
},
{
"epoch": 0.9529693897854248,
"grad_norm": 2.1812610626220703,
"learning_rate": 5.671767683158357e-09,
"loss": 0.5078,
"step": 91000
},
{
"epoch": 0.9540166088950791,
"grad_norm": 1.9076416492462158,
"learning_rate": 5.422448524944057e-09,
"loss": 0.4871,
"step": 91100
},
{
"epoch": 0.9550638280047334,
"grad_norm": 2.5718798637390137,
"learning_rate": 5.1787031373571326e-09,
"loss": 0.5,
"step": 91200
},
{
"epoch": 0.9561110471143878,
"grad_norm": 1.7200427055358887,
"learning_rate": 4.940534267349861e-09,
"loss": 0.4824,
"step": 91300
},
{
"epoch": 0.9571582662240421,
"grad_norm": 2.0528995990753174,
"learning_rate": 4.7079445990284015e-09,
"loss": 0.4893,
"step": 91400
},
{
"epoch": 0.9582054853336963,
"grad_norm": 2.170036554336548,
"learning_rate": 4.4809367536226e-09,
"loss": 0.5468,
"step": 91500
},
{
"epoch": 0.9592527044433506,
"grad_norm": 2.4191830158233643,
"learning_rate": 4.2595132894565625e-09,
"loss": 0.496,
"step": 91600
},
{
"epoch": 0.960299923553005,
"grad_norm": 3.8748281002044678,
"learning_rate": 4.043676701919741e-09,
"loss": 0.52,
"step": 91700
},
{
"epoch": 0.9613471426626593,
"grad_norm": 2.9865217208862305,
"learning_rate": 3.833429423438838e-09,
"loss": 0.4729,
"step": 91800
},
{
"epoch": 0.9623943617723136,
"grad_norm": 3.5876505374908447,
"learning_rate": 3.628773823450337e-09,
"loss": 0.4557,
"step": 91900
},
{
"epoch": 0.963441580881968,
"grad_norm": 2.007694959640503,
"learning_rate": 3.429712208373847e-09,
"loss": 0.5197,
"step": 92000
},
{
"epoch": 0.9644887999916223,
"grad_norm": 1.564520239830017,
"learning_rate": 3.2362468215861306e-09,
"loss": 0.4519,
"step": 92100
},
{
"epoch": 0.9655360191012765,
"grad_norm": 2.6633753776550293,
"learning_rate": 3.0483798433957876e-09,
"loss": 0.5247,
"step": 92200
},
{
"epoch": 0.9665832382109308,
"grad_norm": 2.7909083366394043,
"learning_rate": 2.8661133910187206e-09,
"loss": 0.4981,
"step": 92300
},
{
"epoch": 0.9676304573205852,
"grad_norm": 2.7965500354766846,
"learning_rate": 2.68944951855421e-09,
"loss": 0.4982,
"step": 92400
},
{
"epoch": 0.9686776764302395,
"grad_norm": 2.164356231689453,
"learning_rate": 2.5183902169618187e-09,
"loss": 0.4926,
"step": 92500
},
{
"epoch": 0.9697248955398938,
"grad_norm": 2.378080368041992,
"learning_rate": 2.352937414038969e-09,
"loss": 0.4796,
"step": 92600
},
{
"epoch": 0.9707721146495482,
"grad_norm": 2.3100953102111816,
"learning_rate": 2.1930929743990136e-09,
"loss": 0.511,
"step": 92700
},
{
"epoch": 0.9718193337592025,
"grad_norm": 1.154026985168457,
"learning_rate": 2.0388586994506964e-09,
"loss": 0.5297,
"step": 92800
},
{
"epoch": 0.9728665528688567,
"grad_norm": 2.432117462158203,
"learning_rate": 1.8902363273772815e-09,
"loss": 0.4869,
"step": 92900
},
{
"epoch": 0.973913771978511,
"grad_norm": 2.1382997035980225,
"learning_rate": 1.7472275331173459e-09,
"loss": 0.5253,
"step": 93000
},
{
"epoch": 0.9749609910881654,
"grad_norm": 2.517921209335327,
"learning_rate": 1.609833928345794e-09,
"loss": 0.4989,
"step": 93100
},
{
"epoch": 0.9760082101978197,
"grad_norm": 2.1486592292785645,
"learning_rate": 1.4780570614556508e-09,
"loss": 0.5392,
"step": 93200
},
{
"epoch": 0.977055429307474,
"grad_norm": 2.8666563034057617,
"learning_rate": 1.3518984175406312e-09,
"loss": 0.4899,
"step": 93300
},
{
"epoch": 0.9781026484171284,
"grad_norm": 2.0608692169189453,
"learning_rate": 1.231359418378486e-09,
"loss": 0.5013,
"step": 93400
},
{
"epoch": 0.9791498675267827,
"grad_norm": 2.5256223678588867,
"learning_rate": 1.1164414224149598e-09,
"loss": 0.506,
"step": 93500
},
{
"epoch": 0.9801970866364369,
"grad_norm": 1.9714406728744507,
"learning_rate": 1.0071457247482485e-09,
"loss": 0.5306,
"step": 93600
},
{
"epoch": 0.9812443057460912,
"grad_norm": 2.5823991298675537,
"learning_rate": 9.034735571147312e-10,
"loss": 0.4887,
"step": 93700
},
{
"epoch": 0.9822915248557456,
"grad_norm": 2.48111891746521,
"learning_rate": 8.054260878749275e-10,
"loss": 0.5309,
"step": 93800
},
{
"epoch": 0.9833387439653999,
"grad_norm": 3.824676752090454,
"learning_rate": 7.130044220003962e-10,
"loss": 0.4919,
"step": 93900
},
{
"epoch": 0.9843859630750542,
"grad_norm": 2.073537588119507,
"learning_rate": 6.26209601061134e-10,
"loss": 0.4679,
"step": 94000
},
{
"epoch": 0.9854331821847085,
"grad_norm": 2.32852840423584,
"learning_rate": 5.450426032140298e-10,
"loss": 0.4893,
"step": 94100
},
{
"epoch": 0.9864804012943629,
"grad_norm": 3.0331838130950928,
"learning_rate": 4.695043431917068e-10,
"loss": 0.4837,
"step": 94200
},
{
"epoch": 0.9875276204040171,
"grad_norm": 2.3463919162750244,
"learning_rate": 3.995956722922522e-10,
"loss": 0.4748,
"step": 94300
},
{
"epoch": 0.9885748395136714,
"grad_norm": 3.0472140312194824,
"learning_rate": 3.3531737836967054e-10,
"loss": 0.5212,
"step": 94400
},
{
"epoch": 0.9896220586233258,
"grad_norm": 1.4455373287200928,
"learning_rate": 2.766701858250009e-10,
"loss": 0.4858,
"step": 94500
},
{
"epoch": 0.9906692777329801,
"grad_norm": 2.5533838272094727,
"learning_rate": 2.2365475559799064e-10,
"loss": 0.5016,
"step": 94600
},
{
"epoch": 0.9917164968426344,
"grad_norm": 2.4406557083129883,
"learning_rate": 1.762716851599344e-10,
"loss": 0.4551,
"step": 94700
},
{
"epoch": 0.9927637159522887,
"grad_norm": 2.5848546028137207,
"learning_rate": 1.3452150850656872e-10,
"loss": 0.4797,
"step": 94800
},
{
"epoch": 0.993810935061943,
"grad_norm": 2.0372912883758545,
"learning_rate": 9.84046961525209e-11,
"loss": 0.4646,
"step": 94900
},
{
"epoch": 0.9948581541715973,
"grad_norm": 2.8523876667022705,
"learning_rate": 6.792165512553571e-11,
"loss": 0.4876,
"step": 95000
},
{
"epoch": 0.9959053732812516,
"grad_norm": 2.202986001968384,
"learning_rate": 4.3072728962256774e-11,
"loss": 0.5156,
"step": 95100
},
{
"epoch": 0.996952592390906,
"grad_norm": 2.1548354625701904,
"learning_rate": 2.3858197704063055e-11,
"loss": 0.5241,
"step": 95200
},
{
"epoch": 0.9979998115005603,
"grad_norm": 1.8615128993988037,
"learning_rate": 1.0278277894182342e-11,
"loss": 0.4658,
"step": 95300
},
{
"epoch": 0.9990470306102146,
"grad_norm": 2.989764928817749,
"learning_rate": 2.3331225750267137e-12,
"loss": 0.5486,
"step": 95400
}
],
"logging_steps": 100,
"max_steps": 95491,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.299285301826683e+17,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}