deberta-v3-base-zyda-2 / trainer_state.json
agentlans's picture
Upload 13 files
717f1d4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 174828,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028599537831468643,
"grad_norm": 8.294376373291016,
"learning_rate": 4.985700231084266e-05,
"loss": 6.665,
"step": 500
},
{
"epoch": 0.005719907566293729,
"grad_norm": 8.308354377746582,
"learning_rate": 4.971400462168532e-05,
"loss": 5.1044,
"step": 1000
},
{
"epoch": 0.008579861349440594,
"grad_norm": 7.017335891723633,
"learning_rate": 4.9571006932527974e-05,
"loss": 4.6249,
"step": 1500
},
{
"epoch": 0.011439815132587457,
"grad_norm": 7.528384685516357,
"learning_rate": 4.942800924337063e-05,
"loss": 4.3456,
"step": 2000
},
{
"epoch": 0.014299768915734323,
"grad_norm": 7.852795600891113,
"learning_rate": 4.928501155421328e-05,
"loss": 4.0772,
"step": 2500
},
{
"epoch": 0.017159722698881188,
"grad_norm": 7.606760025024414,
"learning_rate": 4.914201386505594e-05,
"loss": 3.974,
"step": 3000
},
{
"epoch": 0.02001967648202805,
"grad_norm": 7.45611572265625,
"learning_rate": 4.8999016175898596e-05,
"loss": 3.8115,
"step": 3500
},
{
"epoch": 0.022879630265174915,
"grad_norm": 7.1126861572265625,
"learning_rate": 4.885601848674126e-05,
"loss": 3.7529,
"step": 4000
},
{
"epoch": 0.02573958404832178,
"grad_norm": 7.052072525024414,
"learning_rate": 4.871302079758392e-05,
"loss": 3.654,
"step": 4500
},
{
"epoch": 0.028599537831468645,
"grad_norm": 7.367290019989014,
"learning_rate": 4.8570023108426574e-05,
"loss": 3.5671,
"step": 5000
},
{
"epoch": 0.03145949161461551,
"grad_norm": 8.534124374389648,
"learning_rate": 4.842702541926923e-05,
"loss": 3.5032,
"step": 5500
},
{
"epoch": 0.034319445397762376,
"grad_norm": 7.498807907104492,
"learning_rate": 4.828402773011189e-05,
"loss": 3.5244,
"step": 6000
},
{
"epoch": 0.037179399180909234,
"grad_norm": 6.6923298835754395,
"learning_rate": 4.814103004095454e-05,
"loss": 3.4814,
"step": 6500
},
{
"epoch": 0.0400393529640561,
"grad_norm": 6.853496551513672,
"learning_rate": 4.7998032351797196e-05,
"loss": 3.3781,
"step": 7000
},
{
"epoch": 0.042899306747202964,
"grad_norm": 7.179901599884033,
"learning_rate": 4.785503466263985e-05,
"loss": 3.3461,
"step": 7500
},
{
"epoch": 0.04575926053034983,
"grad_norm": 7.566349506378174,
"learning_rate": 4.771203697348251e-05,
"loss": 3.3024,
"step": 8000
},
{
"epoch": 0.048619214313496695,
"grad_norm": 7.144839763641357,
"learning_rate": 4.756903928432517e-05,
"loss": 3.2582,
"step": 8500
},
{
"epoch": 0.05147916809664356,
"grad_norm": 6.929514408111572,
"learning_rate": 4.7426041595167824e-05,
"loss": 3.2371,
"step": 9000
},
{
"epoch": 0.054339121879790425,
"grad_norm": 6.699374198913574,
"learning_rate": 4.728304390601048e-05,
"loss": 3.201,
"step": 9500
},
{
"epoch": 0.05719907566293729,
"grad_norm": 5.757383346557617,
"learning_rate": 4.714004621685314e-05,
"loss": 3.1877,
"step": 10000
},
{
"epoch": 0.06005902944608415,
"grad_norm": 7.66983699798584,
"learning_rate": 4.6997048527695796e-05,
"loss": 3.1514,
"step": 10500
},
{
"epoch": 0.06291898322923102,
"grad_norm": 7.166614532470703,
"learning_rate": 4.6854050838538446e-05,
"loss": 3.1203,
"step": 11000
},
{
"epoch": 0.06577893701237789,
"grad_norm": 8.80114459991455,
"learning_rate": 4.671105314938111e-05,
"loss": 3.1312,
"step": 11500
},
{
"epoch": 0.06863889079552475,
"grad_norm": 29.41587257385254,
"learning_rate": 4.656805546022377e-05,
"loss": 3.1019,
"step": 12000
},
{
"epoch": 0.0714988445786716,
"grad_norm": 6.1705145835876465,
"learning_rate": 4.6425057771066424e-05,
"loss": 3.0878,
"step": 12500
},
{
"epoch": 0.07435879836181847,
"grad_norm": 7.218475341796875,
"learning_rate": 4.628206008190908e-05,
"loss": 3.0758,
"step": 13000
},
{
"epoch": 0.07721875214496533,
"grad_norm": 6.435647964477539,
"learning_rate": 4.613906239275174e-05,
"loss": 3.0396,
"step": 13500
},
{
"epoch": 0.0800787059281122,
"grad_norm": 7.471750736236572,
"learning_rate": 4.5996064703594396e-05,
"loss": 3.0454,
"step": 14000
},
{
"epoch": 0.08293865971125906,
"grad_norm": 6.561801910400391,
"learning_rate": 4.585306701443705e-05,
"loss": 3.0095,
"step": 14500
},
{
"epoch": 0.08579861349440593,
"grad_norm": 7.1273369789123535,
"learning_rate": 4.57100693252797e-05,
"loss": 3.0249,
"step": 15000
},
{
"epoch": 0.0886585672775528,
"grad_norm": 6.540430545806885,
"learning_rate": 4.556707163612236e-05,
"loss": 3.0156,
"step": 15500
},
{
"epoch": 0.09151852106069966,
"grad_norm": 6.394286632537842,
"learning_rate": 4.542407394696502e-05,
"loss": 2.9634,
"step": 16000
},
{
"epoch": 0.09437847484384652,
"grad_norm": 7.856606960296631,
"learning_rate": 4.5281076257807674e-05,
"loss": 2.9866,
"step": 16500
},
{
"epoch": 0.09723842862699339,
"grad_norm": 7.8352861404418945,
"learning_rate": 4.513807856865033e-05,
"loss": 2.9864,
"step": 17000
},
{
"epoch": 0.10009838241014025,
"grad_norm": 6.253101348876953,
"learning_rate": 4.499508087949299e-05,
"loss": 2.9592,
"step": 17500
},
{
"epoch": 0.10295833619328712,
"grad_norm": 6.485994815826416,
"learning_rate": 4.4852083190335646e-05,
"loss": 2.9405,
"step": 18000
},
{
"epoch": 0.10581828997643399,
"grad_norm": 6.409724712371826,
"learning_rate": 4.47090855011783e-05,
"loss": 2.9567,
"step": 18500
},
{
"epoch": 0.10867824375958085,
"grad_norm": 7.388598918914795,
"learning_rate": 4.456608781202096e-05,
"loss": 2.9123,
"step": 19000
},
{
"epoch": 0.11153819754272772,
"grad_norm": 7.503371715545654,
"learning_rate": 4.442309012286362e-05,
"loss": 2.922,
"step": 19500
},
{
"epoch": 0.11439815132587458,
"grad_norm": 6.702953338623047,
"learning_rate": 4.4280092433706274e-05,
"loss": 2.9321,
"step": 20000
},
{
"epoch": 0.11725810510902143,
"grad_norm": 7.328106880187988,
"learning_rate": 4.413709474454893e-05,
"loss": 2.8965,
"step": 20500
},
{
"epoch": 0.1201180588921683,
"grad_norm": 6.787193775177002,
"learning_rate": 4.399409705539159e-05,
"loss": 2.9204,
"step": 21000
},
{
"epoch": 0.12297801267531516,
"grad_norm": 5.832542896270752,
"learning_rate": 4.3851099366234246e-05,
"loss": 2.8724,
"step": 21500
},
{
"epoch": 0.12583796645846204,
"grad_norm": 6.784033298492432,
"learning_rate": 4.37081016770769e-05,
"loss": 2.889,
"step": 22000
},
{
"epoch": 0.1286979202416089,
"grad_norm": 7.457705497741699,
"learning_rate": 4.356510398791956e-05,
"loss": 2.8845,
"step": 22500
},
{
"epoch": 0.13155787402475577,
"grad_norm": 7.377457141876221,
"learning_rate": 4.342210629876222e-05,
"loss": 2.876,
"step": 23000
},
{
"epoch": 0.13441782780790262,
"grad_norm": 6.810230731964111,
"learning_rate": 4.327910860960487e-05,
"loss": 2.8881,
"step": 23500
},
{
"epoch": 0.1372777815910495,
"grad_norm": 6.137091636657715,
"learning_rate": 4.3136110920447525e-05,
"loss": 2.8599,
"step": 24000
},
{
"epoch": 0.14013773537419635,
"grad_norm": 27.535808563232422,
"learning_rate": 4.299311323129018e-05,
"loss": 2.8432,
"step": 24500
},
{
"epoch": 0.1429976891573432,
"grad_norm": 6.044827461242676,
"learning_rate": 4.285011554213284e-05,
"loss": 2.8644,
"step": 25000
},
{
"epoch": 0.14585764294049008,
"grad_norm": 6.300295829772949,
"learning_rate": 4.2707117852975496e-05,
"loss": 2.8269,
"step": 25500
},
{
"epoch": 0.14871759672363694,
"grad_norm": 5.811293125152588,
"learning_rate": 4.256412016381815e-05,
"loss": 2.8308,
"step": 26000
},
{
"epoch": 0.15157755050678381,
"grad_norm": 6.52765417098999,
"learning_rate": 4.242112247466081e-05,
"loss": 2.8309,
"step": 26500
},
{
"epoch": 0.15443750428993067,
"grad_norm": 6.731512546539307,
"learning_rate": 4.227812478550347e-05,
"loss": 2.8066,
"step": 27000
},
{
"epoch": 0.15729745807307755,
"grad_norm": 6.837157249450684,
"learning_rate": 4.2135127096346125e-05,
"loss": 2.8282,
"step": 27500
},
{
"epoch": 0.1601574118562244,
"grad_norm": 5.657121181488037,
"learning_rate": 4.199212940718878e-05,
"loss": 2.7904,
"step": 28000
},
{
"epoch": 0.16301736563937128,
"grad_norm": 8.501928329467773,
"learning_rate": 4.184913171803144e-05,
"loss": 2.8224,
"step": 28500
},
{
"epoch": 0.16587731942251813,
"grad_norm": 6.447242736816406,
"learning_rate": 4.1706134028874096e-05,
"loss": 2.7731,
"step": 29000
},
{
"epoch": 0.168737273205665,
"grad_norm": 6.049993991851807,
"learning_rate": 4.156313633971675e-05,
"loss": 2.7726,
"step": 29500
},
{
"epoch": 0.17159722698881186,
"grad_norm": 5.747082710266113,
"learning_rate": 4.142013865055941e-05,
"loss": 2.799,
"step": 30000
},
{
"epoch": 0.17445718077195874,
"grad_norm": 6.7925615310668945,
"learning_rate": 4.127714096140207e-05,
"loss": 2.7704,
"step": 30500
},
{
"epoch": 0.1773171345551056,
"grad_norm": 7.164943218231201,
"learning_rate": 4.1134143272244725e-05,
"loss": 2.7611,
"step": 31000
},
{
"epoch": 0.18017708833825247,
"grad_norm": 6.813632011413574,
"learning_rate": 4.099114558308738e-05,
"loss": 2.7637,
"step": 31500
},
{
"epoch": 0.18303704212139932,
"grad_norm": 5.981168270111084,
"learning_rate": 4.084814789393003e-05,
"loss": 2.7495,
"step": 32000
},
{
"epoch": 0.18589699590454617,
"grad_norm": 6.125492095947266,
"learning_rate": 4.070515020477269e-05,
"loss": 2.7592,
"step": 32500
},
{
"epoch": 0.18875694968769305,
"grad_norm": 44.21103286743164,
"learning_rate": 4.0562152515615347e-05,
"loss": 2.7711,
"step": 33000
},
{
"epoch": 0.1916169034708399,
"grad_norm": 5.714451789855957,
"learning_rate": 4.0419154826458004e-05,
"loss": 2.7456,
"step": 33500
},
{
"epoch": 0.19447685725398678,
"grad_norm": 5.732424736022949,
"learning_rate": 4.027615713730066e-05,
"loss": 2.7412,
"step": 34000
},
{
"epoch": 0.19733681103713363,
"grad_norm": 7.989277362823486,
"learning_rate": 4.013315944814332e-05,
"loss": 2.7429,
"step": 34500
},
{
"epoch": 0.2001967648202805,
"grad_norm": 6.200708389282227,
"learning_rate": 3.9990161758985975e-05,
"loss": 2.7497,
"step": 35000
},
{
"epoch": 0.20305671860342736,
"grad_norm": 6.867748260498047,
"learning_rate": 3.984716406982863e-05,
"loss": 2.7387,
"step": 35500
},
{
"epoch": 0.20591667238657424,
"grad_norm": 5.795921325683594,
"learning_rate": 3.970416638067129e-05,
"loss": 2.7262,
"step": 36000
},
{
"epoch": 0.2087766261697211,
"grad_norm": 6.110116958618164,
"learning_rate": 3.9561168691513947e-05,
"loss": 2.7416,
"step": 36500
},
{
"epoch": 0.21163657995286797,
"grad_norm": 6.253924369812012,
"learning_rate": 3.9418171002356604e-05,
"loss": 2.7215,
"step": 37000
},
{
"epoch": 0.21449653373601482,
"grad_norm": 6.117007732391357,
"learning_rate": 3.927517331319926e-05,
"loss": 2.7231,
"step": 37500
},
{
"epoch": 0.2173564875191617,
"grad_norm": 8.227131843566895,
"learning_rate": 3.913217562404192e-05,
"loss": 2.71,
"step": 38000
},
{
"epoch": 0.22021644130230855,
"grad_norm": 6.146326541900635,
"learning_rate": 3.8989177934884575e-05,
"loss": 2.7076,
"step": 38500
},
{
"epoch": 0.22307639508545543,
"grad_norm": 6.7277398109436035,
"learning_rate": 3.884618024572723e-05,
"loss": 2.7226,
"step": 39000
},
{
"epoch": 0.22593634886860228,
"grad_norm": 6.300662994384766,
"learning_rate": 3.870318255656989e-05,
"loss": 2.7025,
"step": 39500
},
{
"epoch": 0.22879630265174916,
"grad_norm": 5.755123138427734,
"learning_rate": 3.8560184867412547e-05,
"loss": 2.7025,
"step": 40000
},
{
"epoch": 0.231656256434896,
"grad_norm": 6.393768310546875,
"learning_rate": 3.8417187178255204e-05,
"loss": 2.7113,
"step": 40500
},
{
"epoch": 0.23451621021804286,
"grad_norm": 5.855433464050293,
"learning_rate": 3.8274189489097854e-05,
"loss": 2.721,
"step": 41000
},
{
"epoch": 0.23737616400118974,
"grad_norm": 4.719547271728516,
"learning_rate": 3.813119179994051e-05,
"loss": 2.6774,
"step": 41500
},
{
"epoch": 0.2402361177843366,
"grad_norm": 5.75437068939209,
"learning_rate": 3.798819411078317e-05,
"loss": 2.6922,
"step": 42000
},
{
"epoch": 0.24309607156748347,
"grad_norm": 6.258277416229248,
"learning_rate": 3.7845196421625825e-05,
"loss": 2.701,
"step": 42500
},
{
"epoch": 0.24595602535063033,
"grad_norm": 5.8440165519714355,
"learning_rate": 3.770219873246848e-05,
"loss": 2.697,
"step": 43000
},
{
"epoch": 0.2488159791337772,
"grad_norm": 5.4940009117126465,
"learning_rate": 3.755920104331114e-05,
"loss": 2.6826,
"step": 43500
},
{
"epoch": 0.2516759329169241,
"grad_norm": 8.00302791595459,
"learning_rate": 3.7416203354153804e-05,
"loss": 2.6782,
"step": 44000
},
{
"epoch": 0.25453588670007093,
"grad_norm": 6.31597375869751,
"learning_rate": 3.727320566499646e-05,
"loss": 2.7113,
"step": 44500
},
{
"epoch": 0.2573958404832178,
"grad_norm": 6.734432697296143,
"learning_rate": 3.713020797583911e-05,
"loss": 2.6883,
"step": 45000
},
{
"epoch": 0.26025579426636464,
"grad_norm": 8.607872009277344,
"learning_rate": 3.698721028668177e-05,
"loss": 2.675,
"step": 45500
},
{
"epoch": 0.26311574804951154,
"grad_norm": 6.785426139831543,
"learning_rate": 3.6844212597524425e-05,
"loss": 2.6662,
"step": 46000
},
{
"epoch": 0.2659757018326584,
"grad_norm": 5.7255072593688965,
"learning_rate": 3.670121490836708e-05,
"loss": 2.6566,
"step": 46500
},
{
"epoch": 0.26883565561580525,
"grad_norm": 5.778408527374268,
"learning_rate": 3.655821721920974e-05,
"loss": 2.6869,
"step": 47000
},
{
"epoch": 0.2716956093989521,
"grad_norm": 7.3644490242004395,
"learning_rate": 3.64152195300524e-05,
"loss": 2.6548,
"step": 47500
},
{
"epoch": 0.274555563182099,
"grad_norm": 9.922218322753906,
"learning_rate": 3.6272221840895054e-05,
"loss": 2.6548,
"step": 48000
},
{
"epoch": 0.27741551696524586,
"grad_norm": 6.6563944816589355,
"learning_rate": 3.612922415173771e-05,
"loss": 2.6466,
"step": 48500
},
{
"epoch": 0.2802754707483927,
"grad_norm": 5.308610439300537,
"learning_rate": 3.598622646258037e-05,
"loss": 2.6744,
"step": 49000
},
{
"epoch": 0.28313542453153956,
"grad_norm": 6.213603973388672,
"learning_rate": 3.584322877342302e-05,
"loss": 2.6484,
"step": 49500
},
{
"epoch": 0.2859953783146864,
"grad_norm": 5.715392589569092,
"learning_rate": 3.5700231084265676e-05,
"loss": 2.6573,
"step": 50000
},
{
"epoch": 0.2888553320978333,
"grad_norm": 6.067576885223389,
"learning_rate": 3.555723339510833e-05,
"loss": 2.6487,
"step": 50500
},
{
"epoch": 0.29171528588098017,
"grad_norm": 6.300750255584717,
"learning_rate": 3.541423570595099e-05,
"loss": 2.6445,
"step": 51000
},
{
"epoch": 0.294575239664127,
"grad_norm": 6.036895275115967,
"learning_rate": 3.5271238016793654e-05,
"loss": 2.6756,
"step": 51500
},
{
"epoch": 0.29743519344727387,
"grad_norm": 5.856159687042236,
"learning_rate": 3.512824032763631e-05,
"loss": 2.6415,
"step": 52000
},
{
"epoch": 0.3002951472304208,
"grad_norm": 12.173583984375,
"learning_rate": 3.498524263847897e-05,
"loss": 2.6386,
"step": 52500
},
{
"epoch": 0.30315510101356763,
"grad_norm": 5.493927478790283,
"learning_rate": 3.4842244949321625e-05,
"loss": 2.6515,
"step": 53000
},
{
"epoch": 0.3060150547967145,
"grad_norm": 5.786694526672363,
"learning_rate": 3.4699247260164276e-05,
"loss": 2.6449,
"step": 53500
},
{
"epoch": 0.30887500857986133,
"grad_norm": 5.755667686462402,
"learning_rate": 3.455624957100693e-05,
"loss": 2.6357,
"step": 54000
},
{
"epoch": 0.31173496236300824,
"grad_norm": 5.9297027587890625,
"learning_rate": 3.441325188184959e-05,
"loss": 2.6493,
"step": 54500
},
{
"epoch": 0.3145949161461551,
"grad_norm": 6.182466983795166,
"learning_rate": 3.427025419269225e-05,
"loss": 2.6298,
"step": 55000
},
{
"epoch": 0.31745486992930194,
"grad_norm": 6.565801620483398,
"learning_rate": 3.4127256503534904e-05,
"loss": 2.6611,
"step": 55500
},
{
"epoch": 0.3203148237124488,
"grad_norm": 5.94129753112793,
"learning_rate": 3.398425881437756e-05,
"loss": 2.6201,
"step": 56000
},
{
"epoch": 0.32317477749559564,
"grad_norm": 6.72519063949585,
"learning_rate": 3.384126112522022e-05,
"loss": 2.5961,
"step": 56500
},
{
"epoch": 0.32603473127874255,
"grad_norm": 6.440931797027588,
"learning_rate": 3.3698263436062876e-05,
"loss": 2.6322,
"step": 57000
},
{
"epoch": 0.3288946850618894,
"grad_norm": 6.059328079223633,
"learning_rate": 3.355526574690553e-05,
"loss": 2.615,
"step": 57500
},
{
"epoch": 0.33175463884503625,
"grad_norm": 6.007944107055664,
"learning_rate": 3.341226805774818e-05,
"loss": 2.5992,
"step": 58000
},
{
"epoch": 0.3346145926281831,
"grad_norm": 6.9386982917785645,
"learning_rate": 3.326927036859084e-05,
"loss": 2.6317,
"step": 58500
},
{
"epoch": 0.33747454641133,
"grad_norm": 5.493308067321777,
"learning_rate": 3.3126272679433504e-05,
"loss": 2.5975,
"step": 59000
},
{
"epoch": 0.34033450019447686,
"grad_norm": 7.026157855987549,
"learning_rate": 3.298327499027616e-05,
"loss": 2.6139,
"step": 59500
},
{
"epoch": 0.3431944539776237,
"grad_norm": 5.790646553039551,
"learning_rate": 3.284027730111882e-05,
"loss": 2.5916,
"step": 60000
},
{
"epoch": 0.34605440776077057,
"grad_norm": 5.980741024017334,
"learning_rate": 3.2697279611961476e-05,
"loss": 2.5811,
"step": 60500
},
{
"epoch": 0.3489143615439175,
"grad_norm": 6.555883407592773,
"learning_rate": 3.255428192280413e-05,
"loss": 2.5909,
"step": 61000
},
{
"epoch": 0.3517743153270643,
"grad_norm": 5.8480706214904785,
"learning_rate": 3.241128423364679e-05,
"loss": 2.6127,
"step": 61500
},
{
"epoch": 0.3546342691102112,
"grad_norm": 6.341095924377441,
"learning_rate": 3.226828654448944e-05,
"loss": 2.5959,
"step": 62000
},
{
"epoch": 0.357494222893358,
"grad_norm": 5.832342147827148,
"learning_rate": 3.21252888553321e-05,
"loss": 2.5946,
"step": 62500
},
{
"epoch": 0.36035417667650493,
"grad_norm": 6.495291709899902,
"learning_rate": 3.1982291166174755e-05,
"loss": 2.6122,
"step": 63000
},
{
"epoch": 0.3632141304596518,
"grad_norm": 6.527446746826172,
"learning_rate": 3.183929347701741e-05,
"loss": 2.573,
"step": 63500
},
{
"epoch": 0.36607408424279864,
"grad_norm": 6.4324951171875,
"learning_rate": 3.169629578786007e-05,
"loss": 2.6119,
"step": 64000
},
{
"epoch": 0.3689340380259455,
"grad_norm": 7.166018009185791,
"learning_rate": 3.1553298098702726e-05,
"loss": 2.6124,
"step": 64500
},
{
"epoch": 0.37179399180909234,
"grad_norm": 6.462119102478027,
"learning_rate": 3.141030040954538e-05,
"loss": 2.5552,
"step": 65000
},
{
"epoch": 0.37465394559223925,
"grad_norm": 6.0564703941345215,
"learning_rate": 3.126730272038804e-05,
"loss": 2.5672,
"step": 65500
},
{
"epoch": 0.3775138993753861,
"grad_norm": 5.307662487030029,
"learning_rate": 3.11243050312307e-05,
"loss": 2.5611,
"step": 66000
},
{
"epoch": 0.38037385315853295,
"grad_norm": 5.18694543838501,
"learning_rate": 3.0981307342073355e-05,
"loss": 2.5691,
"step": 66500
},
{
"epoch": 0.3832338069416798,
"grad_norm": 5.568657398223877,
"learning_rate": 3.083830965291601e-05,
"loss": 2.575,
"step": 67000
},
{
"epoch": 0.3860937607248267,
"grad_norm": 10.616528511047363,
"learning_rate": 3.069531196375867e-05,
"loss": 2.5886,
"step": 67500
},
{
"epoch": 0.38895371450797356,
"grad_norm": 6.7568206787109375,
"learning_rate": 3.0552314274601326e-05,
"loss": 2.5822,
"step": 68000
},
{
"epoch": 0.3918136682911204,
"grad_norm": 6.087740421295166,
"learning_rate": 3.040931658544398e-05,
"loss": 2.5472,
"step": 68500
},
{
"epoch": 0.39467362207426726,
"grad_norm": 6.702504634857178,
"learning_rate": 3.0266318896286637e-05,
"loss": 2.5897,
"step": 69000
},
{
"epoch": 0.39753357585741417,
"grad_norm": 6.2178053855896,
"learning_rate": 3.0123321207129297e-05,
"loss": 2.5698,
"step": 69500
},
{
"epoch": 0.400393529640561,
"grad_norm": 6.559543609619141,
"learning_rate": 2.9980323517971955e-05,
"loss": 2.5725,
"step": 70000
},
{
"epoch": 0.40325348342370787,
"grad_norm": 5.918066501617432,
"learning_rate": 2.9837325828814605e-05,
"loss": 2.5847,
"step": 70500
},
{
"epoch": 0.4061134372068547,
"grad_norm": 5.602575778961182,
"learning_rate": 2.9694328139657262e-05,
"loss": 2.583,
"step": 71000
},
{
"epoch": 0.40897339099000163,
"grad_norm": 5.304308891296387,
"learning_rate": 2.955133045049992e-05,
"loss": 2.5632,
"step": 71500
},
{
"epoch": 0.4118333447731485,
"grad_norm": 5.540666103363037,
"learning_rate": 2.9408332761342576e-05,
"loss": 2.5756,
"step": 72000
},
{
"epoch": 0.41469329855629533,
"grad_norm": 6.2000861167907715,
"learning_rate": 2.9265335072185234e-05,
"loss": 2.5357,
"step": 72500
},
{
"epoch": 0.4175532523394422,
"grad_norm": 5.1564459800720215,
"learning_rate": 2.912233738302789e-05,
"loss": 2.5516,
"step": 73000
},
{
"epoch": 0.42041320612258903,
"grad_norm": 6.008329391479492,
"learning_rate": 2.897933969387055e-05,
"loss": 2.5738,
"step": 73500
},
{
"epoch": 0.42327315990573594,
"grad_norm": 6.52450704574585,
"learning_rate": 2.883634200471321e-05,
"loss": 2.578,
"step": 74000
},
{
"epoch": 0.4261331136888828,
"grad_norm": 5.788220405578613,
"learning_rate": 2.8693344315555866e-05,
"loss": 2.5578,
"step": 74500
},
{
"epoch": 0.42899306747202964,
"grad_norm": 5.5810112953186035,
"learning_rate": 2.8550346626398516e-05,
"loss": 2.5643,
"step": 75000
},
{
"epoch": 0.4318530212551765,
"grad_norm": 5.334226608276367,
"learning_rate": 2.8407348937241173e-05,
"loss": 2.5376,
"step": 75500
},
{
"epoch": 0.4347129750383234,
"grad_norm": 5.804100513458252,
"learning_rate": 2.826435124808383e-05,
"loss": 2.541,
"step": 76000
},
{
"epoch": 0.43757292882147025,
"grad_norm": 5.555410385131836,
"learning_rate": 2.8121353558926487e-05,
"loss": 2.5364,
"step": 76500
},
{
"epoch": 0.4404328826046171,
"grad_norm": 5.454427719116211,
"learning_rate": 2.7978355869769148e-05,
"loss": 2.5602,
"step": 77000
},
{
"epoch": 0.44329283638776396,
"grad_norm": 16.772747039794922,
"learning_rate": 2.7835358180611805e-05,
"loss": 2.5674,
"step": 77500
},
{
"epoch": 0.44615279017091086,
"grad_norm": 8.047761917114258,
"learning_rate": 2.7692360491454462e-05,
"loss": 2.5334,
"step": 78000
},
{
"epoch": 0.4490127439540577,
"grad_norm": 6.612277507781982,
"learning_rate": 2.754936280229712e-05,
"loss": 2.5525,
"step": 78500
},
{
"epoch": 0.45187269773720457,
"grad_norm": 6.439370632171631,
"learning_rate": 2.740636511313977e-05,
"loss": 2.5349,
"step": 79000
},
{
"epoch": 0.4547326515203514,
"grad_norm": 6.890873908996582,
"learning_rate": 2.7263367423982427e-05,
"loss": 2.5145,
"step": 79500
},
{
"epoch": 0.4575926053034983,
"grad_norm": 5.4768500328063965,
"learning_rate": 2.7120369734825084e-05,
"loss": 2.5277,
"step": 80000
},
{
"epoch": 0.4604525590866452,
"grad_norm": 5.825018405914307,
"learning_rate": 2.697737204566774e-05,
"loss": 2.5505,
"step": 80500
},
{
"epoch": 0.463312512869792,
"grad_norm": 6.583479881286621,
"learning_rate": 2.68343743565104e-05,
"loss": 2.5562,
"step": 81000
},
{
"epoch": 0.4661724666529389,
"grad_norm": 6.420114040374756,
"learning_rate": 2.669137666735306e-05,
"loss": 2.5094,
"step": 81500
},
{
"epoch": 0.46903242043608573,
"grad_norm": 6.8168110847473145,
"learning_rate": 2.6548378978195716e-05,
"loss": 2.5347,
"step": 82000
},
{
"epoch": 0.47189237421923264,
"grad_norm": 6.224096298217773,
"learning_rate": 2.6405381289038373e-05,
"loss": 2.5154,
"step": 82500
},
{
"epoch": 0.4747523280023795,
"grad_norm": 6.240240097045898,
"learning_rate": 2.626238359988103e-05,
"loss": 2.535,
"step": 83000
},
{
"epoch": 0.47761228178552634,
"grad_norm": 6.053983211517334,
"learning_rate": 2.611938591072368e-05,
"loss": 2.5275,
"step": 83500
},
{
"epoch": 0.4804722355686732,
"grad_norm": 5.546879768371582,
"learning_rate": 2.5976388221566338e-05,
"loss": 2.5329,
"step": 84000
},
{
"epoch": 0.4833321893518201,
"grad_norm": 6.190423011779785,
"learning_rate": 2.5833390532408995e-05,
"loss": 2.5174,
"step": 84500
},
{
"epoch": 0.48619214313496695,
"grad_norm": 5.437402248382568,
"learning_rate": 2.5690392843251655e-05,
"loss": 2.49,
"step": 85000
},
{
"epoch": 0.4890520969181138,
"grad_norm": 6.8163557052612305,
"learning_rate": 2.5547395154094312e-05,
"loss": 2.524,
"step": 85500
},
{
"epoch": 0.49191205070126065,
"grad_norm": 6.754604816436768,
"learning_rate": 2.540439746493697e-05,
"loss": 2.5041,
"step": 86000
},
{
"epoch": 0.49477200448440756,
"grad_norm": 5.496472358703613,
"learning_rate": 2.5261399775779627e-05,
"loss": 2.5277,
"step": 86500
},
{
"epoch": 0.4976319582675544,
"grad_norm": 5.616280555725098,
"learning_rate": 2.5118402086622284e-05,
"loss": 2.5061,
"step": 87000
},
{
"epoch": 0.5004919120507013,
"grad_norm": 6.141283988952637,
"learning_rate": 2.4975404397464938e-05,
"loss": 2.5214,
"step": 87500
},
{
"epoch": 0.5033518658338482,
"grad_norm": 6.124631404876709,
"learning_rate": 2.4832406708307595e-05,
"loss": 2.4854,
"step": 88000
},
{
"epoch": 0.506211819616995,
"grad_norm": 6.740499496459961,
"learning_rate": 2.4689409019150252e-05,
"loss": 2.5054,
"step": 88500
},
{
"epoch": 0.5090717734001419,
"grad_norm": 6.040327548980713,
"learning_rate": 2.454641132999291e-05,
"loss": 2.5042,
"step": 89000
},
{
"epoch": 0.5119317271832887,
"grad_norm": 5.564330577850342,
"learning_rate": 2.4403413640835566e-05,
"loss": 2.5021,
"step": 89500
},
{
"epoch": 0.5147916809664356,
"grad_norm": 6.915059566497803,
"learning_rate": 2.4260415951678223e-05,
"loss": 2.5227,
"step": 90000
},
{
"epoch": 0.5176516347495824,
"grad_norm": 6.181910991668701,
"learning_rate": 2.411741826252088e-05,
"loss": 2.5098,
"step": 90500
},
{
"epoch": 0.5205115885327293,
"grad_norm": 5.829164505004883,
"learning_rate": 2.3974420573363534e-05,
"loss": 2.5133,
"step": 91000
},
{
"epoch": 0.5233715423158761,
"grad_norm": 14.621573448181152,
"learning_rate": 2.383142288420619e-05,
"loss": 2.503,
"step": 91500
},
{
"epoch": 0.5262314960990231,
"grad_norm": 6.3930511474609375,
"learning_rate": 2.368842519504885e-05,
"loss": 2.5124,
"step": 92000
},
{
"epoch": 0.5290914498821699,
"grad_norm": 5.840575695037842,
"learning_rate": 2.3545427505891506e-05,
"loss": 2.5177,
"step": 92500
},
{
"epoch": 0.5319514036653168,
"grad_norm": 6.612518787384033,
"learning_rate": 2.3402429816734163e-05,
"loss": 2.4881,
"step": 93000
},
{
"epoch": 0.5348113574484636,
"grad_norm": 6.505732536315918,
"learning_rate": 2.325943212757682e-05,
"loss": 2.4872,
"step": 93500
},
{
"epoch": 0.5376713112316105,
"grad_norm": 7.19988489151001,
"learning_rate": 2.3116434438419477e-05,
"loss": 2.4958,
"step": 94000
},
{
"epoch": 0.5405312650147573,
"grad_norm": 5.988187789916992,
"learning_rate": 2.2973436749262134e-05,
"loss": 2.5094,
"step": 94500
},
{
"epoch": 0.5433912187979042,
"grad_norm": 5.709506511688232,
"learning_rate": 2.2830439060104788e-05,
"loss": 2.4882,
"step": 95000
},
{
"epoch": 0.546251172581051,
"grad_norm": 5.567132949829102,
"learning_rate": 2.2687441370947445e-05,
"loss": 2.4909,
"step": 95500
},
{
"epoch": 0.549111126364198,
"grad_norm": 11.825920104980469,
"learning_rate": 2.2544443681790102e-05,
"loss": 2.4944,
"step": 96000
},
{
"epoch": 0.5519710801473449,
"grad_norm": 5.969587802886963,
"learning_rate": 2.240144599263276e-05,
"loss": 2.4912,
"step": 96500
},
{
"epoch": 0.5548310339304917,
"grad_norm": 6.31153678894043,
"learning_rate": 2.225844830347542e-05,
"loss": 2.4901,
"step": 97000
},
{
"epoch": 0.5576909877136386,
"grad_norm": 7.130558013916016,
"learning_rate": 2.2115450614318074e-05,
"loss": 2.4768,
"step": 97500
},
{
"epoch": 0.5605509414967854,
"grad_norm": 5.947187900543213,
"learning_rate": 2.197245292516073e-05,
"loss": 2.4971,
"step": 98000
},
{
"epoch": 0.5634108952799323,
"grad_norm": 6.830575466156006,
"learning_rate": 2.1829455236003388e-05,
"loss": 2.4901,
"step": 98500
},
{
"epoch": 0.5662708490630791,
"grad_norm": 5.682921409606934,
"learning_rate": 2.1686457546846045e-05,
"loss": 2.4946,
"step": 99000
},
{
"epoch": 0.569130802846226,
"grad_norm": 5.174154758453369,
"learning_rate": 2.15434598576887e-05,
"loss": 2.4813,
"step": 99500
},
{
"epoch": 0.5719907566293728,
"grad_norm": 5.400365352630615,
"learning_rate": 2.1400462168531356e-05,
"loss": 2.4498,
"step": 100000
},
{
"epoch": 0.5748507104125198,
"grad_norm": 5.433869361877441,
"learning_rate": 2.1257464479374013e-05,
"loss": 2.523,
"step": 100500
},
{
"epoch": 0.5777106641956666,
"grad_norm": 6.321377754211426,
"learning_rate": 2.1114466790216674e-05,
"loss": 2.4731,
"step": 101000
},
{
"epoch": 0.5805706179788135,
"grad_norm": 6.643988609313965,
"learning_rate": 2.0971469101059327e-05,
"loss": 2.4837,
"step": 101500
},
{
"epoch": 0.5834305717619603,
"grad_norm": 6.258885383605957,
"learning_rate": 2.0828471411901985e-05,
"loss": 2.4735,
"step": 102000
},
{
"epoch": 0.5862905255451072,
"grad_norm": 5.747689723968506,
"learning_rate": 2.068547372274464e-05,
"loss": 2.4742,
"step": 102500
},
{
"epoch": 0.589150479328254,
"grad_norm": 6.016144275665283,
"learning_rate": 2.05424760335873e-05,
"loss": 2.4633,
"step": 103000
},
{
"epoch": 0.5920104331114009,
"grad_norm": 5.250337600708008,
"learning_rate": 2.0399478344429953e-05,
"loss": 2.467,
"step": 103500
},
{
"epoch": 0.5948703868945477,
"grad_norm": 5.667397975921631,
"learning_rate": 2.025648065527261e-05,
"loss": 2.4709,
"step": 104000
},
{
"epoch": 0.5977303406776946,
"grad_norm": 6.414941310882568,
"learning_rate": 2.0113482966115267e-05,
"loss": 2.4805,
"step": 104500
},
{
"epoch": 0.6005902944608416,
"grad_norm": 6.118762493133545,
"learning_rate": 1.9970485276957927e-05,
"loss": 2.46,
"step": 105000
},
{
"epoch": 0.6034502482439884,
"grad_norm": 7.456865310668945,
"learning_rate": 1.9827487587800584e-05,
"loss": 2.4863,
"step": 105500
},
{
"epoch": 0.6063102020271353,
"grad_norm": 7.2666096687316895,
"learning_rate": 1.9684489898643238e-05,
"loss": 2.431,
"step": 106000
},
{
"epoch": 0.6091701558102821,
"grad_norm": 6.135725975036621,
"learning_rate": 1.9541492209485895e-05,
"loss": 2.4833,
"step": 106500
},
{
"epoch": 0.612030109593429,
"grad_norm": 6.930655002593994,
"learning_rate": 1.9398494520328553e-05,
"loss": 2.4791,
"step": 107000
},
{
"epoch": 0.6148900633765758,
"grad_norm": 5.848691940307617,
"learning_rate": 1.925549683117121e-05,
"loss": 2.4744,
"step": 107500
},
{
"epoch": 0.6177500171597227,
"grad_norm": 6.593609809875488,
"learning_rate": 1.9112499142013863e-05,
"loss": 2.4818,
"step": 108000
},
{
"epoch": 0.6206099709428695,
"grad_norm": 5.148362636566162,
"learning_rate": 1.8969501452856524e-05,
"loss": 2.4863,
"step": 108500
},
{
"epoch": 0.6234699247260165,
"grad_norm": 6.264626979827881,
"learning_rate": 1.882650376369918e-05,
"loss": 2.4896,
"step": 109000
},
{
"epoch": 0.6263298785091633,
"grad_norm": 7.046905040740967,
"learning_rate": 1.8683506074541838e-05,
"loss": 2.4746,
"step": 109500
},
{
"epoch": 0.6291898322923102,
"grad_norm": 6.274538993835449,
"learning_rate": 1.8540508385384492e-05,
"loss": 2.4395,
"step": 110000
},
{
"epoch": 0.632049786075457,
"grad_norm": 5.889391899108887,
"learning_rate": 1.839751069622715e-05,
"loss": 2.4307,
"step": 110500
},
{
"epoch": 0.6349097398586039,
"grad_norm": 5.6989030838012695,
"learning_rate": 1.8254513007069806e-05,
"loss": 2.4297,
"step": 111000
},
{
"epoch": 0.6377696936417507,
"grad_norm": 6.275044918060303,
"learning_rate": 1.8111515317912463e-05,
"loss": 2.4504,
"step": 111500
},
{
"epoch": 0.6406296474248976,
"grad_norm": 6.444321155548096,
"learning_rate": 1.7968517628755117e-05,
"loss": 2.4286,
"step": 112000
},
{
"epoch": 0.6434896012080444,
"grad_norm": 6.624863147735596,
"learning_rate": 1.7825519939597778e-05,
"loss": 2.463,
"step": 112500
},
{
"epoch": 0.6463495549911913,
"grad_norm": 7.994183540344238,
"learning_rate": 1.7682522250440435e-05,
"loss": 2.4362,
"step": 113000
},
{
"epoch": 0.6492095087743383,
"grad_norm": 5.6794257164001465,
"learning_rate": 1.7539524561283092e-05,
"loss": 2.4355,
"step": 113500
},
{
"epoch": 0.6520694625574851,
"grad_norm": 5.606757164001465,
"learning_rate": 1.739652687212575e-05,
"loss": 2.4525,
"step": 114000
},
{
"epoch": 0.654929416340632,
"grad_norm": 6.253554344177246,
"learning_rate": 1.7253529182968403e-05,
"loss": 2.4511,
"step": 114500
},
{
"epoch": 0.6577893701237788,
"grad_norm": 6.014497756958008,
"learning_rate": 1.711053149381106e-05,
"loss": 2.4571,
"step": 115000
},
{
"epoch": 0.6606493239069257,
"grad_norm": 6.601302146911621,
"learning_rate": 1.6967533804653717e-05,
"loss": 2.4505,
"step": 115500
},
{
"epoch": 0.6635092776900725,
"grad_norm": 7.215948104858398,
"learning_rate": 1.6824536115496374e-05,
"loss": 2.4351,
"step": 116000
},
{
"epoch": 0.6663692314732194,
"grad_norm": 5.974714279174805,
"learning_rate": 1.668153842633903e-05,
"loss": 2.4435,
"step": 116500
},
{
"epoch": 0.6692291852563662,
"grad_norm": 6.903178691864014,
"learning_rate": 1.653854073718169e-05,
"loss": 2.4388,
"step": 117000
},
{
"epoch": 0.6720891390395132,
"grad_norm": 6.214517116546631,
"learning_rate": 1.6395543048024346e-05,
"loss": 2.4405,
"step": 117500
},
{
"epoch": 0.67494909282266,
"grad_norm": 6.263461589813232,
"learning_rate": 1.6252545358867003e-05,
"loss": 2.4496,
"step": 118000
},
{
"epoch": 0.6778090466058069,
"grad_norm": 8.066364288330078,
"learning_rate": 1.6109547669709657e-05,
"loss": 2.4368,
"step": 118500
},
{
"epoch": 0.6806690003889537,
"grad_norm": 5.834959506988525,
"learning_rate": 1.5966549980552314e-05,
"loss": 2.4481,
"step": 119000
},
{
"epoch": 0.6835289541721006,
"grad_norm": 6.710206031799316,
"learning_rate": 1.582355229139497e-05,
"loss": 2.4325,
"step": 119500
},
{
"epoch": 0.6863889079552474,
"grad_norm": 5.984834671020508,
"learning_rate": 1.5680554602237628e-05,
"loss": 2.4454,
"step": 120000
},
{
"epoch": 0.6892488617383943,
"grad_norm": 5.370354652404785,
"learning_rate": 1.5537556913080285e-05,
"loss": 2.4279,
"step": 120500
},
{
"epoch": 0.6921088155215411,
"grad_norm": 6.09434175491333,
"learning_rate": 1.5394559223922942e-05,
"loss": 2.4314,
"step": 121000
},
{
"epoch": 0.694968769304688,
"grad_norm": 6.878710746765137,
"learning_rate": 1.52515615347656e-05,
"loss": 2.4191,
"step": 121500
},
{
"epoch": 0.697828723087835,
"grad_norm": 5.660272121429443,
"learning_rate": 1.5108563845608257e-05,
"loss": 2.433,
"step": 122000
},
{
"epoch": 0.7006886768709818,
"grad_norm": 6.489835739135742,
"learning_rate": 1.4965566156450914e-05,
"loss": 2.4491,
"step": 122500
},
{
"epoch": 0.7035486306541286,
"grad_norm": 5.600217819213867,
"learning_rate": 1.4822568467293567e-05,
"loss": 2.4235,
"step": 123000
},
{
"epoch": 0.7064085844372755,
"grad_norm": 5.281232833862305,
"learning_rate": 1.4679570778136226e-05,
"loss": 2.4219,
"step": 123500
},
{
"epoch": 0.7092685382204224,
"grad_norm": 5.651204586029053,
"learning_rate": 1.4536573088978883e-05,
"loss": 2.448,
"step": 124000
},
{
"epoch": 0.7121284920035692,
"grad_norm": 5.520606994628906,
"learning_rate": 1.439357539982154e-05,
"loss": 2.4118,
"step": 124500
},
{
"epoch": 0.714988445786716,
"grad_norm": 6.359561920166016,
"learning_rate": 1.4250577710664196e-05,
"loss": 2.4502,
"step": 125000
},
{
"epoch": 0.7178483995698629,
"grad_norm": 6.264361381530762,
"learning_rate": 1.4107580021506853e-05,
"loss": 2.4214,
"step": 125500
},
{
"epoch": 0.7207083533530099,
"grad_norm": 15.211498260498047,
"learning_rate": 1.396458233234951e-05,
"loss": 2.4476,
"step": 126000
},
{
"epoch": 0.7235683071361567,
"grad_norm": 6.165014266967773,
"learning_rate": 1.3821584643192167e-05,
"loss": 2.4255,
"step": 126500
},
{
"epoch": 0.7264282609193036,
"grad_norm": 5.279512882232666,
"learning_rate": 1.3678586954034823e-05,
"loss": 2.4458,
"step": 127000
},
{
"epoch": 0.7292882147024504,
"grad_norm": 6.13384485244751,
"learning_rate": 1.353558926487748e-05,
"loss": 2.4022,
"step": 127500
},
{
"epoch": 0.7321481684855973,
"grad_norm": 5.577615261077881,
"learning_rate": 1.3392591575720137e-05,
"loss": 2.4174,
"step": 128000
},
{
"epoch": 0.7350081222687441,
"grad_norm": 5.860058784484863,
"learning_rate": 1.3249593886562794e-05,
"loss": 2.4043,
"step": 128500
},
{
"epoch": 0.737868076051891,
"grad_norm": 6.8798065185546875,
"learning_rate": 1.3106596197405451e-05,
"loss": 2.3858,
"step": 129000
},
{
"epoch": 0.7407280298350378,
"grad_norm": 7.996329307556152,
"learning_rate": 1.2963598508248107e-05,
"loss": 2.3993,
"step": 129500
},
{
"epoch": 0.7435879836181847,
"grad_norm": 6.488850116729736,
"learning_rate": 1.2820600819090764e-05,
"loss": 2.4204,
"step": 130000
},
{
"epoch": 0.7464479374013316,
"grad_norm": 5.177313804626465,
"learning_rate": 1.2677603129933421e-05,
"loss": 2.433,
"step": 130500
},
{
"epoch": 0.7493078911844785,
"grad_norm": 6.9536895751953125,
"learning_rate": 1.2534605440776078e-05,
"loss": 2.4145,
"step": 131000
},
{
"epoch": 0.7521678449676253,
"grad_norm": 5.639203071594238,
"learning_rate": 1.2391607751618735e-05,
"loss": 2.3906,
"step": 131500
},
{
"epoch": 0.7550277987507722,
"grad_norm": 5.76200532913208,
"learning_rate": 1.2248610062461391e-05,
"loss": 2.4065,
"step": 132000
},
{
"epoch": 0.757887752533919,
"grad_norm": 7.033239364624023,
"learning_rate": 1.2105612373304048e-05,
"loss": 2.4045,
"step": 132500
},
{
"epoch": 0.7607477063170659,
"grad_norm": 6.319807529449463,
"learning_rate": 1.1962614684146704e-05,
"loss": 2.3646,
"step": 133000
},
{
"epoch": 0.7636076601002127,
"grad_norm": 6.506091117858887,
"learning_rate": 1.1819616994989362e-05,
"loss": 2.4247,
"step": 133500
},
{
"epoch": 0.7664676138833596,
"grad_norm": 6.245853424072266,
"learning_rate": 1.1676619305832018e-05,
"loss": 2.3998,
"step": 134000
},
{
"epoch": 0.7693275676665066,
"grad_norm": 6.403684616088867,
"learning_rate": 1.1533621616674675e-05,
"loss": 2.4072,
"step": 134500
},
{
"epoch": 0.7721875214496534,
"grad_norm": 6.385560035705566,
"learning_rate": 1.1390623927517332e-05,
"loss": 2.4078,
"step": 135000
},
{
"epoch": 0.7750474752328003,
"grad_norm": 6.857175350189209,
"learning_rate": 1.124762623835999e-05,
"loss": 2.4167,
"step": 135500
},
{
"epoch": 0.7779074290159471,
"grad_norm": 5.734222888946533,
"learning_rate": 1.1104628549202645e-05,
"loss": 2.411,
"step": 136000
},
{
"epoch": 0.780767382799094,
"grad_norm": 6.311659812927246,
"learning_rate": 1.0961630860045302e-05,
"loss": 2.4232,
"step": 136500
},
{
"epoch": 0.7836273365822408,
"grad_norm": 6.344162940979004,
"learning_rate": 1.0818633170887959e-05,
"loss": 2.3997,
"step": 137000
},
{
"epoch": 0.7864872903653877,
"grad_norm": 5.971358776092529,
"learning_rate": 1.0675635481730616e-05,
"loss": 2.4181,
"step": 137500
},
{
"epoch": 0.7893472441485345,
"grad_norm": 5.663905620574951,
"learning_rate": 1.0532637792573273e-05,
"loss": 2.3939,
"step": 138000
},
{
"epoch": 0.7922071979316814,
"grad_norm": 5.739428520202637,
"learning_rate": 1.0389640103415929e-05,
"loss": 2.3803,
"step": 138500
},
{
"epoch": 0.7950671517148283,
"grad_norm": 6.558109760284424,
"learning_rate": 1.0246642414258586e-05,
"loss": 2.3794,
"step": 139000
},
{
"epoch": 0.7979271054979752,
"grad_norm": 7.577678203582764,
"learning_rate": 1.0103644725101243e-05,
"loss": 2.4035,
"step": 139500
},
{
"epoch": 0.800787059281122,
"grad_norm": 6.890414237976074,
"learning_rate": 9.9606470359439e-06,
"loss": 2.3791,
"step": 140000
},
{
"epoch": 0.8036470130642689,
"grad_norm": 6.212318420410156,
"learning_rate": 9.817649346786556e-06,
"loss": 2.363,
"step": 140500
},
{
"epoch": 0.8065069668474157,
"grad_norm": 6.501023292541504,
"learning_rate": 9.674651657629213e-06,
"loss": 2.3794,
"step": 141000
},
{
"epoch": 0.8093669206305626,
"grad_norm": 6.136830806732178,
"learning_rate": 9.53165396847187e-06,
"loss": 2.3835,
"step": 141500
},
{
"epoch": 0.8122268744137094,
"grad_norm": 6.386491298675537,
"learning_rate": 9.388656279314527e-06,
"loss": 2.3836,
"step": 142000
},
{
"epoch": 0.8150868281968563,
"grad_norm": 6.060532093048096,
"learning_rate": 9.245658590157182e-06,
"loss": 2.3714,
"step": 142500
},
{
"epoch": 0.8179467819800033,
"grad_norm": 6.481443405151367,
"learning_rate": 9.10266090099984e-06,
"loss": 2.3842,
"step": 143000
},
{
"epoch": 0.8208067357631501,
"grad_norm": 6.378634929656982,
"learning_rate": 8.959663211842497e-06,
"loss": 2.4011,
"step": 143500
},
{
"epoch": 0.823666689546297,
"grad_norm": 7.321898937225342,
"learning_rate": 8.816665522685154e-06,
"loss": 2.3874,
"step": 144000
},
{
"epoch": 0.8265266433294438,
"grad_norm": 5.878232479095459,
"learning_rate": 8.673667833527811e-06,
"loss": 2.3747,
"step": 144500
},
{
"epoch": 0.8293865971125907,
"grad_norm": 6.182088375091553,
"learning_rate": 8.530670144370468e-06,
"loss": 2.3928,
"step": 145000
},
{
"epoch": 0.8322465508957375,
"grad_norm": 6.2058258056640625,
"learning_rate": 8.387672455213125e-06,
"loss": 2.3784,
"step": 145500
},
{
"epoch": 0.8351065046788844,
"grad_norm": 6.231584072113037,
"learning_rate": 8.24467476605578e-06,
"loss": 2.3715,
"step": 146000
},
{
"epoch": 0.8379664584620312,
"grad_norm": 6.14652156829834,
"learning_rate": 8.101677076898438e-06,
"loss": 2.3789,
"step": 146500
},
{
"epoch": 0.8408264122451781,
"grad_norm": 6.431158065795898,
"learning_rate": 7.958679387741095e-06,
"loss": 2.3792,
"step": 147000
},
{
"epoch": 0.843686366028325,
"grad_norm": 5.822235584259033,
"learning_rate": 7.815681698583752e-06,
"loss": 2.4062,
"step": 147500
},
{
"epoch": 0.8465463198114719,
"grad_norm": 5.64607048034668,
"learning_rate": 7.672684009426408e-06,
"loss": 2.368,
"step": 148000
},
{
"epoch": 0.8494062735946187,
"grad_norm": 6.182931900024414,
"learning_rate": 7.5296863202690655e-06,
"loss": 2.3877,
"step": 148500
},
{
"epoch": 0.8522662273777656,
"grad_norm": 6.151760578155518,
"learning_rate": 7.386688631111721e-06,
"loss": 2.3915,
"step": 149000
},
{
"epoch": 0.8551261811609124,
"grad_norm": 6.303664684295654,
"learning_rate": 7.243690941954379e-06,
"loss": 2.3565,
"step": 149500
},
{
"epoch": 0.8579861349440593,
"grad_norm": 6.381216526031494,
"learning_rate": 7.100693252797034e-06,
"loss": 2.3697,
"step": 150000
},
{
"epoch": 0.8608460887272061,
"grad_norm": 5.706302165985107,
"learning_rate": 6.957695563639692e-06,
"loss": 2.4026,
"step": 150500
},
{
"epoch": 0.863706042510353,
"grad_norm": 7.22359561920166,
"learning_rate": 6.814697874482348e-06,
"loss": 2.3759,
"step": 151000
},
{
"epoch": 0.8665659962935,
"grad_norm": 5.458381652832031,
"learning_rate": 6.671700185325006e-06,
"loss": 2.3836,
"step": 151500
},
{
"epoch": 0.8694259500766468,
"grad_norm": 5.785479545593262,
"learning_rate": 6.528702496167661e-06,
"loss": 2.3655,
"step": 152000
},
{
"epoch": 0.8722859038597937,
"grad_norm": 5.856048583984375,
"learning_rate": 6.385704807010319e-06,
"loss": 2.3669,
"step": 152500
},
{
"epoch": 0.8751458576429405,
"grad_norm": 5.491500377655029,
"learning_rate": 6.2427071178529756e-06,
"loss": 2.4154,
"step": 153000
},
{
"epoch": 0.8780058114260874,
"grad_norm": 5.936758518218994,
"learning_rate": 6.099709428695633e-06,
"loss": 2.3702,
"step": 153500
},
{
"epoch": 0.8808657652092342,
"grad_norm": 7.138918399810791,
"learning_rate": 5.956711739538289e-06,
"loss": 2.3582,
"step": 154000
},
{
"epoch": 0.8837257189923811,
"grad_norm": 6.457569122314453,
"learning_rate": 5.813714050380946e-06,
"loss": 2.381,
"step": 154500
},
{
"epoch": 0.8865856727755279,
"grad_norm": 6.026115894317627,
"learning_rate": 5.6707163612236024e-06,
"loss": 2.385,
"step": 155000
},
{
"epoch": 0.8894456265586748,
"grad_norm": 6.851065158843994,
"learning_rate": 5.52771867206626e-06,
"loss": 2.3664,
"step": 155500
},
{
"epoch": 0.8923055803418217,
"grad_norm": 6.16819953918457,
"learning_rate": 5.384720982908916e-06,
"loss": 2.3814,
"step": 156000
},
{
"epoch": 0.8951655341249686,
"grad_norm": 5.917440891265869,
"learning_rate": 5.241723293751574e-06,
"loss": 2.3701,
"step": 156500
},
{
"epoch": 0.8980254879081154,
"grad_norm": 10.217552185058594,
"learning_rate": 5.09872560459423e-06,
"loss": 2.3516,
"step": 157000
},
{
"epoch": 0.9008854416912623,
"grad_norm": 7.088205814361572,
"learning_rate": 4.955727915436887e-06,
"loss": 2.3936,
"step": 157500
},
{
"epoch": 0.9037453954744091,
"grad_norm": 6.357458591461182,
"learning_rate": 4.812730226279544e-06,
"loss": 2.3672,
"step": 158000
},
{
"epoch": 0.906605349257556,
"grad_norm": 6.871440887451172,
"learning_rate": 4.669732537122201e-06,
"loss": 2.3691,
"step": 158500
},
{
"epoch": 0.9094653030407028,
"grad_norm": 6.192137718200684,
"learning_rate": 4.526734847964857e-06,
"loss": 2.3608,
"step": 159000
},
{
"epoch": 0.9123252568238497,
"grad_norm": 6.265544414520264,
"learning_rate": 4.383737158807514e-06,
"loss": 2.3682,
"step": 159500
},
{
"epoch": 0.9151852106069966,
"grad_norm": 5.907118320465088,
"learning_rate": 4.2407394696501705e-06,
"loss": 2.3423,
"step": 160000
},
{
"epoch": 0.9180451643901435,
"grad_norm": 6.204267501831055,
"learning_rate": 4.097741780492828e-06,
"loss": 2.3605,
"step": 160500
},
{
"epoch": 0.9209051181732903,
"grad_norm": 6.978556156158447,
"learning_rate": 3.954744091335484e-06,
"loss": 2.3594,
"step": 161000
},
{
"epoch": 0.9237650719564372,
"grad_norm": 6.3842082023620605,
"learning_rate": 3.811746402178141e-06,
"loss": 2.3677,
"step": 161500
},
{
"epoch": 0.926625025739584,
"grad_norm": 6.20996618270874,
"learning_rate": 3.6687487130207977e-06,
"loss": 2.3538,
"step": 162000
},
{
"epoch": 0.9294849795227309,
"grad_norm": 6.184482574462891,
"learning_rate": 3.5257510238634545e-06,
"loss": 2.3787,
"step": 162500
},
{
"epoch": 0.9323449333058778,
"grad_norm": 6.219623565673828,
"learning_rate": 3.382753334706111e-06,
"loss": 2.3774,
"step": 163000
},
{
"epoch": 0.9352048870890246,
"grad_norm": 6.634711742401123,
"learning_rate": 3.239755645548768e-06,
"loss": 2.3671,
"step": 163500
},
{
"epoch": 0.9380648408721715,
"grad_norm": 7.119485855102539,
"learning_rate": 3.096757956391425e-06,
"loss": 2.356,
"step": 164000
},
{
"epoch": 0.9409247946553184,
"grad_norm": 6.833123207092285,
"learning_rate": 2.9537602672340818e-06,
"loss": 2.3451,
"step": 164500
},
{
"epoch": 0.9437847484384653,
"grad_norm": 6.631540298461914,
"learning_rate": 2.8107625780767385e-06,
"loss": 2.3324,
"step": 165000
},
{
"epoch": 0.9466447022216121,
"grad_norm": 6.187737941741943,
"learning_rate": 2.667764888919395e-06,
"loss": 2.3573,
"step": 165500
},
{
"epoch": 0.949504656004759,
"grad_norm": 5.523457050323486,
"learning_rate": 2.524767199762052e-06,
"loss": 2.3468,
"step": 166000
},
{
"epoch": 0.9523646097879058,
"grad_norm": 6.898806095123291,
"learning_rate": 2.381769510604709e-06,
"loss": 2.3534,
"step": 166500
},
{
"epoch": 0.9552245635710527,
"grad_norm": 6.348108291625977,
"learning_rate": 2.2387718214473658e-06,
"loss": 2.3588,
"step": 167000
},
{
"epoch": 0.9580845173541995,
"grad_norm": 6.188412189483643,
"learning_rate": 2.0957741322900225e-06,
"loss": 2.3607,
"step": 167500
},
{
"epoch": 0.9609444711373464,
"grad_norm": 6.769163608551025,
"learning_rate": 1.952776443132679e-06,
"loss": 2.3721,
"step": 168000
},
{
"epoch": 0.9638044249204932,
"grad_norm": 6.389153957366943,
"learning_rate": 1.8097787539753357e-06,
"loss": 2.381,
"step": 168500
},
{
"epoch": 0.9666643787036402,
"grad_norm": 5.625518798828125,
"learning_rate": 1.6667810648179926e-06,
"loss": 2.3656,
"step": 169000
},
{
"epoch": 0.969524332486787,
"grad_norm": 6.03477144241333,
"learning_rate": 1.5237833756606493e-06,
"loss": 2.3796,
"step": 169500
},
{
"epoch": 0.9723842862699339,
"grad_norm": 6.034476280212402,
"learning_rate": 1.3807856865033063e-06,
"loss": 2.3407,
"step": 170000
},
{
"epoch": 0.9752442400530807,
"grad_norm": 6.318973541259766,
"learning_rate": 1.237787997345963e-06,
"loss": 2.3537,
"step": 170500
},
{
"epoch": 0.9781041938362276,
"grad_norm": 6.3570237159729,
"learning_rate": 1.0947903081886197e-06,
"loss": 2.3744,
"step": 171000
},
{
"epoch": 0.9809641476193744,
"grad_norm": 5.440378189086914,
"learning_rate": 9.517926190312765e-07,
"loss": 2.3775,
"step": 171500
},
{
"epoch": 0.9838241014025213,
"grad_norm": 7.5823655128479,
"learning_rate": 8.087949298739332e-07,
"loss": 2.3301,
"step": 172000
},
{
"epoch": 0.9866840551856682,
"grad_norm": 6.07295560836792,
"learning_rate": 6.6579724071659e-07,
"loss": 2.3347,
"step": 172500
},
{
"epoch": 0.9895440089688151,
"grad_norm": 7.158942222595215,
"learning_rate": 5.227995515592468e-07,
"loss": 2.3567,
"step": 173000
},
{
"epoch": 0.992403962751962,
"grad_norm": 6.406834125518799,
"learning_rate": 3.798018624019036e-07,
"loss": 2.3204,
"step": 173500
},
{
"epoch": 0.9952639165351088,
"grad_norm": 5.863027572631836,
"learning_rate": 2.3680417324456038e-07,
"loss": 2.3569,
"step": 174000
},
{
"epoch": 0.9981238703182557,
"grad_norm": 6.552116394042969,
"learning_rate": 9.380648408721716e-08,
"loss": 2.3332,
"step": 174500
},
{
"epoch": 1.0,
"step": 174828,
"total_flos": 1.8427441878551347e+17,
"train_loss": 1.5726176189089465,
"train_runtime": 27622.4465,
"train_samples_per_second": 25.317,
"train_steps_per_second": 6.329
}
],
"logging_steps": 500,
"max_steps": 174828,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.8427441878551347e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}