ovis-agent / trainer_state.json
younik's picture
Add files using upload-large-folder tool
9fd1c8e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 25.0,
"eval_steps": 500,
"global_step": 132825,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.09410878976096368,
"grad_norm": 7.552914619445801,
"learning_rate": 4.843152017065061e-05,
"loss": 1.9573,
"step": 500
},
{
"epoch": 0.18821757952192736,
"grad_norm": 10.377354621887207,
"learning_rate": 4.686304034130121e-05,
"loss": 1.6637,
"step": 1000
},
{
"epoch": 0.282326369282891,
"grad_norm": 10.336068153381348,
"learning_rate": 4.529456051195182e-05,
"loss": 1.6903,
"step": 1500
},
{
"epoch": 0.3764351590438547,
"grad_norm": 4.373341083526611,
"learning_rate": 4.372608068260242e-05,
"loss": 1.7066,
"step": 2000
},
{
"epoch": 0.47054394880481837,
"grad_norm": 6.67123556137085,
"learning_rate": 4.2157600853253026e-05,
"loss": 1.6262,
"step": 2500
},
{
"epoch": 0.564652738565782,
"grad_norm": 7.045924663543701,
"learning_rate": 4.0589121023903634e-05,
"loss": 1.6133,
"step": 3000
},
{
"epoch": 0.6587615283267457,
"grad_norm": 8.646467208862305,
"learning_rate": 3.902064119455424e-05,
"loss": 1.5958,
"step": 3500
},
{
"epoch": 0.7528703180877094,
"grad_norm": 17.767257690429688,
"learning_rate": 3.745216136520485e-05,
"loss": 1.5926,
"step": 4000
},
{
"epoch": 0.846979107848673,
"grad_norm": 6.851809978485107,
"learning_rate": 3.588368153585545e-05,
"loss": 1.5898,
"step": 4500
},
{
"epoch": 0.9410878976096367,
"grad_norm": 6.367198467254639,
"learning_rate": 3.431520170650606e-05,
"loss": 1.5842,
"step": 5000
},
{
"epoch": 1.0351966873706004,
"grad_norm": 3.8963205814361572,
"learning_rate": 3.274672187715666e-05,
"loss": 1.6023,
"step": 5500
},
{
"epoch": 1.129305477131564,
"grad_norm": 3.492241382598877,
"learning_rate": 3.1178242047807265e-05,
"loss": 1.5835,
"step": 6000
},
{
"epoch": 1.2234142668925279,
"grad_norm": 6.9010396003723145,
"learning_rate": 2.960976221845787e-05,
"loss": 1.5804,
"step": 6500
},
{
"epoch": 1.3175230566534915,
"grad_norm": 7.097581386566162,
"learning_rate": 2.8041282389108477e-05,
"loss": 1.5685,
"step": 7000
},
{
"epoch": 1.411631846414455,
"grad_norm": 4.106161117553711,
"learning_rate": 2.647280255975908e-05,
"loss": 1.5557,
"step": 7500
},
{
"epoch": 1.5057406361754189,
"grad_norm": 5.339535236358643,
"learning_rate": 2.490432273040969e-05,
"loss": 1.5266,
"step": 8000
},
{
"epoch": 1.5998494259363825,
"grad_norm": 5.467094421386719,
"learning_rate": 2.3335842901060293e-05,
"loss": 1.5394,
"step": 8500
},
{
"epoch": 1.693958215697346,
"grad_norm": 5.166747093200684,
"learning_rate": 2.17673630717109e-05,
"loss": 1.5131,
"step": 9000
},
{
"epoch": 1.7880670054583097,
"grad_norm": 3.2604665756225586,
"learning_rate": 2.0198883242361504e-05,
"loss": 1.4695,
"step": 9500
},
{
"epoch": 1.8821757952192735,
"grad_norm": 6.9691362380981445,
"learning_rate": 1.8630403413012108e-05,
"loss": 1.4532,
"step": 10000
},
{
"epoch": 1.9762845849802373,
"grad_norm": 3.044600248336792,
"learning_rate": 1.7061923583662716e-05,
"loss": 1.4349,
"step": 10500
},
{
"epoch": 2.070393374741201,
"grad_norm": 5.485976219177246,
"learning_rate": 1.549344375431332e-05,
"loss": 1.4098,
"step": 11000
},
{
"epoch": 2.1645021645021645,
"grad_norm": 3.2020225524902344,
"learning_rate": 1.3924963924963927e-05,
"loss": 1.4025,
"step": 11500
},
{
"epoch": 2.258610954263128,
"grad_norm": 4.406314849853516,
"learning_rate": 1.235648409561453e-05,
"loss": 1.384,
"step": 12000
},
{
"epoch": 2.3527197440240917,
"grad_norm": 7.23325252532959,
"learning_rate": 1.0788004266265137e-05,
"loss": 1.3831,
"step": 12500
},
{
"epoch": 2.4468285337850557,
"grad_norm": 6.645949840545654,
"learning_rate": 9.219524436915741e-06,
"loss": 1.3611,
"step": 13000
},
{
"epoch": 2.5409373235460193,
"grad_norm": 4.476417064666748,
"learning_rate": 7.651044607566347e-06,
"loss": 1.3478,
"step": 13500
},
{
"epoch": 2.635046113306983,
"grad_norm": 4.497859954833984,
"learning_rate": 6.082564778216952e-06,
"loss": 1.3354,
"step": 14000
},
{
"epoch": 2.7291549030679465,
"grad_norm": 4.2654008865356445,
"learning_rate": 4.514084948867558e-06,
"loss": 1.3229,
"step": 14500
},
{
"epoch": 2.82326369282891,
"grad_norm": 3.607623338699341,
"learning_rate": 2.945605119518163e-06,
"loss": 1.3001,
"step": 15000
},
{
"epoch": 2.917372482589874,
"grad_norm": 3.7838690280914307,
"learning_rate": 1.3771252901687685e-06,
"loss": 1.2967,
"step": 15500
},
{
"epoch": 3.0114812723508377,
"grad_norm": 7.810318946838379,
"learning_rate": 3.494259363824581e-05,
"loss": 1.4147,
"step": 16000
},
{
"epoch": 3.1055900621118013,
"grad_norm": 3.653393030166626,
"learning_rate": 3.4472049689440996e-05,
"loss": 1.4375,
"step": 16500
},
{
"epoch": 3.199698851872765,
"grad_norm": 6.7401018142700195,
"learning_rate": 3.400150574063617e-05,
"loss": 1.3906,
"step": 17000
},
{
"epoch": 3.2938076416337285,
"grad_norm": 4.238823890686035,
"learning_rate": 3.3530961791831364e-05,
"loss": 1.3716,
"step": 17500
},
{
"epoch": 3.387916431394692,
"grad_norm": 2.793461561203003,
"learning_rate": 3.306041784302654e-05,
"loss": 1.3764,
"step": 18000
},
{
"epoch": 3.4820252211556557,
"grad_norm": 3.1460959911346436,
"learning_rate": 3.2589873894221726e-05,
"loss": 1.3557,
"step": 18500
},
{
"epoch": 3.5761340109166198,
"grad_norm": 6.384432792663574,
"learning_rate": 3.2119329945416903e-05,
"loss": 1.3503,
"step": 19000
},
{
"epoch": 3.6702428006775834,
"grad_norm": 3.8678534030914307,
"learning_rate": 3.164878599661209e-05,
"loss": 1.3407,
"step": 19500
},
{
"epoch": 3.764351590438547,
"grad_norm": 2.776543140411377,
"learning_rate": 3.1178242047807265e-05,
"loss": 1.3593,
"step": 20000
},
{
"epoch": 3.8584603801995105,
"grad_norm": 8.332947731018066,
"learning_rate": 3.070769809900245e-05,
"loss": 1.3323,
"step": 20500
},
{
"epoch": 3.9525691699604746,
"grad_norm": 5.115898609161377,
"learning_rate": 3.0237154150197627e-05,
"loss": 1.3184,
"step": 21000
},
{
"epoch": 4.046677959721438,
"grad_norm": 5.099429607391357,
"learning_rate": 2.9766610201392815e-05,
"loss": 1.3082,
"step": 21500
},
{
"epoch": 4.140786749482402,
"grad_norm": 3.901289939880371,
"learning_rate": 2.9296066252587996e-05,
"loss": 1.3038,
"step": 22000
},
{
"epoch": 4.234895539243365,
"grad_norm": 3.7559354305267334,
"learning_rate": 2.8825522303783176e-05,
"loss": 1.2892,
"step": 22500
},
{
"epoch": 4.329004329004329,
"grad_norm": 4.156054973602295,
"learning_rate": 2.8354978354978357e-05,
"loss": 1.2893,
"step": 23000
},
{
"epoch": 4.423113118765293,
"grad_norm": 3.1980538368225098,
"learning_rate": 2.7884434406173538e-05,
"loss": 1.288,
"step": 23500
},
{
"epoch": 4.517221908526256,
"grad_norm": 2.7643442153930664,
"learning_rate": 2.741389045736872e-05,
"loss": 1.2666,
"step": 24000
},
{
"epoch": 4.61133069828722,
"grad_norm": 2.195343017578125,
"learning_rate": 2.69433465085639e-05,
"loss": 1.2632,
"step": 24500
},
{
"epoch": 4.705439488048183,
"grad_norm": 2.6611695289611816,
"learning_rate": 2.647280255975908e-05,
"loss": 1.262,
"step": 25000
},
{
"epoch": 4.799548277809148,
"grad_norm": 5.055428981781006,
"learning_rate": 2.6002258610954265e-05,
"loss": 1.2687,
"step": 25500
},
{
"epoch": 4.893657067570111,
"grad_norm": 2.993502616882324,
"learning_rate": 2.5531714662149446e-05,
"loss": 1.2439,
"step": 26000
},
{
"epoch": 4.987765857331075,
"grad_norm": 2.6108858585357666,
"learning_rate": 2.5061170713344627e-05,
"loss": 1.2315,
"step": 26500
},
{
"epoch": 5.081874647092039,
"grad_norm": 3.422744035720825,
"learning_rate": 2.4590626764539808e-05,
"loss": 1.2216,
"step": 27000
},
{
"epoch": 5.175983436853002,
"grad_norm": 2.409943103790283,
"learning_rate": 2.412008281573499e-05,
"loss": 1.214,
"step": 27500
},
{
"epoch": 5.270092226613966,
"grad_norm": 2.378814458847046,
"learning_rate": 2.3649538866930173e-05,
"loss": 1.206,
"step": 28000
},
{
"epoch": 5.364201016374929,
"grad_norm": 2.6459624767303467,
"learning_rate": 2.3178994918125354e-05,
"loss": 1.2078,
"step": 28500
},
{
"epoch": 5.458309806135893,
"grad_norm": 3.1709744930267334,
"learning_rate": 2.2708450969320535e-05,
"loss": 1.1908,
"step": 29000
},
{
"epoch": 5.552418595896857,
"grad_norm": 3.47601056098938,
"learning_rate": 2.2237907020515716e-05,
"loss": 1.192,
"step": 29500
},
{
"epoch": 5.64652738565782,
"grad_norm": 3.5006909370422363,
"learning_rate": 2.17673630717109e-05,
"loss": 1.1913,
"step": 30000
},
{
"epoch": 5.740636175418784,
"grad_norm": 2.8024189472198486,
"learning_rate": 2.129681912290608e-05,
"loss": 1.1815,
"step": 30500
},
{
"epoch": 5.834744965179748,
"grad_norm": 4.2450947761535645,
"learning_rate": 2.0826275174101262e-05,
"loss": 1.1793,
"step": 31000
},
{
"epoch": 5.928853754940711,
"grad_norm": 3.6110267639160156,
"learning_rate": 2.0355731225296443e-05,
"loss": 1.1734,
"step": 31500
},
{
"epoch": 6.0229625447016755,
"grad_norm": 4.730632305145264,
"learning_rate": 1.9885187276491627e-05,
"loss": 1.1447,
"step": 32000
},
{
"epoch": 6.117071334462639,
"grad_norm": 3.0380797386169434,
"learning_rate": 1.9414643327686808e-05,
"loss": 1.1206,
"step": 32500
},
{
"epoch": 6.211180124223603,
"grad_norm": 4.4165358543396,
"learning_rate": 1.894409937888199e-05,
"loss": 1.1295,
"step": 33000
},
{
"epoch": 6.305288913984566,
"grad_norm": 3.3100738525390625,
"learning_rate": 1.847355543007717e-05,
"loss": 1.1214,
"step": 33500
},
{
"epoch": 6.39939770374553,
"grad_norm": 4.085879325866699,
"learning_rate": 1.8003011481272354e-05,
"loss": 1.1128,
"step": 34000
},
{
"epoch": 6.4935064935064934,
"grad_norm": 3.0867068767547607,
"learning_rate": 1.7532467532467535e-05,
"loss": 1.114,
"step": 34500
},
{
"epoch": 6.587615283267457,
"grad_norm": 3.721590757369995,
"learning_rate": 1.7061923583662716e-05,
"loss": 1.1037,
"step": 35000
},
{
"epoch": 6.681724073028421,
"grad_norm": 3.4121475219726562,
"learning_rate": 1.6591379634857897e-05,
"loss": 1.107,
"step": 35500
},
{
"epoch": 6.775832862789384,
"grad_norm": 3.0844948291778564,
"learning_rate": 1.6120835686053078e-05,
"loss": 1.1005,
"step": 36000
},
{
"epoch": 6.869941652550348,
"grad_norm": 3.7161357402801514,
"learning_rate": 1.565029173724826e-05,
"loss": 1.0948,
"step": 36500
},
{
"epoch": 6.964050442311311,
"grad_norm": 3.217207193374634,
"learning_rate": 1.5179747788443441e-05,
"loss": 1.0877,
"step": 37000
},
{
"epoch": 7.058159232072276,
"grad_norm": 4.4285078048706055,
"learning_rate": 1.4709203839638622e-05,
"loss": 1.0559,
"step": 37500
},
{
"epoch": 7.1522680218332395,
"grad_norm": 3.0102875232696533,
"learning_rate": 1.4238659890833805e-05,
"loss": 1.0322,
"step": 38000
},
{
"epoch": 7.246376811594203,
"grad_norm": 3.4103572368621826,
"learning_rate": 1.3768115942028985e-05,
"loss": 1.0217,
"step": 38500
},
{
"epoch": 7.340485601355167,
"grad_norm": 3.9534976482391357,
"learning_rate": 1.3297571993224166e-05,
"loss": 1.0248,
"step": 39000
},
{
"epoch": 7.43459439111613,
"grad_norm": 5.0660719871521,
"learning_rate": 1.2827028044419347e-05,
"loss": 1.0233,
"step": 39500
},
{
"epoch": 7.528703180877094,
"grad_norm": 4.05812931060791,
"learning_rate": 1.235648409561453e-05,
"loss": 1.0254,
"step": 40000
},
{
"epoch": 7.6228119706380575,
"grad_norm": 2.9366817474365234,
"learning_rate": 1.1885940146809712e-05,
"loss": 1.0154,
"step": 40500
},
{
"epoch": 7.716920760399021,
"grad_norm": 3.6969943046569824,
"learning_rate": 1.1415396198004895e-05,
"loss": 1.0023,
"step": 41000
},
{
"epoch": 7.811029550159985,
"grad_norm": 3.294569969177246,
"learning_rate": 3.4377940899680034e-05,
"loss": 1.0861,
"step": 41500
},
{
"epoch": 7.905138339920948,
"grad_norm": 1.961162805557251,
"learning_rate": 3.418972332015811e-05,
"loss": 1.105,
"step": 42000
},
{
"epoch": 7.999247129681912,
"grad_norm": 4.648848056793213,
"learning_rate": 3.400150574063617e-05,
"loss": 1.1171,
"step": 42500
},
{
"epoch": 8.093355919442876,
"grad_norm": 4.888842582702637,
"learning_rate": 3.381328816111425e-05,
"loss": 1.0555,
"step": 43000
},
{
"epoch": 8.187464709203839,
"grad_norm": 3.3256289958953857,
"learning_rate": 3.362507058159232e-05,
"loss": 1.059,
"step": 43500
},
{
"epoch": 8.281573498964804,
"grad_norm": 2.744330406188965,
"learning_rate": 3.3436853002070396e-05,
"loss": 1.0818,
"step": 44000
},
{
"epoch": 8.375682288725766,
"grad_norm": 2.7452945709228516,
"learning_rate": 3.3248635422548465e-05,
"loss": 1.0772,
"step": 44500
},
{
"epoch": 8.46979107848673,
"grad_norm": 2.993236780166626,
"learning_rate": 3.306041784302654e-05,
"loss": 1.0896,
"step": 45000
},
{
"epoch": 8.563899868247695,
"grad_norm": 4.128504276275635,
"learning_rate": 3.287220026350462e-05,
"loss": 1.0806,
"step": 45500
},
{
"epoch": 8.658008658008658,
"grad_norm": 3.3763933181762695,
"learning_rate": 3.268398268398268e-05,
"loss": 1.0713,
"step": 46000
},
{
"epoch": 8.752117447769622,
"grad_norm": 2.213223457336426,
"learning_rate": 3.249576510446076e-05,
"loss": 1.0698,
"step": 46500
},
{
"epoch": 8.846226237530585,
"grad_norm": 3.0853641033172607,
"learning_rate": 3.230754752493883e-05,
"loss": 1.0724,
"step": 47000
},
{
"epoch": 8.94033502729155,
"grad_norm": 3.031791925430298,
"learning_rate": 3.2119329945416903e-05,
"loss": 1.0672,
"step": 47500
},
{
"epoch": 9.034443817052512,
"grad_norm": 3.0765955448150635,
"learning_rate": 3.193111236589498e-05,
"loss": 1.0545,
"step": 48000
},
{
"epoch": 9.128552606813477,
"grad_norm": 4.204183101654053,
"learning_rate": 3.174289478637305e-05,
"loss": 1.0037,
"step": 48500
},
{
"epoch": 9.22266139657444,
"grad_norm": 5.794823169708252,
"learning_rate": 3.155467720685112e-05,
"loss": 1.0069,
"step": 49000
},
{
"epoch": 9.316770186335404,
"grad_norm": 3.246511459350586,
"learning_rate": 3.136645962732919e-05,
"loss": 1.0151,
"step": 49500
},
{
"epoch": 9.410878976096367,
"grad_norm": 2.7034826278686523,
"learning_rate": 3.1178242047807265e-05,
"loss": 1.0183,
"step": 50000
},
{
"epoch": 9.504987765857331,
"grad_norm": 2.402880907058716,
"learning_rate": 3.099002446828534e-05,
"loss": 1.0193,
"step": 50500
},
{
"epoch": 9.599096555618296,
"grad_norm": 5.0139336585998535,
"learning_rate": 3.080180688876341e-05,
"loss": 1.0145,
"step": 51000
},
{
"epoch": 9.693205345379258,
"grad_norm": 3.907541275024414,
"learning_rate": 3.061358930924149e-05,
"loss": 1.011,
"step": 51500
},
{
"epoch": 9.787314135140223,
"grad_norm": 4.411949157714844,
"learning_rate": 3.042537172971956e-05,
"loss": 1.0063,
"step": 52000
},
{
"epoch": 9.881422924901186,
"grad_norm": 4.035211563110352,
"learning_rate": 3.0237154150197627e-05,
"loss": 1.0149,
"step": 52500
},
{
"epoch": 9.97553171466215,
"grad_norm": 3.6050124168395996,
"learning_rate": 3.00489365706757e-05,
"loss": 1.0027,
"step": 53000
},
{
"epoch": 10.069640504423113,
"grad_norm": 3.40191388130188,
"learning_rate": 2.9860718991153773e-05,
"loss": 0.9468,
"step": 53500
},
{
"epoch": 10.163749294184077,
"grad_norm": 2.8000032901763916,
"learning_rate": 2.9672501411631846e-05,
"loss": 0.9248,
"step": 54000
},
{
"epoch": 10.25785808394504,
"grad_norm": 4.177460193634033,
"learning_rate": 2.9484283832109923e-05,
"loss": 0.9389,
"step": 54500
},
{
"epoch": 10.351966873706004,
"grad_norm": 3.88246750831604,
"learning_rate": 2.9296066252587996e-05,
"loss": 0.9326,
"step": 55000
},
{
"epoch": 10.446075663466967,
"grad_norm": 4.002796173095703,
"learning_rate": 2.910784867306607e-05,
"loss": 0.9378,
"step": 55500
},
{
"epoch": 10.540184453227932,
"grad_norm": 4.069864749908447,
"learning_rate": 2.8919631093544135e-05,
"loss": 0.9405,
"step": 56000
},
{
"epoch": 10.634293242988894,
"grad_norm": 3.493865966796875,
"learning_rate": 2.8731413514022208e-05,
"loss": 0.9347,
"step": 56500
},
{
"epoch": 10.728402032749859,
"grad_norm": 3.725019693374634,
"learning_rate": 2.8543195934500284e-05,
"loss": 0.9428,
"step": 57000
},
{
"epoch": 10.822510822510823,
"grad_norm": 3.557591438293457,
"learning_rate": 2.8354978354978357e-05,
"loss": 0.9397,
"step": 57500
},
{
"epoch": 10.916619612271786,
"grad_norm": 3.370312213897705,
"learning_rate": 2.816676077545643e-05,
"loss": 0.939,
"step": 58000
},
{
"epoch": 11.01072840203275,
"grad_norm": 2.937887191772461,
"learning_rate": 2.7978543195934503e-05,
"loss": 0.9225,
"step": 58500
},
{
"epoch": 11.104837191793713,
"grad_norm": 2.403350353240967,
"learning_rate": 2.7790325616412576e-05,
"loss": 0.827,
"step": 59000
},
{
"epoch": 11.198945981554678,
"grad_norm": 2.4876935482025146,
"learning_rate": 2.7602108036890646e-05,
"loss": 0.8461,
"step": 59500
},
{
"epoch": 11.29305477131564,
"grad_norm": 3.663511276245117,
"learning_rate": 2.741389045736872e-05,
"loss": 0.8434,
"step": 60000
},
{
"epoch": 11.387163561076605,
"grad_norm": 4.3568806648254395,
"learning_rate": 2.7225672877846792e-05,
"loss": 0.8496,
"step": 60500
},
{
"epoch": 11.481272350837568,
"grad_norm": 3.4234845638275146,
"learning_rate": 2.7037455298324865e-05,
"loss": 0.8549,
"step": 61000
},
{
"epoch": 11.575381140598532,
"grad_norm": 2.537666082382202,
"learning_rate": 2.6849237718802938e-05,
"loss": 0.8548,
"step": 61500
},
{
"epoch": 11.669489930359495,
"grad_norm": 3.7678942680358887,
"learning_rate": 2.666102013928101e-05,
"loss": 0.8501,
"step": 62000
},
{
"epoch": 11.76359872012046,
"grad_norm": 3.4384429454803467,
"learning_rate": 2.647280255975908e-05,
"loss": 0.8546,
"step": 62500
},
{
"epoch": 11.857707509881424,
"grad_norm": 2.883970022201538,
"learning_rate": 2.6284584980237154e-05,
"loss": 0.8494,
"step": 63000
},
{
"epoch": 11.951816299642386,
"grad_norm": 3.2041871547698975,
"learning_rate": 2.6096367400715227e-05,
"loss": 0.8473,
"step": 63500
},
{
"epoch": 12.045925089403351,
"grad_norm": 3.7006897926330566,
"learning_rate": 2.59081498211933e-05,
"loss": 0.7851,
"step": 64000
},
{
"epoch": 12.140033879164314,
"grad_norm": 3.4774985313415527,
"learning_rate": 2.5719932241671373e-05,
"loss": 0.7241,
"step": 64500
},
{
"epoch": 12.234142668925278,
"grad_norm": 3.615403413772583,
"learning_rate": 2.5531714662149446e-05,
"loss": 0.7296,
"step": 65000
},
{
"epoch": 12.32825145868624,
"grad_norm": 4.831261157989502,
"learning_rate": 2.534349708262752e-05,
"loss": 0.7382,
"step": 65500
},
{
"epoch": 12.422360248447205,
"grad_norm": 3.072159767150879,
"learning_rate": 2.515527950310559e-05,
"loss": 0.733,
"step": 66000
},
{
"epoch": 12.516469038208168,
"grad_norm": 3.442324161529541,
"learning_rate": 2.4967061923583665e-05,
"loss": 0.7453,
"step": 66500
},
{
"epoch": 12.610577827969133,
"grad_norm": 2.71148419380188,
"learning_rate": 2.4778844344061735e-05,
"loss": 0.7388,
"step": 67000
},
{
"epoch": 12.704686617730095,
"grad_norm": 3.6085212230682373,
"learning_rate": 2.4590626764539808e-05,
"loss": 0.7444,
"step": 67500
},
{
"epoch": 12.79879540749106,
"grad_norm": 3.9403274059295654,
"learning_rate": 2.440240918501788e-05,
"loss": 0.7291,
"step": 68000
},
{
"epoch": 12.892904197252022,
"grad_norm": 3.322840929031372,
"learning_rate": 2.4214191605495954e-05,
"loss": 0.7523,
"step": 68500
},
{
"epoch": 12.987012987012987,
"grad_norm": 4.849690914154053,
"learning_rate": 2.4025974025974027e-05,
"loss": 0.7477,
"step": 69000
},
{
"epoch": 13.081121776773951,
"grad_norm": 4.5327959060668945,
"learning_rate": 2.38377564464521e-05,
"loss": 0.5957,
"step": 69500
},
{
"epoch": 13.175230566534914,
"grad_norm": 2.9526569843292236,
"learning_rate": 2.3649538866930173e-05,
"loss": 0.5829,
"step": 70000
},
{
"epoch": 13.269339356295879,
"grad_norm": 2.7245306968688965,
"learning_rate": 2.3461321287408243e-05,
"loss": 0.5948,
"step": 70500
},
{
"epoch": 13.363448146056841,
"grad_norm": 3.7524588108062744,
"learning_rate": 2.327310370788632e-05,
"loss": 0.5956,
"step": 71000
},
{
"epoch": 13.457556935817806,
"grad_norm": 4.387008190155029,
"learning_rate": 2.3084886128364392e-05,
"loss": 0.605,
"step": 71500
},
{
"epoch": 13.551665725578768,
"grad_norm": 3.449723482131958,
"learning_rate": 2.2896668548842462e-05,
"loss": 0.6087,
"step": 72000
},
{
"epoch": 13.645774515339733,
"grad_norm": 3.2176854610443115,
"learning_rate": 2.2708450969320535e-05,
"loss": 0.6141,
"step": 72500
},
{
"epoch": 13.739883305100696,
"grad_norm": 4.136612415313721,
"learning_rate": 2.2520233389798608e-05,
"loss": 0.6251,
"step": 73000
},
{
"epoch": 13.83399209486166,
"grad_norm": 4.145909786224365,
"learning_rate": 2.233201581027668e-05,
"loss": 0.6094,
"step": 73500
},
{
"epoch": 13.928100884622623,
"grad_norm": 3.8660261631011963,
"learning_rate": 2.2143798230754754e-05,
"loss": 0.6194,
"step": 74000
},
{
"epoch": 14.022209674383587,
"grad_norm": 3.511117696762085,
"learning_rate": 2.1955580651232827e-05,
"loss": 0.5712,
"step": 74500
},
{
"epoch": 14.116318464144552,
"grad_norm": 2.771902084350586,
"learning_rate": 2.17673630717109e-05,
"loss": 0.4559,
"step": 75000
},
{
"epoch": 14.210427253905515,
"grad_norm": 5.087037086486816,
"learning_rate": 2.157914549218897e-05,
"loss": 0.4529,
"step": 75500
},
{
"epoch": 14.304536043666479,
"grad_norm": 4.842881679534912,
"learning_rate": 2.1390927912667043e-05,
"loss": 0.4567,
"step": 76000
},
{
"epoch": 14.398644833427442,
"grad_norm": 3.444424629211426,
"learning_rate": 2.120271033314512e-05,
"loss": 0.4618,
"step": 76500
},
{
"epoch": 14.492753623188406,
"grad_norm": 3.934549331665039,
"learning_rate": 2.101449275362319e-05,
"loss": 0.455,
"step": 77000
},
{
"epoch": 14.586862412949369,
"grad_norm": 4.338078498840332,
"learning_rate": 2.0826275174101262e-05,
"loss": 0.472,
"step": 77500
},
{
"epoch": 14.680971202710333,
"grad_norm": 3.781182050704956,
"learning_rate": 2.0638057594579335e-05,
"loss": 0.4613,
"step": 78000
},
{
"epoch": 14.775079992471296,
"grad_norm": 3.866917371749878,
"learning_rate": 2.0449840015057405e-05,
"loss": 0.4791,
"step": 78500
},
{
"epoch": 14.86918878223226,
"grad_norm": 3.5265841484069824,
"learning_rate": 2.026162243553548e-05,
"loss": 0.4709,
"step": 79000
},
{
"epoch": 14.963297571993223,
"grad_norm": 3.3175408840179443,
"learning_rate": 2.0073404856013554e-05,
"loss": 0.4847,
"step": 79500
},
{
"epoch": 15.057406361754188,
"grad_norm": 2.848555564880371,
"learning_rate": 1.9885187276491627e-05,
"loss": 0.3788,
"step": 80000
},
{
"epoch": 15.151515151515152,
"grad_norm": 3.355653762817383,
"learning_rate": 1.9696969696969697e-05,
"loss": 0.3235,
"step": 80500
},
{
"epoch": 15.245623941276115,
"grad_norm": 3.260960817337036,
"learning_rate": 1.950875211744777e-05,
"loss": 0.3162,
"step": 81000
},
{
"epoch": 15.33973273103708,
"grad_norm": 3.9132673740386963,
"learning_rate": 1.9320534537925843e-05,
"loss": 0.3209,
"step": 81500
},
{
"epoch": 15.433841520798042,
"grad_norm": 4.693389892578125,
"learning_rate": 1.9132316958403916e-05,
"loss": 0.3421,
"step": 82000
},
{
"epoch": 15.527950310559007,
"grad_norm": 3.7237417697906494,
"learning_rate": 1.894409937888199e-05,
"loss": 0.3441,
"step": 82500
},
{
"epoch": 15.62205910031997,
"grad_norm": 3.9301464557647705,
"learning_rate": 1.8755881799360062e-05,
"loss": 0.3424,
"step": 83000
},
{
"epoch": 15.716167890080934,
"grad_norm": 4.186377048492432,
"learning_rate": 1.8567664219838135e-05,
"loss": 0.3503,
"step": 83500
},
{
"epoch": 15.810276679841897,
"grad_norm": 3.6368534564971924,
"learning_rate": 1.8379446640316205e-05,
"loss": 0.3347,
"step": 84000
},
{
"epoch": 15.904385469602861,
"grad_norm": 2.6787190437316895,
"learning_rate": 1.8191229060794278e-05,
"loss": 0.3445,
"step": 84500
},
{
"epoch": 15.998494259363824,
"grad_norm": 3.942444324493408,
"learning_rate": 1.8003011481272354e-05,
"loss": 0.343,
"step": 85000
},
{
"epoch": 16.09260304912479,
"grad_norm": 3.3087995052337646,
"learning_rate": 1.7814793901750424e-05,
"loss": 0.2093,
"step": 85500
},
{
"epoch": 16.186711838885753,
"grad_norm": 4.419996738433838,
"learning_rate": 1.7626576322228497e-05,
"loss": 0.2209,
"step": 86000
},
{
"epoch": 16.280820628646715,
"grad_norm": 4.352416515350342,
"learning_rate": 1.743835874270657e-05,
"loss": 0.2282,
"step": 86500
},
{
"epoch": 16.374929418407678,
"grad_norm": 3.7437074184417725,
"learning_rate": 1.725014116318464e-05,
"loss": 0.232,
"step": 87000
},
{
"epoch": 16.469038208168644,
"grad_norm": 6.4077534675598145,
"learning_rate": 1.7061923583662716e-05,
"loss": 0.2291,
"step": 87500
},
{
"epoch": 16.563146997929607,
"grad_norm": 3.0419087409973145,
"learning_rate": 1.687370600414079e-05,
"loss": 0.2401,
"step": 88000
},
{
"epoch": 16.65725578769057,
"grad_norm": 4.095304012298584,
"learning_rate": 1.6685488424618862e-05,
"loss": 0.233,
"step": 88500
},
{
"epoch": 16.751364577451533,
"grad_norm": 3.372295618057251,
"learning_rate": 1.649727084509693e-05,
"loss": 0.2354,
"step": 89000
},
{
"epoch": 16.8454733672125,
"grad_norm": 2.918405771255493,
"learning_rate": 1.6309053265575005e-05,
"loss": 0.2421,
"step": 89500
},
{
"epoch": 16.93958215697346,
"grad_norm": 2.4548110961914062,
"learning_rate": 1.6120835686053078e-05,
"loss": 0.2374,
"step": 90000
},
{
"epoch": 17.033690946734424,
"grad_norm": 4.15058708190918,
"learning_rate": 1.593261810653115e-05,
"loss": 0.2135,
"step": 90500
},
{
"epoch": 17.12779973649539,
"grad_norm": 2.979170083999634,
"learning_rate": 1.5744400527009224e-05,
"loss": 0.1495,
"step": 91000
},
{
"epoch": 17.221908526256353,
"grad_norm": 3.950493574142456,
"learning_rate": 1.5556182947487297e-05,
"loss": 0.1503,
"step": 91500
},
{
"epoch": 17.316017316017316,
"grad_norm": 3.174896001815796,
"learning_rate": 1.5367965367965366e-05,
"loss": 0.1559,
"step": 92000
},
{
"epoch": 17.41012610577828,
"grad_norm": 3.4672741889953613,
"learning_rate": 1.5179747788443441e-05,
"loss": 0.1606,
"step": 92500
},
{
"epoch": 17.504234895539245,
"grad_norm": 3.950160026550293,
"learning_rate": 1.4991530208921514e-05,
"loss": 0.1561,
"step": 93000
},
{
"epoch": 17.598343685300208,
"grad_norm": 3.546109199523926,
"learning_rate": 1.4803312629399587e-05,
"loss": 0.1622,
"step": 93500
},
{
"epoch": 17.69245247506117,
"grad_norm": 6.592913627624512,
"learning_rate": 1.4615095049877658e-05,
"loss": 0.1644,
"step": 94000
},
{
"epoch": 17.786561264822133,
"grad_norm": 2.762545347213745,
"learning_rate": 1.4426877470355732e-05,
"loss": 0.1614,
"step": 94500
},
{
"epoch": 17.8806700545831,
"grad_norm": 2.9720265865325928,
"learning_rate": 1.4238659890833805e-05,
"loss": 0.1577,
"step": 95000
},
{
"epoch": 17.974778844344062,
"grad_norm": 3.006880283355713,
"learning_rate": 1.4050442311311876e-05,
"loss": 0.1577,
"step": 95500
},
{
"epoch": 18.068887634105025,
"grad_norm": 2.86936616897583,
"learning_rate": 1.3862224731789949e-05,
"loss": 0.1162,
"step": 96000
},
{
"epoch": 18.162996423865987,
"grad_norm": 3.608914852142334,
"learning_rate": 1.3674007152268024e-05,
"loss": 0.1001,
"step": 96500
},
{
"epoch": 18.257105213626954,
"grad_norm": 3.126116991043091,
"learning_rate": 1.3485789572746097e-05,
"loss": 0.1082,
"step": 97000
},
{
"epoch": 18.351214003387916,
"grad_norm": 2.9935250282287598,
"learning_rate": 1.3297571993224166e-05,
"loss": 0.109,
"step": 97500
},
{
"epoch": 18.44532279314888,
"grad_norm": 1.9776825904846191,
"learning_rate": 1.3109354413702241e-05,
"loss": 0.1031,
"step": 98000
},
{
"epoch": 18.539431582909845,
"grad_norm": 4.182958602905273,
"learning_rate": 1.2921136834180314e-05,
"loss": 0.1084,
"step": 98500
},
{
"epoch": 18.633540372670808,
"grad_norm": 3.405510902404785,
"learning_rate": 1.2732919254658385e-05,
"loss": 0.1069,
"step": 99000
},
{
"epoch": 18.72764916243177,
"grad_norm": 2.7036936283111572,
"learning_rate": 1.2544701675136458e-05,
"loss": 0.1084,
"step": 99500
},
{
"epoch": 18.821757952192733,
"grad_norm": 2.601555347442627,
"learning_rate": 1.235648409561453e-05,
"loss": 0.1097,
"step": 100000
},
{
"epoch": 18.9158667419537,
"grad_norm": 2.9937756061553955,
"learning_rate": 1.2168266516092605e-05,
"loss": 0.1004,
"step": 100500
},
{
"epoch": 19.009975531714662,
"grad_norm": 1.493790864944458,
"learning_rate": 1.1980048936570676e-05,
"loss": 0.1032,
"step": 101000
},
{
"epoch": 19.104084321475625,
"grad_norm": 1.7063292264938354,
"learning_rate": 1.1791831357048749e-05,
"loss": 0.0667,
"step": 101500
},
{
"epoch": 19.198193111236588,
"grad_norm": 2.8355624675750732,
"learning_rate": 1.1603613777526822e-05,
"loss": 0.0678,
"step": 102000
},
{
"epoch": 19.292301900997554,
"grad_norm": 1.7610359191894531,
"learning_rate": 1.1415396198004895e-05,
"loss": 0.0654,
"step": 102500
},
{
"epoch": 19.386410690758517,
"grad_norm": 2.759197950363159,
"learning_rate": 1.1227178618482966e-05,
"loss": 0.0717,
"step": 103000
},
{
"epoch": 19.48051948051948,
"grad_norm": 2.657435417175293,
"learning_rate": 1.103896103896104e-05,
"loss": 0.0722,
"step": 103500
},
{
"epoch": 19.574628270280446,
"grad_norm": 2.5865299701690674,
"learning_rate": 1.0850743459439112e-05,
"loss": 0.0689,
"step": 104000
},
{
"epoch": 19.66873706004141,
"grad_norm": 4.484961986541748,
"learning_rate": 1.0662525879917184e-05,
"loss": 0.0706,
"step": 104500
},
{
"epoch": 19.76284584980237,
"grad_norm": 2.431190252304077,
"learning_rate": 1.0474308300395258e-05,
"loss": 0.0691,
"step": 105000
},
{
"epoch": 19.856954639563334,
"grad_norm": 2.6577088832855225,
"learning_rate": 1.028609072087333e-05,
"loss": 0.0704,
"step": 105500
},
{
"epoch": 19.9510634293243,
"grad_norm": 3.1382012367248535,
"learning_rate": 1.0097873141351403e-05,
"loss": 0.0673,
"step": 106000
},
{
"epoch": 20.045172219085263,
"grad_norm": 1.5920716524124146,
"learning_rate": 9.909655561829476e-06,
"loss": 0.0605,
"step": 106500
},
{
"epoch": 20.139281008846226,
"grad_norm": 0.49824026226997375,
"learning_rate": 9.721437982307547e-06,
"loss": 0.0427,
"step": 107000
},
{
"epoch": 20.23338979860719,
"grad_norm": 1.5648902654647827,
"learning_rate": 9.533220402785622e-06,
"loss": 0.042,
"step": 107500
},
{
"epoch": 20.327498588368154,
"grad_norm": 2.084714889526367,
"learning_rate": 9.345002823263693e-06,
"loss": 0.0442,
"step": 108000
},
{
"epoch": 20.421607378129117,
"grad_norm": 2.1811583042144775,
"learning_rate": 9.156785243741765e-06,
"loss": 0.0424,
"step": 108500
},
{
"epoch": 20.51571616789008,
"grad_norm": 3.9864232540130615,
"learning_rate": 8.96856766421984e-06,
"loss": 0.0439,
"step": 109000
},
{
"epoch": 20.609824957651046,
"grad_norm": 4.495816707611084,
"learning_rate": 8.78035008469791e-06,
"loss": 0.0433,
"step": 109500
},
{
"epoch": 20.70393374741201,
"grad_norm": 1.6480516195297241,
"learning_rate": 8.592132505175984e-06,
"loss": 0.0454,
"step": 110000
},
{
"epoch": 20.79804253717297,
"grad_norm": 1.1842634677886963,
"learning_rate": 8.403914925654057e-06,
"loss": 0.0425,
"step": 110500
},
{
"epoch": 20.892151326933934,
"grad_norm": 4.60665225982666,
"learning_rate": 8.215697346132128e-06,
"loss": 0.0419,
"step": 111000
},
{
"epoch": 20.9862601166949,
"grad_norm": 3.4669153690338135,
"learning_rate": 8.027479766610201e-06,
"loss": 0.0412,
"step": 111500
},
{
"epoch": 21.080368906455863,
"grad_norm": 0.7074203491210938,
"learning_rate": 7.839262187088274e-06,
"loss": 0.0285,
"step": 112000
},
{
"epoch": 21.174477696216826,
"grad_norm": 2.58770489692688,
"learning_rate": 7.651044607566347e-06,
"loss": 0.0275,
"step": 112500
},
{
"epoch": 21.26858648597779,
"grad_norm": 0.6419113874435425,
"learning_rate": 7.462827028044419e-06,
"loss": 0.0263,
"step": 113000
},
{
"epoch": 21.362695275738755,
"grad_norm": 2.158191204071045,
"learning_rate": 7.274609448522493e-06,
"loss": 0.0267,
"step": 113500
},
{
"epoch": 21.456804065499718,
"grad_norm": 1.1450761556625366,
"learning_rate": 7.0863918690005655e-06,
"loss": 0.024,
"step": 114000
},
{
"epoch": 21.55091285526068,
"grad_norm": 0.9204089045524597,
"learning_rate": 6.898174289478637e-06,
"loss": 0.0273,
"step": 114500
},
{
"epoch": 21.645021645021647,
"grad_norm": 1.3897809982299805,
"learning_rate": 6.709956709956711e-06,
"loss": 0.0276,
"step": 115000
},
{
"epoch": 21.73913043478261,
"grad_norm": 2.818786382675171,
"learning_rate": 6.521739130434783e-06,
"loss": 0.027,
"step": 115500
},
{
"epoch": 21.833239224543572,
"grad_norm": 1.8107503652572632,
"learning_rate": 6.333521550912856e-06,
"loss": 0.025,
"step": 116000
},
{
"epoch": 21.927348014304535,
"grad_norm": 2.591801881790161,
"learning_rate": 6.145303971390928e-06,
"loss": 0.0254,
"step": 116500
},
{
"epoch": 22.0214568040655,
"grad_norm": 1.22541344165802,
"learning_rate": 5.957086391869001e-06,
"loss": 0.0207,
"step": 117000
},
{
"epoch": 22.115565593826464,
"grad_norm": 2.6778624057769775,
"learning_rate": 5.768868812347074e-06,
"loss": 0.0148,
"step": 117500
},
{
"epoch": 22.209674383587426,
"grad_norm": 1.2167950868606567,
"learning_rate": 5.5806512328251455e-06,
"loss": 0.0134,
"step": 118000
},
{
"epoch": 22.30378317334839,
"grad_norm": 0.1222626119852066,
"learning_rate": 5.3924336533032186e-06,
"loss": 0.0175,
"step": 118500
},
{
"epoch": 22.397891963109355,
"grad_norm": 0.3822714686393738,
"learning_rate": 5.204216073781292e-06,
"loss": 0.0156,
"step": 119000
},
{
"epoch": 22.492000752870318,
"grad_norm": 0.542395293712616,
"learning_rate": 5.015998494259365e-06,
"loss": 0.0141,
"step": 119500
},
{
"epoch": 22.58610954263128,
"grad_norm": 1.051392674446106,
"learning_rate": 4.827780914737437e-06,
"loss": 0.0138,
"step": 120000
},
{
"epoch": 22.680218332392247,
"grad_norm": 1.7265046834945679,
"learning_rate": 4.639563335215509e-06,
"loss": 0.0139,
"step": 120500
},
{
"epoch": 22.77432712215321,
"grad_norm": 0.8157036304473877,
"learning_rate": 4.451345755693582e-06,
"loss": 0.0147,
"step": 121000
},
{
"epoch": 22.868435911914172,
"grad_norm": 2.212116003036499,
"learning_rate": 4.263128176171654e-06,
"loss": 0.0157,
"step": 121500
},
{
"epoch": 22.962544701675135,
"grad_norm": 1.5762394666671753,
"learning_rate": 4.074910596649727e-06,
"loss": 0.0139,
"step": 122000
},
{
"epoch": 23.0566534914361,
"grad_norm": 0.16070736944675446,
"learning_rate": 3.8866930171278e-06,
"loss": 0.0113,
"step": 122500
},
{
"epoch": 23.150762281197064,
"grad_norm": 0.6197986602783203,
"learning_rate": 3.698475437605872e-06,
"loss": 0.0066,
"step": 123000
},
{
"epoch": 23.244871070958027,
"grad_norm": 0.2145221084356308,
"learning_rate": 3.510257858083945e-06,
"loss": 0.0061,
"step": 123500
},
{
"epoch": 23.33897986071899,
"grad_norm": 0.1779479682445526,
"learning_rate": 3.3220402785620177e-06,
"loss": 0.0059,
"step": 124000
},
{
"epoch": 23.433088650479956,
"grad_norm": 0.18615959584712982,
"learning_rate": 3.1338226990400907e-06,
"loss": 0.0057,
"step": 124500
},
{
"epoch": 23.52719744024092,
"grad_norm": 1.9545246362686157,
"learning_rate": 2.945605119518163e-06,
"loss": 0.0067,
"step": 125000
},
{
"epoch": 23.62130623000188,
"grad_norm": 2.255216598510742,
"learning_rate": 2.7573875399962355e-06,
"loss": 0.0062,
"step": 125500
},
{
"epoch": 23.715415019762847,
"grad_norm": 0.28629258275032043,
"learning_rate": 2.5691699604743086e-06,
"loss": 0.0054,
"step": 126000
},
{
"epoch": 23.80952380952381,
"grad_norm": 0.1650991588830948,
"learning_rate": 2.3809523809523808e-06,
"loss": 0.0056,
"step": 126500
},
{
"epoch": 23.903632599284773,
"grad_norm": 0.45735275745391846,
"learning_rate": 2.192734801430454e-06,
"loss": 0.006,
"step": 127000
},
{
"epoch": 23.997741389045736,
"grad_norm": 1.3017574548721313,
"learning_rate": 2.0045172219085264e-06,
"loss": 0.0047,
"step": 127500
},
{
"epoch": 24.091850178806702,
"grad_norm": 0.15063992142677307,
"learning_rate": 1.8162996423865988e-06,
"loss": 0.0037,
"step": 128000
},
{
"epoch": 24.185958968567665,
"grad_norm": 0.09415856748819351,
"learning_rate": 1.6280820628646716e-06,
"loss": 0.0021,
"step": 128500
},
{
"epoch": 24.280067758328627,
"grad_norm": 0.06203685700893402,
"learning_rate": 1.4398644833427442e-06,
"loss": 0.0017,
"step": 129000
},
{
"epoch": 24.37417654808959,
"grad_norm": 1.0590204000473022,
"learning_rate": 1.2516469038208169e-06,
"loss": 0.0016,
"step": 129500
},
{
"epoch": 24.468285337850556,
"grad_norm": 0.05048515647649765,
"learning_rate": 1.0634293242988897e-06,
"loss": 0.0014,
"step": 130000
},
{
"epoch": 24.56239412761152,
"grad_norm": 0.10998225957155228,
"learning_rate": 8.752117447769622e-07,
"loss": 0.0014,
"step": 130500
},
{
"epoch": 24.65650291737248,
"grad_norm": 0.03882027417421341,
"learning_rate": 6.869941652550348e-07,
"loss": 0.0013,
"step": 131000
},
{
"epoch": 24.750611707133448,
"grad_norm": 0.09357800334692001,
"learning_rate": 4.987765857331075e-07,
"loss": 0.0016,
"step": 131500
},
{
"epoch": 24.84472049689441,
"grad_norm": 0.18081815540790558,
"learning_rate": 3.1055900621118013e-07,
"loss": 0.0016,
"step": 132000
},
{
"epoch": 24.938829286655373,
"grad_norm": 0.04123268648982048,
"learning_rate": 1.223414266892528e-07,
"loss": 0.0025,
"step": 132500
},
{
"epoch": 25.0,
"step": 132825,
"total_flos": 2.908084043402707e+18,
"train_loss": 0.022898558901232692,
"train_runtime": 71987.3815,
"train_samples_per_second": 29.519,
"train_steps_per_second": 1.845
}
],
"logging_steps": 500,
"max_steps": 132825,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.908084043402707e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}