|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 25.0, |
|
"eval_steps": 500, |
|
"global_step": 132825, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09410878976096368, |
|
"grad_norm": 7.552914619445801, |
|
"learning_rate": 4.843152017065061e-05, |
|
"loss": 1.9573, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.18821757952192736, |
|
"grad_norm": 10.377354621887207, |
|
"learning_rate": 4.686304034130121e-05, |
|
"loss": 1.6637, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.282326369282891, |
|
"grad_norm": 10.336068153381348, |
|
"learning_rate": 4.529456051195182e-05, |
|
"loss": 1.6903, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3764351590438547, |
|
"grad_norm": 4.373341083526611, |
|
"learning_rate": 4.372608068260242e-05, |
|
"loss": 1.7066, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.47054394880481837, |
|
"grad_norm": 6.67123556137085, |
|
"learning_rate": 4.2157600853253026e-05, |
|
"loss": 1.6262, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.564652738565782, |
|
"grad_norm": 7.045924663543701, |
|
"learning_rate": 4.0589121023903634e-05, |
|
"loss": 1.6133, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.6587615283267457, |
|
"grad_norm": 8.646467208862305, |
|
"learning_rate": 3.902064119455424e-05, |
|
"loss": 1.5958, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.7528703180877094, |
|
"grad_norm": 17.767257690429688, |
|
"learning_rate": 3.745216136520485e-05, |
|
"loss": 1.5926, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.846979107848673, |
|
"grad_norm": 6.851809978485107, |
|
"learning_rate": 3.588368153585545e-05, |
|
"loss": 1.5898, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.9410878976096367, |
|
"grad_norm": 6.367198467254639, |
|
"learning_rate": 3.431520170650606e-05, |
|
"loss": 1.5842, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.0351966873706004, |
|
"grad_norm": 3.8963205814361572, |
|
"learning_rate": 3.274672187715666e-05, |
|
"loss": 1.6023, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.129305477131564, |
|
"grad_norm": 3.492241382598877, |
|
"learning_rate": 3.1178242047807265e-05, |
|
"loss": 1.5835, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.2234142668925279, |
|
"grad_norm": 6.9010396003723145, |
|
"learning_rate": 2.960976221845787e-05, |
|
"loss": 1.5804, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.3175230566534915, |
|
"grad_norm": 7.097581386566162, |
|
"learning_rate": 2.8041282389108477e-05, |
|
"loss": 1.5685, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.411631846414455, |
|
"grad_norm": 4.106161117553711, |
|
"learning_rate": 2.647280255975908e-05, |
|
"loss": 1.5557, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.5057406361754189, |
|
"grad_norm": 5.339535236358643, |
|
"learning_rate": 2.490432273040969e-05, |
|
"loss": 1.5266, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.5998494259363825, |
|
"grad_norm": 5.467094421386719, |
|
"learning_rate": 2.3335842901060293e-05, |
|
"loss": 1.5394, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.693958215697346, |
|
"grad_norm": 5.166747093200684, |
|
"learning_rate": 2.17673630717109e-05, |
|
"loss": 1.5131, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.7880670054583097, |
|
"grad_norm": 3.2604665756225586, |
|
"learning_rate": 2.0198883242361504e-05, |
|
"loss": 1.4695, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.8821757952192735, |
|
"grad_norm": 6.9691362380981445, |
|
"learning_rate": 1.8630403413012108e-05, |
|
"loss": 1.4532, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.9762845849802373, |
|
"grad_norm": 3.044600248336792, |
|
"learning_rate": 1.7061923583662716e-05, |
|
"loss": 1.4349, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.070393374741201, |
|
"grad_norm": 5.485976219177246, |
|
"learning_rate": 1.549344375431332e-05, |
|
"loss": 1.4098, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.1645021645021645, |
|
"grad_norm": 3.2020225524902344, |
|
"learning_rate": 1.3924963924963927e-05, |
|
"loss": 1.4025, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.258610954263128, |
|
"grad_norm": 4.406314849853516, |
|
"learning_rate": 1.235648409561453e-05, |
|
"loss": 1.384, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.3527197440240917, |
|
"grad_norm": 7.23325252532959, |
|
"learning_rate": 1.0788004266265137e-05, |
|
"loss": 1.3831, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.4468285337850557, |
|
"grad_norm": 6.645949840545654, |
|
"learning_rate": 9.219524436915741e-06, |
|
"loss": 1.3611, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.5409373235460193, |
|
"grad_norm": 4.476417064666748, |
|
"learning_rate": 7.651044607566347e-06, |
|
"loss": 1.3478, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.635046113306983, |
|
"grad_norm": 4.497859954833984, |
|
"learning_rate": 6.082564778216952e-06, |
|
"loss": 1.3354, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.7291549030679465, |
|
"grad_norm": 4.2654008865356445, |
|
"learning_rate": 4.514084948867558e-06, |
|
"loss": 1.3229, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.82326369282891, |
|
"grad_norm": 3.607623338699341, |
|
"learning_rate": 2.945605119518163e-06, |
|
"loss": 1.3001, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.917372482589874, |
|
"grad_norm": 3.7838690280914307, |
|
"learning_rate": 1.3771252901687685e-06, |
|
"loss": 1.2967, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.0114812723508377, |
|
"grad_norm": 7.810318946838379, |
|
"learning_rate": 3.494259363824581e-05, |
|
"loss": 1.4147, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.1055900621118013, |
|
"grad_norm": 3.653393030166626, |
|
"learning_rate": 3.4472049689440996e-05, |
|
"loss": 1.4375, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.199698851872765, |
|
"grad_norm": 6.7401018142700195, |
|
"learning_rate": 3.400150574063617e-05, |
|
"loss": 1.3906, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.2938076416337285, |
|
"grad_norm": 4.238823890686035, |
|
"learning_rate": 3.3530961791831364e-05, |
|
"loss": 1.3716, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.387916431394692, |
|
"grad_norm": 2.793461561203003, |
|
"learning_rate": 3.306041784302654e-05, |
|
"loss": 1.3764, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 3.4820252211556557, |
|
"grad_norm": 3.1460959911346436, |
|
"learning_rate": 3.2589873894221726e-05, |
|
"loss": 1.3557, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 3.5761340109166198, |
|
"grad_norm": 6.384432792663574, |
|
"learning_rate": 3.2119329945416903e-05, |
|
"loss": 1.3503, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.6702428006775834, |
|
"grad_norm": 3.8678534030914307, |
|
"learning_rate": 3.164878599661209e-05, |
|
"loss": 1.3407, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 3.764351590438547, |
|
"grad_norm": 2.776543140411377, |
|
"learning_rate": 3.1178242047807265e-05, |
|
"loss": 1.3593, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.8584603801995105, |
|
"grad_norm": 8.332947731018066, |
|
"learning_rate": 3.070769809900245e-05, |
|
"loss": 1.3323, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.9525691699604746, |
|
"grad_norm": 5.115898609161377, |
|
"learning_rate": 3.0237154150197627e-05, |
|
"loss": 1.3184, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.046677959721438, |
|
"grad_norm": 5.099429607391357, |
|
"learning_rate": 2.9766610201392815e-05, |
|
"loss": 1.3082, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 4.140786749482402, |
|
"grad_norm": 3.901289939880371, |
|
"learning_rate": 2.9296066252587996e-05, |
|
"loss": 1.3038, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.234895539243365, |
|
"grad_norm": 3.7559354305267334, |
|
"learning_rate": 2.8825522303783176e-05, |
|
"loss": 1.2892, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 4.329004329004329, |
|
"grad_norm": 4.156054973602295, |
|
"learning_rate": 2.8354978354978357e-05, |
|
"loss": 1.2893, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 4.423113118765293, |
|
"grad_norm": 3.1980538368225098, |
|
"learning_rate": 2.7884434406173538e-05, |
|
"loss": 1.288, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 4.517221908526256, |
|
"grad_norm": 2.7643442153930664, |
|
"learning_rate": 2.741389045736872e-05, |
|
"loss": 1.2666, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 4.61133069828722, |
|
"grad_norm": 2.195343017578125, |
|
"learning_rate": 2.69433465085639e-05, |
|
"loss": 1.2632, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 4.705439488048183, |
|
"grad_norm": 2.6611695289611816, |
|
"learning_rate": 2.647280255975908e-05, |
|
"loss": 1.262, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 4.799548277809148, |
|
"grad_norm": 5.055428981781006, |
|
"learning_rate": 2.6002258610954265e-05, |
|
"loss": 1.2687, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 4.893657067570111, |
|
"grad_norm": 2.993502616882324, |
|
"learning_rate": 2.5531714662149446e-05, |
|
"loss": 1.2439, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.987765857331075, |
|
"grad_norm": 2.6108858585357666, |
|
"learning_rate": 2.5061170713344627e-05, |
|
"loss": 1.2315, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 5.081874647092039, |
|
"grad_norm": 3.422744035720825, |
|
"learning_rate": 2.4590626764539808e-05, |
|
"loss": 1.2216, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 5.175983436853002, |
|
"grad_norm": 2.409943103790283, |
|
"learning_rate": 2.412008281573499e-05, |
|
"loss": 1.214, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 5.270092226613966, |
|
"grad_norm": 2.378814458847046, |
|
"learning_rate": 2.3649538866930173e-05, |
|
"loss": 1.206, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 5.364201016374929, |
|
"grad_norm": 2.6459624767303467, |
|
"learning_rate": 2.3178994918125354e-05, |
|
"loss": 1.2078, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 5.458309806135893, |
|
"grad_norm": 3.1709744930267334, |
|
"learning_rate": 2.2708450969320535e-05, |
|
"loss": 1.1908, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 5.552418595896857, |
|
"grad_norm": 3.47601056098938, |
|
"learning_rate": 2.2237907020515716e-05, |
|
"loss": 1.192, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 5.64652738565782, |
|
"grad_norm": 3.5006909370422363, |
|
"learning_rate": 2.17673630717109e-05, |
|
"loss": 1.1913, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 5.740636175418784, |
|
"grad_norm": 2.8024189472198486, |
|
"learning_rate": 2.129681912290608e-05, |
|
"loss": 1.1815, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 5.834744965179748, |
|
"grad_norm": 4.2450947761535645, |
|
"learning_rate": 2.0826275174101262e-05, |
|
"loss": 1.1793, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 5.928853754940711, |
|
"grad_norm": 3.6110267639160156, |
|
"learning_rate": 2.0355731225296443e-05, |
|
"loss": 1.1734, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 6.0229625447016755, |
|
"grad_norm": 4.730632305145264, |
|
"learning_rate": 1.9885187276491627e-05, |
|
"loss": 1.1447, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 6.117071334462639, |
|
"grad_norm": 3.0380797386169434, |
|
"learning_rate": 1.9414643327686808e-05, |
|
"loss": 1.1206, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 6.211180124223603, |
|
"grad_norm": 4.4165358543396, |
|
"learning_rate": 1.894409937888199e-05, |
|
"loss": 1.1295, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 6.305288913984566, |
|
"grad_norm": 3.3100738525390625, |
|
"learning_rate": 1.847355543007717e-05, |
|
"loss": 1.1214, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 6.39939770374553, |
|
"grad_norm": 4.085879325866699, |
|
"learning_rate": 1.8003011481272354e-05, |
|
"loss": 1.1128, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 6.4935064935064934, |
|
"grad_norm": 3.0867068767547607, |
|
"learning_rate": 1.7532467532467535e-05, |
|
"loss": 1.114, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 6.587615283267457, |
|
"grad_norm": 3.721590757369995, |
|
"learning_rate": 1.7061923583662716e-05, |
|
"loss": 1.1037, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 6.681724073028421, |
|
"grad_norm": 3.4121475219726562, |
|
"learning_rate": 1.6591379634857897e-05, |
|
"loss": 1.107, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 6.775832862789384, |
|
"grad_norm": 3.0844948291778564, |
|
"learning_rate": 1.6120835686053078e-05, |
|
"loss": 1.1005, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 6.869941652550348, |
|
"grad_norm": 3.7161357402801514, |
|
"learning_rate": 1.565029173724826e-05, |
|
"loss": 1.0948, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 6.964050442311311, |
|
"grad_norm": 3.217207193374634, |
|
"learning_rate": 1.5179747788443441e-05, |
|
"loss": 1.0877, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 7.058159232072276, |
|
"grad_norm": 4.4285078048706055, |
|
"learning_rate": 1.4709203839638622e-05, |
|
"loss": 1.0559, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 7.1522680218332395, |
|
"grad_norm": 3.0102875232696533, |
|
"learning_rate": 1.4238659890833805e-05, |
|
"loss": 1.0322, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 7.246376811594203, |
|
"grad_norm": 3.4103572368621826, |
|
"learning_rate": 1.3768115942028985e-05, |
|
"loss": 1.0217, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 7.340485601355167, |
|
"grad_norm": 3.9534976482391357, |
|
"learning_rate": 1.3297571993224166e-05, |
|
"loss": 1.0248, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 7.43459439111613, |
|
"grad_norm": 5.0660719871521, |
|
"learning_rate": 1.2827028044419347e-05, |
|
"loss": 1.0233, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 7.528703180877094, |
|
"grad_norm": 4.05812931060791, |
|
"learning_rate": 1.235648409561453e-05, |
|
"loss": 1.0254, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 7.6228119706380575, |
|
"grad_norm": 2.9366817474365234, |
|
"learning_rate": 1.1885940146809712e-05, |
|
"loss": 1.0154, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 7.716920760399021, |
|
"grad_norm": 3.6969943046569824, |
|
"learning_rate": 1.1415396198004895e-05, |
|
"loss": 1.0023, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 7.811029550159985, |
|
"grad_norm": 3.294569969177246, |
|
"learning_rate": 3.4377940899680034e-05, |
|
"loss": 1.0861, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 7.905138339920948, |
|
"grad_norm": 1.961162805557251, |
|
"learning_rate": 3.418972332015811e-05, |
|
"loss": 1.105, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 7.999247129681912, |
|
"grad_norm": 4.648848056793213, |
|
"learning_rate": 3.400150574063617e-05, |
|
"loss": 1.1171, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 8.093355919442876, |
|
"grad_norm": 4.888842582702637, |
|
"learning_rate": 3.381328816111425e-05, |
|
"loss": 1.0555, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 8.187464709203839, |
|
"grad_norm": 3.3256289958953857, |
|
"learning_rate": 3.362507058159232e-05, |
|
"loss": 1.059, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 8.281573498964804, |
|
"grad_norm": 2.744330406188965, |
|
"learning_rate": 3.3436853002070396e-05, |
|
"loss": 1.0818, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 8.375682288725766, |
|
"grad_norm": 2.7452945709228516, |
|
"learning_rate": 3.3248635422548465e-05, |
|
"loss": 1.0772, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 8.46979107848673, |
|
"grad_norm": 2.993236780166626, |
|
"learning_rate": 3.306041784302654e-05, |
|
"loss": 1.0896, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 8.563899868247695, |
|
"grad_norm": 4.128504276275635, |
|
"learning_rate": 3.287220026350462e-05, |
|
"loss": 1.0806, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 8.658008658008658, |
|
"grad_norm": 3.3763933181762695, |
|
"learning_rate": 3.268398268398268e-05, |
|
"loss": 1.0713, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 8.752117447769622, |
|
"grad_norm": 2.213223457336426, |
|
"learning_rate": 3.249576510446076e-05, |
|
"loss": 1.0698, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 8.846226237530585, |
|
"grad_norm": 3.0853641033172607, |
|
"learning_rate": 3.230754752493883e-05, |
|
"loss": 1.0724, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 8.94033502729155, |
|
"grad_norm": 3.031791925430298, |
|
"learning_rate": 3.2119329945416903e-05, |
|
"loss": 1.0672, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 9.034443817052512, |
|
"grad_norm": 3.0765955448150635, |
|
"learning_rate": 3.193111236589498e-05, |
|
"loss": 1.0545, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 9.128552606813477, |
|
"grad_norm": 4.204183101654053, |
|
"learning_rate": 3.174289478637305e-05, |
|
"loss": 1.0037, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 9.22266139657444, |
|
"grad_norm": 5.794823169708252, |
|
"learning_rate": 3.155467720685112e-05, |
|
"loss": 1.0069, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 9.316770186335404, |
|
"grad_norm": 3.246511459350586, |
|
"learning_rate": 3.136645962732919e-05, |
|
"loss": 1.0151, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 9.410878976096367, |
|
"grad_norm": 2.7034826278686523, |
|
"learning_rate": 3.1178242047807265e-05, |
|
"loss": 1.0183, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 9.504987765857331, |
|
"grad_norm": 2.402880907058716, |
|
"learning_rate": 3.099002446828534e-05, |
|
"loss": 1.0193, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 9.599096555618296, |
|
"grad_norm": 5.0139336585998535, |
|
"learning_rate": 3.080180688876341e-05, |
|
"loss": 1.0145, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 9.693205345379258, |
|
"grad_norm": 3.907541275024414, |
|
"learning_rate": 3.061358930924149e-05, |
|
"loss": 1.011, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 9.787314135140223, |
|
"grad_norm": 4.411949157714844, |
|
"learning_rate": 3.042537172971956e-05, |
|
"loss": 1.0063, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 9.881422924901186, |
|
"grad_norm": 4.035211563110352, |
|
"learning_rate": 3.0237154150197627e-05, |
|
"loss": 1.0149, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 9.97553171466215, |
|
"grad_norm": 3.6050124168395996, |
|
"learning_rate": 3.00489365706757e-05, |
|
"loss": 1.0027, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 10.069640504423113, |
|
"grad_norm": 3.40191388130188, |
|
"learning_rate": 2.9860718991153773e-05, |
|
"loss": 0.9468, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 10.163749294184077, |
|
"grad_norm": 2.8000032901763916, |
|
"learning_rate": 2.9672501411631846e-05, |
|
"loss": 0.9248, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 10.25785808394504, |
|
"grad_norm": 4.177460193634033, |
|
"learning_rate": 2.9484283832109923e-05, |
|
"loss": 0.9389, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 10.351966873706004, |
|
"grad_norm": 3.88246750831604, |
|
"learning_rate": 2.9296066252587996e-05, |
|
"loss": 0.9326, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 10.446075663466967, |
|
"grad_norm": 4.002796173095703, |
|
"learning_rate": 2.910784867306607e-05, |
|
"loss": 0.9378, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 10.540184453227932, |
|
"grad_norm": 4.069864749908447, |
|
"learning_rate": 2.8919631093544135e-05, |
|
"loss": 0.9405, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 10.634293242988894, |
|
"grad_norm": 3.493865966796875, |
|
"learning_rate": 2.8731413514022208e-05, |
|
"loss": 0.9347, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 10.728402032749859, |
|
"grad_norm": 3.725019693374634, |
|
"learning_rate": 2.8543195934500284e-05, |
|
"loss": 0.9428, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 10.822510822510823, |
|
"grad_norm": 3.557591438293457, |
|
"learning_rate": 2.8354978354978357e-05, |
|
"loss": 0.9397, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 10.916619612271786, |
|
"grad_norm": 3.370312213897705, |
|
"learning_rate": 2.816676077545643e-05, |
|
"loss": 0.939, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 11.01072840203275, |
|
"grad_norm": 2.937887191772461, |
|
"learning_rate": 2.7978543195934503e-05, |
|
"loss": 0.9225, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 11.104837191793713, |
|
"grad_norm": 2.403350353240967, |
|
"learning_rate": 2.7790325616412576e-05, |
|
"loss": 0.827, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 11.198945981554678, |
|
"grad_norm": 2.4876935482025146, |
|
"learning_rate": 2.7602108036890646e-05, |
|
"loss": 0.8461, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 11.29305477131564, |
|
"grad_norm": 3.663511276245117, |
|
"learning_rate": 2.741389045736872e-05, |
|
"loss": 0.8434, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 11.387163561076605, |
|
"grad_norm": 4.3568806648254395, |
|
"learning_rate": 2.7225672877846792e-05, |
|
"loss": 0.8496, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 11.481272350837568, |
|
"grad_norm": 3.4234845638275146, |
|
"learning_rate": 2.7037455298324865e-05, |
|
"loss": 0.8549, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 11.575381140598532, |
|
"grad_norm": 2.537666082382202, |
|
"learning_rate": 2.6849237718802938e-05, |
|
"loss": 0.8548, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 11.669489930359495, |
|
"grad_norm": 3.7678942680358887, |
|
"learning_rate": 2.666102013928101e-05, |
|
"loss": 0.8501, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 11.76359872012046, |
|
"grad_norm": 3.4384429454803467, |
|
"learning_rate": 2.647280255975908e-05, |
|
"loss": 0.8546, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 11.857707509881424, |
|
"grad_norm": 2.883970022201538, |
|
"learning_rate": 2.6284584980237154e-05, |
|
"loss": 0.8494, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 11.951816299642386, |
|
"grad_norm": 3.2041871547698975, |
|
"learning_rate": 2.6096367400715227e-05, |
|
"loss": 0.8473, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 12.045925089403351, |
|
"grad_norm": 3.7006897926330566, |
|
"learning_rate": 2.59081498211933e-05, |
|
"loss": 0.7851, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 12.140033879164314, |
|
"grad_norm": 3.4774985313415527, |
|
"learning_rate": 2.5719932241671373e-05, |
|
"loss": 0.7241, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 12.234142668925278, |
|
"grad_norm": 3.615403413772583, |
|
"learning_rate": 2.5531714662149446e-05, |
|
"loss": 0.7296, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 12.32825145868624, |
|
"grad_norm": 4.831261157989502, |
|
"learning_rate": 2.534349708262752e-05, |
|
"loss": 0.7382, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 12.422360248447205, |
|
"grad_norm": 3.072159767150879, |
|
"learning_rate": 2.515527950310559e-05, |
|
"loss": 0.733, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 12.516469038208168, |
|
"grad_norm": 3.442324161529541, |
|
"learning_rate": 2.4967061923583665e-05, |
|
"loss": 0.7453, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 12.610577827969133, |
|
"grad_norm": 2.71148419380188, |
|
"learning_rate": 2.4778844344061735e-05, |
|
"loss": 0.7388, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 12.704686617730095, |
|
"grad_norm": 3.6085212230682373, |
|
"learning_rate": 2.4590626764539808e-05, |
|
"loss": 0.7444, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 12.79879540749106, |
|
"grad_norm": 3.9403274059295654, |
|
"learning_rate": 2.440240918501788e-05, |
|
"loss": 0.7291, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 12.892904197252022, |
|
"grad_norm": 3.322840929031372, |
|
"learning_rate": 2.4214191605495954e-05, |
|
"loss": 0.7523, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 12.987012987012987, |
|
"grad_norm": 4.849690914154053, |
|
"learning_rate": 2.4025974025974027e-05, |
|
"loss": 0.7477, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 13.081121776773951, |
|
"grad_norm": 4.5327959060668945, |
|
"learning_rate": 2.38377564464521e-05, |
|
"loss": 0.5957, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 13.175230566534914, |
|
"grad_norm": 2.9526569843292236, |
|
"learning_rate": 2.3649538866930173e-05, |
|
"loss": 0.5829, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 13.269339356295879, |
|
"grad_norm": 2.7245306968688965, |
|
"learning_rate": 2.3461321287408243e-05, |
|
"loss": 0.5948, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 13.363448146056841, |
|
"grad_norm": 3.7524588108062744, |
|
"learning_rate": 2.327310370788632e-05, |
|
"loss": 0.5956, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 13.457556935817806, |
|
"grad_norm": 4.387008190155029, |
|
"learning_rate": 2.3084886128364392e-05, |
|
"loss": 0.605, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 13.551665725578768, |
|
"grad_norm": 3.449723482131958, |
|
"learning_rate": 2.2896668548842462e-05, |
|
"loss": 0.6087, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 13.645774515339733, |
|
"grad_norm": 3.2176854610443115, |
|
"learning_rate": 2.2708450969320535e-05, |
|
"loss": 0.6141, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 13.739883305100696, |
|
"grad_norm": 4.136612415313721, |
|
"learning_rate": 2.2520233389798608e-05, |
|
"loss": 0.6251, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 13.83399209486166, |
|
"grad_norm": 4.145909786224365, |
|
"learning_rate": 2.233201581027668e-05, |
|
"loss": 0.6094, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 13.928100884622623, |
|
"grad_norm": 3.8660261631011963, |
|
"learning_rate": 2.2143798230754754e-05, |
|
"loss": 0.6194, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 14.022209674383587, |
|
"grad_norm": 3.511117696762085, |
|
"learning_rate": 2.1955580651232827e-05, |
|
"loss": 0.5712, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 14.116318464144552, |
|
"grad_norm": 2.771902084350586, |
|
"learning_rate": 2.17673630717109e-05, |
|
"loss": 0.4559, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 14.210427253905515, |
|
"grad_norm": 5.087037086486816, |
|
"learning_rate": 2.157914549218897e-05, |
|
"loss": 0.4529, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 14.304536043666479, |
|
"grad_norm": 4.842881679534912, |
|
"learning_rate": 2.1390927912667043e-05, |
|
"loss": 0.4567, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 14.398644833427442, |
|
"grad_norm": 3.444424629211426, |
|
"learning_rate": 2.120271033314512e-05, |
|
"loss": 0.4618, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 14.492753623188406, |
|
"grad_norm": 3.934549331665039, |
|
"learning_rate": 2.101449275362319e-05, |
|
"loss": 0.455, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 14.586862412949369, |
|
"grad_norm": 4.338078498840332, |
|
"learning_rate": 2.0826275174101262e-05, |
|
"loss": 0.472, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 14.680971202710333, |
|
"grad_norm": 3.781182050704956, |
|
"learning_rate": 2.0638057594579335e-05, |
|
"loss": 0.4613, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 14.775079992471296, |
|
"grad_norm": 3.866917371749878, |
|
"learning_rate": 2.0449840015057405e-05, |
|
"loss": 0.4791, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 14.86918878223226, |
|
"grad_norm": 3.5265841484069824, |
|
"learning_rate": 2.026162243553548e-05, |
|
"loss": 0.4709, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 14.963297571993223, |
|
"grad_norm": 3.3175408840179443, |
|
"learning_rate": 2.0073404856013554e-05, |
|
"loss": 0.4847, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 15.057406361754188, |
|
"grad_norm": 2.848555564880371, |
|
"learning_rate": 1.9885187276491627e-05, |
|
"loss": 0.3788, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 15.151515151515152, |
|
"grad_norm": 3.355653762817383, |
|
"learning_rate": 1.9696969696969697e-05, |
|
"loss": 0.3235, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 15.245623941276115, |
|
"grad_norm": 3.260960817337036, |
|
"learning_rate": 1.950875211744777e-05, |
|
"loss": 0.3162, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 15.33973273103708, |
|
"grad_norm": 3.9132673740386963, |
|
"learning_rate": 1.9320534537925843e-05, |
|
"loss": 0.3209, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 15.433841520798042, |
|
"grad_norm": 4.693389892578125, |
|
"learning_rate": 1.9132316958403916e-05, |
|
"loss": 0.3421, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 15.527950310559007, |
|
"grad_norm": 3.7237417697906494, |
|
"learning_rate": 1.894409937888199e-05, |
|
"loss": 0.3441, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 15.62205910031997, |
|
"grad_norm": 3.9301464557647705, |
|
"learning_rate": 1.8755881799360062e-05, |
|
"loss": 0.3424, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 15.716167890080934, |
|
"grad_norm": 4.186377048492432, |
|
"learning_rate": 1.8567664219838135e-05, |
|
"loss": 0.3503, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 15.810276679841897, |
|
"grad_norm": 3.6368534564971924, |
|
"learning_rate": 1.8379446640316205e-05, |
|
"loss": 0.3347, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 15.904385469602861, |
|
"grad_norm": 2.6787190437316895, |
|
"learning_rate": 1.8191229060794278e-05, |
|
"loss": 0.3445, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 15.998494259363824, |
|
"grad_norm": 3.942444324493408, |
|
"learning_rate": 1.8003011481272354e-05, |
|
"loss": 0.343, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 16.09260304912479, |
|
"grad_norm": 3.3087995052337646, |
|
"learning_rate": 1.7814793901750424e-05, |
|
"loss": 0.2093, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 16.186711838885753, |
|
"grad_norm": 4.419996738433838, |
|
"learning_rate": 1.7626576322228497e-05, |
|
"loss": 0.2209, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 16.280820628646715, |
|
"grad_norm": 4.352416515350342, |
|
"learning_rate": 1.743835874270657e-05, |
|
"loss": 0.2282, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 16.374929418407678, |
|
"grad_norm": 3.7437074184417725, |
|
"learning_rate": 1.725014116318464e-05, |
|
"loss": 0.232, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 16.469038208168644, |
|
"grad_norm": 6.4077534675598145, |
|
"learning_rate": 1.7061923583662716e-05, |
|
"loss": 0.2291, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 16.563146997929607, |
|
"grad_norm": 3.0419087409973145, |
|
"learning_rate": 1.687370600414079e-05, |
|
"loss": 0.2401, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 16.65725578769057, |
|
"grad_norm": 4.095304012298584, |
|
"learning_rate": 1.6685488424618862e-05, |
|
"loss": 0.233, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 16.751364577451533, |
|
"grad_norm": 3.372295618057251, |
|
"learning_rate": 1.649727084509693e-05, |
|
"loss": 0.2354, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 16.8454733672125, |
|
"grad_norm": 2.918405771255493, |
|
"learning_rate": 1.6309053265575005e-05, |
|
"loss": 0.2421, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 16.93958215697346, |
|
"grad_norm": 2.4548110961914062, |
|
"learning_rate": 1.6120835686053078e-05, |
|
"loss": 0.2374, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 17.033690946734424, |
|
"grad_norm": 4.15058708190918, |
|
"learning_rate": 1.593261810653115e-05, |
|
"loss": 0.2135, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 17.12779973649539, |
|
"grad_norm": 2.979170083999634, |
|
"learning_rate": 1.5744400527009224e-05, |
|
"loss": 0.1495, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 17.221908526256353, |
|
"grad_norm": 3.950493574142456, |
|
"learning_rate": 1.5556182947487297e-05, |
|
"loss": 0.1503, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 17.316017316017316, |
|
"grad_norm": 3.174896001815796, |
|
"learning_rate": 1.5367965367965366e-05, |
|
"loss": 0.1559, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 17.41012610577828, |
|
"grad_norm": 3.4672741889953613, |
|
"learning_rate": 1.5179747788443441e-05, |
|
"loss": 0.1606, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 17.504234895539245, |
|
"grad_norm": 3.950160026550293, |
|
"learning_rate": 1.4991530208921514e-05, |
|
"loss": 0.1561, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 17.598343685300208, |
|
"grad_norm": 3.546109199523926, |
|
"learning_rate": 1.4803312629399587e-05, |
|
"loss": 0.1622, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 17.69245247506117, |
|
"grad_norm": 6.592913627624512, |
|
"learning_rate": 1.4615095049877658e-05, |
|
"loss": 0.1644, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 17.786561264822133, |
|
"grad_norm": 2.762545347213745, |
|
"learning_rate": 1.4426877470355732e-05, |
|
"loss": 0.1614, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 17.8806700545831, |
|
"grad_norm": 2.9720265865325928, |
|
"learning_rate": 1.4238659890833805e-05, |
|
"loss": 0.1577, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 17.974778844344062, |
|
"grad_norm": 3.006880283355713, |
|
"learning_rate": 1.4050442311311876e-05, |
|
"loss": 0.1577, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 18.068887634105025, |
|
"grad_norm": 2.86936616897583, |
|
"learning_rate": 1.3862224731789949e-05, |
|
"loss": 0.1162, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 18.162996423865987, |
|
"grad_norm": 3.608914852142334, |
|
"learning_rate": 1.3674007152268024e-05, |
|
"loss": 0.1001, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 18.257105213626954, |
|
"grad_norm": 3.126116991043091, |
|
"learning_rate": 1.3485789572746097e-05, |
|
"loss": 0.1082, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 18.351214003387916, |
|
"grad_norm": 2.9935250282287598, |
|
"learning_rate": 1.3297571993224166e-05, |
|
"loss": 0.109, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 18.44532279314888, |
|
"grad_norm": 1.9776825904846191, |
|
"learning_rate": 1.3109354413702241e-05, |
|
"loss": 0.1031, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 18.539431582909845, |
|
"grad_norm": 4.182958602905273, |
|
"learning_rate": 1.2921136834180314e-05, |
|
"loss": 0.1084, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 18.633540372670808, |
|
"grad_norm": 3.405510902404785, |
|
"learning_rate": 1.2732919254658385e-05, |
|
"loss": 0.1069, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 18.72764916243177, |
|
"grad_norm": 2.7036936283111572, |
|
"learning_rate": 1.2544701675136458e-05, |
|
"loss": 0.1084, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 18.821757952192733, |
|
"grad_norm": 2.601555347442627, |
|
"learning_rate": 1.235648409561453e-05, |
|
"loss": 0.1097, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 18.9158667419537, |
|
"grad_norm": 2.9937756061553955, |
|
"learning_rate": 1.2168266516092605e-05, |
|
"loss": 0.1004, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 19.009975531714662, |
|
"grad_norm": 1.493790864944458, |
|
"learning_rate": 1.1980048936570676e-05, |
|
"loss": 0.1032, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 19.104084321475625, |
|
"grad_norm": 1.7063292264938354, |
|
"learning_rate": 1.1791831357048749e-05, |
|
"loss": 0.0667, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 19.198193111236588, |
|
"grad_norm": 2.8355624675750732, |
|
"learning_rate": 1.1603613777526822e-05, |
|
"loss": 0.0678, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 19.292301900997554, |
|
"grad_norm": 1.7610359191894531, |
|
"learning_rate": 1.1415396198004895e-05, |
|
"loss": 0.0654, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 19.386410690758517, |
|
"grad_norm": 2.759197950363159, |
|
"learning_rate": 1.1227178618482966e-05, |
|
"loss": 0.0717, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 19.48051948051948, |
|
"grad_norm": 2.657435417175293, |
|
"learning_rate": 1.103896103896104e-05, |
|
"loss": 0.0722, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 19.574628270280446, |
|
"grad_norm": 2.5865299701690674, |
|
"learning_rate": 1.0850743459439112e-05, |
|
"loss": 0.0689, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 19.66873706004141, |
|
"grad_norm": 4.484961986541748, |
|
"learning_rate": 1.0662525879917184e-05, |
|
"loss": 0.0706, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 19.76284584980237, |
|
"grad_norm": 2.431190252304077, |
|
"learning_rate": 1.0474308300395258e-05, |
|
"loss": 0.0691, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 19.856954639563334, |
|
"grad_norm": 2.6577088832855225, |
|
"learning_rate": 1.028609072087333e-05, |
|
"loss": 0.0704, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 19.9510634293243, |
|
"grad_norm": 3.1382012367248535, |
|
"learning_rate": 1.0097873141351403e-05, |
|
"loss": 0.0673, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 20.045172219085263, |
|
"grad_norm": 1.5920716524124146, |
|
"learning_rate": 9.909655561829476e-06, |
|
"loss": 0.0605, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 20.139281008846226, |
|
"grad_norm": 0.49824026226997375, |
|
"learning_rate": 9.721437982307547e-06, |
|
"loss": 0.0427, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 20.23338979860719, |
|
"grad_norm": 1.5648902654647827, |
|
"learning_rate": 9.533220402785622e-06, |
|
"loss": 0.042, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 20.327498588368154, |
|
"grad_norm": 2.084714889526367, |
|
"learning_rate": 9.345002823263693e-06, |
|
"loss": 0.0442, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 20.421607378129117, |
|
"grad_norm": 2.1811583042144775, |
|
"learning_rate": 9.156785243741765e-06, |
|
"loss": 0.0424, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 20.51571616789008, |
|
"grad_norm": 3.9864232540130615, |
|
"learning_rate": 8.96856766421984e-06, |
|
"loss": 0.0439, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 20.609824957651046, |
|
"grad_norm": 4.495816707611084, |
|
"learning_rate": 8.78035008469791e-06, |
|
"loss": 0.0433, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 20.70393374741201, |
|
"grad_norm": 1.6480516195297241, |
|
"learning_rate": 8.592132505175984e-06, |
|
"loss": 0.0454, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 20.79804253717297, |
|
"grad_norm": 1.1842634677886963, |
|
"learning_rate": 8.403914925654057e-06, |
|
"loss": 0.0425, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 20.892151326933934, |
|
"grad_norm": 4.60665225982666, |
|
"learning_rate": 8.215697346132128e-06, |
|
"loss": 0.0419, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 20.9862601166949, |
|
"grad_norm": 3.4669153690338135, |
|
"learning_rate": 8.027479766610201e-06, |
|
"loss": 0.0412, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 21.080368906455863, |
|
"grad_norm": 0.7074203491210938, |
|
"learning_rate": 7.839262187088274e-06, |
|
"loss": 0.0285, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 21.174477696216826, |
|
"grad_norm": 2.58770489692688, |
|
"learning_rate": 7.651044607566347e-06, |
|
"loss": 0.0275, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 21.26858648597779, |
|
"grad_norm": 0.6419113874435425, |
|
"learning_rate": 7.462827028044419e-06, |
|
"loss": 0.0263, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 21.362695275738755, |
|
"grad_norm": 2.158191204071045, |
|
"learning_rate": 7.274609448522493e-06, |
|
"loss": 0.0267, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 21.456804065499718, |
|
"grad_norm": 1.1450761556625366, |
|
"learning_rate": 7.0863918690005655e-06, |
|
"loss": 0.024, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 21.55091285526068, |
|
"grad_norm": 0.9204089045524597, |
|
"learning_rate": 6.898174289478637e-06, |
|
"loss": 0.0273, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 21.645021645021647, |
|
"grad_norm": 1.3897809982299805, |
|
"learning_rate": 6.709956709956711e-06, |
|
"loss": 0.0276, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 2.818786382675171, |
|
"learning_rate": 6.521739130434783e-06, |
|
"loss": 0.027, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 21.833239224543572, |
|
"grad_norm": 1.8107503652572632, |
|
"learning_rate": 6.333521550912856e-06, |
|
"loss": 0.025, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 21.927348014304535, |
|
"grad_norm": 2.591801881790161, |
|
"learning_rate": 6.145303971390928e-06, |
|
"loss": 0.0254, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 22.0214568040655, |
|
"grad_norm": 1.22541344165802, |
|
"learning_rate": 5.957086391869001e-06, |
|
"loss": 0.0207, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 22.115565593826464, |
|
"grad_norm": 2.6778624057769775, |
|
"learning_rate": 5.768868812347074e-06, |
|
"loss": 0.0148, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 22.209674383587426, |
|
"grad_norm": 1.2167950868606567, |
|
"learning_rate": 5.5806512328251455e-06, |
|
"loss": 0.0134, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 22.30378317334839, |
|
"grad_norm": 0.1222626119852066, |
|
"learning_rate": 5.3924336533032186e-06, |
|
"loss": 0.0175, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 22.397891963109355, |
|
"grad_norm": 0.3822714686393738, |
|
"learning_rate": 5.204216073781292e-06, |
|
"loss": 0.0156, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 22.492000752870318, |
|
"grad_norm": 0.542395293712616, |
|
"learning_rate": 5.015998494259365e-06, |
|
"loss": 0.0141, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 22.58610954263128, |
|
"grad_norm": 1.051392674446106, |
|
"learning_rate": 4.827780914737437e-06, |
|
"loss": 0.0138, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 22.680218332392247, |
|
"grad_norm": 1.7265046834945679, |
|
"learning_rate": 4.639563335215509e-06, |
|
"loss": 0.0139, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 22.77432712215321, |
|
"grad_norm": 0.8157036304473877, |
|
"learning_rate": 4.451345755693582e-06, |
|
"loss": 0.0147, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 22.868435911914172, |
|
"grad_norm": 2.212116003036499, |
|
"learning_rate": 4.263128176171654e-06, |
|
"loss": 0.0157, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 22.962544701675135, |
|
"grad_norm": 1.5762394666671753, |
|
"learning_rate": 4.074910596649727e-06, |
|
"loss": 0.0139, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 23.0566534914361, |
|
"grad_norm": 0.16070736944675446, |
|
"learning_rate": 3.8866930171278e-06, |
|
"loss": 0.0113, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 23.150762281197064, |
|
"grad_norm": 0.6197986602783203, |
|
"learning_rate": 3.698475437605872e-06, |
|
"loss": 0.0066, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 23.244871070958027, |
|
"grad_norm": 0.2145221084356308, |
|
"learning_rate": 3.510257858083945e-06, |
|
"loss": 0.0061, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 23.33897986071899, |
|
"grad_norm": 0.1779479682445526, |
|
"learning_rate": 3.3220402785620177e-06, |
|
"loss": 0.0059, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 23.433088650479956, |
|
"grad_norm": 0.18615959584712982, |
|
"learning_rate": 3.1338226990400907e-06, |
|
"loss": 0.0057, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 23.52719744024092, |
|
"grad_norm": 1.9545246362686157, |
|
"learning_rate": 2.945605119518163e-06, |
|
"loss": 0.0067, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 23.62130623000188, |
|
"grad_norm": 2.255216598510742, |
|
"learning_rate": 2.7573875399962355e-06, |
|
"loss": 0.0062, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 23.715415019762847, |
|
"grad_norm": 0.28629258275032043, |
|
"learning_rate": 2.5691699604743086e-06, |
|
"loss": 0.0054, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 23.80952380952381, |
|
"grad_norm": 0.1650991588830948, |
|
"learning_rate": 2.3809523809523808e-06, |
|
"loss": 0.0056, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 23.903632599284773, |
|
"grad_norm": 0.45735275745391846, |
|
"learning_rate": 2.192734801430454e-06, |
|
"loss": 0.006, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 23.997741389045736, |
|
"grad_norm": 1.3017574548721313, |
|
"learning_rate": 2.0045172219085264e-06, |
|
"loss": 0.0047, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 24.091850178806702, |
|
"grad_norm": 0.15063992142677307, |
|
"learning_rate": 1.8162996423865988e-06, |
|
"loss": 0.0037, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 24.185958968567665, |
|
"grad_norm": 0.09415856748819351, |
|
"learning_rate": 1.6280820628646716e-06, |
|
"loss": 0.0021, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 24.280067758328627, |
|
"grad_norm": 0.06203685700893402, |
|
"learning_rate": 1.4398644833427442e-06, |
|
"loss": 0.0017, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 24.37417654808959, |
|
"grad_norm": 1.0590204000473022, |
|
"learning_rate": 1.2516469038208169e-06, |
|
"loss": 0.0016, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 24.468285337850556, |
|
"grad_norm": 0.05048515647649765, |
|
"learning_rate": 1.0634293242988897e-06, |
|
"loss": 0.0014, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 24.56239412761152, |
|
"grad_norm": 0.10998225957155228, |
|
"learning_rate": 8.752117447769622e-07, |
|
"loss": 0.0014, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 24.65650291737248, |
|
"grad_norm": 0.03882027417421341, |
|
"learning_rate": 6.869941652550348e-07, |
|
"loss": 0.0013, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 24.750611707133448, |
|
"grad_norm": 0.09357800334692001, |
|
"learning_rate": 4.987765857331075e-07, |
|
"loss": 0.0016, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 24.84472049689441, |
|
"grad_norm": 0.18081815540790558, |
|
"learning_rate": 3.1055900621118013e-07, |
|
"loss": 0.0016, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 24.938829286655373, |
|
"grad_norm": 0.04123268648982048, |
|
"learning_rate": 1.223414266892528e-07, |
|
"loss": 0.0025, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"step": 132825, |
|
"total_flos": 2.908084043402707e+18, |
|
"train_loss": 0.022898558901232692, |
|
"train_runtime": 71987.3815, |
|
"train_samples_per_second": 29.519, |
|
"train_steps_per_second": 1.845 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 132825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.908084043402707e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|