|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9992108378271736, |
|
"eval_steps": 500, |
|
"global_step": 1900, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 21.429368641007674, |
|
"learning_rate": 7.017543859649123e-08, |
|
"loss": 1.3492, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.31312382727859, |
|
"learning_rate": 7.017543859649123e-07, |
|
"loss": 1.3613, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.007423000393036, |
|
"learning_rate": 1.4035087719298246e-06, |
|
"loss": 1.2748, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.0403045598401235, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 1.2139, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.0964386063482452, |
|
"learning_rate": 2.8070175438596493e-06, |
|
"loss": 1.1675, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8673269208557858, |
|
"learning_rate": 3.5087719298245615e-06, |
|
"loss": 1.1426, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.9321512205829348, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 1.1429, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8679633177498763, |
|
"learning_rate": 4.912280701754386e-06, |
|
"loss": 1.1172, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8837362567367552, |
|
"learning_rate": 5.6140350877192985e-06, |
|
"loss": 1.1265, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8764206258248486, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 1.1018, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7946842003727972, |
|
"learning_rate": 7.017543859649123e-06, |
|
"loss": 1.0954, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8556672031028142, |
|
"learning_rate": 7.719298245614036e-06, |
|
"loss": 1.1026, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.9218462405321642, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 1.106, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8068032094497395, |
|
"learning_rate": 9.12280701754386e-06, |
|
"loss": 1.09, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8339454651398522, |
|
"learning_rate": 9.824561403508772e-06, |
|
"loss": 1.0831, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.9036107262137951, |
|
"learning_rate": 1.0526315789473684e-05, |
|
"loss": 1.0992, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.9507432313402697, |
|
"learning_rate": 1.1228070175438597e-05, |
|
"loss": 1.0784, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8907962932508753, |
|
"learning_rate": 1.192982456140351e-05, |
|
"loss": 1.0945, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.9738408667871853, |
|
"learning_rate": 1.263157894736842e-05, |
|
"loss": 1.0942, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9062994953351728, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 1.0805, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.876947526057646, |
|
"learning_rate": 1.4035087719298246e-05, |
|
"loss": 1.0913, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.0277290799105536, |
|
"learning_rate": 1.4736842105263159e-05, |
|
"loss": 1.0876, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.885527804467122, |
|
"learning_rate": 1.543859649122807e-05, |
|
"loss": 1.0815, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.9859422411610012, |
|
"learning_rate": 1.6140350877192984e-05, |
|
"loss": 1.0908, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.0229418982903158, |
|
"learning_rate": 1.6842105263157896e-05, |
|
"loss": 1.0791, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8569947258085523, |
|
"learning_rate": 1.754385964912281e-05, |
|
"loss": 1.0853, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.8783916684369713, |
|
"learning_rate": 1.824561403508772e-05, |
|
"loss": 1.0941, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.8738383070305185, |
|
"learning_rate": 1.894736842105263e-05, |
|
"loss": 1.0838, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.9534322549204314, |
|
"learning_rate": 1.9649122807017544e-05, |
|
"loss": 1.0959, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.9650066100194556, |
|
"learning_rate": 1.9999812486015525e-05, |
|
"loss": 1.0874, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9216993889093766, |
|
"learning_rate": 1.999831241633323e-05, |
|
"loss": 1.1062, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.8877068992817276, |
|
"learning_rate": 1.9995312501993765e-05, |
|
"loss": 1.0869, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7994033406223618, |
|
"learning_rate": 1.9990813193013625e-05, |
|
"loss": 1.1007, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8897446412058863, |
|
"learning_rate": 1.9984815164333163e-05, |
|
"loss": 1.1015, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.8778722190072454, |
|
"learning_rate": 1.997731931571535e-05, |
|
"loss": 1.0809, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.8678685090268201, |
|
"learning_rate": 1.9968326771610797e-05, |
|
"loss": 1.0842, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8650925218664476, |
|
"learning_rate": 1.9957838880989076e-05, |
|
"loss": 1.0873, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.8697128878351399, |
|
"learning_rate": 1.9945857217136365e-05, |
|
"loss": 1.09, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8530013045101023, |
|
"learning_rate": 1.9932383577419432e-05, |
|
"loss": 1.0796, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8894110888486104, |
|
"learning_rate": 1.9917419983016025e-05, |
|
"loss": 1.0865, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.741007374421567, |
|
"learning_rate": 1.9900968678611664e-05, |
|
"loss": 1.0737, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8681690517112644, |
|
"learning_rate": 1.9883032132062926e-05, |
|
"loss": 1.0926, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.8833251327405618, |
|
"learning_rate": 1.9863613034027224e-05, |
|
"loss": 1.0749, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.8981504263742338, |
|
"learning_rate": 1.9842714297559212e-05, |
|
"loss": 1.0816, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.978688604732774, |
|
"learning_rate": 1.9820339057673773e-05, |
|
"loss": 1.0935, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.8996937553022554, |
|
"learning_rate": 1.979649067087574e-05, |
|
"loss": 1.0826, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.8811418408051818, |
|
"learning_rate": 1.97711727146564e-05, |
|
"loss": 1.0879, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0101190939103164, |
|
"learning_rate": 1.9744388986956824e-05, |
|
"loss": 1.0877, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7613467454313384, |
|
"learning_rate": 1.971614350559814e-05, |
|
"loss": 1.0727, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7703159126101999, |
|
"learning_rate": 1.9686440507678827e-05, |
|
"loss": 1.0733, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7868241714503517, |
|
"learning_rate": 1.9655284448939094e-05, |
|
"loss": 1.0712, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7897482799062521, |
|
"learning_rate": 1.9622680003092503e-05, |
|
"loss": 1.0802, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7588882568608744, |
|
"learning_rate": 1.9588632061124837e-05, |
|
"loss": 1.0922, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7382743506650484, |
|
"learning_rate": 1.9553145730560415e-05, |
|
"loss": 1.077, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.8453793493894306, |
|
"learning_rate": 1.951622633469592e-05, |
|
"loss": 1.0754, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7110389209689899, |
|
"learning_rate": 1.9477879411801843e-05, |
|
"loss": 1.078, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.8056008052980671, |
|
"learning_rate": 1.9438110714291697e-05, |
|
"loss": 1.0728, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.9240547335105442, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 1.0653, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.781437426142816, |
|
"learning_rate": 1.935433207058281e-05, |
|
"loss": 1.0901, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.8686654994863429, |
|
"learning_rate": 1.9310334692000077e-05, |
|
"loss": 1.0983, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7772396469297038, |
|
"learning_rate": 1.9264940672148018e-05, |
|
"loss": 1.0708, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7651697115565855, |
|
"learning_rate": 1.9218156820573618e-05, |
|
"loss": 1.0817, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7565300585910141, |
|
"learning_rate": 1.916999015531221e-05, |
|
"loss": 1.0915, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.8420338788100721, |
|
"learning_rate": 1.9120447901834708e-05, |
|
"loss": 1.0719, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7588794329097819, |
|
"learning_rate": 1.906953749196371e-05, |
|
"loss": 1.0698, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8810609158187063, |
|
"learning_rate": 1.901726656275866e-05, |
|
"loss": 1.0734, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.848783007432763, |
|
"learning_rate": 1.8963642955370203e-05, |
|
"loss": 1.0761, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8595204312733032, |
|
"learning_rate": 1.890867471386395e-05, |
|
"loss": 1.0751, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.8035640869223637, |
|
"learning_rate": 1.8852370084013783e-05, |
|
"loss": 1.0674, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7926712110270574, |
|
"learning_rate": 1.879473751206489e-05, |
|
"loss": 1.0788, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7472134613955423, |
|
"learning_rate": 1.8735785643466786e-05, |
|
"loss": 1.0711, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7535511662219013, |
|
"learning_rate": 1.867552332157637e-05, |
|
"loss": 1.0786, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7620302080300618, |
|
"learning_rate": 1.8613959586331364e-05, |
|
"loss": 1.0748, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7771293813366091, |
|
"learning_rate": 1.855110367289421e-05, |
|
"loss": 1.0734, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7520618932539609, |
|
"learning_rate": 1.8486965010266726e-05, |
|
"loss": 1.0843, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7523029137721694, |
|
"learning_rate": 1.842155321987566e-05, |
|
"loss": 1.0759, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7323944181966383, |
|
"learning_rate": 1.8354878114129368e-05, |
|
"loss": 1.0577, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.8216142632382807, |
|
"learning_rate": 1.8286949694945864e-05, |
|
"loss": 1.063, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.9331761680038722, |
|
"learning_rate": 1.821777815225245e-05, |
|
"loss": 1.0653, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.7463038001394499, |
|
"learning_rate": 1.8147373862457107e-05, |
|
"loss": 1.0796, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.8497591810239704, |
|
"learning_rate": 1.807574738689193e-05, |
|
"loss": 1.0646, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7706758595869727, |
|
"learning_rate": 1.800290947022884e-05, |
|
"loss": 1.0751, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.7043413482735349, |
|
"learning_rate": 1.7928871038867785e-05, |
|
"loss": 1.0608, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.8329829504677462, |
|
"learning_rate": 1.7853643199297632e-05, |
|
"loss": 1.0809, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.749386798044592, |
|
"learning_rate": 1.777723723643014e-05, |
|
"loss": 1.0744, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.7517374580777705, |
|
"learning_rate": 1.769966461190707e-05, |
|
"loss": 1.0588, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.9289661993883984, |
|
"learning_rate": 1.762093696238086e-05, |
|
"loss": 1.0592, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.702861524628992, |
|
"learning_rate": 1.7541066097768965e-05, |
|
"loss": 1.0671, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.733335804526001, |
|
"learning_rate": 1.7460063999482314e-05, |
|
"loss": 1.0718, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.7062706365131609, |
|
"learning_rate": 1.737794281862794e-05, |
|
"loss": 1.0688, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7356207256793295, |
|
"learning_rate": 1.729471487418621e-05, |
|
"loss": 1.0658, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.7572566444671317, |
|
"learning_rate": 1.721039265116285e-05, |
|
"loss": 1.0655, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.735805885071863, |
|
"learning_rate": 1.7124988798716084e-05, |
|
"loss": 1.0545, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.7251156689638848, |
|
"learning_rate": 1.7038516128259118e-05, |
|
"loss": 1.0668, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.8306937180459297, |
|
"learning_rate": 1.695098761153832e-05, |
|
"loss": 1.0575, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.7410203095059583, |
|
"learning_rate": 1.686241637868734e-05, |
|
"loss": 1.0608, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.0695801973342896, |
|
"eval_runtime": 1107.1718, |
|
"eval_samples_per_second": 12.154, |
|
"eval_steps_per_second": 0.76, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 1.16058288449513, |
|
"learning_rate": 1.6772815716257414e-05, |
|
"loss": 0.9274, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 1.0275862927417232, |
|
"learning_rate": 1.6682199065224307e-05, |
|
"loss": 0.9092, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.8572569275851243, |
|
"learning_rate": 1.6590580018972012e-05, |
|
"loss": 0.9214, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.8103196292462057, |
|
"learning_rate": 1.64979723212536e-05, |
|
"loss": 0.922, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.9343043639973103, |
|
"learning_rate": 1.6404389864129533e-05, |
|
"loss": 0.9143, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.8577834657562019, |
|
"learning_rate": 1.6309846685883726e-05, |
|
"loss": 0.9108, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.7694790034514357, |
|
"learning_rate": 1.621435696891765e-05, |
|
"loss": 0.9112, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.7881582670286174, |
|
"learning_rate": 1.6117935037622848e-05, |
|
"loss": 0.9166, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.8163445575344302, |
|
"learning_rate": 1.6020595356232137e-05, |
|
"loss": 0.9168, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.7808590648116421, |
|
"learning_rate": 1.5922352526649803e-05, |
|
"loss": 0.9183, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.8694235628482853, |
|
"learning_rate": 1.5823221286261217e-05, |
|
"loss": 0.9295, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.7509497034921186, |
|
"learning_rate": 1.572321650572205e-05, |
|
"loss": 0.92, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.8147167421280597, |
|
"learning_rate": 1.5622353186727542e-05, |
|
"loss": 0.9341, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.8078856301262012, |
|
"learning_rate": 1.5520646459762102e-05, |
|
"loss": 0.9225, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.7215279351968892, |
|
"learning_rate": 1.5418111581829575e-05, |
|
"loss": 0.9116, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.7295126696154695, |
|
"learning_rate": 1.531476393416456e-05, |
|
"loss": 0.9104, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.8135765592831766, |
|
"learning_rate": 1.5210619019925066e-05, |
|
"loss": 0.9315, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.8840016217189187, |
|
"learning_rate": 1.5105692461866874e-05, |
|
"loss": 0.9197, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.8104545240762417, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.9159, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.7692322199629117, |
|
"learning_rate": 1.4893557489227518e-05, |
|
"loss": 0.9191, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.8223799154663355, |
|
"learning_rate": 1.478638089696716e-05, |
|
"loss": 0.9102, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.7443893394953662, |
|
"learning_rate": 1.467848630075608e-05, |
|
"loss": 0.9212, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.7900297431396095, |
|
"learning_rate": 1.456988988583904e-05, |
|
"loss": 0.9274, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.8344369201219162, |
|
"learning_rate": 1.4460607942740468e-05, |
|
"loss": 0.923, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.9319670376335052, |
|
"learning_rate": 1.4350656864820733e-05, |
|
"loss": 0.9164, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.7526336939340502, |
|
"learning_rate": 1.4240053145816968e-05, |
|
"loss": 0.9157, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.9004879689820842, |
|
"learning_rate": 1.4128813377368851e-05, |
|
"loss": 0.9124, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.7963894514570174, |
|
"learning_rate": 1.4016954246529697e-05, |
|
"loss": 0.9208, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.867489112759267, |
|
"learning_rate": 1.3904492533263243e-05, |
|
"loss": 0.9257, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.7815367295791487, |
|
"learning_rate": 1.3791445107926478e-05, |
|
"loss": 0.9182, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.8678431722314973, |
|
"learning_rate": 1.3677828928738934e-05, |
|
"loss": 0.9218, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.7941451121772987, |
|
"learning_rate": 1.3563661039238785e-05, |
|
"loss": 0.9304, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.7742431211390169, |
|
"learning_rate": 1.3448958565726144e-05, |
|
"loss": 0.9234, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.7634049344229958, |
|
"learning_rate": 1.3333738714693958e-05, |
|
"loss": 0.9176, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.8108629000712599, |
|
"learning_rate": 1.3218018770246858e-05, |
|
"loss": 0.9035, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.747630866975614, |
|
"learning_rate": 1.3101816091508389e-05, |
|
"loss": 0.9125, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.7793584345888945, |
|
"learning_rate": 1.2985148110016947e-05, |
|
"loss": 0.9128, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.788003717890724, |
|
"learning_rate": 1.2868032327110904e-05, |
|
"loss": 0.9156, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.7602070916944652, |
|
"learning_rate": 1.2750486311303218e-05, |
|
"loss": 0.9178, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.7526823803299355, |
|
"learning_rate": 1.2632527695645993e-05, |
|
"loss": 0.9275, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.7855296800390148, |
|
"learning_rate": 1.2514174175085346e-05, |
|
"loss": 0.9248, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.7554953629554423, |
|
"learning_rate": 1.239544350380699e-05, |
|
"loss": 0.9173, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.8450238703674462, |
|
"learning_rate": 1.2276353492572937e-05, |
|
"loss": 0.9053, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.7798006206733635, |
|
"learning_rate": 1.2156922006049703e-05, |
|
"loss": 0.9198, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.8364819432425579, |
|
"learning_rate": 1.2037166960128443e-05, |
|
"loss": 0.9223, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.7977694082425033, |
|
"learning_rate": 1.1917106319237386e-05, |
|
"loss": 0.9295, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.7952817409309066, |
|
"learning_rate": 1.1796758093646989e-05, |
|
"loss": 0.9049, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.7515124212430284, |
|
"learning_rate": 1.1676140336768236e-05, |
|
"loss": 0.9161, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.8244844518102562, |
|
"learning_rate": 1.1555271142444433e-05, |
|
"loss": 0.9306, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.7369175193722303, |
|
"learning_rate": 1.1434168642236964e-05, |
|
"loss": 0.9038, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.8027738354510138, |
|
"learning_rate": 1.1312851002705383e-05, |
|
"loss": 0.9158, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.7062360581582109, |
|
"learning_rate": 1.1191336422682237e-05, |
|
"loss": 0.9089, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.8268879803104501, |
|
"learning_rate": 1.1069643130543084e-05, |
|
"loss": 0.9092, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.8718489443695471, |
|
"learning_rate": 1.0947789381472035e-05, |
|
"loss": 0.9198, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.8442661853821835, |
|
"learning_rate": 1.0825793454723325e-05, |
|
"loss": 0.9103, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.7829036502503139, |
|
"learning_rate": 1.0703673650879219e-05, |
|
"loss": 0.9113, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.8311518421239111, |
|
"learning_rate": 1.0581448289104759e-05, |
|
"loss": 0.9136, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.7744326648665285, |
|
"learning_rate": 1.045913570439972e-05, |
|
"loss": 0.9151, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.7826514271975142, |
|
"learning_rate": 1.0336754244848156e-05, |
|
"loss": 0.9121, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.7486809723755371, |
|
"learning_rate": 1.0214322268866033e-05, |
|
"loss": 0.912, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.7664528918396255, |
|
"learning_rate": 1.0091858142447266e-05, |
|
"loss": 0.9127, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.7472624299086522, |
|
"learning_rate": 9.969380236408656e-06, |
|
"loss": 0.908, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.7555438137387003, |
|
"learning_rate": 9.846906923634079e-06, |
|
"loss": 0.9116, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.7650426487217997, |
|
"learning_rate": 9.724456576318383e-06, |
|
"loss": 0.9105, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.7159833295095184, |
|
"learning_rate": 9.602047563211359e-06, |
|
"loss": 0.9063, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.7715082848797897, |
|
"learning_rate": 9.479698246862277e-06, |
|
"loss": 0.8971, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.7894584602411355, |
|
"learning_rate": 9.3574269808653e-06, |
|
"loss": 0.9016, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.7317225702124043, |
|
"learning_rate": 9.23525210710628e-06, |
|
"loss": 0.8926, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.752304934068834, |
|
"learning_rate": 9.113191953011287e-06, |
|
"loss": 0.9186, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.7875568347573355, |
|
"learning_rate": 8.991264828797319e-06, |
|
"loss": 0.9127, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.8678707556431731, |
|
"learning_rate": 8.869489024725595e-06, |
|
"loss": 0.9058, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.7381008984105847, |
|
"learning_rate": 8.747882808357828e-06, |
|
"loss": 0.9021, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.7113811353954564, |
|
"learning_rate": 8.626464421815919e-06, |
|
"loss": 0.9098, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.767953884904161, |
|
"learning_rate": 8.505252079045459e-06, |
|
"loss": 0.8992, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.7985189315270093, |
|
"learning_rate": 8.384263963083453e-06, |
|
"loss": 0.9252, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.7698958288877892, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 0.9103, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.7090644308000087, |
|
"learning_rate": 8.143032972829184e-06, |
|
"loss": 0.8954, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.7912315130376889, |
|
"learning_rate": 8.022826285544967e-06, |
|
"loss": 0.9055, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.7324512866653731, |
|
"learning_rate": 7.902916193656898e-06, |
|
"loss": 0.905, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.7403537916049381, |
|
"learning_rate": 7.783320684851613e-06, |
|
"loss": 0.9078, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.7614179219478668, |
|
"learning_rate": 7.664057699625215e-06, |
|
"loss": 0.8939, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.7487787946919199, |
|
"learning_rate": 7.545145128592009e-06, |
|
"loss": 0.9038, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.7493118620205345, |
|
"learning_rate": 7.426600809800753e-06, |
|
"loss": 0.8987, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.7660149988776317, |
|
"learning_rate": 7.308442526058757e-06, |
|
"loss": 0.9103, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.75782079047253, |
|
"learning_rate": 7.190688002264308e-06, |
|
"loss": 0.8889, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.7314903237351683, |
|
"learning_rate": 7.073354902747742e-06, |
|
"loss": 0.8971, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.7196432526110507, |
|
"learning_rate": 6.956460828621641e-06, |
|
"loss": 0.9006, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.7534330383661013, |
|
"learning_rate": 6.840023315140476e-06, |
|
"loss": 0.906, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.7482833703640341, |
|
"learning_rate": 6.7240598290701585e-06, |
|
"loss": 0.9039, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.7448122469908642, |
|
"learning_rate": 6.608587766067853e-06, |
|
"loss": 0.8922, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.8717367397632231, |
|
"learning_rate": 6.4936244480724575e-06, |
|
"loss": 0.9057, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.7109756502386223, |
|
"learning_rate": 6.379187120706138e-06, |
|
"loss": 0.9042, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.7373476878859979, |
|
"learning_rate": 6.265292950687329e-06, |
|
"loss": 0.9013, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.7521628525759111, |
|
"learning_rate": 6.151959023255545e-06, |
|
"loss": 0.8942, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.7784149654946333, |
|
"learning_rate": 6.039202339608432e-06, |
|
"loss": 0.9014, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.0405230522155762, |
|
"eval_runtime": 1106.6966, |
|
"eval_samples_per_second": 12.16, |
|
"eval_steps_per_second": 0.761, |
|
"step": 1900 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2850, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 795904416153600.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|