|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 1850, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 3.48477044162232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8622, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 2.360261766882162, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6871, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 2.861337031562045, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6496, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 2.110955586370757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6379, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 2.6603965950552375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6224, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 2.3295005659015637, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6158, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"grad_norm": 1.4436743736286393, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6121, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 1.480185038939668, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6048, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 1.7946241900816966, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6027, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 2.953041989671829, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6057, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"grad_norm": 2.3672617838763292, |
|
"learning_rate": 5e-06, |
|
"loss": 0.603, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 2.570180323617788, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5968, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"grad_norm": 2.4395069709384405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5971, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 2.139806105177032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5967, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 1.8058881040832764, |
|
"learning_rate": 5e-06, |
|
"loss": 0.588, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 1.906112223883072, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5911, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"grad_norm": 1.7152010217915816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5929, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 1.4638072286809918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.592, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5135135135135135, |
|
"grad_norm": 1.4052441733855927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5881, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 1.3473253327847512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5863, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5675675675675675, |
|
"grad_norm": 1.6317553588784817, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5888, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5945945945945946, |
|
"grad_norm": 1.4742582489032408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5846, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.6216216216216216, |
|
"grad_norm": 1.3786710575715564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5858, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.6486486486486487, |
|
"grad_norm": 1.3706145590107932, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5823, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 2.9406828647293684, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5759, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7027027027027027, |
|
"grad_norm": 1.874695043037241, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5788, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.7297297297297297, |
|
"grad_norm": 1.6258297787646745, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5807, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.7567567567567568, |
|
"grad_norm": 1.3370174946548334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5753, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7837837837837838, |
|
"grad_norm": 1.4879061874672985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5751, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 1.4797771423522537, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5752, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8378378378378378, |
|
"grad_norm": 1.349831755648445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5731, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.8648648648648649, |
|
"grad_norm": 1.487488462220457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5719, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8918918918918919, |
|
"grad_norm": 1.3196199401162108, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5736, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.918918918918919, |
|
"grad_norm": 1.2813534915454334, |
|
"learning_rate": 5e-06, |
|
"loss": 0.581, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 1.2493996252660173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5726, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.972972972972973, |
|
"grad_norm": 1.3169592287464336, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5717, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.6256059905335656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5735, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.28570452332496643, |
|
"eval_runtime": 96.5261, |
|
"eval_samples_per_second": 103.236, |
|
"eval_steps_per_second": 0.404, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.027027027027027, |
|
"grad_norm": 1.7234414441453492, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4828, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.054054054054054, |
|
"grad_norm": 1.542357388344519, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4753, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 1.4861195617131548, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4799, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1081081081081081, |
|
"grad_norm": 1.4134885594853346, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4728, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.135135135135135, |
|
"grad_norm": 1.8446040240127095, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4775, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.1621621621621623, |
|
"grad_norm": 1.8467841301436136, |
|
"learning_rate": 5e-06, |
|
"loss": 0.484, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.1891891891891893, |
|
"grad_norm": 2.42665171779706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4749, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 1.8049502845392928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.48, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.2432432432432432, |
|
"grad_norm": 2.0761863031810375, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4809, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.2702702702702702, |
|
"grad_norm": 2.0655543660952476, |
|
"learning_rate": 5e-06, |
|
"loss": 0.481, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.2972972972972974, |
|
"grad_norm": 1.6759566925998968, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4764, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.3243243243243243, |
|
"grad_norm": 1.396353193003371, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4761, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 1.397142398326265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.482, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3783783783783785, |
|
"grad_norm": 1.5036245472795235, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4817, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.4054054054054055, |
|
"grad_norm": 1.5607076153200972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4861, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.4324324324324325, |
|
"grad_norm": 1.7026202176545633, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4801, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.4594594594594594, |
|
"grad_norm": 1.694705438287125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4834, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 1.8913973826320096, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4815, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5135135135135136, |
|
"grad_norm": 1.3860850720137203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4853, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.5405405405405406, |
|
"grad_norm": 1.6545389203896799, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4775, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.5675675675675675, |
|
"grad_norm": 1.6330744148753316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.484, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.5945945945945947, |
|
"grad_norm": 1.749422453289866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4842, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 1.4714668834139133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4815, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.6486486486486487, |
|
"grad_norm": 1.7987332350786904, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4803, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.6756756756756757, |
|
"grad_norm": 1.4458046351621974, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4867, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.7027027027027026, |
|
"grad_norm": 1.5250625163041274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4898, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.7297297297297298, |
|
"grad_norm": 1.682061525596803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4903, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 1.725604337448065, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4832, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.7837837837837838, |
|
"grad_norm": 1.451360087475008, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4825, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.810810810810811, |
|
"grad_norm": 1.6609825444761357, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4826, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.8378378378378377, |
|
"grad_norm": 1.9683314087008068, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4864, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.864864864864865, |
|
"grad_norm": 1.6782355490104197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.485, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 1.9940863134776443, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4884, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.9189189189189189, |
|
"grad_norm": 1.558943375608973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4926, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.945945945945946, |
|
"grad_norm": 1.2455247588814846, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4855, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.972972972972973, |
|
"grad_norm": 1.4348648241338484, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4845, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.2414831162937485, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4841, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.2870478928089142, |
|
"eval_runtime": 96.6141, |
|
"eval_samples_per_second": 103.142, |
|
"eval_steps_per_second": 0.404, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 2.327501959947698, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3857, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.054054054054054, |
|
"grad_norm": 1.7853879802404622, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3828, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.081081081081081, |
|
"grad_norm": 1.6129872542969907, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3811, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.108108108108108, |
|
"grad_norm": 1.6822958801213699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.379, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.135135135135135, |
|
"grad_norm": 1.5765921764422512, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3804, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 1.4901475659604642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3877, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.189189189189189, |
|
"grad_norm": 1.6201893013251834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3865, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.2162162162162162, |
|
"grad_norm": 1.5437549027848532, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3847, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.2432432432432434, |
|
"grad_norm": 1.8632952369866993, |
|
"learning_rate": 5e-06, |
|
"loss": 0.392, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.27027027027027, |
|
"grad_norm": 1.7661982815559183, |
|
"learning_rate": 5e-06, |
|
"loss": 0.394, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 1.7332904607598014, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3879, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.3243243243243246, |
|
"grad_norm": 2.252072555591542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3904, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.3513513513513513, |
|
"grad_norm": 1.6253901818998244, |
|
"learning_rate": 5e-06, |
|
"loss": 0.397, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.3783783783783785, |
|
"grad_norm": 2.211017287566556, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3931, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.4054054054054053, |
|
"grad_norm": 2.398472470432874, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3938, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 1.5682213825969358, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3916, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.4594594594594597, |
|
"grad_norm": 2.1044553282329077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3881, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.4864864864864864, |
|
"grad_norm": 1.7564401087246682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.391, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.5135135135135136, |
|
"grad_norm": 1.6382220101052676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.391, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.5405405405405403, |
|
"grad_norm": 1.4798359424940517, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3979, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 1.4676265261223125, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3963, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.5945945945945947, |
|
"grad_norm": 1.6295486852060186, |
|
"learning_rate": 5e-06, |
|
"loss": 0.399, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.6216216216216215, |
|
"grad_norm": 1.6192977742906878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4023, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.6486486486486487, |
|
"grad_norm": 1.4805451072812197, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4037, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.6756756756756754, |
|
"grad_norm": 1.6037076836111144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4037, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 1.5225333016382139, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3945, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.72972972972973, |
|
"grad_norm": 1.6212777115845565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4051, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.756756756756757, |
|
"grad_norm": 1.5016437573712755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.406, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.7837837837837838, |
|
"grad_norm": 1.5902877384401621, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4029, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.810810810810811, |
|
"grad_norm": 1.3912722358421692, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4055, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 1.5492106652733648, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4046, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.864864864864865, |
|
"grad_norm": 1.5663312579126816, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4016, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.891891891891892, |
|
"grad_norm": 1.40383114587804, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4079, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.918918918918919, |
|
"grad_norm": 1.6190187870975588, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4049, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.945945945945946, |
|
"grad_norm": 1.640627877455668, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4065, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 1.4897415581729503, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4066, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.4711135774031523, |
|
"learning_rate": 5e-06, |
|
"loss": 0.408, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.3071748912334442, |
|
"eval_runtime": 96.698, |
|
"eval_samples_per_second": 103.053, |
|
"eval_steps_per_second": 0.403, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.027027027027027, |
|
"grad_norm": 2.6131814809186613, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2928, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.054054054054054, |
|
"grad_norm": 2.2485831495990847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2851, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.081081081081081, |
|
"grad_norm": 1.928587031956844, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2828, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 1.743854950076673, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2809, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.135135135135135, |
|
"grad_norm": 1.8950206315568565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2814, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.1621621621621623, |
|
"grad_norm": 1.9383522044886479, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2865, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 3.189189189189189, |
|
"grad_norm": 1.9203849142483256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2851, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 3.2162162162162162, |
|
"grad_norm": 1.9070432540284974, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2874, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 1.9642273546651734, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2874, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.27027027027027, |
|
"grad_norm": 2.371293282521119, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2921, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 3.2972972972972974, |
|
"grad_norm": 2.69733157052314, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2891, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 3.3243243243243246, |
|
"grad_norm": 2.4462972161470073, |
|
"learning_rate": 5e-06, |
|
"loss": 0.287, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 3.3513513513513513, |
|
"grad_norm": 2.09306288923572, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2895, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 2.260854207612739, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2865, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.4054054054054053, |
|
"grad_norm": 2.692817562351766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2899, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 3.4324324324324325, |
|
"grad_norm": 3.125964731548184, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2907, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 3.4594594594594597, |
|
"grad_norm": 2.442243961667573, |
|
"learning_rate": 5e-06, |
|
"loss": 0.291, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 3.4864864864864864, |
|
"grad_norm": 2.0307566267966344, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2934, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 2.098894756875353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2878, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.5405405405405403, |
|
"grad_norm": 2.1484942546619985, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2918, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 3.5675675675675675, |
|
"grad_norm": 1.8339116955180341, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2943, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 3.5945945945945947, |
|
"grad_norm": 1.9887758275282583, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2925, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 3.6216216216216215, |
|
"grad_norm": 1.9845485688385418, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2955, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 1.969672930852857, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2988, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.6756756756756754, |
|
"grad_norm": 1.7604665753983029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2954, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 3.7027027027027026, |
|
"grad_norm": 1.8509109071012708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2962, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 3.72972972972973, |
|
"grad_norm": 1.6839851527778709, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2991, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 3.756756756756757, |
|
"grad_norm": 1.9195065233001387, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2997, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 2.0352568345546262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3012, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.810810810810811, |
|
"grad_norm": 1.8158735733043123, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3005, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 3.8378378378378377, |
|
"grad_norm": 1.9516953165324797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3022, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 3.864864864864865, |
|
"grad_norm": 1.863346672591352, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3018, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 3.891891891891892, |
|
"grad_norm": 1.8751695571197564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3019, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 1.9801631273484928, |
|
"learning_rate": 5e-06, |
|
"loss": 0.307, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 3.945945945945946, |
|
"grad_norm": 2.003514563060868, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3058, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 3.972972972972973, |
|
"grad_norm": 2.0763911612682837, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3032, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 2.336230273567409, |
|
"learning_rate": 5e-06, |
|
"loss": 0.3066, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.3498997688293457, |
|
"eval_runtime": 96.4986, |
|
"eval_samples_per_second": 103.266, |
|
"eval_steps_per_second": 0.404, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 4.027027027027027, |
|
"grad_norm": 3.0158703873058617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1986, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 4.054054054054054, |
|
"grad_norm": 2.369544566010332, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1914, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.081081081081081, |
|
"grad_norm": 2.183015942133232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1906, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 4.108108108108108, |
|
"grad_norm": 2.158995110143315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1894, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 4.135135135135135, |
|
"grad_norm": 2.15897438632279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1923, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 4.162162162162162, |
|
"grad_norm": 2.3012908533453316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1918, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 4.1891891891891895, |
|
"grad_norm": 2.3397981392039133, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1955, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.216216216216216, |
|
"grad_norm": 2.3318950522877544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1967, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 4.243243243243243, |
|
"grad_norm": 2.2558016269733017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1973, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 4.27027027027027, |
|
"grad_norm": 2.0308981605993557, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2001, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 4.297297297297297, |
|
"grad_norm": 2.2105682816818604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1999, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 4.324324324324325, |
|
"grad_norm": 2.0611643054991777, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1977, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.351351351351352, |
|
"grad_norm": 2.244438029380733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1988, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 4.378378378378378, |
|
"grad_norm": 2.1171487595846683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2016, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 4.405405405405405, |
|
"grad_norm": 2.129014172266156, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2045, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 4.4324324324324325, |
|
"grad_norm": 2.0849845815435284, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2049, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 4.45945945945946, |
|
"grad_norm": 2.107058909660221, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2058, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.486486486486487, |
|
"grad_norm": 2.4419601951125274, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2063, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 4.513513513513513, |
|
"grad_norm": 2.242862620136703, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2079, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 4.54054054054054, |
|
"grad_norm": 2.1964969220779498, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2063, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 4.5675675675675675, |
|
"grad_norm": 2.144927893368169, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2073, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 4.594594594594595, |
|
"grad_norm": 2.4089173007864986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2064, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.621621621621622, |
|
"grad_norm": 2.236620269865094, |
|
"learning_rate": 5e-06, |
|
"loss": 0.208, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 4.648648648648649, |
|
"grad_norm": 2.2273265321363964, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2091, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 4.675675675675675, |
|
"grad_norm": 2.043537478772687, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2083, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 4.702702702702703, |
|
"grad_norm": 2.2566729233438516, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2099, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 4.72972972972973, |
|
"grad_norm": 2.3414348548757853, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2107, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.756756756756757, |
|
"grad_norm": 2.231457361542356, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2096, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 4.783783783783784, |
|
"grad_norm": 2.182368419604419, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2111, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 4.8108108108108105, |
|
"grad_norm": 2.3022522015009526, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2097, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 4.837837837837838, |
|
"grad_norm": 2.154576582443656, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2132, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 4.864864864864865, |
|
"grad_norm": 2.185745168624507, |
|
"learning_rate": 5e-06, |
|
"loss": 0.212, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.891891891891892, |
|
"grad_norm": 2.1651574566981395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2143, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 4.918918918918919, |
|
"grad_norm": 2.2200846915512127, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2137, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 4.945945945945946, |
|
"grad_norm": 2.344248992891754, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2162, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 4.972972972972973, |
|
"grad_norm": 2.150842096847866, |
|
"learning_rate": 5e-06, |
|
"loss": 0.215, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 2.3030280373590335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2144, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.4100167155265808, |
|
"eval_runtime": 95.571, |
|
"eval_samples_per_second": 104.268, |
|
"eval_steps_per_second": 0.408, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 1850, |
|
"total_flos": 3097981385441280.0, |
|
"train_loss": 0.3954814358659693, |
|
"train_runtime": 27395.2752, |
|
"train_samples_per_second": 34.555, |
|
"train_steps_per_second": 0.068 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1850, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3097981385441280.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|