|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9984947315604615, |
|
"eval_steps": 500, |
|
"global_step": 2988, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.010035122930255895, |
|
"grad_norm": 3.0791230568719863, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0584, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02007024586051179, |
|
"grad_norm": 1.9010262387699988, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9369, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.030105368790767688, |
|
"grad_norm": 3.9302140807930486, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8954, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04014049172102358, |
|
"grad_norm": 1.1954643789588726, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8743, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.050175614651279475, |
|
"grad_norm": 1.1581266418383889, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8604, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.060210737581535376, |
|
"grad_norm": 1.5383829915522733, |
|
"learning_rate": 5e-06, |
|
"loss": 0.844, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07024586051179127, |
|
"grad_norm": 1.5977753412538256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8329, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08028098344204716, |
|
"grad_norm": 1.5289565466827575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8265, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09031610637230306, |
|
"grad_norm": 1.3386469754796255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8147, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10035122930255895, |
|
"grad_norm": 1.0647477486272434, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8113, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11038635223281486, |
|
"grad_norm": 0.7070463503515779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8026, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12042147516307075, |
|
"grad_norm": 0.7381734885268878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7989, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13045659809332663, |
|
"grad_norm": 0.8946901348596374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8007, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14049172102358254, |
|
"grad_norm": 0.7080206896455782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7937, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15052684395383845, |
|
"grad_norm": 0.7872021804288697, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7964, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16056196688409433, |
|
"grad_norm": 0.6344742687953677, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7938, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17059708981435023, |
|
"grad_norm": 0.8040310396952577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7867, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1806322127446061, |
|
"grad_norm": 0.5889599293110972, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7868, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19066733567486202, |
|
"grad_norm": 0.708534731132967, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7854, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2007024586051179, |
|
"grad_norm": 0.590241380971299, |
|
"learning_rate": 5e-06, |
|
"loss": 0.782, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2107375815353738, |
|
"grad_norm": 0.6386623963841482, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7802, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22077270446562972, |
|
"grad_norm": 0.8643148756886396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7766, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.2308078273958856, |
|
"grad_norm": 0.6869876976216545, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7811, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2408429503261415, |
|
"grad_norm": 0.5947006434799368, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7785, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.2508780732563974, |
|
"grad_norm": 0.5988389120535884, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7736, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26091319618665326, |
|
"grad_norm": 0.7015845489442423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7683, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2709483191169092, |
|
"grad_norm": 0.7899101098197423, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7735, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.2809834420471651, |
|
"grad_norm": 0.6594638076973581, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7718, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.29101856497742096, |
|
"grad_norm": 0.7466372083749109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7729, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3010536879076769, |
|
"grad_norm": 0.7256825478194775, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7692, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.31108881083793277, |
|
"grad_norm": 0.6222207642465774, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7664, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.32112393376818865, |
|
"grad_norm": 0.6646085367912792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7648, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.33115905669844453, |
|
"grad_norm": 0.7893620341431038, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7624, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.34119417962870047, |
|
"grad_norm": 0.6231595108266089, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7714, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.35122930255895635, |
|
"grad_norm": 0.668351154817616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7632, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3612644254892122, |
|
"grad_norm": 0.6343439838317185, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7626, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.37129954841946816, |
|
"grad_norm": 0.7056146316204847, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7628, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.38133467134972404, |
|
"grad_norm": 0.7902937779981405, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7689, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.3913697942799799, |
|
"grad_norm": 0.6918763236938501, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7592, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4014049172102358, |
|
"grad_norm": 0.7358230335616606, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7577, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.41144004014049174, |
|
"grad_norm": 0.6126046734368374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.761, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4214751630707476, |
|
"grad_norm": 0.6317827551022122, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7598, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.4315102860010035, |
|
"grad_norm": 0.6003042486796623, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7613, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.44154540893125943, |
|
"grad_norm": 0.5703662549001378, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7602, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4515805318615153, |
|
"grad_norm": 0.6096409131095752, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7496, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4616156547917712, |
|
"grad_norm": 0.8305089106013069, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7553, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.47165077772202707, |
|
"grad_norm": 0.5896793508236663, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7503, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.481685900652283, |
|
"grad_norm": 0.6181255276560262, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7573, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4917210235825389, |
|
"grad_norm": 0.818946770368422, |
|
"learning_rate": 5e-06, |
|
"loss": 0.752, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5017561465127948, |
|
"grad_norm": 0.6056931157441836, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7537, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5117912694430506, |
|
"grad_norm": 0.5810131329440165, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7559, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5218263923733065, |
|
"grad_norm": 0.5475586575226008, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7502, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5318615153035625, |
|
"grad_norm": 0.5857098250554217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7486, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5418966382338184, |
|
"grad_norm": 0.71215741030445, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7453, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5519317611640743, |
|
"grad_norm": 0.6801576099304811, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7476, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5619668840943302, |
|
"grad_norm": 0.7375590297607938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7511, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.572002007024586, |
|
"grad_norm": 0.6187827311828052, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7484, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.5820371299548419, |
|
"grad_norm": 0.5878218056763826, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7441, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.5920722528850978, |
|
"grad_norm": 0.5969510290233113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7462, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6021073758153538, |
|
"grad_norm": 0.5535464540372343, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7497, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6121424987456097, |
|
"grad_norm": 0.5509670875952559, |
|
"learning_rate": 5e-06, |
|
"loss": 0.747, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6221776216758655, |
|
"grad_norm": 0.558690698251435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7433, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6322127446061214, |
|
"grad_norm": 0.559060237211832, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7482, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6422478675363773, |
|
"grad_norm": 0.993216287837658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7482, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.6522829904666332, |
|
"grad_norm": 0.8850325099442093, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7428, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6623181133968891, |
|
"grad_norm": 0.6458169799733141, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7454, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.672353236327145, |
|
"grad_norm": 0.5757378815184032, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7462, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.6823883592574009, |
|
"grad_norm": 0.8278456155470433, |
|
"learning_rate": 5e-06, |
|
"loss": 0.742, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.6924234821876568, |
|
"grad_norm": 0.8087863092750499, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7392, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7024586051179127, |
|
"grad_norm": 0.5920912292564408, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7425, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7124937280481686, |
|
"grad_norm": 0.6733638944211415, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7408, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7225288509784245, |
|
"grad_norm": 0.6330098467703786, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7424, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.7325639739086803, |
|
"grad_norm": 0.7212451311927113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7429, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7425990968389363, |
|
"grad_norm": 0.7896101501841413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7419, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7526342197691922, |
|
"grad_norm": 0.8026558529242067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7383, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.7626693426994481, |
|
"grad_norm": 0.5426060774366821, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7425, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.772704465629704, |
|
"grad_norm": 0.8110802533740097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7406, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.7827395885599598, |
|
"grad_norm": 0.5470009049474683, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7413, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.7927747114902157, |
|
"grad_norm": 0.5433159754299082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.737, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8028098344204716, |
|
"grad_norm": 0.5469593174447279, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7381, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.8128449573507276, |
|
"grad_norm": 0.5831597485374533, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7374, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8228800802809835, |
|
"grad_norm": 0.5117459329458333, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7386, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8329152032112394, |
|
"grad_norm": 0.6427232877089865, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7356, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8429503261414952, |
|
"grad_norm": 0.5821269785394396, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7347, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.8529854490717511, |
|
"grad_norm": 0.7120331561544883, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7363, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.863020572002007, |
|
"grad_norm": 0.6454042518783881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7371, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.8730556949322629, |
|
"grad_norm": 0.5420399494529005, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7349, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.8830908178625189, |
|
"grad_norm": 0.5989777798696867, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7346, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.8931259407927747, |
|
"grad_norm": 0.5399005419446141, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7333, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9031610637230306, |
|
"grad_norm": 0.5484451725519497, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7369, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9131961866532865, |
|
"grad_norm": 0.6023327403789546, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7332, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9232313095835424, |
|
"grad_norm": 0.570469676089088, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7332, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9332664325137983, |
|
"grad_norm": 0.5358735783509466, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7364, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9433015554440541, |
|
"grad_norm": 0.596570834196872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.734, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.9533366783743101, |
|
"grad_norm": 0.8858750188622682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7299, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.963371801304566, |
|
"grad_norm": 0.7057998358035058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7296, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.9734069242348219, |
|
"grad_norm": 0.6567689691351051, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7383, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.9834420471650778, |
|
"grad_norm": 0.5723166821895646, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7329, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.9934771700953337, |
|
"grad_norm": 0.5989872298413202, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7331, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.9994982438534872, |
|
"eval_loss": 0.7304001450538635, |
|
"eval_runtime": 533.1509, |
|
"eval_samples_per_second": 50.359, |
|
"eval_steps_per_second": 0.394, |
|
"step": 996 |
|
}, |
|
{ |
|
"epoch": 1.0035122930255895, |
|
"grad_norm": 0.7229493528687063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.754, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0135474159558455, |
|
"grad_norm": 0.7232729261961888, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6938, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.0235825388861013, |
|
"grad_norm": 0.5558461578633823, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6963, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.0336176618163573, |
|
"grad_norm": 0.7260986137213504, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6892, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.043652784746613, |
|
"grad_norm": 0.5639681058736989, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6922, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.053687907676869, |
|
"grad_norm": 0.7762004415565651, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6893, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.063723030607125, |
|
"grad_norm": 0.5656634035888263, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6903, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.0737581535373808, |
|
"grad_norm": 0.5622021467734256, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6975, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.0837932764676368, |
|
"grad_norm": 0.5747675890837022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6953, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.0938283993978926, |
|
"grad_norm": 0.7477336699942959, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6924, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.1038635223281485, |
|
"grad_norm": 0.6002636077929234, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6933, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.1138986452584043, |
|
"grad_norm": 0.5094594079799682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6909, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.1239337681886603, |
|
"grad_norm": 0.7393831172107309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6962, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.1339688911189163, |
|
"grad_norm": 0.8841355043993067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6909, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.144004014049172, |
|
"grad_norm": 0.6844280192067772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6909, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.154039136979428, |
|
"grad_norm": 0.5968326130517227, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6959, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.1640742599096838, |
|
"grad_norm": 0.6164015759188082, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6898, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.1741093828399398, |
|
"grad_norm": 0.6358359825338351, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6944, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.1841445057701956, |
|
"grad_norm": 0.6829806654257828, |
|
"learning_rate": 5e-06, |
|
"loss": 0.697, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.1941796287004516, |
|
"grad_norm": 0.794769579664283, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6983, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.2042147516307076, |
|
"grad_norm": 0.5535487901068424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6917, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.2142498745609633, |
|
"grad_norm": 0.5121320501008007, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6931, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.2242849974912193, |
|
"grad_norm": 0.77177684811206, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6959, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.234320120421475, |
|
"grad_norm": 0.5890956733651708, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6909, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.244355243351731, |
|
"grad_norm": 0.6143309157442813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6928, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.2543903662819869, |
|
"grad_norm": 0.5522142360609605, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6947, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.2644254892122428, |
|
"grad_norm": 0.49945028630806676, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6948, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.2744606121424988, |
|
"grad_norm": 0.6318729529808568, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6948, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.2844957350727546, |
|
"grad_norm": 0.571168433475203, |
|
"learning_rate": 5e-06, |
|
"loss": 0.691, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.2945308580030106, |
|
"grad_norm": 0.586240165635217, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6867, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.3045659809332664, |
|
"grad_norm": 0.5371646553547392, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6959, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.3146011038635224, |
|
"grad_norm": 0.7305933258706494, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6965, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.3246362267937783, |
|
"grad_norm": 0.5749853775138927, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6916, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.3346713497240341, |
|
"grad_norm": 0.5364405695908858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6879, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.3447064726542899, |
|
"grad_norm": 0.6583337401074435, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6941, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.3547415955845459, |
|
"grad_norm": 0.5510351142365912, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6881, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3647767185148019, |
|
"grad_norm": 0.6470013078598107, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6963, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.3748118414450576, |
|
"grad_norm": 0.6435465190281996, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6848, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.3848469643753136, |
|
"grad_norm": 0.5592954158689228, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6933, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.3948820873055694, |
|
"grad_norm": 0.5565093643287172, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6956, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.4049172102358254, |
|
"grad_norm": 0.5429384088118309, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6913, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.4149523331660814, |
|
"grad_norm": 0.5259873486414379, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6905, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.4249874560963371, |
|
"grad_norm": 0.5791809050097347, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6913, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.4350225790265931, |
|
"grad_norm": 0.5409946987555362, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6944, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.445057701956849, |
|
"grad_norm": 0.6258615837964843, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6906, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.455092824887105, |
|
"grad_norm": 0.5255664654202296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6915, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.4651279478173609, |
|
"grad_norm": 0.5157724202212534, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6911, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.4751630707476167, |
|
"grad_norm": 0.6020158761130551, |
|
"learning_rate": 5e-06, |
|
"loss": 0.692, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.4851981936778724, |
|
"grad_norm": 0.5268622104694118, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6884, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.4952333166081284, |
|
"grad_norm": 0.6743718354665602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6886, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.5052684395383844, |
|
"grad_norm": 0.5526860667382173, |
|
"learning_rate": 5e-06, |
|
"loss": 0.695, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.5153035624686404, |
|
"grad_norm": 0.5659897839907012, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6962, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.5253386853988962, |
|
"grad_norm": 0.566194976984101, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6953, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.535373808329152, |
|
"grad_norm": 0.6884023263714835, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6912, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.545408931259408, |
|
"grad_norm": 0.6152652274544539, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6881, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.555444054189664, |
|
"grad_norm": 0.7014542676012229, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6874, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.5654791771199197, |
|
"grad_norm": 0.765002797379268, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6923, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.5755143000501755, |
|
"grad_norm": 0.5516453996184308, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6896, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.5855494229804314, |
|
"grad_norm": 0.5432337667581397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6903, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.5955845459106874, |
|
"grad_norm": 0.5346527907725022, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6917, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.6056196688409434, |
|
"grad_norm": 0.5603099472061689, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6913, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.6156547917711992, |
|
"grad_norm": 0.6065488834635239, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6855, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.625689914701455, |
|
"grad_norm": 0.5274199389654457, |
|
"learning_rate": 5e-06, |
|
"loss": 0.689, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.635725037631711, |
|
"grad_norm": 0.5120395521998542, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6917, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.645760160561967, |
|
"grad_norm": 0.5519879159635278, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6844, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.655795283492223, |
|
"grad_norm": 0.504366123389348, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6894, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.6658304064224787, |
|
"grad_norm": 0.5435020519283282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6985, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.6758655293527345, |
|
"grad_norm": 0.6023059143733316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6899, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.6859006522829905, |
|
"grad_norm": 0.6127499530451496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6901, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.6959357752132465, |
|
"grad_norm": 0.584858369961145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6926, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.7059708981435022, |
|
"grad_norm": 0.6802752379981577, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6889, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.716006021073758, |
|
"grad_norm": 0.6145503442957706, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6938, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.726041144004014, |
|
"grad_norm": 0.5806125711805925, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6892, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.73607626693427, |
|
"grad_norm": 0.5164505361265845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6876, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.746111389864526, |
|
"grad_norm": 0.518409684362953, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6914, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.7561465127947817, |
|
"grad_norm": 0.5798099508033144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6936, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.7661816357250375, |
|
"grad_norm": 0.5397031230350496, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6884, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.7762167586552935, |
|
"grad_norm": 0.5101108640235097, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6861, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.7862518815855495, |
|
"grad_norm": 0.5577428782679603, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6884, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.7962870045158055, |
|
"grad_norm": 0.5088658014753255, |
|
"learning_rate": 5e-06, |
|
"loss": 0.696, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.8063221274460612, |
|
"grad_norm": 0.5834405015876643, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6936, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.816357250376317, |
|
"grad_norm": 0.5223134455064544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6894, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.826392373306573, |
|
"grad_norm": 0.5218318397593602, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6901, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.836427496236829, |
|
"grad_norm": 0.6595429027005055, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6878, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.8464626191670848, |
|
"grad_norm": 0.5041069952524779, |
|
"learning_rate": 5e-06, |
|
"loss": 0.691, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.8564977420973405, |
|
"grad_norm": 0.6147969409931642, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6925, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.8665328650275965, |
|
"grad_norm": 0.5123520217070617, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6864, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.8765679879578525, |
|
"grad_norm": 0.5168471950711845, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6885, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.8866031108881085, |
|
"grad_norm": 0.5930018846461471, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6861, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.8966382338183643, |
|
"grad_norm": 0.5616874309006329, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6895, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.90667335674862, |
|
"grad_norm": 0.5473178004684522, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6891, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.916708479678876, |
|
"grad_norm": 0.498555944884513, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6867, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.926743602609132, |
|
"grad_norm": 0.5781525233352894, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6936, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.936778725539388, |
|
"grad_norm": 0.6304670711001885, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6882, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.9468138484696438, |
|
"grad_norm": 0.59342699382766, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6875, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.9568489713998996, |
|
"grad_norm": 0.6911703835056067, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6879, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.9668840943301555, |
|
"grad_norm": 0.7568921959517525, |
|
"learning_rate": 5e-06, |
|
"loss": 0.687, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.9769192172604115, |
|
"grad_norm": 0.541514579573604, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6891, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.9869543401906673, |
|
"grad_norm": 0.5249988740865994, |
|
"learning_rate": 5e-06, |
|
"loss": 0.688, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.996989463120923, |
|
"grad_norm": 0.518981463681463, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6916, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.7170566320419312, |
|
"eval_runtime": 534.2939, |
|
"eval_samples_per_second": 50.251, |
|
"eval_steps_per_second": 0.393, |
|
"step": 1993 |
|
}, |
|
{ |
|
"epoch": 2.007024586051179, |
|
"grad_norm": 0.6758178418746282, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6927, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.017059708981435, |
|
"grad_norm": 0.6409143667352257, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6458, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.027094831911691, |
|
"grad_norm": 0.6399574034808028, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6452, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.037129954841947, |
|
"grad_norm": 0.6398494060833938, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6464, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.0471650777722026, |
|
"grad_norm": 0.6225102571911315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6441, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.0572002007024586, |
|
"grad_norm": 0.607220858160029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6488, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.0672353236327146, |
|
"grad_norm": 0.6296613660669085, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6469, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.0772704465629706, |
|
"grad_norm": 0.6997962877873833, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6442, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.087305569493226, |
|
"grad_norm": 0.5565086168492744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6501, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.097340692423482, |
|
"grad_norm": 0.5570529145692124, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6509, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.107375815353738, |
|
"grad_norm": 0.5387202610597639, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6476, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.117410938283994, |
|
"grad_norm": 0.5555720032343792, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6451, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.12744606121425, |
|
"grad_norm": 0.549298003440315, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6452, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.1374811841445056, |
|
"grad_norm": 0.7025357677002765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6501, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.1475163070747616, |
|
"grad_norm": 0.6933827951492744, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6483, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.1575514300050176, |
|
"grad_norm": 0.5931050285459404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6488, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.1675865529352736, |
|
"grad_norm": 0.5619532741142755, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6448, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.177621675865529, |
|
"grad_norm": 0.5451448701863834, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6444, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.187656798795785, |
|
"grad_norm": 0.48428533410635616, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6495, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.197691921726041, |
|
"grad_norm": 0.5479783043613699, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6525, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.207727044656297, |
|
"grad_norm": 0.5880029735313238, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6492, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.217762167586553, |
|
"grad_norm": 0.518071247803138, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6531, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.2277972905168086, |
|
"grad_norm": 0.5525479511919598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6486, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.2378324134470646, |
|
"grad_norm": 0.5930736784074986, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6477, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.2478675363773206, |
|
"grad_norm": 0.5191764875817738, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6476, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.2579026593075766, |
|
"grad_norm": 0.5226745835573395, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6468, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.2679377822378326, |
|
"grad_norm": 0.594787801985729, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6493, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.277972905168088, |
|
"grad_norm": 0.6593644795439105, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6494, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.288008028098344, |
|
"grad_norm": 0.5809194962718293, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6488, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.2980431510286, |
|
"grad_norm": 0.5769100613769682, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6443, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.308078273958856, |
|
"grad_norm": 0.5798022574438232, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6511, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.318113396889112, |
|
"grad_norm": 0.5500840976868079, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6521, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.3281485198193677, |
|
"grad_norm": 0.557151649596109, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6499, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.3381836427496236, |
|
"grad_norm": 0.5206053660885596, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6497, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.3482187656798796, |
|
"grad_norm": 0.5151956126043011, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6465, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.3582538886101356, |
|
"grad_norm": 0.6243860622771431, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6471, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.368289011540391, |
|
"grad_norm": 0.5551415160151735, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6499, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.378324134470647, |
|
"grad_norm": 0.5399591771385078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6542, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.388359257400903, |
|
"grad_norm": 0.7051292001377757, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6519, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.398394380331159, |
|
"grad_norm": 0.7249734533415666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6453, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.408429503261415, |
|
"grad_norm": 0.5667898165098878, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6502, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.4184646261916707, |
|
"grad_norm": 0.5714404629239772, |
|
"learning_rate": 5e-06, |
|
"loss": 0.651, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.4284997491219267, |
|
"grad_norm": 0.5776379885611598, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6533, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.4385348720521827, |
|
"grad_norm": 0.5720433924252879, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6495, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.4485699949824387, |
|
"grad_norm": 0.5385010214829424, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6535, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.458605117912694, |
|
"grad_norm": 0.7858581581353575, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6501, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.46864024084295, |
|
"grad_norm": 0.5254373337828374, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6514, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.478675363773206, |
|
"grad_norm": 0.5927951089454341, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6567, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.488710486703462, |
|
"grad_norm": 0.576333054967198, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6468, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.498745609633718, |
|
"grad_norm": 0.5997091587436316, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6494, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.5087807325639737, |
|
"grad_norm": 0.5774761407070679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6575, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.5188158554942297, |
|
"grad_norm": 0.5697277316039128, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6529, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.5288509784244857, |
|
"grad_norm": 0.5523271421741578, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6486, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.5388861013547417, |
|
"grad_norm": 0.5308441673869406, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6479, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.5489212242849977, |
|
"grad_norm": 0.611658706018765, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6515, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.5589563472152532, |
|
"grad_norm": 0.5882952161403756, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6529, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.568991470145509, |
|
"grad_norm": 0.5944397200737397, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6486, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.579026593075765, |
|
"grad_norm": 0.6211121511651304, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6511, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.589061716006021, |
|
"grad_norm": 0.5687379330786366, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6599, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.599096838936277, |
|
"grad_norm": 0.5264826150946973, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6475, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.6091319618665327, |
|
"grad_norm": 0.546285649845265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6531, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.6191670847967887, |
|
"grad_norm": 0.5016403997355949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6532, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.6292022077270447, |
|
"grad_norm": 0.514946202260049, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6476, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.6392373306573007, |
|
"grad_norm": 0.5768998926592247, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6499, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.6492724535875567, |
|
"grad_norm": 0.56824673619337, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6515, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.6593075765178122, |
|
"grad_norm": 0.5743894544574858, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6528, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.6693426994480682, |
|
"grad_norm": 0.6566059771585782, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6482, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.6793778223783242, |
|
"grad_norm": 0.6222476495314778, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6521, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.6894129453085798, |
|
"grad_norm": 0.5352433226802004, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6511, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.6994480682388358, |
|
"grad_norm": 0.5593740996298296, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6502, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.7094831911690918, |
|
"grad_norm": 0.5870512541625679, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6509, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.7195183140993477, |
|
"grad_norm": 0.5194458048924404, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6522, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.7295534370296037, |
|
"grad_norm": 0.5207337998079029, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6513, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.7395885599598593, |
|
"grad_norm": 0.543790751828564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6551, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.7496236828901153, |
|
"grad_norm": 0.5863597318330472, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6545, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.7596588058203713, |
|
"grad_norm": 0.5388170321853544, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.7696939287506273, |
|
"grad_norm": 0.5742236049971658, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6583, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.7797290516808832, |
|
"grad_norm": 0.5384373778506647, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6545, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.789764174611139, |
|
"grad_norm": 0.5345134774106539, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6508, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.799799297541395, |
|
"grad_norm": 0.6099706934146881, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6534, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.8098344204716508, |
|
"grad_norm": 0.7684214994129063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6552, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.8198695434019068, |
|
"grad_norm": 0.6812429404931887, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6541, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.8299046663321628, |
|
"grad_norm": 0.5551480308045381, |
|
"learning_rate": 5e-06, |
|
"loss": 0.648, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.8399397892624183, |
|
"grad_norm": 0.5783011582533801, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6538, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.8499749121926743, |
|
"grad_norm": 0.6083930556812827, |
|
"learning_rate": 5e-06, |
|
"loss": 0.652, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.8600100351229303, |
|
"grad_norm": 0.6412044060911565, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6574, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.8700451580531863, |
|
"grad_norm": 0.6641263630618335, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6533, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.8800802809834423, |
|
"grad_norm": 0.7128854398954438, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6516, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.890115403913698, |
|
"grad_norm": 0.6732370461622078, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6542, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.900150526843954, |
|
"grad_norm": 0.5615551950157813, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6517, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.91018564977421, |
|
"grad_norm": 0.5360864764156098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6529, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.920220772704466, |
|
"grad_norm": 0.5686095181581269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6522, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.9302558956347218, |
|
"grad_norm": 0.49222391395442017, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6508, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.9402910185649773, |
|
"grad_norm": 0.5723888567358063, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6497, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.9503261414952333, |
|
"grad_norm": 0.6277028838019034, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6548, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.9603612644254893, |
|
"grad_norm": 0.6499248602518872, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6539, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.970396387355745, |
|
"grad_norm": 0.5523665140419113, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6567, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.980431510286001, |
|
"grad_norm": 0.5253539559863383, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6547, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.990466633216257, |
|
"grad_norm": 0.5665529328640058, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6516, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.9984947315604615, |
|
"eval_loss": 0.714939534664154, |
|
"eval_runtime": 532.5929, |
|
"eval_samples_per_second": 50.412, |
|
"eval_steps_per_second": 0.394, |
|
"step": 2988 |
|
}, |
|
{ |
|
"epoch": 2.9984947315604615, |
|
"step": 2988, |
|
"total_flos": 5004592530063360.0, |
|
"train_loss": 0.7032094593826866, |
|
"train_runtime": 89126.24, |
|
"train_samples_per_second": 17.171, |
|
"train_steps_per_second": 0.034 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2988, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5004592530063360.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|