{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9984947315604615, "eval_steps": 500, "global_step": 2988, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010035122930255895, "grad_norm": 3.0791230568719863, "learning_rate": 5e-06, "loss": 1.0584, "step": 10 }, { "epoch": 0.02007024586051179, "grad_norm": 1.9010262387699988, "learning_rate": 5e-06, "loss": 0.9369, "step": 20 }, { "epoch": 0.030105368790767688, "grad_norm": 3.9302140807930486, "learning_rate": 5e-06, "loss": 0.8954, "step": 30 }, { "epoch": 0.04014049172102358, "grad_norm": 1.1954643789588726, "learning_rate": 5e-06, "loss": 0.8743, "step": 40 }, { "epoch": 0.050175614651279475, "grad_norm": 1.1581266418383889, "learning_rate": 5e-06, "loss": 0.8604, "step": 50 }, { "epoch": 0.060210737581535376, "grad_norm": 1.5383829915522733, "learning_rate": 5e-06, "loss": 0.844, "step": 60 }, { "epoch": 0.07024586051179127, "grad_norm": 1.5977753412538256, "learning_rate": 5e-06, "loss": 0.8329, "step": 70 }, { "epoch": 0.08028098344204716, "grad_norm": 1.5289565466827575, "learning_rate": 5e-06, "loss": 0.8265, "step": 80 }, { "epoch": 0.09031610637230306, "grad_norm": 1.3386469754796255, "learning_rate": 5e-06, "loss": 0.8147, "step": 90 }, { "epoch": 0.10035122930255895, "grad_norm": 1.0647477486272434, "learning_rate": 5e-06, "loss": 0.8113, "step": 100 }, { "epoch": 0.11038635223281486, "grad_norm": 0.7070463503515779, "learning_rate": 5e-06, "loss": 0.8026, "step": 110 }, { "epoch": 0.12042147516307075, "grad_norm": 0.7381734885268878, "learning_rate": 5e-06, "loss": 0.7989, "step": 120 }, { "epoch": 0.13045659809332663, "grad_norm": 0.8946901348596374, "learning_rate": 5e-06, "loss": 0.8007, "step": 130 }, { "epoch": 0.14049172102358254, "grad_norm": 0.7080206896455782, "learning_rate": 5e-06, "loss": 0.7937, "step": 140 }, { "epoch": 0.15052684395383845, "grad_norm": 0.7872021804288697, "learning_rate": 5e-06, "loss": 0.7964, "step": 150 }, { "epoch": 0.16056196688409433, "grad_norm": 0.6344742687953677, "learning_rate": 5e-06, "loss": 0.7938, "step": 160 }, { "epoch": 0.17059708981435023, "grad_norm": 0.8040310396952577, "learning_rate": 5e-06, "loss": 0.7867, "step": 170 }, { "epoch": 0.1806322127446061, "grad_norm": 0.5889599293110972, "learning_rate": 5e-06, "loss": 0.7868, "step": 180 }, { "epoch": 0.19066733567486202, "grad_norm": 0.708534731132967, "learning_rate": 5e-06, "loss": 0.7854, "step": 190 }, { "epoch": 0.2007024586051179, "grad_norm": 0.590241380971299, "learning_rate": 5e-06, "loss": 0.782, "step": 200 }, { "epoch": 0.2107375815353738, "grad_norm": 0.6386623963841482, "learning_rate": 5e-06, "loss": 0.7802, "step": 210 }, { "epoch": 0.22077270446562972, "grad_norm": 0.8643148756886396, "learning_rate": 5e-06, "loss": 0.7766, "step": 220 }, { "epoch": 0.2308078273958856, "grad_norm": 0.6869876976216545, "learning_rate": 5e-06, "loss": 0.7811, "step": 230 }, { "epoch": 0.2408429503261415, "grad_norm": 0.5947006434799368, "learning_rate": 5e-06, "loss": 0.7785, "step": 240 }, { "epoch": 0.2508780732563974, "grad_norm": 0.5988389120535884, "learning_rate": 5e-06, "loss": 0.7736, "step": 250 }, { "epoch": 0.26091319618665326, "grad_norm": 0.7015845489442423, "learning_rate": 5e-06, "loss": 0.7683, "step": 260 }, { "epoch": 0.2709483191169092, "grad_norm": 0.7899101098197423, "learning_rate": 5e-06, "loss": 0.7735, "step": 270 }, { "epoch": 0.2809834420471651, "grad_norm": 0.6594638076973581, "learning_rate": 5e-06, "loss": 0.7718, "step": 280 }, { "epoch": 0.29101856497742096, "grad_norm": 0.7466372083749109, "learning_rate": 5e-06, "loss": 0.7729, "step": 290 }, { "epoch": 0.3010536879076769, "grad_norm": 0.7256825478194775, "learning_rate": 5e-06, "loss": 0.7692, "step": 300 }, { "epoch": 0.31108881083793277, "grad_norm": 0.6222207642465774, "learning_rate": 5e-06, "loss": 0.7664, "step": 310 }, { "epoch": 0.32112393376818865, "grad_norm": 0.6646085367912792, "learning_rate": 5e-06, "loss": 0.7648, "step": 320 }, { "epoch": 0.33115905669844453, "grad_norm": 0.7893620341431038, "learning_rate": 5e-06, "loss": 0.7624, "step": 330 }, { "epoch": 0.34119417962870047, "grad_norm": 0.6231595108266089, "learning_rate": 5e-06, "loss": 0.7714, "step": 340 }, { "epoch": 0.35122930255895635, "grad_norm": 0.668351154817616, "learning_rate": 5e-06, "loss": 0.7632, "step": 350 }, { "epoch": 0.3612644254892122, "grad_norm": 0.6343439838317185, "learning_rate": 5e-06, "loss": 0.7626, "step": 360 }, { "epoch": 0.37129954841946816, "grad_norm": 0.7056146316204847, "learning_rate": 5e-06, "loss": 0.7628, "step": 370 }, { "epoch": 0.38133467134972404, "grad_norm": 0.7902937779981405, "learning_rate": 5e-06, "loss": 0.7689, "step": 380 }, { "epoch": 0.3913697942799799, "grad_norm": 0.6918763236938501, "learning_rate": 5e-06, "loss": 0.7592, "step": 390 }, { "epoch": 0.4014049172102358, "grad_norm": 0.7358230335616606, "learning_rate": 5e-06, "loss": 0.7577, "step": 400 }, { "epoch": 0.41144004014049174, "grad_norm": 0.6126046734368374, "learning_rate": 5e-06, "loss": 0.761, "step": 410 }, { "epoch": 0.4214751630707476, "grad_norm": 0.6317827551022122, "learning_rate": 5e-06, "loss": 0.7598, "step": 420 }, { "epoch": 0.4315102860010035, "grad_norm": 0.6003042486796623, "learning_rate": 5e-06, "loss": 0.7613, "step": 430 }, { "epoch": 0.44154540893125943, "grad_norm": 0.5703662549001378, "learning_rate": 5e-06, "loss": 0.7602, "step": 440 }, { "epoch": 0.4515805318615153, "grad_norm": 0.6096409131095752, "learning_rate": 5e-06, "loss": 0.7496, "step": 450 }, { "epoch": 0.4616156547917712, "grad_norm": 0.8305089106013069, "learning_rate": 5e-06, "loss": 0.7553, "step": 460 }, { "epoch": 0.47165077772202707, "grad_norm": 0.5896793508236663, "learning_rate": 5e-06, "loss": 0.7503, "step": 470 }, { "epoch": 0.481685900652283, "grad_norm": 0.6181255276560262, "learning_rate": 5e-06, "loss": 0.7573, "step": 480 }, { "epoch": 0.4917210235825389, "grad_norm": 0.818946770368422, "learning_rate": 5e-06, "loss": 0.752, "step": 490 }, { "epoch": 0.5017561465127948, "grad_norm": 0.6056931157441836, "learning_rate": 5e-06, "loss": 0.7537, "step": 500 }, { "epoch": 0.5117912694430506, "grad_norm": 0.5810131329440165, "learning_rate": 5e-06, "loss": 0.7559, "step": 510 }, { "epoch": 0.5218263923733065, "grad_norm": 0.5475586575226008, "learning_rate": 5e-06, "loss": 0.7502, "step": 520 }, { "epoch": 0.5318615153035625, "grad_norm": 0.5857098250554217, "learning_rate": 5e-06, "loss": 0.7486, "step": 530 }, { "epoch": 0.5418966382338184, "grad_norm": 0.71215741030445, "learning_rate": 5e-06, "loss": 0.7453, "step": 540 }, { "epoch": 0.5519317611640743, "grad_norm": 0.6801576099304811, "learning_rate": 5e-06, "loss": 0.7476, "step": 550 }, { "epoch": 0.5619668840943302, "grad_norm": 0.7375590297607938, "learning_rate": 5e-06, "loss": 0.7511, "step": 560 }, { "epoch": 0.572002007024586, "grad_norm": 0.6187827311828052, "learning_rate": 5e-06, "loss": 0.7484, "step": 570 }, { "epoch": 0.5820371299548419, "grad_norm": 0.5878218056763826, "learning_rate": 5e-06, "loss": 0.7441, "step": 580 }, { "epoch": 0.5920722528850978, "grad_norm": 0.5969510290233113, "learning_rate": 5e-06, "loss": 0.7462, "step": 590 }, { "epoch": 0.6021073758153538, "grad_norm": 0.5535464540372343, "learning_rate": 5e-06, "loss": 0.7497, "step": 600 }, { "epoch": 0.6121424987456097, "grad_norm": 0.5509670875952559, "learning_rate": 5e-06, "loss": 0.747, "step": 610 }, { "epoch": 0.6221776216758655, "grad_norm": 0.558690698251435, "learning_rate": 5e-06, "loss": 0.7433, "step": 620 }, { "epoch": 0.6322127446061214, "grad_norm": 0.559060237211832, "learning_rate": 5e-06, "loss": 0.7482, "step": 630 }, { "epoch": 0.6422478675363773, "grad_norm": 0.993216287837658, "learning_rate": 5e-06, "loss": 0.7482, "step": 640 }, { "epoch": 0.6522829904666332, "grad_norm": 0.8850325099442093, "learning_rate": 5e-06, "loss": 0.7428, "step": 650 }, { "epoch": 0.6623181133968891, "grad_norm": 0.6458169799733141, "learning_rate": 5e-06, "loss": 0.7454, "step": 660 }, { "epoch": 0.672353236327145, "grad_norm": 0.5757378815184032, "learning_rate": 5e-06, "loss": 0.7462, "step": 670 }, { "epoch": 0.6823883592574009, "grad_norm": 0.8278456155470433, "learning_rate": 5e-06, "loss": 0.742, "step": 680 }, { "epoch": 0.6924234821876568, "grad_norm": 0.8087863092750499, "learning_rate": 5e-06, "loss": 0.7392, "step": 690 }, { "epoch": 0.7024586051179127, "grad_norm": 0.5920912292564408, "learning_rate": 5e-06, "loss": 0.7425, "step": 700 }, { "epoch": 0.7124937280481686, "grad_norm": 0.6733638944211415, "learning_rate": 5e-06, "loss": 0.7408, "step": 710 }, { "epoch": 0.7225288509784245, "grad_norm": 0.6330098467703786, "learning_rate": 5e-06, "loss": 0.7424, "step": 720 }, { "epoch": 0.7325639739086803, "grad_norm": 0.7212451311927113, "learning_rate": 5e-06, "loss": 0.7429, "step": 730 }, { "epoch": 0.7425990968389363, "grad_norm": 0.7896101501841413, "learning_rate": 5e-06, "loss": 0.7419, "step": 740 }, { "epoch": 0.7526342197691922, "grad_norm": 0.8026558529242067, "learning_rate": 5e-06, "loss": 0.7383, "step": 750 }, { "epoch": 0.7626693426994481, "grad_norm": 0.5426060774366821, "learning_rate": 5e-06, "loss": 0.7425, "step": 760 }, { "epoch": 0.772704465629704, "grad_norm": 0.8110802533740097, "learning_rate": 5e-06, "loss": 0.7406, "step": 770 }, { "epoch": 0.7827395885599598, "grad_norm": 0.5470009049474683, "learning_rate": 5e-06, "loss": 0.7413, "step": 780 }, { "epoch": 0.7927747114902157, "grad_norm": 0.5433159754299082, "learning_rate": 5e-06, "loss": 0.737, "step": 790 }, { "epoch": 0.8028098344204716, "grad_norm": 0.5469593174447279, "learning_rate": 5e-06, "loss": 0.7381, "step": 800 }, { "epoch": 0.8128449573507276, "grad_norm": 0.5831597485374533, "learning_rate": 5e-06, "loss": 0.7374, "step": 810 }, { "epoch": 0.8228800802809835, "grad_norm": 0.5117459329458333, "learning_rate": 5e-06, "loss": 0.7386, "step": 820 }, { "epoch": 0.8329152032112394, "grad_norm": 0.6427232877089865, "learning_rate": 5e-06, "loss": 0.7356, "step": 830 }, { "epoch": 0.8429503261414952, "grad_norm": 0.5821269785394396, "learning_rate": 5e-06, "loss": 0.7347, "step": 840 }, { "epoch": 0.8529854490717511, "grad_norm": 0.7120331561544883, "learning_rate": 5e-06, "loss": 0.7363, "step": 850 }, { "epoch": 0.863020572002007, "grad_norm": 0.6454042518783881, "learning_rate": 5e-06, "loss": 0.7371, "step": 860 }, { "epoch": 0.8730556949322629, "grad_norm": 0.5420399494529005, "learning_rate": 5e-06, "loss": 0.7349, "step": 870 }, { "epoch": 0.8830908178625189, "grad_norm": 0.5989777798696867, "learning_rate": 5e-06, "loss": 0.7346, "step": 880 }, { "epoch": 0.8931259407927747, "grad_norm": 0.5399005419446141, "learning_rate": 5e-06, "loss": 0.7333, "step": 890 }, { "epoch": 0.9031610637230306, "grad_norm": 0.5484451725519497, "learning_rate": 5e-06, "loss": 0.7369, "step": 900 }, { "epoch": 0.9131961866532865, "grad_norm": 0.6023327403789546, "learning_rate": 5e-06, "loss": 0.7332, "step": 910 }, { "epoch": 0.9232313095835424, "grad_norm": 0.570469676089088, "learning_rate": 5e-06, "loss": 0.7332, "step": 920 }, { "epoch": 0.9332664325137983, "grad_norm": 0.5358735783509466, "learning_rate": 5e-06, "loss": 0.7364, "step": 930 }, { "epoch": 0.9433015554440541, "grad_norm": 0.596570834196872, "learning_rate": 5e-06, "loss": 0.734, "step": 940 }, { "epoch": 0.9533366783743101, "grad_norm": 0.8858750188622682, "learning_rate": 5e-06, "loss": 0.7299, "step": 950 }, { "epoch": 0.963371801304566, "grad_norm": 0.7057998358035058, "learning_rate": 5e-06, "loss": 0.7296, "step": 960 }, { "epoch": 0.9734069242348219, "grad_norm": 0.6567689691351051, "learning_rate": 5e-06, "loss": 0.7383, "step": 970 }, { "epoch": 0.9834420471650778, "grad_norm": 0.5723166821895646, "learning_rate": 5e-06, "loss": 0.7329, "step": 980 }, { "epoch": 0.9934771700953337, "grad_norm": 0.5989872298413202, "learning_rate": 5e-06, "loss": 0.7331, "step": 990 }, { "epoch": 0.9994982438534872, "eval_loss": 0.7304001450538635, "eval_runtime": 533.1509, "eval_samples_per_second": 50.359, "eval_steps_per_second": 0.394, "step": 996 }, { "epoch": 1.0035122930255895, "grad_norm": 0.7229493528687063, "learning_rate": 5e-06, "loss": 0.754, "step": 1000 }, { "epoch": 1.0135474159558455, "grad_norm": 0.7232729261961888, "learning_rate": 5e-06, "loss": 0.6938, "step": 1010 }, { "epoch": 1.0235825388861013, "grad_norm": 0.5558461578633823, "learning_rate": 5e-06, "loss": 0.6963, "step": 1020 }, { "epoch": 1.0336176618163573, "grad_norm": 0.7260986137213504, "learning_rate": 5e-06, "loss": 0.6892, "step": 1030 }, { "epoch": 1.043652784746613, "grad_norm": 0.5639681058736989, "learning_rate": 5e-06, "loss": 0.6922, "step": 1040 }, { "epoch": 1.053687907676869, "grad_norm": 0.7762004415565651, "learning_rate": 5e-06, "loss": 0.6893, "step": 1050 }, { "epoch": 1.063723030607125, "grad_norm": 0.5656634035888263, "learning_rate": 5e-06, "loss": 0.6903, "step": 1060 }, { "epoch": 1.0737581535373808, "grad_norm": 0.5622021467734256, "learning_rate": 5e-06, "loss": 0.6975, "step": 1070 }, { "epoch": 1.0837932764676368, "grad_norm": 0.5747675890837022, "learning_rate": 5e-06, "loss": 0.6953, "step": 1080 }, { "epoch": 1.0938283993978926, "grad_norm": 0.7477336699942959, "learning_rate": 5e-06, "loss": 0.6924, "step": 1090 }, { "epoch": 1.1038635223281485, "grad_norm": 0.6002636077929234, "learning_rate": 5e-06, "loss": 0.6933, "step": 1100 }, { "epoch": 1.1138986452584043, "grad_norm": 0.5094594079799682, "learning_rate": 5e-06, "loss": 0.6909, "step": 1110 }, { "epoch": 1.1239337681886603, "grad_norm": 0.7393831172107309, "learning_rate": 5e-06, "loss": 0.6962, "step": 1120 }, { "epoch": 1.1339688911189163, "grad_norm": 0.8841355043993067, "learning_rate": 5e-06, "loss": 0.6909, "step": 1130 }, { "epoch": 1.144004014049172, "grad_norm": 0.6844280192067772, "learning_rate": 5e-06, "loss": 0.6909, "step": 1140 }, { "epoch": 1.154039136979428, "grad_norm": 0.5968326130517227, "learning_rate": 5e-06, "loss": 0.6959, "step": 1150 }, { "epoch": 1.1640742599096838, "grad_norm": 0.6164015759188082, "learning_rate": 5e-06, "loss": 0.6898, "step": 1160 }, { "epoch": 1.1741093828399398, "grad_norm": 0.6358359825338351, "learning_rate": 5e-06, "loss": 0.6944, "step": 1170 }, { "epoch": 1.1841445057701956, "grad_norm": 0.6829806654257828, "learning_rate": 5e-06, "loss": 0.697, "step": 1180 }, { "epoch": 1.1941796287004516, "grad_norm": 0.794769579664283, "learning_rate": 5e-06, "loss": 0.6983, "step": 1190 }, { "epoch": 1.2042147516307076, "grad_norm": 0.5535487901068424, "learning_rate": 5e-06, "loss": 0.6917, "step": 1200 }, { "epoch": 1.2142498745609633, "grad_norm": 0.5121320501008007, "learning_rate": 5e-06, "loss": 0.6931, "step": 1210 }, { "epoch": 1.2242849974912193, "grad_norm": 0.77177684811206, "learning_rate": 5e-06, "loss": 0.6959, "step": 1220 }, { "epoch": 1.234320120421475, "grad_norm": 0.5890956733651708, "learning_rate": 5e-06, "loss": 0.6909, "step": 1230 }, { "epoch": 1.244355243351731, "grad_norm": 0.6143309157442813, "learning_rate": 5e-06, "loss": 0.6928, "step": 1240 }, { "epoch": 1.2543903662819869, "grad_norm": 0.5522142360609605, "learning_rate": 5e-06, "loss": 0.6947, "step": 1250 }, { "epoch": 1.2644254892122428, "grad_norm": 0.49945028630806676, "learning_rate": 5e-06, "loss": 0.6948, "step": 1260 }, { "epoch": 1.2744606121424988, "grad_norm": 0.6318729529808568, "learning_rate": 5e-06, "loss": 0.6948, "step": 1270 }, { "epoch": 1.2844957350727546, "grad_norm": 0.571168433475203, "learning_rate": 5e-06, "loss": 0.691, "step": 1280 }, { "epoch": 1.2945308580030106, "grad_norm": 0.586240165635217, "learning_rate": 5e-06, "loss": 0.6867, "step": 1290 }, { "epoch": 1.3045659809332664, "grad_norm": 0.5371646553547392, "learning_rate": 5e-06, "loss": 0.6959, "step": 1300 }, { "epoch": 1.3146011038635224, "grad_norm": 0.7305933258706494, "learning_rate": 5e-06, "loss": 0.6965, "step": 1310 }, { "epoch": 1.3246362267937783, "grad_norm": 0.5749853775138927, "learning_rate": 5e-06, "loss": 0.6916, "step": 1320 }, { "epoch": 1.3346713497240341, "grad_norm": 0.5364405695908858, "learning_rate": 5e-06, "loss": 0.6879, "step": 1330 }, { "epoch": 1.3447064726542899, "grad_norm": 0.6583337401074435, "learning_rate": 5e-06, "loss": 0.6941, "step": 1340 }, { "epoch": 1.3547415955845459, "grad_norm": 0.5510351142365912, "learning_rate": 5e-06, "loss": 0.6881, "step": 1350 }, { "epoch": 1.3647767185148019, "grad_norm": 0.6470013078598107, "learning_rate": 5e-06, "loss": 0.6963, "step": 1360 }, { "epoch": 1.3748118414450576, "grad_norm": 0.6435465190281996, "learning_rate": 5e-06, "loss": 0.6848, "step": 1370 }, { "epoch": 1.3848469643753136, "grad_norm": 0.5592954158689228, "learning_rate": 5e-06, "loss": 0.6933, "step": 1380 }, { "epoch": 1.3948820873055694, "grad_norm": 0.5565093643287172, "learning_rate": 5e-06, "loss": 0.6956, "step": 1390 }, { "epoch": 1.4049172102358254, "grad_norm": 0.5429384088118309, "learning_rate": 5e-06, "loss": 0.6913, "step": 1400 }, { "epoch": 1.4149523331660814, "grad_norm": 0.5259873486414379, "learning_rate": 5e-06, "loss": 0.6905, "step": 1410 }, { "epoch": 1.4249874560963371, "grad_norm": 0.5791809050097347, "learning_rate": 5e-06, "loss": 0.6913, "step": 1420 }, { "epoch": 1.4350225790265931, "grad_norm": 0.5409946987555362, "learning_rate": 5e-06, "loss": 0.6944, "step": 1430 }, { "epoch": 1.445057701956849, "grad_norm": 0.6258615837964843, "learning_rate": 5e-06, "loss": 0.6906, "step": 1440 }, { "epoch": 1.455092824887105, "grad_norm": 0.5255664654202296, "learning_rate": 5e-06, "loss": 0.6915, "step": 1450 }, { "epoch": 1.4651279478173609, "grad_norm": 0.5157724202212534, "learning_rate": 5e-06, "loss": 0.6911, "step": 1460 }, { "epoch": 1.4751630707476167, "grad_norm": 0.6020158761130551, "learning_rate": 5e-06, "loss": 0.692, "step": 1470 }, { "epoch": 1.4851981936778724, "grad_norm": 0.5268622104694118, "learning_rate": 5e-06, "loss": 0.6884, "step": 1480 }, { "epoch": 1.4952333166081284, "grad_norm": 0.6743718354665602, "learning_rate": 5e-06, "loss": 0.6886, "step": 1490 }, { "epoch": 1.5052684395383844, "grad_norm": 0.5526860667382173, "learning_rate": 5e-06, "loss": 0.695, "step": 1500 }, { "epoch": 1.5153035624686404, "grad_norm": 0.5659897839907012, "learning_rate": 5e-06, "loss": 0.6962, "step": 1510 }, { "epoch": 1.5253386853988962, "grad_norm": 0.566194976984101, "learning_rate": 5e-06, "loss": 0.6953, "step": 1520 }, { "epoch": 1.535373808329152, "grad_norm": 0.6884023263714835, "learning_rate": 5e-06, "loss": 0.6912, "step": 1530 }, { "epoch": 1.545408931259408, "grad_norm": 0.6152652274544539, "learning_rate": 5e-06, "loss": 0.6881, "step": 1540 }, { "epoch": 1.555444054189664, "grad_norm": 0.7014542676012229, "learning_rate": 5e-06, "loss": 0.6874, "step": 1550 }, { "epoch": 1.5654791771199197, "grad_norm": 0.765002797379268, "learning_rate": 5e-06, "loss": 0.6923, "step": 1560 }, { "epoch": 1.5755143000501755, "grad_norm": 0.5516453996184308, "learning_rate": 5e-06, "loss": 0.6896, "step": 1570 }, { "epoch": 1.5855494229804314, "grad_norm": 0.5432337667581397, "learning_rate": 5e-06, "loss": 0.6903, "step": 1580 }, { "epoch": 1.5955845459106874, "grad_norm": 0.5346527907725022, "learning_rate": 5e-06, "loss": 0.6917, "step": 1590 }, { "epoch": 1.6056196688409434, "grad_norm": 0.5603099472061689, "learning_rate": 5e-06, "loss": 0.6913, "step": 1600 }, { "epoch": 1.6156547917711992, "grad_norm": 0.6065488834635239, "learning_rate": 5e-06, "loss": 0.6855, "step": 1610 }, { "epoch": 1.625689914701455, "grad_norm": 0.5274199389654457, "learning_rate": 5e-06, "loss": 0.689, "step": 1620 }, { "epoch": 1.635725037631711, "grad_norm": 0.5120395521998542, "learning_rate": 5e-06, "loss": 0.6917, "step": 1630 }, { "epoch": 1.645760160561967, "grad_norm": 0.5519879159635278, "learning_rate": 5e-06, "loss": 0.6844, "step": 1640 }, { "epoch": 1.655795283492223, "grad_norm": 0.504366123389348, "learning_rate": 5e-06, "loss": 0.6894, "step": 1650 }, { "epoch": 1.6658304064224787, "grad_norm": 0.5435020519283282, "learning_rate": 5e-06, "loss": 0.6985, "step": 1660 }, { "epoch": 1.6758655293527345, "grad_norm": 0.6023059143733316, "learning_rate": 5e-06, "loss": 0.6899, "step": 1670 }, { "epoch": 1.6859006522829905, "grad_norm": 0.6127499530451496, "learning_rate": 5e-06, "loss": 0.6901, "step": 1680 }, { "epoch": 1.6959357752132465, "grad_norm": 0.584858369961145, "learning_rate": 5e-06, "loss": 0.6926, "step": 1690 }, { "epoch": 1.7059708981435022, "grad_norm": 0.6802752379981577, "learning_rate": 5e-06, "loss": 0.6889, "step": 1700 }, { "epoch": 1.716006021073758, "grad_norm": 0.6145503442957706, "learning_rate": 5e-06, "loss": 0.6938, "step": 1710 }, { "epoch": 1.726041144004014, "grad_norm": 0.5806125711805925, "learning_rate": 5e-06, "loss": 0.6892, "step": 1720 }, { "epoch": 1.73607626693427, "grad_norm": 0.5164505361265845, "learning_rate": 5e-06, "loss": 0.6876, "step": 1730 }, { "epoch": 1.746111389864526, "grad_norm": 0.518409684362953, "learning_rate": 5e-06, "loss": 0.6914, "step": 1740 }, { "epoch": 1.7561465127947817, "grad_norm": 0.5798099508033144, "learning_rate": 5e-06, "loss": 0.6936, "step": 1750 }, { "epoch": 1.7661816357250375, "grad_norm": 0.5397031230350496, "learning_rate": 5e-06, "loss": 0.6884, "step": 1760 }, { "epoch": 1.7762167586552935, "grad_norm": 0.5101108640235097, "learning_rate": 5e-06, "loss": 0.6861, "step": 1770 }, { "epoch": 1.7862518815855495, "grad_norm": 0.5577428782679603, "learning_rate": 5e-06, "loss": 0.6884, "step": 1780 }, { "epoch": 1.7962870045158055, "grad_norm": 0.5088658014753255, "learning_rate": 5e-06, "loss": 0.696, "step": 1790 }, { "epoch": 1.8063221274460612, "grad_norm": 0.5834405015876643, "learning_rate": 5e-06, "loss": 0.6936, "step": 1800 }, { "epoch": 1.816357250376317, "grad_norm": 0.5223134455064544, "learning_rate": 5e-06, "loss": 0.6894, "step": 1810 }, { "epoch": 1.826392373306573, "grad_norm": 0.5218318397593602, "learning_rate": 5e-06, "loss": 0.6901, "step": 1820 }, { "epoch": 1.836427496236829, "grad_norm": 0.6595429027005055, "learning_rate": 5e-06, "loss": 0.6878, "step": 1830 }, { "epoch": 1.8464626191670848, "grad_norm": 0.5041069952524779, "learning_rate": 5e-06, "loss": 0.691, "step": 1840 }, { "epoch": 1.8564977420973405, "grad_norm": 0.6147969409931642, "learning_rate": 5e-06, "loss": 0.6925, "step": 1850 }, { "epoch": 1.8665328650275965, "grad_norm": 0.5123520217070617, "learning_rate": 5e-06, "loss": 0.6864, "step": 1860 }, { "epoch": 1.8765679879578525, "grad_norm": 0.5168471950711845, "learning_rate": 5e-06, "loss": 0.6885, "step": 1870 }, { "epoch": 1.8866031108881085, "grad_norm": 0.5930018846461471, "learning_rate": 5e-06, "loss": 0.6861, "step": 1880 }, { "epoch": 1.8966382338183643, "grad_norm": 0.5616874309006329, "learning_rate": 5e-06, "loss": 0.6895, "step": 1890 }, { "epoch": 1.90667335674862, "grad_norm": 0.5473178004684522, "learning_rate": 5e-06, "loss": 0.6891, "step": 1900 }, { "epoch": 1.916708479678876, "grad_norm": 0.498555944884513, "learning_rate": 5e-06, "loss": 0.6867, "step": 1910 }, { "epoch": 1.926743602609132, "grad_norm": 0.5781525233352894, "learning_rate": 5e-06, "loss": 0.6936, "step": 1920 }, { "epoch": 1.936778725539388, "grad_norm": 0.6304670711001885, "learning_rate": 5e-06, "loss": 0.6882, "step": 1930 }, { "epoch": 1.9468138484696438, "grad_norm": 0.59342699382766, "learning_rate": 5e-06, "loss": 0.6875, "step": 1940 }, { "epoch": 1.9568489713998996, "grad_norm": 0.6911703835056067, "learning_rate": 5e-06, "loss": 0.6879, "step": 1950 }, { "epoch": 1.9668840943301555, "grad_norm": 0.7568921959517525, "learning_rate": 5e-06, "loss": 0.687, "step": 1960 }, { "epoch": 1.9769192172604115, "grad_norm": 0.541514579573604, "learning_rate": 5e-06, "loss": 0.6891, "step": 1970 }, { "epoch": 1.9869543401906673, "grad_norm": 0.5249988740865994, "learning_rate": 5e-06, "loss": 0.688, "step": 1980 }, { "epoch": 1.996989463120923, "grad_norm": 0.518981463681463, "learning_rate": 5e-06, "loss": 0.6916, "step": 1990 }, { "epoch": 2.0, "eval_loss": 0.7170566320419312, "eval_runtime": 534.2939, "eval_samples_per_second": 50.251, "eval_steps_per_second": 0.393, "step": 1993 }, { "epoch": 2.007024586051179, "grad_norm": 0.6758178418746282, "learning_rate": 5e-06, "loss": 0.6927, "step": 2000 }, { "epoch": 2.017059708981435, "grad_norm": 0.6409143667352257, "learning_rate": 5e-06, "loss": 0.6458, "step": 2010 }, { "epoch": 2.027094831911691, "grad_norm": 0.6399574034808028, "learning_rate": 5e-06, "loss": 0.6452, "step": 2020 }, { "epoch": 2.037129954841947, "grad_norm": 0.6398494060833938, "learning_rate": 5e-06, "loss": 0.6464, "step": 2030 }, { "epoch": 2.0471650777722026, "grad_norm": 0.6225102571911315, "learning_rate": 5e-06, "loss": 0.6441, "step": 2040 }, { "epoch": 2.0572002007024586, "grad_norm": 0.607220858160029, "learning_rate": 5e-06, "loss": 0.6488, "step": 2050 }, { "epoch": 2.0672353236327146, "grad_norm": 0.6296613660669085, "learning_rate": 5e-06, "loss": 0.6469, "step": 2060 }, { "epoch": 2.0772704465629706, "grad_norm": 0.6997962877873833, "learning_rate": 5e-06, "loss": 0.6442, "step": 2070 }, { "epoch": 2.087305569493226, "grad_norm": 0.5565086168492744, "learning_rate": 5e-06, "loss": 0.6501, "step": 2080 }, { "epoch": 2.097340692423482, "grad_norm": 0.5570529145692124, "learning_rate": 5e-06, "loss": 0.6509, "step": 2090 }, { "epoch": 2.107375815353738, "grad_norm": 0.5387202610597639, "learning_rate": 5e-06, "loss": 0.6476, "step": 2100 }, { "epoch": 2.117410938283994, "grad_norm": 0.5555720032343792, "learning_rate": 5e-06, "loss": 0.6451, "step": 2110 }, { "epoch": 2.12744606121425, "grad_norm": 0.549298003440315, "learning_rate": 5e-06, "loss": 0.6452, "step": 2120 }, { "epoch": 2.1374811841445056, "grad_norm": 0.7025357677002765, "learning_rate": 5e-06, "loss": 0.6501, "step": 2130 }, { "epoch": 2.1475163070747616, "grad_norm": 0.6933827951492744, "learning_rate": 5e-06, "loss": 0.6483, "step": 2140 }, { "epoch": 2.1575514300050176, "grad_norm": 0.5931050285459404, "learning_rate": 5e-06, "loss": 0.6488, "step": 2150 }, { "epoch": 2.1675865529352736, "grad_norm": 0.5619532741142755, "learning_rate": 5e-06, "loss": 0.6448, "step": 2160 }, { "epoch": 2.177621675865529, "grad_norm": 0.5451448701863834, "learning_rate": 5e-06, "loss": 0.6444, "step": 2170 }, { "epoch": 2.187656798795785, "grad_norm": 0.48428533410635616, "learning_rate": 5e-06, "loss": 0.6495, "step": 2180 }, { "epoch": 2.197691921726041, "grad_norm": 0.5479783043613699, "learning_rate": 5e-06, "loss": 0.6525, "step": 2190 }, { "epoch": 2.207727044656297, "grad_norm": 0.5880029735313238, "learning_rate": 5e-06, "loss": 0.6492, "step": 2200 }, { "epoch": 2.217762167586553, "grad_norm": 0.518071247803138, "learning_rate": 5e-06, "loss": 0.6531, "step": 2210 }, { "epoch": 2.2277972905168086, "grad_norm": 0.5525479511919598, "learning_rate": 5e-06, "loss": 0.6486, "step": 2220 }, { "epoch": 2.2378324134470646, "grad_norm": 0.5930736784074986, "learning_rate": 5e-06, "loss": 0.6477, "step": 2230 }, { "epoch": 2.2478675363773206, "grad_norm": 0.5191764875817738, "learning_rate": 5e-06, "loss": 0.6476, "step": 2240 }, { "epoch": 2.2579026593075766, "grad_norm": 0.5226745835573395, "learning_rate": 5e-06, "loss": 0.6468, "step": 2250 }, { "epoch": 2.2679377822378326, "grad_norm": 0.594787801985729, "learning_rate": 5e-06, "loss": 0.6493, "step": 2260 }, { "epoch": 2.277972905168088, "grad_norm": 0.6593644795439105, "learning_rate": 5e-06, "loss": 0.6494, "step": 2270 }, { "epoch": 2.288008028098344, "grad_norm": 0.5809194962718293, "learning_rate": 5e-06, "loss": 0.6488, "step": 2280 }, { "epoch": 2.2980431510286, "grad_norm": 0.5769100613769682, "learning_rate": 5e-06, "loss": 0.6443, "step": 2290 }, { "epoch": 2.308078273958856, "grad_norm": 0.5798022574438232, "learning_rate": 5e-06, "loss": 0.6511, "step": 2300 }, { "epoch": 2.318113396889112, "grad_norm": 0.5500840976868079, "learning_rate": 5e-06, "loss": 0.6521, "step": 2310 }, { "epoch": 2.3281485198193677, "grad_norm": 0.557151649596109, "learning_rate": 5e-06, "loss": 0.6499, "step": 2320 }, { "epoch": 2.3381836427496236, "grad_norm": 0.5206053660885596, "learning_rate": 5e-06, "loss": 0.6497, "step": 2330 }, { "epoch": 2.3482187656798796, "grad_norm": 0.5151956126043011, "learning_rate": 5e-06, "loss": 0.6465, "step": 2340 }, { "epoch": 2.3582538886101356, "grad_norm": 0.6243860622771431, "learning_rate": 5e-06, "loss": 0.6471, "step": 2350 }, { "epoch": 2.368289011540391, "grad_norm": 0.5551415160151735, "learning_rate": 5e-06, "loss": 0.6499, "step": 2360 }, { "epoch": 2.378324134470647, "grad_norm": 0.5399591771385078, "learning_rate": 5e-06, "loss": 0.6542, "step": 2370 }, { "epoch": 2.388359257400903, "grad_norm": 0.7051292001377757, "learning_rate": 5e-06, "loss": 0.6519, "step": 2380 }, { "epoch": 2.398394380331159, "grad_norm": 0.7249734533415666, "learning_rate": 5e-06, "loss": 0.6453, "step": 2390 }, { "epoch": 2.408429503261415, "grad_norm": 0.5667898165098878, "learning_rate": 5e-06, "loss": 0.6502, "step": 2400 }, { "epoch": 2.4184646261916707, "grad_norm": 0.5714404629239772, "learning_rate": 5e-06, "loss": 0.651, "step": 2410 }, { "epoch": 2.4284997491219267, "grad_norm": 0.5776379885611598, "learning_rate": 5e-06, "loss": 0.6533, "step": 2420 }, { "epoch": 2.4385348720521827, "grad_norm": 0.5720433924252879, "learning_rate": 5e-06, "loss": 0.6495, "step": 2430 }, { "epoch": 2.4485699949824387, "grad_norm": 0.5385010214829424, "learning_rate": 5e-06, "loss": 0.6535, "step": 2440 }, { "epoch": 2.458605117912694, "grad_norm": 0.7858581581353575, "learning_rate": 5e-06, "loss": 0.6501, "step": 2450 }, { "epoch": 2.46864024084295, "grad_norm": 0.5254373337828374, "learning_rate": 5e-06, "loss": 0.6514, "step": 2460 }, { "epoch": 2.478675363773206, "grad_norm": 0.5927951089454341, "learning_rate": 5e-06, "loss": 0.6567, "step": 2470 }, { "epoch": 2.488710486703462, "grad_norm": 0.576333054967198, "learning_rate": 5e-06, "loss": 0.6468, "step": 2480 }, { "epoch": 2.498745609633718, "grad_norm": 0.5997091587436316, "learning_rate": 5e-06, "loss": 0.6494, "step": 2490 }, { "epoch": 2.5087807325639737, "grad_norm": 0.5774761407070679, "learning_rate": 5e-06, "loss": 0.6575, "step": 2500 }, { "epoch": 2.5188158554942297, "grad_norm": 0.5697277316039128, "learning_rate": 5e-06, "loss": 0.6529, "step": 2510 }, { "epoch": 2.5288509784244857, "grad_norm": 0.5523271421741578, "learning_rate": 5e-06, "loss": 0.6486, "step": 2520 }, { "epoch": 2.5388861013547417, "grad_norm": 0.5308441673869406, "learning_rate": 5e-06, "loss": 0.6479, "step": 2530 }, { "epoch": 2.5489212242849977, "grad_norm": 0.611658706018765, "learning_rate": 5e-06, "loss": 0.6515, "step": 2540 }, { "epoch": 2.5589563472152532, "grad_norm": 0.5882952161403756, "learning_rate": 5e-06, "loss": 0.6529, "step": 2550 }, { "epoch": 2.568991470145509, "grad_norm": 0.5944397200737397, "learning_rate": 5e-06, "loss": 0.6486, "step": 2560 }, { "epoch": 2.579026593075765, "grad_norm": 0.6211121511651304, "learning_rate": 5e-06, "loss": 0.6511, "step": 2570 }, { "epoch": 2.589061716006021, "grad_norm": 0.5687379330786366, "learning_rate": 5e-06, "loss": 0.6599, "step": 2580 }, { "epoch": 2.599096838936277, "grad_norm": 0.5264826150946973, "learning_rate": 5e-06, "loss": 0.6475, "step": 2590 }, { "epoch": 2.6091319618665327, "grad_norm": 0.546285649845265, "learning_rate": 5e-06, "loss": 0.6531, "step": 2600 }, { "epoch": 2.6191670847967887, "grad_norm": 0.5016403997355949, "learning_rate": 5e-06, "loss": 0.6532, "step": 2610 }, { "epoch": 2.6292022077270447, "grad_norm": 0.514946202260049, "learning_rate": 5e-06, "loss": 0.6476, "step": 2620 }, { "epoch": 2.6392373306573007, "grad_norm": 0.5768998926592247, "learning_rate": 5e-06, "loss": 0.6499, "step": 2630 }, { "epoch": 2.6492724535875567, "grad_norm": 0.56824673619337, "learning_rate": 5e-06, "loss": 0.6515, "step": 2640 }, { "epoch": 2.6593075765178122, "grad_norm": 0.5743894544574858, "learning_rate": 5e-06, "loss": 0.6528, "step": 2650 }, { "epoch": 2.6693426994480682, "grad_norm": 0.6566059771585782, "learning_rate": 5e-06, "loss": 0.6482, "step": 2660 }, { "epoch": 2.6793778223783242, "grad_norm": 0.6222476495314778, "learning_rate": 5e-06, "loss": 0.6521, "step": 2670 }, { "epoch": 2.6894129453085798, "grad_norm": 0.5352433226802004, "learning_rate": 5e-06, "loss": 0.6511, "step": 2680 }, { "epoch": 2.6994480682388358, "grad_norm": 0.5593740996298296, "learning_rate": 5e-06, "loss": 0.6502, "step": 2690 }, { "epoch": 2.7094831911690918, "grad_norm": 0.5870512541625679, "learning_rate": 5e-06, "loss": 0.6509, "step": 2700 }, { "epoch": 2.7195183140993477, "grad_norm": 0.5194458048924404, "learning_rate": 5e-06, "loss": 0.6522, "step": 2710 }, { "epoch": 2.7295534370296037, "grad_norm": 0.5207337998079029, "learning_rate": 5e-06, "loss": 0.6513, "step": 2720 }, { "epoch": 2.7395885599598593, "grad_norm": 0.543790751828564, "learning_rate": 5e-06, "loss": 0.6551, "step": 2730 }, { "epoch": 2.7496236828901153, "grad_norm": 0.5863597318330472, "learning_rate": 5e-06, "loss": 0.6545, "step": 2740 }, { "epoch": 2.7596588058203713, "grad_norm": 0.5388170321853544, "learning_rate": 5e-06, "loss": 0.6538, "step": 2750 }, { "epoch": 2.7696939287506273, "grad_norm": 0.5742236049971658, "learning_rate": 5e-06, "loss": 0.6583, "step": 2760 }, { "epoch": 2.7797290516808832, "grad_norm": 0.5384373778506647, "learning_rate": 5e-06, "loss": 0.6545, "step": 2770 }, { "epoch": 2.789764174611139, "grad_norm": 0.5345134774106539, "learning_rate": 5e-06, "loss": 0.6508, "step": 2780 }, { "epoch": 2.799799297541395, "grad_norm": 0.6099706934146881, "learning_rate": 5e-06, "loss": 0.6534, "step": 2790 }, { "epoch": 2.8098344204716508, "grad_norm": 0.7684214994129063, "learning_rate": 5e-06, "loss": 0.6552, "step": 2800 }, { "epoch": 2.8198695434019068, "grad_norm": 0.6812429404931887, "learning_rate": 5e-06, "loss": 0.6541, "step": 2810 }, { "epoch": 2.8299046663321628, "grad_norm": 0.5551480308045381, "learning_rate": 5e-06, "loss": 0.648, "step": 2820 }, { "epoch": 2.8399397892624183, "grad_norm": 0.5783011582533801, "learning_rate": 5e-06, "loss": 0.6538, "step": 2830 }, { "epoch": 2.8499749121926743, "grad_norm": 0.6083930556812827, "learning_rate": 5e-06, "loss": 0.652, "step": 2840 }, { "epoch": 2.8600100351229303, "grad_norm": 0.6412044060911565, "learning_rate": 5e-06, "loss": 0.6574, "step": 2850 }, { "epoch": 2.8700451580531863, "grad_norm": 0.6641263630618335, "learning_rate": 5e-06, "loss": 0.6533, "step": 2860 }, { "epoch": 2.8800802809834423, "grad_norm": 0.7128854398954438, "learning_rate": 5e-06, "loss": 0.6516, "step": 2870 }, { "epoch": 2.890115403913698, "grad_norm": 0.6732370461622078, "learning_rate": 5e-06, "loss": 0.6542, "step": 2880 }, { "epoch": 2.900150526843954, "grad_norm": 0.5615551950157813, "learning_rate": 5e-06, "loss": 0.6517, "step": 2890 }, { "epoch": 2.91018564977421, "grad_norm": 0.5360864764156098, "learning_rate": 5e-06, "loss": 0.6529, "step": 2900 }, { "epoch": 2.920220772704466, "grad_norm": 0.5686095181581269, "learning_rate": 5e-06, "loss": 0.6522, "step": 2910 }, { "epoch": 2.9302558956347218, "grad_norm": 0.49222391395442017, "learning_rate": 5e-06, "loss": 0.6508, "step": 2920 }, { "epoch": 2.9402910185649773, "grad_norm": 0.5723888567358063, "learning_rate": 5e-06, "loss": 0.6497, "step": 2930 }, { "epoch": 2.9503261414952333, "grad_norm": 0.6277028838019034, "learning_rate": 5e-06, "loss": 0.6548, "step": 2940 }, { "epoch": 2.9603612644254893, "grad_norm": 0.6499248602518872, "learning_rate": 5e-06, "loss": 0.6539, "step": 2950 }, { "epoch": 2.970396387355745, "grad_norm": 0.5523665140419113, "learning_rate": 5e-06, "loss": 0.6567, "step": 2960 }, { "epoch": 2.980431510286001, "grad_norm": 0.5253539559863383, "learning_rate": 5e-06, "loss": 0.6547, "step": 2970 }, { "epoch": 2.990466633216257, "grad_norm": 0.5665529328640058, "learning_rate": 5e-06, "loss": 0.6516, "step": 2980 }, { "epoch": 2.9984947315604615, "eval_loss": 0.714939534664154, "eval_runtime": 532.5929, "eval_samples_per_second": 50.412, "eval_steps_per_second": 0.394, "step": 2988 }, { "epoch": 2.9984947315604615, "step": 2988, "total_flos": 5004592530063360.0, "train_loss": 0.7032094593826866, "train_runtime": 89126.24, "train_samples_per_second": 17.171, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 2988, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5004592530063360.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }