sedrickkeh's picture
End of training
4e5d8cf verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9984947315604615,
"eval_steps": 500,
"global_step": 2988,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010035122930255895,
"grad_norm": 3.0791230568719863,
"learning_rate": 5e-06,
"loss": 1.0584,
"step": 10
},
{
"epoch": 0.02007024586051179,
"grad_norm": 1.9010262387699988,
"learning_rate": 5e-06,
"loss": 0.9369,
"step": 20
},
{
"epoch": 0.030105368790767688,
"grad_norm": 3.9302140807930486,
"learning_rate": 5e-06,
"loss": 0.8954,
"step": 30
},
{
"epoch": 0.04014049172102358,
"grad_norm": 1.1954643789588726,
"learning_rate": 5e-06,
"loss": 0.8743,
"step": 40
},
{
"epoch": 0.050175614651279475,
"grad_norm": 1.1581266418383889,
"learning_rate": 5e-06,
"loss": 0.8604,
"step": 50
},
{
"epoch": 0.060210737581535376,
"grad_norm": 1.5383829915522733,
"learning_rate": 5e-06,
"loss": 0.844,
"step": 60
},
{
"epoch": 0.07024586051179127,
"grad_norm": 1.5977753412538256,
"learning_rate": 5e-06,
"loss": 0.8329,
"step": 70
},
{
"epoch": 0.08028098344204716,
"grad_norm": 1.5289565466827575,
"learning_rate": 5e-06,
"loss": 0.8265,
"step": 80
},
{
"epoch": 0.09031610637230306,
"grad_norm": 1.3386469754796255,
"learning_rate": 5e-06,
"loss": 0.8147,
"step": 90
},
{
"epoch": 0.10035122930255895,
"grad_norm": 1.0647477486272434,
"learning_rate": 5e-06,
"loss": 0.8113,
"step": 100
},
{
"epoch": 0.11038635223281486,
"grad_norm": 0.7070463503515779,
"learning_rate": 5e-06,
"loss": 0.8026,
"step": 110
},
{
"epoch": 0.12042147516307075,
"grad_norm": 0.7381734885268878,
"learning_rate": 5e-06,
"loss": 0.7989,
"step": 120
},
{
"epoch": 0.13045659809332663,
"grad_norm": 0.8946901348596374,
"learning_rate": 5e-06,
"loss": 0.8007,
"step": 130
},
{
"epoch": 0.14049172102358254,
"grad_norm": 0.7080206896455782,
"learning_rate": 5e-06,
"loss": 0.7937,
"step": 140
},
{
"epoch": 0.15052684395383845,
"grad_norm": 0.7872021804288697,
"learning_rate": 5e-06,
"loss": 0.7964,
"step": 150
},
{
"epoch": 0.16056196688409433,
"grad_norm": 0.6344742687953677,
"learning_rate": 5e-06,
"loss": 0.7938,
"step": 160
},
{
"epoch": 0.17059708981435023,
"grad_norm": 0.8040310396952577,
"learning_rate": 5e-06,
"loss": 0.7867,
"step": 170
},
{
"epoch": 0.1806322127446061,
"grad_norm": 0.5889599293110972,
"learning_rate": 5e-06,
"loss": 0.7868,
"step": 180
},
{
"epoch": 0.19066733567486202,
"grad_norm": 0.708534731132967,
"learning_rate": 5e-06,
"loss": 0.7854,
"step": 190
},
{
"epoch": 0.2007024586051179,
"grad_norm": 0.590241380971299,
"learning_rate": 5e-06,
"loss": 0.782,
"step": 200
},
{
"epoch": 0.2107375815353738,
"grad_norm": 0.6386623963841482,
"learning_rate": 5e-06,
"loss": 0.7802,
"step": 210
},
{
"epoch": 0.22077270446562972,
"grad_norm": 0.8643148756886396,
"learning_rate": 5e-06,
"loss": 0.7766,
"step": 220
},
{
"epoch": 0.2308078273958856,
"grad_norm": 0.6869876976216545,
"learning_rate": 5e-06,
"loss": 0.7811,
"step": 230
},
{
"epoch": 0.2408429503261415,
"grad_norm": 0.5947006434799368,
"learning_rate": 5e-06,
"loss": 0.7785,
"step": 240
},
{
"epoch": 0.2508780732563974,
"grad_norm": 0.5988389120535884,
"learning_rate": 5e-06,
"loss": 0.7736,
"step": 250
},
{
"epoch": 0.26091319618665326,
"grad_norm": 0.7015845489442423,
"learning_rate": 5e-06,
"loss": 0.7683,
"step": 260
},
{
"epoch": 0.2709483191169092,
"grad_norm": 0.7899101098197423,
"learning_rate": 5e-06,
"loss": 0.7735,
"step": 270
},
{
"epoch": 0.2809834420471651,
"grad_norm": 0.6594638076973581,
"learning_rate": 5e-06,
"loss": 0.7718,
"step": 280
},
{
"epoch": 0.29101856497742096,
"grad_norm": 0.7466372083749109,
"learning_rate": 5e-06,
"loss": 0.7729,
"step": 290
},
{
"epoch": 0.3010536879076769,
"grad_norm": 0.7256825478194775,
"learning_rate": 5e-06,
"loss": 0.7692,
"step": 300
},
{
"epoch": 0.31108881083793277,
"grad_norm": 0.6222207642465774,
"learning_rate": 5e-06,
"loss": 0.7664,
"step": 310
},
{
"epoch": 0.32112393376818865,
"grad_norm": 0.6646085367912792,
"learning_rate": 5e-06,
"loss": 0.7648,
"step": 320
},
{
"epoch": 0.33115905669844453,
"grad_norm": 0.7893620341431038,
"learning_rate": 5e-06,
"loss": 0.7624,
"step": 330
},
{
"epoch": 0.34119417962870047,
"grad_norm": 0.6231595108266089,
"learning_rate": 5e-06,
"loss": 0.7714,
"step": 340
},
{
"epoch": 0.35122930255895635,
"grad_norm": 0.668351154817616,
"learning_rate": 5e-06,
"loss": 0.7632,
"step": 350
},
{
"epoch": 0.3612644254892122,
"grad_norm": 0.6343439838317185,
"learning_rate": 5e-06,
"loss": 0.7626,
"step": 360
},
{
"epoch": 0.37129954841946816,
"grad_norm": 0.7056146316204847,
"learning_rate": 5e-06,
"loss": 0.7628,
"step": 370
},
{
"epoch": 0.38133467134972404,
"grad_norm": 0.7902937779981405,
"learning_rate": 5e-06,
"loss": 0.7689,
"step": 380
},
{
"epoch": 0.3913697942799799,
"grad_norm": 0.6918763236938501,
"learning_rate": 5e-06,
"loss": 0.7592,
"step": 390
},
{
"epoch": 0.4014049172102358,
"grad_norm": 0.7358230335616606,
"learning_rate": 5e-06,
"loss": 0.7577,
"step": 400
},
{
"epoch": 0.41144004014049174,
"grad_norm": 0.6126046734368374,
"learning_rate": 5e-06,
"loss": 0.761,
"step": 410
},
{
"epoch": 0.4214751630707476,
"grad_norm": 0.6317827551022122,
"learning_rate": 5e-06,
"loss": 0.7598,
"step": 420
},
{
"epoch": 0.4315102860010035,
"grad_norm": 0.6003042486796623,
"learning_rate": 5e-06,
"loss": 0.7613,
"step": 430
},
{
"epoch": 0.44154540893125943,
"grad_norm": 0.5703662549001378,
"learning_rate": 5e-06,
"loss": 0.7602,
"step": 440
},
{
"epoch": 0.4515805318615153,
"grad_norm": 0.6096409131095752,
"learning_rate": 5e-06,
"loss": 0.7496,
"step": 450
},
{
"epoch": 0.4616156547917712,
"grad_norm": 0.8305089106013069,
"learning_rate": 5e-06,
"loss": 0.7553,
"step": 460
},
{
"epoch": 0.47165077772202707,
"grad_norm": 0.5896793508236663,
"learning_rate": 5e-06,
"loss": 0.7503,
"step": 470
},
{
"epoch": 0.481685900652283,
"grad_norm": 0.6181255276560262,
"learning_rate": 5e-06,
"loss": 0.7573,
"step": 480
},
{
"epoch": 0.4917210235825389,
"grad_norm": 0.818946770368422,
"learning_rate": 5e-06,
"loss": 0.752,
"step": 490
},
{
"epoch": 0.5017561465127948,
"grad_norm": 0.6056931157441836,
"learning_rate": 5e-06,
"loss": 0.7537,
"step": 500
},
{
"epoch": 0.5117912694430506,
"grad_norm": 0.5810131329440165,
"learning_rate": 5e-06,
"loss": 0.7559,
"step": 510
},
{
"epoch": 0.5218263923733065,
"grad_norm": 0.5475586575226008,
"learning_rate": 5e-06,
"loss": 0.7502,
"step": 520
},
{
"epoch": 0.5318615153035625,
"grad_norm": 0.5857098250554217,
"learning_rate": 5e-06,
"loss": 0.7486,
"step": 530
},
{
"epoch": 0.5418966382338184,
"grad_norm": 0.71215741030445,
"learning_rate": 5e-06,
"loss": 0.7453,
"step": 540
},
{
"epoch": 0.5519317611640743,
"grad_norm": 0.6801576099304811,
"learning_rate": 5e-06,
"loss": 0.7476,
"step": 550
},
{
"epoch": 0.5619668840943302,
"grad_norm": 0.7375590297607938,
"learning_rate": 5e-06,
"loss": 0.7511,
"step": 560
},
{
"epoch": 0.572002007024586,
"grad_norm": 0.6187827311828052,
"learning_rate": 5e-06,
"loss": 0.7484,
"step": 570
},
{
"epoch": 0.5820371299548419,
"grad_norm": 0.5878218056763826,
"learning_rate": 5e-06,
"loss": 0.7441,
"step": 580
},
{
"epoch": 0.5920722528850978,
"grad_norm": 0.5969510290233113,
"learning_rate": 5e-06,
"loss": 0.7462,
"step": 590
},
{
"epoch": 0.6021073758153538,
"grad_norm": 0.5535464540372343,
"learning_rate": 5e-06,
"loss": 0.7497,
"step": 600
},
{
"epoch": 0.6121424987456097,
"grad_norm": 0.5509670875952559,
"learning_rate": 5e-06,
"loss": 0.747,
"step": 610
},
{
"epoch": 0.6221776216758655,
"grad_norm": 0.558690698251435,
"learning_rate": 5e-06,
"loss": 0.7433,
"step": 620
},
{
"epoch": 0.6322127446061214,
"grad_norm": 0.559060237211832,
"learning_rate": 5e-06,
"loss": 0.7482,
"step": 630
},
{
"epoch": 0.6422478675363773,
"grad_norm": 0.993216287837658,
"learning_rate": 5e-06,
"loss": 0.7482,
"step": 640
},
{
"epoch": 0.6522829904666332,
"grad_norm": 0.8850325099442093,
"learning_rate": 5e-06,
"loss": 0.7428,
"step": 650
},
{
"epoch": 0.6623181133968891,
"grad_norm": 0.6458169799733141,
"learning_rate": 5e-06,
"loss": 0.7454,
"step": 660
},
{
"epoch": 0.672353236327145,
"grad_norm": 0.5757378815184032,
"learning_rate": 5e-06,
"loss": 0.7462,
"step": 670
},
{
"epoch": 0.6823883592574009,
"grad_norm": 0.8278456155470433,
"learning_rate": 5e-06,
"loss": 0.742,
"step": 680
},
{
"epoch": 0.6924234821876568,
"grad_norm": 0.8087863092750499,
"learning_rate": 5e-06,
"loss": 0.7392,
"step": 690
},
{
"epoch": 0.7024586051179127,
"grad_norm": 0.5920912292564408,
"learning_rate": 5e-06,
"loss": 0.7425,
"step": 700
},
{
"epoch": 0.7124937280481686,
"grad_norm": 0.6733638944211415,
"learning_rate": 5e-06,
"loss": 0.7408,
"step": 710
},
{
"epoch": 0.7225288509784245,
"grad_norm": 0.6330098467703786,
"learning_rate": 5e-06,
"loss": 0.7424,
"step": 720
},
{
"epoch": 0.7325639739086803,
"grad_norm": 0.7212451311927113,
"learning_rate": 5e-06,
"loss": 0.7429,
"step": 730
},
{
"epoch": 0.7425990968389363,
"grad_norm": 0.7896101501841413,
"learning_rate": 5e-06,
"loss": 0.7419,
"step": 740
},
{
"epoch": 0.7526342197691922,
"grad_norm": 0.8026558529242067,
"learning_rate": 5e-06,
"loss": 0.7383,
"step": 750
},
{
"epoch": 0.7626693426994481,
"grad_norm": 0.5426060774366821,
"learning_rate": 5e-06,
"loss": 0.7425,
"step": 760
},
{
"epoch": 0.772704465629704,
"grad_norm": 0.8110802533740097,
"learning_rate": 5e-06,
"loss": 0.7406,
"step": 770
},
{
"epoch": 0.7827395885599598,
"grad_norm": 0.5470009049474683,
"learning_rate": 5e-06,
"loss": 0.7413,
"step": 780
},
{
"epoch": 0.7927747114902157,
"grad_norm": 0.5433159754299082,
"learning_rate": 5e-06,
"loss": 0.737,
"step": 790
},
{
"epoch": 0.8028098344204716,
"grad_norm": 0.5469593174447279,
"learning_rate": 5e-06,
"loss": 0.7381,
"step": 800
},
{
"epoch": 0.8128449573507276,
"grad_norm": 0.5831597485374533,
"learning_rate": 5e-06,
"loss": 0.7374,
"step": 810
},
{
"epoch": 0.8228800802809835,
"grad_norm": 0.5117459329458333,
"learning_rate": 5e-06,
"loss": 0.7386,
"step": 820
},
{
"epoch": 0.8329152032112394,
"grad_norm": 0.6427232877089865,
"learning_rate": 5e-06,
"loss": 0.7356,
"step": 830
},
{
"epoch": 0.8429503261414952,
"grad_norm": 0.5821269785394396,
"learning_rate": 5e-06,
"loss": 0.7347,
"step": 840
},
{
"epoch": 0.8529854490717511,
"grad_norm": 0.7120331561544883,
"learning_rate": 5e-06,
"loss": 0.7363,
"step": 850
},
{
"epoch": 0.863020572002007,
"grad_norm": 0.6454042518783881,
"learning_rate": 5e-06,
"loss": 0.7371,
"step": 860
},
{
"epoch": 0.8730556949322629,
"grad_norm": 0.5420399494529005,
"learning_rate": 5e-06,
"loss": 0.7349,
"step": 870
},
{
"epoch": 0.8830908178625189,
"grad_norm": 0.5989777798696867,
"learning_rate": 5e-06,
"loss": 0.7346,
"step": 880
},
{
"epoch": 0.8931259407927747,
"grad_norm": 0.5399005419446141,
"learning_rate": 5e-06,
"loss": 0.7333,
"step": 890
},
{
"epoch": 0.9031610637230306,
"grad_norm": 0.5484451725519497,
"learning_rate": 5e-06,
"loss": 0.7369,
"step": 900
},
{
"epoch": 0.9131961866532865,
"grad_norm": 0.6023327403789546,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 910
},
{
"epoch": 0.9232313095835424,
"grad_norm": 0.570469676089088,
"learning_rate": 5e-06,
"loss": 0.7332,
"step": 920
},
{
"epoch": 0.9332664325137983,
"grad_norm": 0.5358735783509466,
"learning_rate": 5e-06,
"loss": 0.7364,
"step": 930
},
{
"epoch": 0.9433015554440541,
"grad_norm": 0.596570834196872,
"learning_rate": 5e-06,
"loss": 0.734,
"step": 940
},
{
"epoch": 0.9533366783743101,
"grad_norm": 0.8858750188622682,
"learning_rate": 5e-06,
"loss": 0.7299,
"step": 950
},
{
"epoch": 0.963371801304566,
"grad_norm": 0.7057998358035058,
"learning_rate": 5e-06,
"loss": 0.7296,
"step": 960
},
{
"epoch": 0.9734069242348219,
"grad_norm": 0.6567689691351051,
"learning_rate": 5e-06,
"loss": 0.7383,
"step": 970
},
{
"epoch": 0.9834420471650778,
"grad_norm": 0.5723166821895646,
"learning_rate": 5e-06,
"loss": 0.7329,
"step": 980
},
{
"epoch": 0.9934771700953337,
"grad_norm": 0.5989872298413202,
"learning_rate": 5e-06,
"loss": 0.7331,
"step": 990
},
{
"epoch": 0.9994982438534872,
"eval_loss": 0.7304001450538635,
"eval_runtime": 533.1509,
"eval_samples_per_second": 50.359,
"eval_steps_per_second": 0.394,
"step": 996
},
{
"epoch": 1.0035122930255895,
"grad_norm": 0.7229493528687063,
"learning_rate": 5e-06,
"loss": 0.754,
"step": 1000
},
{
"epoch": 1.0135474159558455,
"grad_norm": 0.7232729261961888,
"learning_rate": 5e-06,
"loss": 0.6938,
"step": 1010
},
{
"epoch": 1.0235825388861013,
"grad_norm": 0.5558461578633823,
"learning_rate": 5e-06,
"loss": 0.6963,
"step": 1020
},
{
"epoch": 1.0336176618163573,
"grad_norm": 0.7260986137213504,
"learning_rate": 5e-06,
"loss": 0.6892,
"step": 1030
},
{
"epoch": 1.043652784746613,
"grad_norm": 0.5639681058736989,
"learning_rate": 5e-06,
"loss": 0.6922,
"step": 1040
},
{
"epoch": 1.053687907676869,
"grad_norm": 0.7762004415565651,
"learning_rate": 5e-06,
"loss": 0.6893,
"step": 1050
},
{
"epoch": 1.063723030607125,
"grad_norm": 0.5656634035888263,
"learning_rate": 5e-06,
"loss": 0.6903,
"step": 1060
},
{
"epoch": 1.0737581535373808,
"grad_norm": 0.5622021467734256,
"learning_rate": 5e-06,
"loss": 0.6975,
"step": 1070
},
{
"epoch": 1.0837932764676368,
"grad_norm": 0.5747675890837022,
"learning_rate": 5e-06,
"loss": 0.6953,
"step": 1080
},
{
"epoch": 1.0938283993978926,
"grad_norm": 0.7477336699942959,
"learning_rate": 5e-06,
"loss": 0.6924,
"step": 1090
},
{
"epoch": 1.1038635223281485,
"grad_norm": 0.6002636077929234,
"learning_rate": 5e-06,
"loss": 0.6933,
"step": 1100
},
{
"epoch": 1.1138986452584043,
"grad_norm": 0.5094594079799682,
"learning_rate": 5e-06,
"loss": 0.6909,
"step": 1110
},
{
"epoch": 1.1239337681886603,
"grad_norm": 0.7393831172107309,
"learning_rate": 5e-06,
"loss": 0.6962,
"step": 1120
},
{
"epoch": 1.1339688911189163,
"grad_norm": 0.8841355043993067,
"learning_rate": 5e-06,
"loss": 0.6909,
"step": 1130
},
{
"epoch": 1.144004014049172,
"grad_norm": 0.6844280192067772,
"learning_rate": 5e-06,
"loss": 0.6909,
"step": 1140
},
{
"epoch": 1.154039136979428,
"grad_norm": 0.5968326130517227,
"learning_rate": 5e-06,
"loss": 0.6959,
"step": 1150
},
{
"epoch": 1.1640742599096838,
"grad_norm": 0.6164015759188082,
"learning_rate": 5e-06,
"loss": 0.6898,
"step": 1160
},
{
"epoch": 1.1741093828399398,
"grad_norm": 0.6358359825338351,
"learning_rate": 5e-06,
"loss": 0.6944,
"step": 1170
},
{
"epoch": 1.1841445057701956,
"grad_norm": 0.6829806654257828,
"learning_rate": 5e-06,
"loss": 0.697,
"step": 1180
},
{
"epoch": 1.1941796287004516,
"grad_norm": 0.794769579664283,
"learning_rate": 5e-06,
"loss": 0.6983,
"step": 1190
},
{
"epoch": 1.2042147516307076,
"grad_norm": 0.5535487901068424,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 1200
},
{
"epoch": 1.2142498745609633,
"grad_norm": 0.5121320501008007,
"learning_rate": 5e-06,
"loss": 0.6931,
"step": 1210
},
{
"epoch": 1.2242849974912193,
"grad_norm": 0.77177684811206,
"learning_rate": 5e-06,
"loss": 0.6959,
"step": 1220
},
{
"epoch": 1.234320120421475,
"grad_norm": 0.5890956733651708,
"learning_rate": 5e-06,
"loss": 0.6909,
"step": 1230
},
{
"epoch": 1.244355243351731,
"grad_norm": 0.6143309157442813,
"learning_rate": 5e-06,
"loss": 0.6928,
"step": 1240
},
{
"epoch": 1.2543903662819869,
"grad_norm": 0.5522142360609605,
"learning_rate": 5e-06,
"loss": 0.6947,
"step": 1250
},
{
"epoch": 1.2644254892122428,
"grad_norm": 0.49945028630806676,
"learning_rate": 5e-06,
"loss": 0.6948,
"step": 1260
},
{
"epoch": 1.2744606121424988,
"grad_norm": 0.6318729529808568,
"learning_rate": 5e-06,
"loss": 0.6948,
"step": 1270
},
{
"epoch": 1.2844957350727546,
"grad_norm": 0.571168433475203,
"learning_rate": 5e-06,
"loss": 0.691,
"step": 1280
},
{
"epoch": 1.2945308580030106,
"grad_norm": 0.586240165635217,
"learning_rate": 5e-06,
"loss": 0.6867,
"step": 1290
},
{
"epoch": 1.3045659809332664,
"grad_norm": 0.5371646553547392,
"learning_rate": 5e-06,
"loss": 0.6959,
"step": 1300
},
{
"epoch": 1.3146011038635224,
"grad_norm": 0.7305933258706494,
"learning_rate": 5e-06,
"loss": 0.6965,
"step": 1310
},
{
"epoch": 1.3246362267937783,
"grad_norm": 0.5749853775138927,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 1320
},
{
"epoch": 1.3346713497240341,
"grad_norm": 0.5364405695908858,
"learning_rate": 5e-06,
"loss": 0.6879,
"step": 1330
},
{
"epoch": 1.3447064726542899,
"grad_norm": 0.6583337401074435,
"learning_rate": 5e-06,
"loss": 0.6941,
"step": 1340
},
{
"epoch": 1.3547415955845459,
"grad_norm": 0.5510351142365912,
"learning_rate": 5e-06,
"loss": 0.6881,
"step": 1350
},
{
"epoch": 1.3647767185148019,
"grad_norm": 0.6470013078598107,
"learning_rate": 5e-06,
"loss": 0.6963,
"step": 1360
},
{
"epoch": 1.3748118414450576,
"grad_norm": 0.6435465190281996,
"learning_rate": 5e-06,
"loss": 0.6848,
"step": 1370
},
{
"epoch": 1.3848469643753136,
"grad_norm": 0.5592954158689228,
"learning_rate": 5e-06,
"loss": 0.6933,
"step": 1380
},
{
"epoch": 1.3948820873055694,
"grad_norm": 0.5565093643287172,
"learning_rate": 5e-06,
"loss": 0.6956,
"step": 1390
},
{
"epoch": 1.4049172102358254,
"grad_norm": 0.5429384088118309,
"learning_rate": 5e-06,
"loss": 0.6913,
"step": 1400
},
{
"epoch": 1.4149523331660814,
"grad_norm": 0.5259873486414379,
"learning_rate": 5e-06,
"loss": 0.6905,
"step": 1410
},
{
"epoch": 1.4249874560963371,
"grad_norm": 0.5791809050097347,
"learning_rate": 5e-06,
"loss": 0.6913,
"step": 1420
},
{
"epoch": 1.4350225790265931,
"grad_norm": 0.5409946987555362,
"learning_rate": 5e-06,
"loss": 0.6944,
"step": 1430
},
{
"epoch": 1.445057701956849,
"grad_norm": 0.6258615837964843,
"learning_rate": 5e-06,
"loss": 0.6906,
"step": 1440
},
{
"epoch": 1.455092824887105,
"grad_norm": 0.5255664654202296,
"learning_rate": 5e-06,
"loss": 0.6915,
"step": 1450
},
{
"epoch": 1.4651279478173609,
"grad_norm": 0.5157724202212534,
"learning_rate": 5e-06,
"loss": 0.6911,
"step": 1460
},
{
"epoch": 1.4751630707476167,
"grad_norm": 0.6020158761130551,
"learning_rate": 5e-06,
"loss": 0.692,
"step": 1470
},
{
"epoch": 1.4851981936778724,
"grad_norm": 0.5268622104694118,
"learning_rate": 5e-06,
"loss": 0.6884,
"step": 1480
},
{
"epoch": 1.4952333166081284,
"grad_norm": 0.6743718354665602,
"learning_rate": 5e-06,
"loss": 0.6886,
"step": 1490
},
{
"epoch": 1.5052684395383844,
"grad_norm": 0.5526860667382173,
"learning_rate": 5e-06,
"loss": 0.695,
"step": 1500
},
{
"epoch": 1.5153035624686404,
"grad_norm": 0.5659897839907012,
"learning_rate": 5e-06,
"loss": 0.6962,
"step": 1510
},
{
"epoch": 1.5253386853988962,
"grad_norm": 0.566194976984101,
"learning_rate": 5e-06,
"loss": 0.6953,
"step": 1520
},
{
"epoch": 1.535373808329152,
"grad_norm": 0.6884023263714835,
"learning_rate": 5e-06,
"loss": 0.6912,
"step": 1530
},
{
"epoch": 1.545408931259408,
"grad_norm": 0.6152652274544539,
"learning_rate": 5e-06,
"loss": 0.6881,
"step": 1540
},
{
"epoch": 1.555444054189664,
"grad_norm": 0.7014542676012229,
"learning_rate": 5e-06,
"loss": 0.6874,
"step": 1550
},
{
"epoch": 1.5654791771199197,
"grad_norm": 0.765002797379268,
"learning_rate": 5e-06,
"loss": 0.6923,
"step": 1560
},
{
"epoch": 1.5755143000501755,
"grad_norm": 0.5516453996184308,
"learning_rate": 5e-06,
"loss": 0.6896,
"step": 1570
},
{
"epoch": 1.5855494229804314,
"grad_norm": 0.5432337667581397,
"learning_rate": 5e-06,
"loss": 0.6903,
"step": 1580
},
{
"epoch": 1.5955845459106874,
"grad_norm": 0.5346527907725022,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 1590
},
{
"epoch": 1.6056196688409434,
"grad_norm": 0.5603099472061689,
"learning_rate": 5e-06,
"loss": 0.6913,
"step": 1600
},
{
"epoch": 1.6156547917711992,
"grad_norm": 0.6065488834635239,
"learning_rate": 5e-06,
"loss": 0.6855,
"step": 1610
},
{
"epoch": 1.625689914701455,
"grad_norm": 0.5274199389654457,
"learning_rate": 5e-06,
"loss": 0.689,
"step": 1620
},
{
"epoch": 1.635725037631711,
"grad_norm": 0.5120395521998542,
"learning_rate": 5e-06,
"loss": 0.6917,
"step": 1630
},
{
"epoch": 1.645760160561967,
"grad_norm": 0.5519879159635278,
"learning_rate": 5e-06,
"loss": 0.6844,
"step": 1640
},
{
"epoch": 1.655795283492223,
"grad_norm": 0.504366123389348,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 1650
},
{
"epoch": 1.6658304064224787,
"grad_norm": 0.5435020519283282,
"learning_rate": 5e-06,
"loss": 0.6985,
"step": 1660
},
{
"epoch": 1.6758655293527345,
"grad_norm": 0.6023059143733316,
"learning_rate": 5e-06,
"loss": 0.6899,
"step": 1670
},
{
"epoch": 1.6859006522829905,
"grad_norm": 0.6127499530451496,
"learning_rate": 5e-06,
"loss": 0.6901,
"step": 1680
},
{
"epoch": 1.6959357752132465,
"grad_norm": 0.584858369961145,
"learning_rate": 5e-06,
"loss": 0.6926,
"step": 1690
},
{
"epoch": 1.7059708981435022,
"grad_norm": 0.6802752379981577,
"learning_rate": 5e-06,
"loss": 0.6889,
"step": 1700
},
{
"epoch": 1.716006021073758,
"grad_norm": 0.6145503442957706,
"learning_rate": 5e-06,
"loss": 0.6938,
"step": 1710
},
{
"epoch": 1.726041144004014,
"grad_norm": 0.5806125711805925,
"learning_rate": 5e-06,
"loss": 0.6892,
"step": 1720
},
{
"epoch": 1.73607626693427,
"grad_norm": 0.5164505361265845,
"learning_rate": 5e-06,
"loss": 0.6876,
"step": 1730
},
{
"epoch": 1.746111389864526,
"grad_norm": 0.518409684362953,
"learning_rate": 5e-06,
"loss": 0.6914,
"step": 1740
},
{
"epoch": 1.7561465127947817,
"grad_norm": 0.5798099508033144,
"learning_rate": 5e-06,
"loss": 0.6936,
"step": 1750
},
{
"epoch": 1.7661816357250375,
"grad_norm": 0.5397031230350496,
"learning_rate": 5e-06,
"loss": 0.6884,
"step": 1760
},
{
"epoch": 1.7762167586552935,
"grad_norm": 0.5101108640235097,
"learning_rate": 5e-06,
"loss": 0.6861,
"step": 1770
},
{
"epoch": 1.7862518815855495,
"grad_norm": 0.5577428782679603,
"learning_rate": 5e-06,
"loss": 0.6884,
"step": 1780
},
{
"epoch": 1.7962870045158055,
"grad_norm": 0.5088658014753255,
"learning_rate": 5e-06,
"loss": 0.696,
"step": 1790
},
{
"epoch": 1.8063221274460612,
"grad_norm": 0.5834405015876643,
"learning_rate": 5e-06,
"loss": 0.6936,
"step": 1800
},
{
"epoch": 1.816357250376317,
"grad_norm": 0.5223134455064544,
"learning_rate": 5e-06,
"loss": 0.6894,
"step": 1810
},
{
"epoch": 1.826392373306573,
"grad_norm": 0.5218318397593602,
"learning_rate": 5e-06,
"loss": 0.6901,
"step": 1820
},
{
"epoch": 1.836427496236829,
"grad_norm": 0.6595429027005055,
"learning_rate": 5e-06,
"loss": 0.6878,
"step": 1830
},
{
"epoch": 1.8464626191670848,
"grad_norm": 0.5041069952524779,
"learning_rate": 5e-06,
"loss": 0.691,
"step": 1840
},
{
"epoch": 1.8564977420973405,
"grad_norm": 0.6147969409931642,
"learning_rate": 5e-06,
"loss": 0.6925,
"step": 1850
},
{
"epoch": 1.8665328650275965,
"grad_norm": 0.5123520217070617,
"learning_rate": 5e-06,
"loss": 0.6864,
"step": 1860
},
{
"epoch": 1.8765679879578525,
"grad_norm": 0.5168471950711845,
"learning_rate": 5e-06,
"loss": 0.6885,
"step": 1870
},
{
"epoch": 1.8866031108881085,
"grad_norm": 0.5930018846461471,
"learning_rate": 5e-06,
"loss": 0.6861,
"step": 1880
},
{
"epoch": 1.8966382338183643,
"grad_norm": 0.5616874309006329,
"learning_rate": 5e-06,
"loss": 0.6895,
"step": 1890
},
{
"epoch": 1.90667335674862,
"grad_norm": 0.5473178004684522,
"learning_rate": 5e-06,
"loss": 0.6891,
"step": 1900
},
{
"epoch": 1.916708479678876,
"grad_norm": 0.498555944884513,
"learning_rate": 5e-06,
"loss": 0.6867,
"step": 1910
},
{
"epoch": 1.926743602609132,
"grad_norm": 0.5781525233352894,
"learning_rate": 5e-06,
"loss": 0.6936,
"step": 1920
},
{
"epoch": 1.936778725539388,
"grad_norm": 0.6304670711001885,
"learning_rate": 5e-06,
"loss": 0.6882,
"step": 1930
},
{
"epoch": 1.9468138484696438,
"grad_norm": 0.59342699382766,
"learning_rate": 5e-06,
"loss": 0.6875,
"step": 1940
},
{
"epoch": 1.9568489713998996,
"grad_norm": 0.6911703835056067,
"learning_rate": 5e-06,
"loss": 0.6879,
"step": 1950
},
{
"epoch": 1.9668840943301555,
"grad_norm": 0.7568921959517525,
"learning_rate": 5e-06,
"loss": 0.687,
"step": 1960
},
{
"epoch": 1.9769192172604115,
"grad_norm": 0.541514579573604,
"learning_rate": 5e-06,
"loss": 0.6891,
"step": 1970
},
{
"epoch": 1.9869543401906673,
"grad_norm": 0.5249988740865994,
"learning_rate": 5e-06,
"loss": 0.688,
"step": 1980
},
{
"epoch": 1.996989463120923,
"grad_norm": 0.518981463681463,
"learning_rate": 5e-06,
"loss": 0.6916,
"step": 1990
},
{
"epoch": 2.0,
"eval_loss": 0.7170566320419312,
"eval_runtime": 534.2939,
"eval_samples_per_second": 50.251,
"eval_steps_per_second": 0.393,
"step": 1993
},
{
"epoch": 2.007024586051179,
"grad_norm": 0.6758178418746282,
"learning_rate": 5e-06,
"loss": 0.6927,
"step": 2000
},
{
"epoch": 2.017059708981435,
"grad_norm": 0.6409143667352257,
"learning_rate": 5e-06,
"loss": 0.6458,
"step": 2010
},
{
"epoch": 2.027094831911691,
"grad_norm": 0.6399574034808028,
"learning_rate": 5e-06,
"loss": 0.6452,
"step": 2020
},
{
"epoch": 2.037129954841947,
"grad_norm": 0.6398494060833938,
"learning_rate": 5e-06,
"loss": 0.6464,
"step": 2030
},
{
"epoch": 2.0471650777722026,
"grad_norm": 0.6225102571911315,
"learning_rate": 5e-06,
"loss": 0.6441,
"step": 2040
},
{
"epoch": 2.0572002007024586,
"grad_norm": 0.607220858160029,
"learning_rate": 5e-06,
"loss": 0.6488,
"step": 2050
},
{
"epoch": 2.0672353236327146,
"grad_norm": 0.6296613660669085,
"learning_rate": 5e-06,
"loss": 0.6469,
"step": 2060
},
{
"epoch": 2.0772704465629706,
"grad_norm": 0.6997962877873833,
"learning_rate": 5e-06,
"loss": 0.6442,
"step": 2070
},
{
"epoch": 2.087305569493226,
"grad_norm": 0.5565086168492744,
"learning_rate": 5e-06,
"loss": 0.6501,
"step": 2080
},
{
"epoch": 2.097340692423482,
"grad_norm": 0.5570529145692124,
"learning_rate": 5e-06,
"loss": 0.6509,
"step": 2090
},
{
"epoch": 2.107375815353738,
"grad_norm": 0.5387202610597639,
"learning_rate": 5e-06,
"loss": 0.6476,
"step": 2100
},
{
"epoch": 2.117410938283994,
"grad_norm": 0.5555720032343792,
"learning_rate": 5e-06,
"loss": 0.6451,
"step": 2110
},
{
"epoch": 2.12744606121425,
"grad_norm": 0.549298003440315,
"learning_rate": 5e-06,
"loss": 0.6452,
"step": 2120
},
{
"epoch": 2.1374811841445056,
"grad_norm": 0.7025357677002765,
"learning_rate": 5e-06,
"loss": 0.6501,
"step": 2130
},
{
"epoch": 2.1475163070747616,
"grad_norm": 0.6933827951492744,
"learning_rate": 5e-06,
"loss": 0.6483,
"step": 2140
},
{
"epoch": 2.1575514300050176,
"grad_norm": 0.5931050285459404,
"learning_rate": 5e-06,
"loss": 0.6488,
"step": 2150
},
{
"epoch": 2.1675865529352736,
"grad_norm": 0.5619532741142755,
"learning_rate": 5e-06,
"loss": 0.6448,
"step": 2160
},
{
"epoch": 2.177621675865529,
"grad_norm": 0.5451448701863834,
"learning_rate": 5e-06,
"loss": 0.6444,
"step": 2170
},
{
"epoch": 2.187656798795785,
"grad_norm": 0.48428533410635616,
"learning_rate": 5e-06,
"loss": 0.6495,
"step": 2180
},
{
"epoch": 2.197691921726041,
"grad_norm": 0.5479783043613699,
"learning_rate": 5e-06,
"loss": 0.6525,
"step": 2190
},
{
"epoch": 2.207727044656297,
"grad_norm": 0.5880029735313238,
"learning_rate": 5e-06,
"loss": 0.6492,
"step": 2200
},
{
"epoch": 2.217762167586553,
"grad_norm": 0.518071247803138,
"learning_rate": 5e-06,
"loss": 0.6531,
"step": 2210
},
{
"epoch": 2.2277972905168086,
"grad_norm": 0.5525479511919598,
"learning_rate": 5e-06,
"loss": 0.6486,
"step": 2220
},
{
"epoch": 2.2378324134470646,
"grad_norm": 0.5930736784074986,
"learning_rate": 5e-06,
"loss": 0.6477,
"step": 2230
},
{
"epoch": 2.2478675363773206,
"grad_norm": 0.5191764875817738,
"learning_rate": 5e-06,
"loss": 0.6476,
"step": 2240
},
{
"epoch": 2.2579026593075766,
"grad_norm": 0.5226745835573395,
"learning_rate": 5e-06,
"loss": 0.6468,
"step": 2250
},
{
"epoch": 2.2679377822378326,
"grad_norm": 0.594787801985729,
"learning_rate": 5e-06,
"loss": 0.6493,
"step": 2260
},
{
"epoch": 2.277972905168088,
"grad_norm": 0.6593644795439105,
"learning_rate": 5e-06,
"loss": 0.6494,
"step": 2270
},
{
"epoch": 2.288008028098344,
"grad_norm": 0.5809194962718293,
"learning_rate": 5e-06,
"loss": 0.6488,
"step": 2280
},
{
"epoch": 2.2980431510286,
"grad_norm": 0.5769100613769682,
"learning_rate": 5e-06,
"loss": 0.6443,
"step": 2290
},
{
"epoch": 2.308078273958856,
"grad_norm": 0.5798022574438232,
"learning_rate": 5e-06,
"loss": 0.6511,
"step": 2300
},
{
"epoch": 2.318113396889112,
"grad_norm": 0.5500840976868079,
"learning_rate": 5e-06,
"loss": 0.6521,
"step": 2310
},
{
"epoch": 2.3281485198193677,
"grad_norm": 0.557151649596109,
"learning_rate": 5e-06,
"loss": 0.6499,
"step": 2320
},
{
"epoch": 2.3381836427496236,
"grad_norm": 0.5206053660885596,
"learning_rate": 5e-06,
"loss": 0.6497,
"step": 2330
},
{
"epoch": 2.3482187656798796,
"grad_norm": 0.5151956126043011,
"learning_rate": 5e-06,
"loss": 0.6465,
"step": 2340
},
{
"epoch": 2.3582538886101356,
"grad_norm": 0.6243860622771431,
"learning_rate": 5e-06,
"loss": 0.6471,
"step": 2350
},
{
"epoch": 2.368289011540391,
"grad_norm": 0.5551415160151735,
"learning_rate": 5e-06,
"loss": 0.6499,
"step": 2360
},
{
"epoch": 2.378324134470647,
"grad_norm": 0.5399591771385078,
"learning_rate": 5e-06,
"loss": 0.6542,
"step": 2370
},
{
"epoch": 2.388359257400903,
"grad_norm": 0.7051292001377757,
"learning_rate": 5e-06,
"loss": 0.6519,
"step": 2380
},
{
"epoch": 2.398394380331159,
"grad_norm": 0.7249734533415666,
"learning_rate": 5e-06,
"loss": 0.6453,
"step": 2390
},
{
"epoch": 2.408429503261415,
"grad_norm": 0.5667898165098878,
"learning_rate": 5e-06,
"loss": 0.6502,
"step": 2400
},
{
"epoch": 2.4184646261916707,
"grad_norm": 0.5714404629239772,
"learning_rate": 5e-06,
"loss": 0.651,
"step": 2410
},
{
"epoch": 2.4284997491219267,
"grad_norm": 0.5776379885611598,
"learning_rate": 5e-06,
"loss": 0.6533,
"step": 2420
},
{
"epoch": 2.4385348720521827,
"grad_norm": 0.5720433924252879,
"learning_rate": 5e-06,
"loss": 0.6495,
"step": 2430
},
{
"epoch": 2.4485699949824387,
"grad_norm": 0.5385010214829424,
"learning_rate": 5e-06,
"loss": 0.6535,
"step": 2440
},
{
"epoch": 2.458605117912694,
"grad_norm": 0.7858581581353575,
"learning_rate": 5e-06,
"loss": 0.6501,
"step": 2450
},
{
"epoch": 2.46864024084295,
"grad_norm": 0.5254373337828374,
"learning_rate": 5e-06,
"loss": 0.6514,
"step": 2460
},
{
"epoch": 2.478675363773206,
"grad_norm": 0.5927951089454341,
"learning_rate": 5e-06,
"loss": 0.6567,
"step": 2470
},
{
"epoch": 2.488710486703462,
"grad_norm": 0.576333054967198,
"learning_rate": 5e-06,
"loss": 0.6468,
"step": 2480
},
{
"epoch": 2.498745609633718,
"grad_norm": 0.5997091587436316,
"learning_rate": 5e-06,
"loss": 0.6494,
"step": 2490
},
{
"epoch": 2.5087807325639737,
"grad_norm": 0.5774761407070679,
"learning_rate": 5e-06,
"loss": 0.6575,
"step": 2500
},
{
"epoch": 2.5188158554942297,
"grad_norm": 0.5697277316039128,
"learning_rate": 5e-06,
"loss": 0.6529,
"step": 2510
},
{
"epoch": 2.5288509784244857,
"grad_norm": 0.5523271421741578,
"learning_rate": 5e-06,
"loss": 0.6486,
"step": 2520
},
{
"epoch": 2.5388861013547417,
"grad_norm": 0.5308441673869406,
"learning_rate": 5e-06,
"loss": 0.6479,
"step": 2530
},
{
"epoch": 2.5489212242849977,
"grad_norm": 0.611658706018765,
"learning_rate": 5e-06,
"loss": 0.6515,
"step": 2540
},
{
"epoch": 2.5589563472152532,
"grad_norm": 0.5882952161403756,
"learning_rate": 5e-06,
"loss": 0.6529,
"step": 2550
},
{
"epoch": 2.568991470145509,
"grad_norm": 0.5944397200737397,
"learning_rate": 5e-06,
"loss": 0.6486,
"step": 2560
},
{
"epoch": 2.579026593075765,
"grad_norm": 0.6211121511651304,
"learning_rate": 5e-06,
"loss": 0.6511,
"step": 2570
},
{
"epoch": 2.589061716006021,
"grad_norm": 0.5687379330786366,
"learning_rate": 5e-06,
"loss": 0.6599,
"step": 2580
},
{
"epoch": 2.599096838936277,
"grad_norm": 0.5264826150946973,
"learning_rate": 5e-06,
"loss": 0.6475,
"step": 2590
},
{
"epoch": 2.6091319618665327,
"grad_norm": 0.546285649845265,
"learning_rate": 5e-06,
"loss": 0.6531,
"step": 2600
},
{
"epoch": 2.6191670847967887,
"grad_norm": 0.5016403997355949,
"learning_rate": 5e-06,
"loss": 0.6532,
"step": 2610
},
{
"epoch": 2.6292022077270447,
"grad_norm": 0.514946202260049,
"learning_rate": 5e-06,
"loss": 0.6476,
"step": 2620
},
{
"epoch": 2.6392373306573007,
"grad_norm": 0.5768998926592247,
"learning_rate": 5e-06,
"loss": 0.6499,
"step": 2630
},
{
"epoch": 2.6492724535875567,
"grad_norm": 0.56824673619337,
"learning_rate": 5e-06,
"loss": 0.6515,
"step": 2640
},
{
"epoch": 2.6593075765178122,
"grad_norm": 0.5743894544574858,
"learning_rate": 5e-06,
"loss": 0.6528,
"step": 2650
},
{
"epoch": 2.6693426994480682,
"grad_norm": 0.6566059771585782,
"learning_rate": 5e-06,
"loss": 0.6482,
"step": 2660
},
{
"epoch": 2.6793778223783242,
"grad_norm": 0.6222476495314778,
"learning_rate": 5e-06,
"loss": 0.6521,
"step": 2670
},
{
"epoch": 2.6894129453085798,
"grad_norm": 0.5352433226802004,
"learning_rate": 5e-06,
"loss": 0.6511,
"step": 2680
},
{
"epoch": 2.6994480682388358,
"grad_norm": 0.5593740996298296,
"learning_rate": 5e-06,
"loss": 0.6502,
"step": 2690
},
{
"epoch": 2.7094831911690918,
"grad_norm": 0.5870512541625679,
"learning_rate": 5e-06,
"loss": 0.6509,
"step": 2700
},
{
"epoch": 2.7195183140993477,
"grad_norm": 0.5194458048924404,
"learning_rate": 5e-06,
"loss": 0.6522,
"step": 2710
},
{
"epoch": 2.7295534370296037,
"grad_norm": 0.5207337998079029,
"learning_rate": 5e-06,
"loss": 0.6513,
"step": 2720
},
{
"epoch": 2.7395885599598593,
"grad_norm": 0.543790751828564,
"learning_rate": 5e-06,
"loss": 0.6551,
"step": 2730
},
{
"epoch": 2.7496236828901153,
"grad_norm": 0.5863597318330472,
"learning_rate": 5e-06,
"loss": 0.6545,
"step": 2740
},
{
"epoch": 2.7596588058203713,
"grad_norm": 0.5388170321853544,
"learning_rate": 5e-06,
"loss": 0.6538,
"step": 2750
},
{
"epoch": 2.7696939287506273,
"grad_norm": 0.5742236049971658,
"learning_rate": 5e-06,
"loss": 0.6583,
"step": 2760
},
{
"epoch": 2.7797290516808832,
"grad_norm": 0.5384373778506647,
"learning_rate": 5e-06,
"loss": 0.6545,
"step": 2770
},
{
"epoch": 2.789764174611139,
"grad_norm": 0.5345134774106539,
"learning_rate": 5e-06,
"loss": 0.6508,
"step": 2780
},
{
"epoch": 2.799799297541395,
"grad_norm": 0.6099706934146881,
"learning_rate": 5e-06,
"loss": 0.6534,
"step": 2790
},
{
"epoch": 2.8098344204716508,
"grad_norm": 0.7684214994129063,
"learning_rate": 5e-06,
"loss": 0.6552,
"step": 2800
},
{
"epoch": 2.8198695434019068,
"grad_norm": 0.6812429404931887,
"learning_rate": 5e-06,
"loss": 0.6541,
"step": 2810
},
{
"epoch": 2.8299046663321628,
"grad_norm": 0.5551480308045381,
"learning_rate": 5e-06,
"loss": 0.648,
"step": 2820
},
{
"epoch": 2.8399397892624183,
"grad_norm": 0.5783011582533801,
"learning_rate": 5e-06,
"loss": 0.6538,
"step": 2830
},
{
"epoch": 2.8499749121926743,
"grad_norm": 0.6083930556812827,
"learning_rate": 5e-06,
"loss": 0.652,
"step": 2840
},
{
"epoch": 2.8600100351229303,
"grad_norm": 0.6412044060911565,
"learning_rate": 5e-06,
"loss": 0.6574,
"step": 2850
},
{
"epoch": 2.8700451580531863,
"grad_norm": 0.6641263630618335,
"learning_rate": 5e-06,
"loss": 0.6533,
"step": 2860
},
{
"epoch": 2.8800802809834423,
"grad_norm": 0.7128854398954438,
"learning_rate": 5e-06,
"loss": 0.6516,
"step": 2870
},
{
"epoch": 2.890115403913698,
"grad_norm": 0.6732370461622078,
"learning_rate": 5e-06,
"loss": 0.6542,
"step": 2880
},
{
"epoch": 2.900150526843954,
"grad_norm": 0.5615551950157813,
"learning_rate": 5e-06,
"loss": 0.6517,
"step": 2890
},
{
"epoch": 2.91018564977421,
"grad_norm": 0.5360864764156098,
"learning_rate": 5e-06,
"loss": 0.6529,
"step": 2900
},
{
"epoch": 2.920220772704466,
"grad_norm": 0.5686095181581269,
"learning_rate": 5e-06,
"loss": 0.6522,
"step": 2910
},
{
"epoch": 2.9302558956347218,
"grad_norm": 0.49222391395442017,
"learning_rate": 5e-06,
"loss": 0.6508,
"step": 2920
},
{
"epoch": 2.9402910185649773,
"grad_norm": 0.5723888567358063,
"learning_rate": 5e-06,
"loss": 0.6497,
"step": 2930
},
{
"epoch": 2.9503261414952333,
"grad_norm": 0.6277028838019034,
"learning_rate": 5e-06,
"loss": 0.6548,
"step": 2940
},
{
"epoch": 2.9603612644254893,
"grad_norm": 0.6499248602518872,
"learning_rate": 5e-06,
"loss": 0.6539,
"step": 2950
},
{
"epoch": 2.970396387355745,
"grad_norm": 0.5523665140419113,
"learning_rate": 5e-06,
"loss": 0.6567,
"step": 2960
},
{
"epoch": 2.980431510286001,
"grad_norm": 0.5253539559863383,
"learning_rate": 5e-06,
"loss": 0.6547,
"step": 2970
},
{
"epoch": 2.990466633216257,
"grad_norm": 0.5665529328640058,
"learning_rate": 5e-06,
"loss": 0.6516,
"step": 2980
},
{
"epoch": 2.9984947315604615,
"eval_loss": 0.714939534664154,
"eval_runtime": 532.5929,
"eval_samples_per_second": 50.412,
"eval_steps_per_second": 0.394,
"step": 2988
},
{
"epoch": 2.9984947315604615,
"step": 2988,
"total_flos": 5004592530063360.0,
"train_loss": 0.7032094593826866,
"train_runtime": 89126.24,
"train_samples_per_second": 17.171,
"train_steps_per_second": 0.034
}
],
"logging_steps": 10,
"max_steps": 2988,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5004592530063360.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}