|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 312, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00641025641025641, |
|
"grad_norm": 3.8148568052575884, |
|
"learning_rate": 1.282051282051282e-07, |
|
"loss": 4.889, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.01282051282051282, |
|
"grad_norm": 4.453444589892027, |
|
"learning_rate": 2.564102564102564e-07, |
|
"loss": 4.9097, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02564102564102564, |
|
"grad_norm": 4.896614258621833, |
|
"learning_rate": 5.128205128205128e-07, |
|
"loss": 4.9099, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.038461538461538464, |
|
"grad_norm": 4.456576485464451, |
|
"learning_rate": 7.692307692307694e-07, |
|
"loss": 4.9102, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05128205128205128, |
|
"grad_norm": 4.193427815120892, |
|
"learning_rate": 1.0256410256410257e-06, |
|
"loss": 4.8924, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0641025641025641, |
|
"grad_norm": 3.6726747534666555, |
|
"learning_rate": 1.282051282051282e-06, |
|
"loss": 4.8372, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07692307692307693, |
|
"grad_norm": 3.337981680961211, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 4.7794, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08974358974358974, |
|
"grad_norm": 2.675890453922504, |
|
"learning_rate": 1.794871794871795e-06, |
|
"loss": 4.6191, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10256410256410256, |
|
"grad_norm": 2.398848700299253, |
|
"learning_rate": 2.0512820512820513e-06, |
|
"loss": 4.5723, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11538461538461539, |
|
"grad_norm": 1.8159784961859098, |
|
"learning_rate": 2.307692307692308e-06, |
|
"loss": 4.3568, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1282051282051282, |
|
"grad_norm": 1.6094220673057946, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 4.2686, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14102564102564102, |
|
"grad_norm": 1.4349818434671497, |
|
"learning_rate": 2.8205128205128207e-06, |
|
"loss": 4.169, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.15384615384615385, |
|
"grad_norm": 1.4412559958198408, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 4.0415, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 1.3626982007755366, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 3.8569, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1794871794871795, |
|
"grad_norm": 1.3679096739652512, |
|
"learning_rate": 3.58974358974359e-06, |
|
"loss": 3.7409, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"grad_norm": 1.3396391976584703, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 3.6585, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 1.294876480457606, |
|
"learning_rate": 4.102564102564103e-06, |
|
"loss": 3.4961, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.21794871794871795, |
|
"grad_norm": 1.103820056614455, |
|
"learning_rate": 4.358974358974359e-06, |
|
"loss": 3.3518, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.23076923076923078, |
|
"grad_norm": 1.0522131115906572, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 3.1984, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24358974358974358, |
|
"grad_norm": 1.0081732884085817, |
|
"learning_rate": 4.871794871794872e-06, |
|
"loss": 3.054, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 0.9214039999549644, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 2.8628, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2692307692307692, |
|
"grad_norm": 0.8143994876297143, |
|
"learning_rate": 5.384615384615385e-06, |
|
"loss": 2.7475, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.28205128205128205, |
|
"grad_norm": 0.700891765547207, |
|
"learning_rate": 5.641025641025641e-06, |
|
"loss": 2.5869, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2948717948717949, |
|
"grad_norm": 0.7510674065754775, |
|
"learning_rate": 5.897435897435898e-06, |
|
"loss": 2.4461, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 0.6794074940373539, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 2.3477, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.32051282051282054, |
|
"grad_norm": 0.5162215042692575, |
|
"learning_rate": 6.410256410256412e-06, |
|
"loss": 2.2152, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.5146975027904754, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.1975, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.34615384615384615, |
|
"grad_norm": 0.4474574545979082, |
|
"learning_rate": 6.923076923076923e-06, |
|
"loss": 2.0824, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.358974358974359, |
|
"grad_norm": 0.40379510918119965, |
|
"learning_rate": 7.17948717948718e-06, |
|
"loss": 2.0388, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3717948717948718, |
|
"grad_norm": 0.4109144194248555, |
|
"learning_rate": 7.435897435897437e-06, |
|
"loss": 1.9699, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 0.36878556755849573, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 1.9252, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3974358974358974, |
|
"grad_norm": 0.33951214974325605, |
|
"learning_rate": 7.948717948717949e-06, |
|
"loss": 1.8773, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.31625266306424027, |
|
"learning_rate": 8.205128205128205e-06, |
|
"loss": 1.7966, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4230769230769231, |
|
"grad_norm": 0.7180890498799148, |
|
"learning_rate": 8.461538461538462e-06, |
|
"loss": 1.8108, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4358974358974359, |
|
"grad_norm": 0.33704662479371716, |
|
"learning_rate": 8.717948717948719e-06, |
|
"loss": 1.7498, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44871794871794873, |
|
"grad_norm": 0.2761824271642518, |
|
"learning_rate": 8.974358974358976e-06, |
|
"loss": 1.7124, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.46153846153846156, |
|
"grad_norm": 0.24386286193528572, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 1.6382, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47435897435897434, |
|
"grad_norm": 0.25885451676676363, |
|
"learning_rate": 9.487179487179487e-06, |
|
"loss": 1.6588, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.48717948717948717, |
|
"grad_norm": 0.3040030663690383, |
|
"learning_rate": 9.743589743589744e-06, |
|
"loss": 1.6209, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.26598080566137733, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6294, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 0.22696288673824674, |
|
"learning_rate": 9.99995506314361e-06, |
|
"loss": 1.58, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5256410256410257, |
|
"grad_norm": 0.21242259411358655, |
|
"learning_rate": 9.99982025338217e-06, |
|
"loss": 1.5439, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5384615384615384, |
|
"grad_norm": 0.20291826899403465, |
|
"learning_rate": 9.999595573138845e-06, |
|
"loss": 1.5274, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5512820512820513, |
|
"grad_norm": 0.1855444412322797, |
|
"learning_rate": 9.99928102645221e-06, |
|
"loss": 1.5161, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5641025641025641, |
|
"grad_norm": 0.17883874148398324, |
|
"learning_rate": 9.99887661897616e-06, |
|
"loss": 1.4916, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"grad_norm": 0.17041478792908024, |
|
"learning_rate": 9.99838235797981e-06, |
|
"loss": 1.4679, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5897435897435898, |
|
"grad_norm": 0.1904762198987749, |
|
"learning_rate": 9.997798252347382e-06, |
|
"loss": 1.471, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6025641025641025, |
|
"grad_norm": 0.19077041355708335, |
|
"learning_rate": 9.99712431257802e-06, |
|
"loss": 1.4672, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.1702104328191874, |
|
"learning_rate": 9.996360550785619e-06, |
|
"loss": 1.4455, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6282051282051282, |
|
"grad_norm": 0.19039133859515542, |
|
"learning_rate": 9.9955069806986e-06, |
|
"loss": 1.4727, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"grad_norm": 0.15448238517128507, |
|
"learning_rate": 9.994563617659665e-06, |
|
"loss": 1.4257, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6538461538461539, |
|
"grad_norm": 0.15202351051018634, |
|
"learning_rate": 9.993530478625524e-06, |
|
"loss": 1.4214, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.16296598133044526, |
|
"learning_rate": 9.992407582166582e-06, |
|
"loss": 1.4213, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6794871794871795, |
|
"grad_norm": 0.1462038294164801, |
|
"learning_rate": 9.991194948466615e-06, |
|
"loss": 1.3993, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6923076923076923, |
|
"grad_norm": 0.14470989191451086, |
|
"learning_rate": 9.989892599322404e-06, |
|
"loss": 1.4014, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7051282051282052, |
|
"grad_norm": 0.15440545758233384, |
|
"learning_rate": 9.988500558143337e-06, |
|
"loss": 1.3878, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.717948717948718, |
|
"grad_norm": 0.1412948019214843, |
|
"learning_rate": 9.987018849950996e-06, |
|
"loss": 1.355, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7307692307692307, |
|
"grad_norm": 0.15156074653795895, |
|
"learning_rate": 9.985447501378706e-06, |
|
"loss": 1.3642, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7435897435897436, |
|
"grad_norm": 0.3875845143038168, |
|
"learning_rate": 9.983786540671052e-06, |
|
"loss": 1.3797, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7564102564102564, |
|
"grad_norm": 0.15788537547887518, |
|
"learning_rate": 9.982035997683372e-06, |
|
"loss": 1.3388, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 0.15056320914445512, |
|
"learning_rate": 9.980195903881231e-06, |
|
"loss": 1.343, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.782051282051282, |
|
"grad_norm": 0.1555129283317706, |
|
"learning_rate": 9.978266292339838e-06, |
|
"loss": 1.328, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7948717948717948, |
|
"grad_norm": 0.14999182496915453, |
|
"learning_rate": 9.976247197743465e-06, |
|
"loss": 1.352, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8076923076923077, |
|
"grad_norm": 0.14124313426191026, |
|
"learning_rate": 9.974138656384815e-06, |
|
"loss": 1.3243, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.1378326204862212, |
|
"learning_rate": 9.97194070616438e-06, |
|
"loss": 1.3241, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.14227960534974604, |
|
"learning_rate": 9.969653386589749e-06, |
|
"loss": 1.3219, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8461538461538461, |
|
"grad_norm": 0.12713543749272155, |
|
"learning_rate": 9.967276738774897e-06, |
|
"loss": 1.3096, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8589743589743589, |
|
"grad_norm": 0.15061232362563903, |
|
"learning_rate": 9.964810805439464e-06, |
|
"loss": 1.3011, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8717948717948718, |
|
"grad_norm": 0.14361563348990292, |
|
"learning_rate": 9.962255630907964e-06, |
|
"loss": 1.2827, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8846153846153846, |
|
"grad_norm": 0.17754387209035652, |
|
"learning_rate": 9.959611261108999e-06, |
|
"loss": 1.3185, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8974358974358975, |
|
"grad_norm": 0.1458623897430443, |
|
"learning_rate": 9.956877743574437e-06, |
|
"loss": 1.3286, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9102564102564102, |
|
"grad_norm": 0.14084398418567437, |
|
"learning_rate": 9.954055127438554e-06, |
|
"loss": 1.3005, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 0.13580861113069753, |
|
"learning_rate": 9.951143463437145e-06, |
|
"loss": 1.3165, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9358974358974359, |
|
"grad_norm": 0.13622051889734035, |
|
"learning_rate": 9.948142803906623e-06, |
|
"loss": 1.2929, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9487179487179487, |
|
"grad_norm": 0.12679082371935066, |
|
"learning_rate": 9.94505320278307e-06, |
|
"loss": 1.2833, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 0.11939382079952243, |
|
"learning_rate": 9.94187471560127e-06, |
|
"loss": 1.2851, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9743589743589743, |
|
"grad_norm": 0.11752490134274678, |
|
"learning_rate": 9.938607399493714e-06, |
|
"loss": 1.2559, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9871794871794872, |
|
"grad_norm": 0.11807212671773365, |
|
"learning_rate": 9.935251313189564e-06, |
|
"loss": 1.285, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.1120761333795772, |
|
"learning_rate": 9.931806517013612e-06, |
|
"loss": 1.2491, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0128205128205128, |
|
"grad_norm": 0.10750345822189263, |
|
"learning_rate": 9.92827307288518e-06, |
|
"loss": 1.2442, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.10918642022881683, |
|
"learning_rate": 9.924651044317017e-06, |
|
"loss": 1.2286, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0384615384615385, |
|
"grad_norm": 0.11225330042691335, |
|
"learning_rate": 9.920940496414153e-06, |
|
"loss": 1.2158, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0512820512820513, |
|
"grad_norm": 0.11366482652198566, |
|
"learning_rate": 9.917141495872733e-06, |
|
"loss": 1.2074, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.064102564102564, |
|
"grad_norm": 0.12295651003296312, |
|
"learning_rate": 9.913254110978812e-06, |
|
"loss": 1.2003, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0769230769230769, |
|
"grad_norm": 0.1144456030840293, |
|
"learning_rate": 9.909278411607134e-06, |
|
"loss": 1.206, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0897435897435896, |
|
"grad_norm": 0.2468334129961725, |
|
"learning_rate": 9.90521446921987e-06, |
|
"loss": 1.2235, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1025641025641026, |
|
"grad_norm": 0.127278158070263, |
|
"learning_rate": 9.90106235686534e-06, |
|
"loss": 1.1928, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1153846153846154, |
|
"grad_norm": 0.1280282060730887, |
|
"learning_rate": 9.896822149176695e-06, |
|
"loss": 1.2068, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.1282051282051282, |
|
"grad_norm": 0.1142922422404122, |
|
"learning_rate": 9.892493922370575e-06, |
|
"loss": 1.217, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.141025641025641, |
|
"grad_norm": 0.17470470224878323, |
|
"learning_rate": 9.888077754245741e-06, |
|
"loss": 1.2099, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 0.10477882692325258, |
|
"learning_rate": 9.883573724181683e-06, |
|
"loss": 1.1944, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.114790034377695, |
|
"learning_rate": 9.878981913137178e-06, |
|
"loss": 1.172, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1794871794871795, |
|
"grad_norm": 0.1044922535107306, |
|
"learning_rate": 9.87430240364885e-06, |
|
"loss": 1.2147, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1923076923076923, |
|
"grad_norm": 0.09771283060341285, |
|
"learning_rate": 9.869535279829674e-06, |
|
"loss": 1.173, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.205128205128205, |
|
"grad_norm": 0.1013995999635824, |
|
"learning_rate": 9.864680627367476e-06, |
|
"loss": 1.2023, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.217948717948718, |
|
"grad_norm": 0.10273326452887067, |
|
"learning_rate": 9.859738533523384e-06, |
|
"loss": 1.1732, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.09684048616936082, |
|
"learning_rate": 9.854709087130261e-06, |
|
"loss": 1.1952, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.2435897435897436, |
|
"grad_norm": 0.10827760658070901, |
|
"learning_rate": 9.849592378591113e-06, |
|
"loss": 1.1864, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.2564102564102564, |
|
"grad_norm": 0.09989527940011267, |
|
"learning_rate": 9.844388499877457e-06, |
|
"loss": 1.2016, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2692307692307692, |
|
"grad_norm": 0.09930771667309381, |
|
"learning_rate": 9.839097544527674e-06, |
|
"loss": 1.1738, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.282051282051282, |
|
"grad_norm": 0.1032001919164007, |
|
"learning_rate": 9.833719607645325e-06, |
|
"loss": 1.176, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.294871794871795, |
|
"grad_norm": 0.09859412157061716, |
|
"learning_rate": 9.82825478589744e-06, |
|
"loss": 1.1682, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3076923076923077, |
|
"grad_norm": 0.09558235334437347, |
|
"learning_rate": 9.822703177512783e-06, |
|
"loss": 1.181, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3205128205128205, |
|
"grad_norm": 0.08733478657745303, |
|
"learning_rate": 9.817064882280085e-06, |
|
"loss": 1.1686, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.09397505343456257, |
|
"learning_rate": 9.811340001546252e-06, |
|
"loss": 1.1778, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"grad_norm": 0.09590407825516856, |
|
"learning_rate": 9.805528638214543e-06, |
|
"loss": 1.1542, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.358974358974359, |
|
"grad_norm": 0.0912508440064145, |
|
"learning_rate": 9.799630896742716e-06, |
|
"loss": 1.1643, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3717948717948718, |
|
"grad_norm": 0.09258955107744923, |
|
"learning_rate": 9.793646883141155e-06, |
|
"loss": 1.1686, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3846153846153846, |
|
"grad_norm": 0.09889457149777804, |
|
"learning_rate": 9.787576704970965e-06, |
|
"loss": 1.1677, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.3974358974358974, |
|
"grad_norm": 0.09374670756166416, |
|
"learning_rate": 9.781420471342035e-06, |
|
"loss": 1.146, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4102564102564101, |
|
"grad_norm": 0.09136677460744856, |
|
"learning_rate": 9.77517829291108e-06, |
|
"loss": 1.1594, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4230769230769231, |
|
"grad_norm": 0.10584946030378292, |
|
"learning_rate": 9.768850281879651e-06, |
|
"loss": 1.1865, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 0.09187981607301214, |
|
"learning_rate": 9.762436551992117e-06, |
|
"loss": 1.1606, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4487179487179487, |
|
"grad_norm": 0.09880449655805854, |
|
"learning_rate": 9.755937218533622e-06, |
|
"loss": 1.1586, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4615384615384617, |
|
"grad_norm": 0.08704607108972029, |
|
"learning_rate": 9.74935239832801e-06, |
|
"loss": 1.1746, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4743589743589745, |
|
"grad_norm": 0.08909112778091671, |
|
"learning_rate": 9.742682209735727e-06, |
|
"loss": 1.1575, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4871794871794872, |
|
"grad_norm": 0.09035998053799675, |
|
"learning_rate": 9.735926772651703e-06, |
|
"loss": 1.1678, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.09500864788295198, |
|
"learning_rate": 9.729086208503174e-06, |
|
"loss": 1.1466, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.5128205128205128, |
|
"grad_norm": 0.09247434213683463, |
|
"learning_rate": 9.722160640247523e-06, |
|
"loss": 1.1687, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5256410256410255, |
|
"grad_norm": 0.09322212100100113, |
|
"learning_rate": 9.715150192370054e-06, |
|
"loss": 1.1376, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.08824919508271642, |
|
"learning_rate": 9.708054990881763e-06, |
|
"loss": 1.1523, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5512820512820513, |
|
"grad_norm": 0.25559730635424294, |
|
"learning_rate": 9.700875163317072e-06, |
|
"loss": 1.1488, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.564102564102564, |
|
"grad_norm": 0.2487505162861363, |
|
"learning_rate": 9.693610838731532e-06, |
|
"loss": 1.1481, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5769230769230769, |
|
"grad_norm": 0.12151469789600829, |
|
"learning_rate": 9.686262147699507e-06, |
|
"loss": 1.1483, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5897435897435899, |
|
"grad_norm": 0.10407519891252137, |
|
"learning_rate": 9.678829222311827e-06, |
|
"loss": 1.13, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6025641025641026, |
|
"grad_norm": 0.11236395690738615, |
|
"learning_rate": 9.671312196173413e-06, |
|
"loss": 1.1493, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6153846153846154, |
|
"grad_norm": 0.1012523372817843, |
|
"learning_rate": 9.663711204400872e-06, |
|
"loss": 1.148, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.6282051282051282, |
|
"grad_norm": 0.09652583778417714, |
|
"learning_rate": 9.656026383620076e-06, |
|
"loss": 1.1074, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.09448533541138639, |
|
"learning_rate": 9.6482578719637e-06, |
|
"loss": 1.1486, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.6538461538461537, |
|
"grad_norm": 0.09453430664055591, |
|
"learning_rate": 9.640405809068743e-06, |
|
"loss": 1.1197, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.0952812616531032, |
|
"learning_rate": 9.632470336074009e-06, |
|
"loss": 1.1337, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6794871794871795, |
|
"grad_norm": 0.09048018082770859, |
|
"learning_rate": 9.624451595617588e-06, |
|
"loss": 1.0885, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6923076923076923, |
|
"grad_norm": 0.0922717302732401, |
|
"learning_rate": 9.616349731834271e-06, |
|
"loss": 1.1294, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7051282051282053, |
|
"grad_norm": 0.09113342238000427, |
|
"learning_rate": 9.608164890352977e-06, |
|
"loss": 1.0871, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.717948717948718, |
|
"grad_norm": 0.10188653395954697, |
|
"learning_rate": 9.599897218294122e-06, |
|
"loss": 1.1237, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7307692307692308, |
|
"grad_norm": 0.08946291041522332, |
|
"learning_rate": 9.591546864266983e-06, |
|
"loss": 1.1129, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7435897435897436, |
|
"grad_norm": 0.092702242157672, |
|
"learning_rate": 9.583113978367026e-06, |
|
"loss": 1.1089, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.7564102564102564, |
|
"grad_norm": 0.1140491779513373, |
|
"learning_rate": 9.574598712173202e-06, |
|
"loss": 1.1286, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.7692307692307692, |
|
"grad_norm": 0.09516237353719291, |
|
"learning_rate": 9.56600121874523e-06, |
|
"loss": 1.1122, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.782051282051282, |
|
"grad_norm": 0.08916708413619781, |
|
"learning_rate": 9.557321652620839e-06, |
|
"loss": 1.1048, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.7948717948717947, |
|
"grad_norm": 0.09140805156925046, |
|
"learning_rate": 9.548560169812997e-06, |
|
"loss": 1.1058, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8076923076923077, |
|
"grad_norm": 0.08683635001330178, |
|
"learning_rate": 9.539716927807102e-06, |
|
"loss": 1.0925, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8205128205128205, |
|
"grad_norm": 0.09284148179598711, |
|
"learning_rate": 9.530792085558151e-06, |
|
"loss": 1.0948, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.08800610945553744, |
|
"learning_rate": 9.521785803487888e-06, |
|
"loss": 1.1116, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.08758546749473674, |
|
"learning_rate": 9.512698243481914e-06, |
|
"loss": 1.1059, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.858974358974359, |
|
"grad_norm": 0.08336608124209365, |
|
"learning_rate": 9.50352956888678e-06, |
|
"loss": 1.1015, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.8717948717948718, |
|
"grad_norm": 0.09199580396288136, |
|
"learning_rate": 9.49427994450705e-06, |
|
"loss": 1.0828, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.8846153846153846, |
|
"grad_norm": 0.5410940704298627, |
|
"learning_rate": 9.484949536602343e-06, |
|
"loss": 1.1412, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.8974358974358974, |
|
"grad_norm": 0.08913430120295451, |
|
"learning_rate": 9.47553851288434e-06, |
|
"loss": 1.1073, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9102564102564101, |
|
"grad_norm": 0.09420167495815907, |
|
"learning_rate": 9.466047042513767e-06, |
|
"loss": 1.0957, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 0.08189970955203785, |
|
"learning_rate": 9.45647529609736e-06, |
|
"loss": 1.0909, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.935897435897436, |
|
"grad_norm": 0.09065809775757692, |
|
"learning_rate": 9.4468234456848e-06, |
|
"loss": 1.0896, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9487179487179487, |
|
"grad_norm": 0.08763498764491487, |
|
"learning_rate": 9.437091664765611e-06, |
|
"loss": 1.1099, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.9615384615384617, |
|
"grad_norm": 0.09257403574026254, |
|
"learning_rate": 9.427280128266049e-06, |
|
"loss": 1.1236, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.9743589743589745, |
|
"grad_norm": 0.08983923370086075, |
|
"learning_rate": 9.41738901254596e-06, |
|
"loss": 1.0909, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9871794871794872, |
|
"grad_norm": 0.086289850522152, |
|
"learning_rate": 9.4074184953956e-06, |
|
"loss": 1.0942, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.0874296283040965, |
|
"learning_rate": 9.397368756032445e-06, |
|
"loss": 1.0651, |
|
"step": 312 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1560, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.263134196533035e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|