|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 12.04, |
|
"eval_steps": 500000000, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0005, |
|
"loss": 3.4406, |
|
"loss/crossentropy": 3.14038622379303, |
|
"loss/logits": 0.30017508566379547, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.001, |
|
"loss": 3.4222, |
|
"loss/crossentropy": 3.1114853620529175, |
|
"loss/logits": 0.31074509024620056, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0015, |
|
"loss": 3.4379, |
|
"loss/crossentropy": 3.1400939226150513, |
|
"loss/logits": 0.29777538776397705, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.002, |
|
"loss": 3.3201, |
|
"loss/crossentropy": 3.0177063941955566, |
|
"loss/logits": 0.30243340134620667, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0025, |
|
"loss": 3.3921, |
|
"loss/crossentropy": 3.0849010944366455, |
|
"loss/logits": 0.307217076420784, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.003, |
|
"loss": 3.4715, |
|
"loss/crossentropy": 3.13341748714447, |
|
"loss/logits": 0.33807775378227234, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0034999999999999996, |
|
"loss": 3.4079, |
|
"loss/crossentropy": 3.0802475214004517, |
|
"loss/logits": 0.3276752084493637, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.004, |
|
"loss": 3.2953, |
|
"loss/crossentropy": 2.987769603729248, |
|
"loss/logits": 0.3075404167175293, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0045000000000000005, |
|
"loss": 3.2869, |
|
"loss/crossentropy": 2.9777328968048096, |
|
"loss/logits": 0.3091176152229309, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.005, |
|
"loss": 3.2259, |
|
"loss/crossentropy": 2.9359546899795532, |
|
"loss/logits": 0.28997449576854706, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0055000000000000005, |
|
"loss": 3.3682, |
|
"loss/crossentropy": 3.0672762393951416, |
|
"loss/logits": 0.3009057492017746, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.006, |
|
"loss": 3.2457, |
|
"loss/crossentropy": 2.9338029623031616, |
|
"loss/logits": 0.31185680627822876, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.006500000000000001, |
|
"loss": 3.5126, |
|
"loss/crossentropy": 3.1620391607284546, |
|
"loss/logits": 0.3505771607160568, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.006999999999999999, |
|
"loss": 3.3082, |
|
"loss/crossentropy": 2.9693878889083862, |
|
"loss/logits": 0.3388153314590454, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.7109375, |
|
"grad_norm_var": 0.02401172320048014, |
|
"learning_rate": 0.0075, |
|
"loss": 3.1669, |
|
"loss/crossentropy": 2.8395652770996094, |
|
"loss/logits": 0.3273603916168213, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.62109375, |
|
"grad_norm_var": 0.02595494588216146, |
|
"learning_rate": 0.008, |
|
"loss": 3.435, |
|
"loss/crossentropy": 3.0843794345855713, |
|
"loss/logits": 0.35064929723739624, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.49609375, |
|
"grad_norm_var": 0.02552026112874349, |
|
"learning_rate": 0.0085, |
|
"loss": 3.3402, |
|
"loss/crossentropy": 3.018695592880249, |
|
"loss/logits": 0.32146573066711426, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.77734375, |
|
"grad_norm_var": 0.029073317845662434, |
|
"learning_rate": 0.009000000000000001, |
|
"loss": 3.3812, |
|
"loss/crossentropy": 2.9771525859832764, |
|
"loss/logits": 0.4040682762861252, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.671875, |
|
"grad_norm_var": 0.028688796361287437, |
|
"learning_rate": 0.0095, |
|
"loss": 3.226, |
|
"loss/crossentropy": 2.9090826511383057, |
|
"loss/logits": 0.31695474684238434, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.63671875, |
|
"grad_norm_var": 0.027660115559895834, |
|
"learning_rate": 0.01, |
|
"loss": 2.9946, |
|
"loss/crossentropy": 2.6920872926712036, |
|
"loss/logits": 0.30253250896930695, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.6171875, |
|
"grad_norm_var": 0.024438222249348957, |
|
"learning_rate": 0.009996145181203616, |
|
"loss": 3.2839, |
|
"loss/crossentropy": 2.9436020851135254, |
|
"loss/logits": 0.3402951508760452, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.515625, |
|
"grad_norm_var": 0.019991048177083335, |
|
"learning_rate": 0.00998458666866564, |
|
"loss": 3.2061, |
|
"loss/crossentropy": 2.8926981687545776, |
|
"loss/logits": 0.313431978225708, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.40234375, |
|
"grad_norm_var": 0.016751543680826823, |
|
"learning_rate": 0.009965342284774633, |
|
"loss": 3.1526, |
|
"loss/crossentropy": 2.831883192062378, |
|
"loss/logits": 0.3207142651081085, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.373046875, |
|
"grad_norm_var": 0.015084314346313476, |
|
"learning_rate": 0.009938441702975689, |
|
"loss": 3.0691, |
|
"loss/crossentropy": 2.7710957527160645, |
|
"loss/logits": 0.2979845702648163, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.3671875, |
|
"grad_norm_var": 0.01682891845703125, |
|
"learning_rate": 0.009903926402016152, |
|
"loss": 3.0935, |
|
"loss/crossentropy": 2.800452947616577, |
|
"loss/logits": 0.2930735796689987, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.328125, |
|
"grad_norm_var": 0.019060516357421876, |
|
"learning_rate": 0.009861849601988383, |
|
"loss": 2.972, |
|
"loss/crossentropy": 2.6579891443252563, |
|
"loss/logits": 0.31396760046482086, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.31640625, |
|
"grad_norm_var": 0.021935526529947916, |
|
"learning_rate": 0.009812276182268235, |
|
"loss": 3.0729, |
|
"loss/crossentropy": 2.7841025590896606, |
|
"loss/logits": 0.2887759953737259, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.31640625, |
|
"grad_norm_var": 0.02453505198160807, |
|
"learning_rate": 0.009755282581475769, |
|
"loss": 2.9013, |
|
"loss/crossentropy": 2.6322743892669678, |
|
"loss/logits": 0.2690378725528717, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.349609375, |
|
"grad_norm_var": 0.022967767715454102, |
|
"learning_rate": 0.00969095667961242, |
|
"loss": 2.9966, |
|
"loss/crossentropy": 2.718039870262146, |
|
"loss/logits": 0.2785477787256241, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.337890625, |
|
"grad_norm_var": 0.022896321614583333, |
|
"learning_rate": 0.009619397662556433, |
|
"loss": 2.9799, |
|
"loss/crossentropy": 2.687403917312622, |
|
"loss/logits": 0.2925274521112442, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.294921875, |
|
"grad_norm_var": 0.02292021115620931, |
|
"learning_rate": 0.009540715869125407, |
|
"loss": 2.8545, |
|
"loss/crossentropy": 2.56766939163208, |
|
"loss/logits": 0.2868521362543106, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.232421875, |
|
"grad_norm_var": 0.02558739980061849, |
|
"learning_rate": 0.00945503262094184, |
|
"loss": 3.0801, |
|
"loss/crossentropy": 2.790625810623169, |
|
"loss/logits": 0.28946465253829956, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.2197265625, |
|
"grad_norm_var": 0.01530239979426066, |
|
"learning_rate": 0.009362480035363985, |
|
"loss": 2.8576, |
|
"loss/crossentropy": 2.584877848625183, |
|
"loss/logits": 0.2727508991956711, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.2431640625, |
|
"grad_norm_var": 0.010964266459147136, |
|
"learning_rate": 0.009263200821770462, |
|
"loss": 2.8391, |
|
"loss/crossentropy": 2.565540909767151, |
|
"loss/logits": 0.2735111564397812, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.205078125, |
|
"grad_norm_var": 0.006621154149373373, |
|
"learning_rate": 0.009157348061512726, |
|
"loss": 2.8551, |
|
"loss/crossentropy": 2.577111601829529, |
|
"loss/logits": 0.27798011898994446, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.208984375, |
|
"grad_norm_var": 0.004467582702636719, |
|
"learning_rate": 0.009045084971874737, |
|
"loss": 2.8021, |
|
"loss/crossentropy": 2.5365407466888428, |
|
"loss/logits": 0.26556773483753204, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.2392578125, |
|
"grad_norm_var": 0.00390551487604777, |
|
"learning_rate": 0.008926584654403725, |
|
"loss": 3.0098, |
|
"loss/crossentropy": 2.7032312154769897, |
|
"loss/logits": 0.30657848715782166, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.19921875, |
|
"grad_norm_var": 0.0038659056027730305, |
|
"learning_rate": 0.008802029828000156, |
|
"loss": 2.6913, |
|
"loss/crossentropy": 2.424975633621216, |
|
"loss/logits": 0.2662935256958008, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.2197265625, |
|
"grad_norm_var": 0.0034123579661051433, |
|
"learning_rate": 0.008671612547178428, |
|
"loss": 2.8371, |
|
"loss/crossentropy": 2.559100031852722, |
|
"loss/logits": 0.27803806960582733, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.23046875, |
|
"grad_norm_var": 0.0027981917063395183, |
|
"learning_rate": 0.008535533905932738, |
|
"loss": 2.7885, |
|
"loss/crossentropy": 2.5171408653259277, |
|
"loss/logits": 0.271391361951828, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"grad_norm": 0.166015625, |
|
"grad_norm_var": 0.002695910135904948, |
|
"learning_rate": 0.00839400372766471, |
|
"loss": 2.6022, |
|
"loss/crossentropy": 2.342413067817688, |
|
"loss/logits": 0.259748637676239, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 0.16796875, |
|
"grad_norm_var": 0.002666918436686198, |
|
"learning_rate": 0.008247240241650917, |
|
"loss": 2.6642, |
|
"loss/crossentropy": 2.3876761198043823, |
|
"loss/logits": 0.27653323113918304, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 5.03, |
|
"grad_norm": 0.1689453125, |
|
"grad_norm_var": 0.00197222630182902, |
|
"learning_rate": 0.00809546974654917, |
|
"loss": 2.7753, |
|
"loss/crossentropy": 2.5002858638763428, |
|
"loss/logits": 0.2750103175640106, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"grad_norm": 0.201171875, |
|
"grad_norm_var": 0.001078792413075765, |
|
"learning_rate": 0.007938926261462366, |
|
"loss": 2.9022, |
|
"loss/crossentropy": 2.592116355895996, |
|
"loss/logits": 0.31007957458496094, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 5.05, |
|
"grad_norm": 0.185546875, |
|
"grad_norm_var": 0.0006791392962137859, |
|
"learning_rate": 0.007777851165098011, |
|
"loss": 2.6588, |
|
"loss/crossentropy": 2.39296555519104, |
|
"loss/logits": 0.2658107429742813, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 0.158203125, |
|
"grad_norm_var": 0.0007957100868225097, |
|
"learning_rate": 0.0076124928235797445, |
|
"loss": 2.6616, |
|
"loss/crossentropy": 2.383183717727661, |
|
"loss/logits": 0.2784254252910614, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 5.07, |
|
"grad_norm": 0.181640625, |
|
"grad_norm_var": 0.0007696747779846191, |
|
"learning_rate": 0.007443106207484776, |
|
"loss": 2.696, |
|
"loss/crossentropy": 2.416291117668152, |
|
"loss/logits": 0.2797371596097946, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 5.08, |
|
"grad_norm": 0.181640625, |
|
"grad_norm_var": 0.000768280029296875, |
|
"learning_rate": 0.007269952498697734, |
|
"loss": 2.733, |
|
"loss/crossentropy": 2.4512546062469482, |
|
"loss/logits": 0.28178632259368896, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 0.181640625, |
|
"grad_norm_var": 0.0006483674049377441, |
|
"learning_rate": 0.007093298687687141, |
|
"loss": 2.7384, |
|
"loss/crossentropy": 2.44217312335968, |
|
"loss/logits": 0.2962687909603119, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 0.1552734375, |
|
"grad_norm_var": 0.0007207075754801433, |
|
"learning_rate": 0.00691341716182545, |
|
"loss": 2.5692, |
|
"loss/crossentropy": 2.2908589839935303, |
|
"loss/logits": 0.2783200442790985, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 6.03, |
|
"grad_norm": 0.1494140625, |
|
"grad_norm_var": 0.0006417433420817057, |
|
"learning_rate": 0.006730585285387465, |
|
"loss": 2.6463, |
|
"loss/crossentropy": 2.3550511598587036, |
|
"loss/logits": 0.29120513796806335, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 0.1552734375, |
|
"grad_norm_var": 0.0006787578264872233, |
|
"learning_rate": 0.006545084971874737, |
|
"loss": 2.7074, |
|
"loss/crossentropy": 2.42005717754364, |
|
"loss/logits": 0.2873295992612839, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 0.14453125, |
|
"grad_norm_var": 0.0006558100382486979, |
|
"learning_rate": 0.006357202249325371, |
|
"loss": 2.5666, |
|
"loss/crossentropy": 2.284132957458496, |
|
"loss/logits": 0.2824671268463135, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 6.06, |
|
"grad_norm": 0.15234375, |
|
"grad_norm_var": 0.00048542022705078125, |
|
"learning_rate": 0.0061672268192795275, |
|
"loss": 2.644, |
|
"loss/crossentropy": 2.3572909832000732, |
|
"loss/logits": 0.286677747964859, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 6.07, |
|
"grad_norm": 0.1533203125, |
|
"grad_norm_var": 0.0002622246742248535, |
|
"learning_rate": 0.005975451610080642, |
|
"loss": 2.5268, |
|
"loss/crossentropy": 2.2449907064437866, |
|
"loss/logits": 0.28181955218315125, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 0.1298828125, |
|
"grad_norm_var": 0.00035235087076822914, |
|
"learning_rate": 0.0057821723252011546, |
|
"loss": 2.503, |
|
"loss/crossentropy": 2.2291653156280518, |
|
"loss/logits": 0.27385060489177704, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 0.1337890625, |
|
"grad_norm_var": 0.00048067967096964516, |
|
"learning_rate": 0.0055876869872891885, |
|
"loss": 2.5122, |
|
"loss/crossentropy": 2.2260255813598633, |
|
"loss/logits": 0.2862100601196289, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 0.1318359375, |
|
"grad_norm_var": 0.0004093011220296224, |
|
"learning_rate": 0.0053922954786392256, |
|
"loss": 2.5321, |
|
"loss/crossentropy": 2.243638038635254, |
|
"loss/logits": 0.2884128838777542, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"grad_norm": 0.130859375, |
|
"grad_norm_var": 0.0003852685292561849, |
|
"learning_rate": 0.005196299078795343, |
|
"loss": 2.4881, |
|
"loss/crossentropy": 2.1921509504318237, |
|
"loss/logits": 0.2959805279970169, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 7.04, |
|
"grad_norm": 0.1357421875, |
|
"grad_norm_var": 0.0004018108050028483, |
|
"learning_rate": 0.005, |
|
"loss": 2.6882, |
|
"loss/crossentropy": 2.393338203430176, |
|
"loss/logits": 0.2948644757270813, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 7.05, |
|
"grad_norm": 0.138671875, |
|
"grad_norm_var": 0.0003462115923563639, |
|
"learning_rate": 0.004803700921204659, |
|
"loss": 2.6864, |
|
"loss/crossentropy": 2.387941598892212, |
|
"loss/logits": 0.29849502444267273, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 7.06, |
|
"grad_norm": 0.1279296875, |
|
"grad_norm_var": 0.00029354095458984376, |
|
"learning_rate": 0.004607704521360776, |
|
"loss": 2.446, |
|
"loss/crossentropy": 2.1593087911605835, |
|
"loss/logits": 0.28673939406871796, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 0.1298828125, |
|
"grad_norm_var": 0.00021330118179321289, |
|
"learning_rate": 0.004412313012710813, |
|
"loss": 2.5145, |
|
"loss/crossentropy": 2.2218284606933594, |
|
"loss/logits": 0.29268868267536163, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 7.08, |
|
"grad_norm": 0.1240234375, |
|
"grad_norm_var": 0.00012022654215494791, |
|
"learning_rate": 0.004217827674798845, |
|
"loss": 2.4267, |
|
"loss/crossentropy": 2.1349563598632812, |
|
"loss/logits": 0.29178011417388916, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.12451171875, |
|
"grad_norm_var": 0.00011052191257476807, |
|
"learning_rate": 0.004024548389919359, |
|
"loss": 2.5903, |
|
"loss/crossentropy": 2.2803404331207275, |
|
"loss/logits": 0.30999650061130524, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"grad_norm": 0.11376953125, |
|
"grad_norm_var": 0.00010824203491210938, |
|
"learning_rate": 0.003832773180720475, |
|
"loss": 2.4998, |
|
"loss/crossentropy": 2.2094690799713135, |
|
"loss/logits": 0.2903156131505966, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 8.03, |
|
"grad_norm": 0.11865234375, |
|
"grad_norm_var": 0.00010966360569000244, |
|
"learning_rate": 0.003642797750674629, |
|
"loss": 2.4863, |
|
"loss/crossentropy": 2.1871025562286377, |
|
"loss/logits": 0.29923413693904877, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 8.04, |
|
"grad_norm": 0.11572265625, |
|
"grad_norm_var": 9.021759033203124e-05, |
|
"learning_rate": 0.003454915028125263, |
|
"loss": 2.3923, |
|
"loss/crossentropy": 2.0989878177642822, |
|
"loss/logits": 0.2932642847299576, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 8.05, |
|
"grad_norm": 0.1181640625, |
|
"grad_norm_var": 5.30242919921875e-05, |
|
"learning_rate": 0.003269414714612534, |
|
"loss": 2.563, |
|
"loss/crossentropy": 2.2577706575393677, |
|
"loss/logits": 0.30524544417858124, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 8.06, |
|
"grad_norm": 0.11083984375, |
|
"grad_norm_var": 6.763041019439698e-05, |
|
"learning_rate": 0.0030865828381745515, |
|
"loss": 2.4157, |
|
"loss/crossentropy": 2.1293630599975586, |
|
"loss/logits": 0.2863532304763794, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.1181640625, |
|
"grad_norm_var": 6.939470767974854e-05, |
|
"learning_rate": 0.002906701312312861, |
|
"loss": 2.4613, |
|
"loss/crossentropy": 2.155823588371277, |
|
"loss/logits": 0.3054400086402893, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 0.1142578125, |
|
"grad_norm_var": 6.979207197825114e-05, |
|
"learning_rate": 0.0027300475013022664, |
|
"loss": 2.4934, |
|
"loss/crossentropy": 2.1984496116638184, |
|
"loss/logits": 0.29494835436344147, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 0.10546875, |
|
"grad_norm_var": 8.160173892974853e-05, |
|
"learning_rate": 0.002556893792515227, |
|
"loss": 2.436, |
|
"loss/crossentropy": 2.138706684112549, |
|
"loss/logits": 0.29732728004455566, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"grad_norm": 0.1103515625, |
|
"grad_norm_var": 7.158021132151286e-05, |
|
"learning_rate": 0.002387507176420256, |
|
"loss": 2.5448, |
|
"loss/crossentropy": 2.2357265949249268, |
|
"loss/logits": 0.3090888559818268, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 9.03, |
|
"grad_norm": 0.10888671875, |
|
"grad_norm_var": 5.0067901611328125e-05, |
|
"learning_rate": 0.00222214883490199, |
|
"loss": 2.4924, |
|
"loss/crossentropy": 2.195006251335144, |
|
"loss/logits": 0.29741397500038147, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 0.1083984375, |
|
"grad_norm_var": 4.657109578450521e-05, |
|
"learning_rate": 0.0020610737385376348, |
|
"loss": 2.3857, |
|
"loss/crossentropy": 2.0854870080947876, |
|
"loss/logits": 0.3002435863018036, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 9.05, |
|
"grad_norm": 0.11279296875, |
|
"grad_norm_var": 3.367165724436442e-05, |
|
"learning_rate": 0.0019045302534508297, |
|
"loss": 2.3901, |
|
"loss/crossentropy": 2.0945165157318115, |
|
"loss/logits": 0.2955891638994217, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 9.06, |
|
"grad_norm": 0.1083984375, |
|
"grad_norm_var": 3.042916456858317e-05, |
|
"learning_rate": 0.0017527597583490823, |
|
"loss": 2.5162, |
|
"loss/crossentropy": 2.212076425552368, |
|
"loss/logits": 0.3041383922100067, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"grad_norm": 0.1083984375, |
|
"grad_norm_var": 2.5152166684468588e-05, |
|
"learning_rate": 0.0016059962723352912, |
|
"loss": 2.5047, |
|
"loss/crossentropy": 2.201292634010315, |
|
"loss/logits": 0.30345703661441803, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 9.08, |
|
"grad_norm": 0.107421875, |
|
"grad_norm_var": 1.760721206665039e-05, |
|
"learning_rate": 0.0014644660940672626, |
|
"loss": 2.4349, |
|
"loss/crossentropy": 2.1304625272750854, |
|
"loss/logits": 0.30440980195999146, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 10.01, |
|
"grad_norm": 0.1015625, |
|
"grad_norm_var": 2.1199385325113933e-05, |
|
"learning_rate": 0.0013283874528215733, |
|
"loss": 2.5581, |
|
"loss/crossentropy": 2.2433000802993774, |
|
"loss/logits": 0.31478117406368256, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"grad_norm": 0.10107421875, |
|
"grad_norm_var": 2.471605936686198e-05, |
|
"learning_rate": 0.0011979701719998454, |
|
"loss": 2.4003, |
|
"loss/crossentropy": 2.098269462585449, |
|
"loss/logits": 0.30198484659194946, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 10.03, |
|
"grad_norm": 0.1005859375, |
|
"grad_norm_var": 2.4286905924479165e-05, |
|
"learning_rate": 0.0010734153455962765, |
|
"loss": 2.5084, |
|
"loss/crossentropy": 2.2013310194015503, |
|
"loss/logits": 0.30710338056087494, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 10.04, |
|
"grad_norm": 0.10205078125, |
|
"grad_norm_var": 2.653996149698893e-05, |
|
"learning_rate": 0.0009549150281252633, |
|
"loss": 2.5173, |
|
"loss/crossentropy": 2.213552951812744, |
|
"loss/logits": 0.3037945479154587, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 10.05, |
|
"grad_norm": 0.0986328125, |
|
"grad_norm_var": 2.4155775705973306e-05, |
|
"learning_rate": 0.0008426519384872733, |
|
"loss": 2.3663, |
|
"loss/crossentropy": 2.0724344849586487, |
|
"loss/logits": 0.2938496023416519, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 10.06, |
|
"grad_norm": 0.10205078125, |
|
"grad_norm_var": 2.1448731422424318e-05, |
|
"learning_rate": 0.0007367991782295391, |
|
"loss": 2.3785, |
|
"loss/crossentropy": 2.0789949893951416, |
|
"loss/logits": 0.2995530366897583, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 10.07, |
|
"grad_norm": 0.10009765625, |
|
"grad_norm_var": 1.8596649169921875e-05, |
|
"learning_rate": 0.0006375199646360141, |
|
"loss": 2.4332, |
|
"loss/crossentropy": 2.1326472759246826, |
|
"loss/logits": 0.30053600668907166, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 10.08, |
|
"grad_norm": 0.10009765625, |
|
"grad_norm_var": 2.0224849383036295e-05, |
|
"learning_rate": 0.0005449673790581611, |
|
"loss": 2.494, |
|
"loss/crossentropy": 2.1958537101745605, |
|
"loss/logits": 0.2981116622686386, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 11.01, |
|
"grad_norm": 0.0966796875, |
|
"grad_norm_var": 2.1091103553771974e-05, |
|
"learning_rate": 0.0004592841308745932, |
|
"loss": 2.4855, |
|
"loss/crossentropy": 2.183037042617798, |
|
"loss/logits": 0.3025069534778595, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"grad_norm": 0.09814453125, |
|
"grad_norm_var": 2.0945072174072264e-05, |
|
"learning_rate": 0.0003806023374435663, |
|
"loss": 2.3937, |
|
"loss/crossentropy": 2.0915567874908447, |
|
"loss/logits": 0.3020932674407959, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 11.03, |
|
"grad_norm": 0.0966796875, |
|
"grad_norm_var": 1.5798211097717286e-05, |
|
"learning_rate": 0.00030904332038757975, |
|
"loss": 2.4012, |
|
"loss/crossentropy": 2.1043936014175415, |
|
"loss/logits": 0.29680541157722473, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 11.04, |
|
"grad_norm": 0.09716796875, |
|
"grad_norm_var": 1.3856093088785807e-05, |
|
"learning_rate": 0.00024471741852423234, |
|
"loss": 2.4577, |
|
"loss/crossentropy": 2.1379482746124268, |
|
"loss/logits": 0.3198002427816391, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 11.05, |
|
"grad_norm": 0.095703125, |
|
"grad_norm_var": 1.163482666015625e-05, |
|
"learning_rate": 0.00018772381773176416, |
|
"loss": 2.4786, |
|
"loss/crossentropy": 2.173209071159363, |
|
"loss/logits": 0.3054308444261551, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 11.06, |
|
"grad_norm": 0.0966796875, |
|
"grad_norm_var": 8.706251780192057e-06, |
|
"learning_rate": 0.0001381503980116172, |
|
"loss": 2.5107, |
|
"loss/crossentropy": 2.2105352878570557, |
|
"loss/logits": 0.30013974010944366, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 11.07, |
|
"grad_norm": 0.095703125, |
|
"grad_norm_var": 5.177656809488932e-06, |
|
"learning_rate": 9.607359798384785e-05, |
|
"loss": 2.4619, |
|
"loss/crossentropy": 2.167112946510315, |
|
"loss/logits": 0.29475128650665283, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 11.08, |
|
"grad_norm": 0.095703125, |
|
"grad_norm_var": 5.273024241129557e-06, |
|
"learning_rate": 6.15582970243117e-05, |
|
"loss": 2.4592, |
|
"loss/crossentropy": 2.160895347595215, |
|
"loss/logits": 0.298322930932045, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 12.01, |
|
"grad_norm": 0.09912109375, |
|
"grad_norm_var": 4.9591064453125e-06, |
|
"learning_rate": 3.465771522536854e-05, |
|
"loss": 2.5121, |
|
"loss/crossentropy": 2.1986340284347534, |
|
"loss/logits": 0.31342923641204834, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 12.02, |
|
"grad_norm": 0.0947265625, |
|
"grad_norm_var": 4.497170448303223e-06, |
|
"learning_rate": 1.541333133436018e-05, |
|
"loss": 2.4629, |
|
"loss/crossentropy": 2.156323194503784, |
|
"loss/logits": 0.30655351281166077, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 12.03, |
|
"grad_norm": 0.09228515625, |
|
"grad_norm_var": 6.214777628580729e-06, |
|
"learning_rate": 3.854818796385496e-06, |
|
"loss": 2.5164, |
|
"loss/crossentropy": 2.2150908708572388, |
|
"loss/logits": 0.30133919417858124, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 12.04, |
|
"grad_norm": 0.09228515625, |
|
"grad_norm_var": 5.976359049479167e-06, |
|
"learning_rate": 0.0, |
|
"loss": 2.378, |
|
"loss/crossentropy": 2.089114785194397, |
|
"loss/logits": 0.2889086455106735, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.16881259823104e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|