{ "best_metric": null, "best_model_checkpoint": null, "epoch": 12.04, "eval_steps": 500000000, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.271484375, "learning_rate": 0.0005, "loss": 3.4406, "loss/crossentropy": 3.14038622379303, "loss/logits": 0.30017508566379547, "step": 1 }, { "epoch": 0.02, "grad_norm": 0.384765625, "learning_rate": 0.001, "loss": 3.4222, "loss/crossentropy": 3.1114853620529175, "loss/logits": 0.31074509024620056, "step": 2 }, { "epoch": 0.03, "grad_norm": 0.220703125, "learning_rate": 0.0015, "loss": 3.4379, "loss/crossentropy": 3.1400939226150513, "loss/logits": 0.29777538776397705, "step": 3 }, { "epoch": 0.04, "grad_norm": 0.251953125, "learning_rate": 0.002, "loss": 3.3201, "loss/crossentropy": 3.0177063941955566, "loss/logits": 0.30243340134620667, "step": 4 }, { "epoch": 0.05, "grad_norm": 0.296875, "learning_rate": 0.0025, "loss": 3.3921, "loss/crossentropy": 3.0849010944366455, "loss/logits": 0.307217076420784, "step": 5 }, { "epoch": 0.06, "grad_norm": 0.337890625, "learning_rate": 0.003, "loss": 3.4715, "loss/crossentropy": 3.13341748714447, "loss/logits": 0.33807775378227234, "step": 6 }, { "epoch": 0.07, "grad_norm": 0.296875, "learning_rate": 0.0034999999999999996, "loss": 3.4079, "loss/crossentropy": 3.0802475214004517, "loss/logits": 0.3276752084493637, "step": 7 }, { "epoch": 0.08, "grad_norm": 0.2890625, "learning_rate": 0.004, "loss": 3.2953, "loss/crossentropy": 2.987769603729248, "loss/logits": 0.3075404167175293, "step": 8 }, { "epoch": 1.01, "grad_norm": 0.31640625, "learning_rate": 0.0045000000000000005, "loss": 3.2869, "loss/crossentropy": 2.9777328968048096, "loss/logits": 0.3091176152229309, "step": 9 }, { "epoch": 1.02, "grad_norm": 0.390625, "learning_rate": 0.005, "loss": 3.2259, "loss/crossentropy": 2.9359546899795532, "loss/logits": 0.28997449576854706, "step": 10 }, { "epoch": 1.03, "grad_norm": 0.42578125, "learning_rate": 0.0055000000000000005, "loss": 3.3682, "loss/crossentropy": 3.0672762393951416, "loss/logits": 0.3009057492017746, "step": 11 }, { "epoch": 1.04, "grad_norm": 0.6796875, "learning_rate": 0.006, "loss": 3.2457, "loss/crossentropy": 2.9338029623031616, "loss/logits": 0.31185680627822876, "step": 12 }, { "epoch": 1.05, "grad_norm": 0.58984375, "learning_rate": 0.006500000000000001, "loss": 3.5126, "loss/crossentropy": 3.1620391607284546, "loss/logits": 0.3505771607160568, "step": 13 }, { "epoch": 1.06, "grad_norm": 0.546875, "learning_rate": 0.006999999999999999, "loss": 3.3082, "loss/crossentropy": 2.9693878889083862, "loss/logits": 0.3388153314590454, "step": 14 }, { "epoch": 1.07, "grad_norm": 0.7109375, "grad_norm_var": 0.02401172320048014, "learning_rate": 0.0075, "loss": 3.1669, "loss/crossentropy": 2.8395652770996094, "loss/logits": 0.3273603916168213, "step": 15 }, { "epoch": 1.08, "grad_norm": 0.62109375, "grad_norm_var": 0.02595494588216146, "learning_rate": 0.008, "loss": 3.435, "loss/crossentropy": 3.0843794345855713, "loss/logits": 0.35064929723739624, "step": 16 }, { "epoch": 2.01, "grad_norm": 0.49609375, "grad_norm_var": 0.02552026112874349, "learning_rate": 0.0085, "loss": 3.3402, "loss/crossentropy": 3.018695592880249, "loss/logits": 0.32146573066711426, "step": 17 }, { "epoch": 2.02, "grad_norm": 0.77734375, "grad_norm_var": 0.029073317845662434, "learning_rate": 0.009000000000000001, "loss": 3.3812, "loss/crossentropy": 2.9771525859832764, "loss/logits": 0.4040682762861252, "step": 18 }, { "epoch": 2.03, "grad_norm": 0.671875, "grad_norm_var": 0.028688796361287437, "learning_rate": 0.0095, "loss": 3.226, "loss/crossentropy": 2.9090826511383057, "loss/logits": 0.31695474684238434, "step": 19 }, { "epoch": 2.04, "grad_norm": 0.63671875, "grad_norm_var": 0.027660115559895834, "learning_rate": 0.01, "loss": 2.9946, "loss/crossentropy": 2.6920872926712036, "loss/logits": 0.30253250896930695, "step": 20 }, { "epoch": 2.05, "grad_norm": 0.6171875, "grad_norm_var": 0.024438222249348957, "learning_rate": 0.009996145181203616, "loss": 3.2839, "loss/crossentropy": 2.9436020851135254, "loss/logits": 0.3402951508760452, "step": 21 }, { "epoch": 2.06, "grad_norm": 0.515625, "grad_norm_var": 0.019991048177083335, "learning_rate": 0.00998458666866564, "loss": 3.2061, "loss/crossentropy": 2.8926981687545776, "loss/logits": 0.313431978225708, "step": 22 }, { "epoch": 2.07, "grad_norm": 0.40234375, "grad_norm_var": 0.016751543680826823, "learning_rate": 0.009965342284774633, "loss": 3.1526, "loss/crossentropy": 2.831883192062378, "loss/logits": 0.3207142651081085, "step": 23 }, { "epoch": 2.08, "grad_norm": 0.373046875, "grad_norm_var": 0.015084314346313476, "learning_rate": 0.009938441702975689, "loss": 3.0691, "loss/crossentropy": 2.7710957527160645, "loss/logits": 0.2979845702648163, "step": 24 }, { "epoch": 3.01, "grad_norm": 0.3671875, "grad_norm_var": 0.01682891845703125, "learning_rate": 0.009903926402016152, "loss": 3.0935, "loss/crossentropy": 2.800452947616577, "loss/logits": 0.2930735796689987, "step": 25 }, { "epoch": 3.02, "grad_norm": 0.328125, "grad_norm_var": 0.019060516357421876, "learning_rate": 0.009861849601988383, "loss": 2.972, "loss/crossentropy": 2.6579891443252563, "loss/logits": 0.31396760046482086, "step": 26 }, { "epoch": 3.03, "grad_norm": 0.31640625, "grad_norm_var": 0.021935526529947916, "learning_rate": 0.009812276182268235, "loss": 3.0729, "loss/crossentropy": 2.7841025590896606, "loss/logits": 0.2887759953737259, "step": 27 }, { "epoch": 3.04, "grad_norm": 0.31640625, "grad_norm_var": 0.02453505198160807, "learning_rate": 0.009755282581475769, "loss": 2.9013, "loss/crossentropy": 2.6322743892669678, "loss/logits": 0.2690378725528717, "step": 28 }, { "epoch": 3.05, "grad_norm": 0.349609375, "grad_norm_var": 0.022967767715454102, "learning_rate": 0.00969095667961242, "loss": 2.9966, "loss/crossentropy": 2.718039870262146, "loss/logits": 0.2785477787256241, "step": 29 }, { "epoch": 3.06, "grad_norm": 0.337890625, "grad_norm_var": 0.022896321614583333, "learning_rate": 0.009619397662556433, "loss": 2.9799, "loss/crossentropy": 2.687403917312622, "loss/logits": 0.2925274521112442, "step": 30 }, { "epoch": 3.07, "grad_norm": 0.294921875, "grad_norm_var": 0.02292021115620931, "learning_rate": 0.009540715869125407, "loss": 2.8545, "loss/crossentropy": 2.56766939163208, "loss/logits": 0.2868521362543106, "step": 31 }, { "epoch": 3.08, "grad_norm": 0.232421875, "grad_norm_var": 0.02558739980061849, "learning_rate": 0.00945503262094184, "loss": 3.0801, "loss/crossentropy": 2.790625810623169, "loss/logits": 0.28946465253829956, "step": 32 }, { "epoch": 4.01, "grad_norm": 0.2197265625, "grad_norm_var": 0.01530239979426066, "learning_rate": 0.009362480035363985, "loss": 2.8576, "loss/crossentropy": 2.584877848625183, "loss/logits": 0.2727508991956711, "step": 33 }, { "epoch": 4.02, "grad_norm": 0.2431640625, "grad_norm_var": 0.010964266459147136, "learning_rate": 0.009263200821770462, "loss": 2.8391, "loss/crossentropy": 2.565540909767151, "loss/logits": 0.2735111564397812, "step": 34 }, { "epoch": 4.03, "grad_norm": 0.205078125, "grad_norm_var": 0.006621154149373373, "learning_rate": 0.009157348061512726, "loss": 2.8551, "loss/crossentropy": 2.577111601829529, "loss/logits": 0.27798011898994446, "step": 35 }, { "epoch": 4.04, "grad_norm": 0.208984375, "grad_norm_var": 0.004467582702636719, "learning_rate": 0.009045084971874737, "loss": 2.8021, "loss/crossentropy": 2.5365407466888428, "loss/logits": 0.26556773483753204, "step": 36 }, { "epoch": 4.05, "grad_norm": 0.2392578125, "grad_norm_var": 0.00390551487604777, "learning_rate": 0.008926584654403725, "loss": 3.0098, "loss/crossentropy": 2.7032312154769897, "loss/logits": 0.30657848715782166, "step": 37 }, { "epoch": 4.06, "grad_norm": 0.19921875, "grad_norm_var": 0.0038659056027730305, "learning_rate": 0.008802029828000156, "loss": 2.6913, "loss/crossentropy": 2.424975633621216, "loss/logits": 0.2662935256958008, "step": 38 }, { "epoch": 4.07, "grad_norm": 0.2197265625, "grad_norm_var": 0.0034123579661051433, "learning_rate": 0.008671612547178428, "loss": 2.8371, "loss/crossentropy": 2.559100031852722, "loss/logits": 0.27803806960582733, "step": 39 }, { "epoch": 4.08, "grad_norm": 0.23046875, "grad_norm_var": 0.0027981917063395183, "learning_rate": 0.008535533905932738, "loss": 2.7885, "loss/crossentropy": 2.5171408653259277, "loss/logits": 0.271391361951828, "step": 40 }, { "epoch": 5.01, "grad_norm": 0.166015625, "grad_norm_var": 0.002695910135904948, "learning_rate": 0.00839400372766471, "loss": 2.6022, "loss/crossentropy": 2.342413067817688, "loss/logits": 0.259748637676239, "step": 41 }, { "epoch": 5.02, "grad_norm": 0.16796875, "grad_norm_var": 0.002666918436686198, "learning_rate": 0.008247240241650917, "loss": 2.6642, "loss/crossentropy": 2.3876761198043823, "loss/logits": 0.27653323113918304, "step": 42 }, { "epoch": 5.03, "grad_norm": 0.1689453125, "grad_norm_var": 0.00197222630182902, "learning_rate": 0.00809546974654917, "loss": 2.7753, "loss/crossentropy": 2.5002858638763428, "loss/logits": 0.2750103175640106, "step": 43 }, { "epoch": 5.04, "grad_norm": 0.201171875, "grad_norm_var": 0.001078792413075765, "learning_rate": 0.007938926261462366, "loss": 2.9022, "loss/crossentropy": 2.592116355895996, "loss/logits": 0.31007957458496094, "step": 44 }, { "epoch": 5.05, "grad_norm": 0.185546875, "grad_norm_var": 0.0006791392962137859, "learning_rate": 0.007777851165098011, "loss": 2.6588, "loss/crossentropy": 2.39296555519104, "loss/logits": 0.2658107429742813, "step": 45 }, { "epoch": 5.06, "grad_norm": 0.158203125, "grad_norm_var": 0.0007957100868225097, "learning_rate": 0.0076124928235797445, "loss": 2.6616, "loss/crossentropy": 2.383183717727661, "loss/logits": 0.2784254252910614, "step": 46 }, { "epoch": 5.07, "grad_norm": 0.181640625, "grad_norm_var": 0.0007696747779846191, "learning_rate": 0.007443106207484776, "loss": 2.696, "loss/crossentropy": 2.416291117668152, "loss/logits": 0.2797371596097946, "step": 47 }, { "epoch": 5.08, "grad_norm": 0.181640625, "grad_norm_var": 0.000768280029296875, "learning_rate": 0.007269952498697734, "loss": 2.733, "loss/crossentropy": 2.4512546062469482, "loss/logits": 0.28178632259368896, "step": 48 }, { "epoch": 6.01, "grad_norm": 0.181640625, "grad_norm_var": 0.0006483674049377441, "learning_rate": 0.007093298687687141, "loss": 2.7384, "loss/crossentropy": 2.44217312335968, "loss/logits": 0.2962687909603119, "step": 49 }, { "epoch": 6.02, "grad_norm": 0.1552734375, "grad_norm_var": 0.0007207075754801433, "learning_rate": 0.00691341716182545, "loss": 2.5692, "loss/crossentropy": 2.2908589839935303, "loss/logits": 0.2783200442790985, "step": 50 }, { "epoch": 6.03, "grad_norm": 0.1494140625, "grad_norm_var": 0.0006417433420817057, "learning_rate": 0.006730585285387465, "loss": 2.6463, "loss/crossentropy": 2.3550511598587036, "loss/logits": 0.29120513796806335, "step": 51 }, { "epoch": 6.04, "grad_norm": 0.1552734375, "grad_norm_var": 0.0006787578264872233, "learning_rate": 0.006545084971874737, "loss": 2.7074, "loss/crossentropy": 2.42005717754364, "loss/logits": 0.2873295992612839, "step": 52 }, { "epoch": 6.05, "grad_norm": 0.14453125, "grad_norm_var": 0.0006558100382486979, "learning_rate": 0.006357202249325371, "loss": 2.5666, "loss/crossentropy": 2.284132957458496, "loss/logits": 0.2824671268463135, "step": 53 }, { "epoch": 6.06, "grad_norm": 0.15234375, "grad_norm_var": 0.00048542022705078125, "learning_rate": 0.0061672268192795275, "loss": 2.644, "loss/crossentropy": 2.3572909832000732, "loss/logits": 0.286677747964859, "step": 54 }, { "epoch": 6.07, "grad_norm": 0.1533203125, "grad_norm_var": 0.0002622246742248535, "learning_rate": 0.005975451610080642, "loss": 2.5268, "loss/crossentropy": 2.2449907064437866, "loss/logits": 0.28181955218315125, "step": 55 }, { "epoch": 6.08, "grad_norm": 0.1298828125, "grad_norm_var": 0.00035235087076822914, "learning_rate": 0.0057821723252011546, "loss": 2.503, "loss/crossentropy": 2.2291653156280518, "loss/logits": 0.27385060489177704, "step": 56 }, { "epoch": 7.01, "grad_norm": 0.1337890625, "grad_norm_var": 0.00048067967096964516, "learning_rate": 0.0055876869872891885, "loss": 2.5122, "loss/crossentropy": 2.2260255813598633, "loss/logits": 0.2862100601196289, "step": 57 }, { "epoch": 7.02, "grad_norm": 0.1318359375, "grad_norm_var": 0.0004093011220296224, "learning_rate": 0.0053922954786392256, "loss": 2.5321, "loss/crossentropy": 2.243638038635254, "loss/logits": 0.2884128838777542, "step": 58 }, { "epoch": 7.03, "grad_norm": 0.130859375, "grad_norm_var": 0.0003852685292561849, "learning_rate": 0.005196299078795343, "loss": 2.4881, "loss/crossentropy": 2.1921509504318237, "loss/logits": 0.2959805279970169, "step": 59 }, { "epoch": 7.04, "grad_norm": 0.1357421875, "grad_norm_var": 0.0004018108050028483, "learning_rate": 0.005, "loss": 2.6882, "loss/crossentropy": 2.393338203430176, "loss/logits": 0.2948644757270813, "step": 60 }, { "epoch": 7.05, "grad_norm": 0.138671875, "grad_norm_var": 0.0003462115923563639, "learning_rate": 0.004803700921204659, "loss": 2.6864, "loss/crossentropy": 2.387941598892212, "loss/logits": 0.29849502444267273, "step": 61 }, { "epoch": 7.06, "grad_norm": 0.1279296875, "grad_norm_var": 0.00029354095458984376, "learning_rate": 0.004607704521360776, "loss": 2.446, "loss/crossentropy": 2.1593087911605835, "loss/logits": 0.28673939406871796, "step": 62 }, { "epoch": 7.07, "grad_norm": 0.1298828125, "grad_norm_var": 0.00021330118179321289, "learning_rate": 0.004412313012710813, "loss": 2.5145, "loss/crossentropy": 2.2218284606933594, "loss/logits": 0.29268868267536163, "step": 63 }, { "epoch": 7.08, "grad_norm": 0.1240234375, "grad_norm_var": 0.00012022654215494791, "learning_rate": 0.004217827674798845, "loss": 2.4267, "loss/crossentropy": 2.1349563598632812, "loss/logits": 0.29178011417388916, "step": 64 }, { "epoch": 8.01, "grad_norm": 0.12451171875, "grad_norm_var": 0.00011052191257476807, "learning_rate": 0.004024548389919359, "loss": 2.5903, "loss/crossentropy": 2.2803404331207275, "loss/logits": 0.30999650061130524, "step": 65 }, { "epoch": 8.02, "grad_norm": 0.11376953125, "grad_norm_var": 0.00010824203491210938, "learning_rate": 0.003832773180720475, "loss": 2.4998, "loss/crossentropy": 2.2094690799713135, "loss/logits": 0.2903156131505966, "step": 66 }, { "epoch": 8.03, "grad_norm": 0.11865234375, "grad_norm_var": 0.00010966360569000244, "learning_rate": 0.003642797750674629, "loss": 2.4863, "loss/crossentropy": 2.1871025562286377, "loss/logits": 0.29923413693904877, "step": 67 }, { "epoch": 8.04, "grad_norm": 0.11572265625, "grad_norm_var": 9.021759033203124e-05, "learning_rate": 0.003454915028125263, "loss": 2.3923, "loss/crossentropy": 2.0989878177642822, "loss/logits": 0.2932642847299576, "step": 68 }, { "epoch": 8.05, "grad_norm": 0.1181640625, "grad_norm_var": 5.30242919921875e-05, "learning_rate": 0.003269414714612534, "loss": 2.563, "loss/crossentropy": 2.2577706575393677, "loss/logits": 0.30524544417858124, "step": 69 }, { "epoch": 8.06, "grad_norm": 0.11083984375, "grad_norm_var": 6.763041019439698e-05, "learning_rate": 0.0030865828381745515, "loss": 2.4157, "loss/crossentropy": 2.1293630599975586, "loss/logits": 0.2863532304763794, "step": 70 }, { "epoch": 8.07, "grad_norm": 0.1181640625, "grad_norm_var": 6.939470767974854e-05, "learning_rate": 0.002906701312312861, "loss": 2.4613, "loss/crossentropy": 2.155823588371277, "loss/logits": 0.3054400086402893, "step": 71 }, { "epoch": 8.08, "grad_norm": 0.1142578125, "grad_norm_var": 6.979207197825114e-05, "learning_rate": 0.0027300475013022664, "loss": 2.4934, "loss/crossentropy": 2.1984496116638184, "loss/logits": 0.29494835436344147, "step": 72 }, { "epoch": 9.01, "grad_norm": 0.10546875, "grad_norm_var": 8.160173892974853e-05, "learning_rate": 0.002556893792515227, "loss": 2.436, "loss/crossentropy": 2.138706684112549, "loss/logits": 0.29732728004455566, "step": 73 }, { "epoch": 9.02, "grad_norm": 0.1103515625, "grad_norm_var": 7.158021132151286e-05, "learning_rate": 0.002387507176420256, "loss": 2.5448, "loss/crossentropy": 2.2357265949249268, "loss/logits": 0.3090888559818268, "step": 74 }, { "epoch": 9.03, "grad_norm": 0.10888671875, "grad_norm_var": 5.0067901611328125e-05, "learning_rate": 0.00222214883490199, "loss": 2.4924, "loss/crossentropy": 2.195006251335144, "loss/logits": 0.29741397500038147, "step": 75 }, { "epoch": 9.04, "grad_norm": 0.1083984375, "grad_norm_var": 4.657109578450521e-05, "learning_rate": 0.0020610737385376348, "loss": 2.3857, "loss/crossentropy": 2.0854870080947876, "loss/logits": 0.3002435863018036, "step": 76 }, { "epoch": 9.05, "grad_norm": 0.11279296875, "grad_norm_var": 3.367165724436442e-05, "learning_rate": 0.0019045302534508297, "loss": 2.3901, "loss/crossentropy": 2.0945165157318115, "loss/logits": 0.2955891638994217, "step": 77 }, { "epoch": 9.06, "grad_norm": 0.1083984375, "grad_norm_var": 3.042916456858317e-05, "learning_rate": 0.0017527597583490823, "loss": 2.5162, "loss/crossentropy": 2.212076425552368, "loss/logits": 0.3041383922100067, "step": 78 }, { "epoch": 9.07, "grad_norm": 0.1083984375, "grad_norm_var": 2.5152166684468588e-05, "learning_rate": 0.0016059962723352912, "loss": 2.5047, "loss/crossentropy": 2.201292634010315, "loss/logits": 0.30345703661441803, "step": 79 }, { "epoch": 9.08, "grad_norm": 0.107421875, "grad_norm_var": 1.760721206665039e-05, "learning_rate": 0.0014644660940672626, "loss": 2.4349, "loss/crossentropy": 2.1304625272750854, "loss/logits": 0.30440980195999146, "step": 80 }, { "epoch": 10.01, "grad_norm": 0.1015625, "grad_norm_var": 2.1199385325113933e-05, "learning_rate": 0.0013283874528215733, "loss": 2.5581, "loss/crossentropy": 2.2433000802993774, "loss/logits": 0.31478117406368256, "step": 81 }, { "epoch": 10.02, "grad_norm": 0.10107421875, "grad_norm_var": 2.471605936686198e-05, "learning_rate": 0.0011979701719998454, "loss": 2.4003, "loss/crossentropy": 2.098269462585449, "loss/logits": 0.30198484659194946, "step": 82 }, { "epoch": 10.03, "grad_norm": 0.1005859375, "grad_norm_var": 2.4286905924479165e-05, "learning_rate": 0.0010734153455962765, "loss": 2.5084, "loss/crossentropy": 2.2013310194015503, "loss/logits": 0.30710338056087494, "step": 83 }, { "epoch": 10.04, "grad_norm": 0.10205078125, "grad_norm_var": 2.653996149698893e-05, "learning_rate": 0.0009549150281252633, "loss": 2.5173, "loss/crossentropy": 2.213552951812744, "loss/logits": 0.3037945479154587, "step": 84 }, { "epoch": 10.05, "grad_norm": 0.0986328125, "grad_norm_var": 2.4155775705973306e-05, "learning_rate": 0.0008426519384872733, "loss": 2.3663, "loss/crossentropy": 2.0724344849586487, "loss/logits": 0.2938496023416519, "step": 85 }, { "epoch": 10.06, "grad_norm": 0.10205078125, "grad_norm_var": 2.1448731422424318e-05, "learning_rate": 0.0007367991782295391, "loss": 2.3785, "loss/crossentropy": 2.0789949893951416, "loss/logits": 0.2995530366897583, "step": 86 }, { "epoch": 10.07, "grad_norm": 0.10009765625, "grad_norm_var": 1.8596649169921875e-05, "learning_rate": 0.0006375199646360141, "loss": 2.4332, "loss/crossentropy": 2.1326472759246826, "loss/logits": 0.30053600668907166, "step": 87 }, { "epoch": 10.08, "grad_norm": 0.10009765625, "grad_norm_var": 2.0224849383036295e-05, "learning_rate": 0.0005449673790581611, "loss": 2.494, "loss/crossentropy": 2.1958537101745605, "loss/logits": 0.2981116622686386, "step": 88 }, { "epoch": 11.01, "grad_norm": 0.0966796875, "grad_norm_var": 2.1091103553771974e-05, "learning_rate": 0.0004592841308745932, "loss": 2.4855, "loss/crossentropy": 2.183037042617798, "loss/logits": 0.3025069534778595, "step": 89 }, { "epoch": 11.02, "grad_norm": 0.09814453125, "grad_norm_var": 2.0945072174072264e-05, "learning_rate": 0.0003806023374435663, "loss": 2.3937, "loss/crossentropy": 2.0915567874908447, "loss/logits": 0.3020932674407959, "step": 90 }, { "epoch": 11.03, "grad_norm": 0.0966796875, "grad_norm_var": 1.5798211097717286e-05, "learning_rate": 0.00030904332038757975, "loss": 2.4012, "loss/crossentropy": 2.1043936014175415, "loss/logits": 0.29680541157722473, "step": 91 }, { "epoch": 11.04, "grad_norm": 0.09716796875, "grad_norm_var": 1.3856093088785807e-05, "learning_rate": 0.00024471741852423234, "loss": 2.4577, "loss/crossentropy": 2.1379482746124268, "loss/logits": 0.3198002427816391, "step": 92 }, { "epoch": 11.05, "grad_norm": 0.095703125, "grad_norm_var": 1.163482666015625e-05, "learning_rate": 0.00018772381773176416, "loss": 2.4786, "loss/crossentropy": 2.173209071159363, "loss/logits": 0.3054308444261551, "step": 93 }, { "epoch": 11.06, "grad_norm": 0.0966796875, "grad_norm_var": 8.706251780192057e-06, "learning_rate": 0.0001381503980116172, "loss": 2.5107, "loss/crossentropy": 2.2105352878570557, "loss/logits": 0.30013974010944366, "step": 94 }, { "epoch": 11.07, "grad_norm": 0.095703125, "grad_norm_var": 5.177656809488932e-06, "learning_rate": 9.607359798384785e-05, "loss": 2.4619, "loss/crossentropy": 2.167112946510315, "loss/logits": 0.29475128650665283, "step": 95 }, { "epoch": 11.08, "grad_norm": 0.095703125, "grad_norm_var": 5.273024241129557e-06, "learning_rate": 6.15582970243117e-05, "loss": 2.4592, "loss/crossentropy": 2.160895347595215, "loss/logits": 0.298322930932045, "step": 96 }, { "epoch": 12.01, "grad_norm": 0.09912109375, "grad_norm_var": 4.9591064453125e-06, "learning_rate": 3.465771522536854e-05, "loss": 2.5121, "loss/crossentropy": 2.1986340284347534, "loss/logits": 0.31342923641204834, "step": 97 }, { "epoch": 12.02, "grad_norm": 0.0947265625, "grad_norm_var": 4.497170448303223e-06, "learning_rate": 1.541333133436018e-05, "loss": 2.4629, "loss/crossentropy": 2.156323194503784, "loss/logits": 0.30655351281166077, "step": 98 }, { "epoch": 12.03, "grad_norm": 0.09228515625, "grad_norm_var": 6.214777628580729e-06, "learning_rate": 3.854818796385496e-06, "loss": 2.5164, "loss/crossentropy": 2.2150908708572388, "loss/logits": 0.30133919417858124, "step": 99 }, { "epoch": 12.04, "grad_norm": 0.09228515625, "grad_norm_var": 5.976359049479167e-06, "learning_rate": 0.0, "loss": 2.378, "loss/crossentropy": 2.089114785194397, "loss/logits": 0.2889086455106735, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.16881259823104e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }