distil-test-lol / trainer_state.json
semran1's picture
Upload folder using huggingface_hub
4649207 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 12.04,
"eval_steps": 500000000,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 0.271484375,
"learning_rate": 0.0005,
"loss": 3.4406,
"loss/crossentropy": 3.14038622379303,
"loss/logits": 0.30017508566379547,
"step": 1
},
{
"epoch": 0.02,
"grad_norm": 0.384765625,
"learning_rate": 0.001,
"loss": 3.4222,
"loss/crossentropy": 3.1114853620529175,
"loss/logits": 0.31074509024620056,
"step": 2
},
{
"epoch": 0.03,
"grad_norm": 0.220703125,
"learning_rate": 0.0015,
"loss": 3.4379,
"loss/crossentropy": 3.1400939226150513,
"loss/logits": 0.29777538776397705,
"step": 3
},
{
"epoch": 0.04,
"grad_norm": 0.251953125,
"learning_rate": 0.002,
"loss": 3.3201,
"loss/crossentropy": 3.0177063941955566,
"loss/logits": 0.30243340134620667,
"step": 4
},
{
"epoch": 0.05,
"grad_norm": 0.296875,
"learning_rate": 0.0025,
"loss": 3.3921,
"loss/crossentropy": 3.0849010944366455,
"loss/logits": 0.307217076420784,
"step": 5
},
{
"epoch": 0.06,
"grad_norm": 0.337890625,
"learning_rate": 0.003,
"loss": 3.4715,
"loss/crossentropy": 3.13341748714447,
"loss/logits": 0.33807775378227234,
"step": 6
},
{
"epoch": 0.07,
"grad_norm": 0.296875,
"learning_rate": 0.0034999999999999996,
"loss": 3.4079,
"loss/crossentropy": 3.0802475214004517,
"loss/logits": 0.3276752084493637,
"step": 7
},
{
"epoch": 0.08,
"grad_norm": 0.2890625,
"learning_rate": 0.004,
"loss": 3.2953,
"loss/crossentropy": 2.987769603729248,
"loss/logits": 0.3075404167175293,
"step": 8
},
{
"epoch": 1.01,
"grad_norm": 0.31640625,
"learning_rate": 0.0045000000000000005,
"loss": 3.2869,
"loss/crossentropy": 2.9777328968048096,
"loss/logits": 0.3091176152229309,
"step": 9
},
{
"epoch": 1.02,
"grad_norm": 0.390625,
"learning_rate": 0.005,
"loss": 3.2259,
"loss/crossentropy": 2.9359546899795532,
"loss/logits": 0.28997449576854706,
"step": 10
},
{
"epoch": 1.03,
"grad_norm": 0.42578125,
"learning_rate": 0.0055000000000000005,
"loss": 3.3682,
"loss/crossentropy": 3.0672762393951416,
"loss/logits": 0.3009057492017746,
"step": 11
},
{
"epoch": 1.04,
"grad_norm": 0.6796875,
"learning_rate": 0.006,
"loss": 3.2457,
"loss/crossentropy": 2.9338029623031616,
"loss/logits": 0.31185680627822876,
"step": 12
},
{
"epoch": 1.05,
"grad_norm": 0.58984375,
"learning_rate": 0.006500000000000001,
"loss": 3.5126,
"loss/crossentropy": 3.1620391607284546,
"loss/logits": 0.3505771607160568,
"step": 13
},
{
"epoch": 1.06,
"grad_norm": 0.546875,
"learning_rate": 0.006999999999999999,
"loss": 3.3082,
"loss/crossentropy": 2.9693878889083862,
"loss/logits": 0.3388153314590454,
"step": 14
},
{
"epoch": 1.07,
"grad_norm": 0.7109375,
"grad_norm_var": 0.02401172320048014,
"learning_rate": 0.0075,
"loss": 3.1669,
"loss/crossentropy": 2.8395652770996094,
"loss/logits": 0.3273603916168213,
"step": 15
},
{
"epoch": 1.08,
"grad_norm": 0.62109375,
"grad_norm_var": 0.02595494588216146,
"learning_rate": 0.008,
"loss": 3.435,
"loss/crossentropy": 3.0843794345855713,
"loss/logits": 0.35064929723739624,
"step": 16
},
{
"epoch": 2.01,
"grad_norm": 0.49609375,
"grad_norm_var": 0.02552026112874349,
"learning_rate": 0.0085,
"loss": 3.3402,
"loss/crossentropy": 3.018695592880249,
"loss/logits": 0.32146573066711426,
"step": 17
},
{
"epoch": 2.02,
"grad_norm": 0.77734375,
"grad_norm_var": 0.029073317845662434,
"learning_rate": 0.009000000000000001,
"loss": 3.3812,
"loss/crossentropy": 2.9771525859832764,
"loss/logits": 0.4040682762861252,
"step": 18
},
{
"epoch": 2.03,
"grad_norm": 0.671875,
"grad_norm_var": 0.028688796361287437,
"learning_rate": 0.0095,
"loss": 3.226,
"loss/crossentropy": 2.9090826511383057,
"loss/logits": 0.31695474684238434,
"step": 19
},
{
"epoch": 2.04,
"grad_norm": 0.63671875,
"grad_norm_var": 0.027660115559895834,
"learning_rate": 0.01,
"loss": 2.9946,
"loss/crossentropy": 2.6920872926712036,
"loss/logits": 0.30253250896930695,
"step": 20
},
{
"epoch": 2.05,
"grad_norm": 0.6171875,
"grad_norm_var": 0.024438222249348957,
"learning_rate": 0.009996145181203616,
"loss": 3.2839,
"loss/crossentropy": 2.9436020851135254,
"loss/logits": 0.3402951508760452,
"step": 21
},
{
"epoch": 2.06,
"grad_norm": 0.515625,
"grad_norm_var": 0.019991048177083335,
"learning_rate": 0.00998458666866564,
"loss": 3.2061,
"loss/crossentropy": 2.8926981687545776,
"loss/logits": 0.313431978225708,
"step": 22
},
{
"epoch": 2.07,
"grad_norm": 0.40234375,
"grad_norm_var": 0.016751543680826823,
"learning_rate": 0.009965342284774633,
"loss": 3.1526,
"loss/crossentropy": 2.831883192062378,
"loss/logits": 0.3207142651081085,
"step": 23
},
{
"epoch": 2.08,
"grad_norm": 0.373046875,
"grad_norm_var": 0.015084314346313476,
"learning_rate": 0.009938441702975689,
"loss": 3.0691,
"loss/crossentropy": 2.7710957527160645,
"loss/logits": 0.2979845702648163,
"step": 24
},
{
"epoch": 3.01,
"grad_norm": 0.3671875,
"grad_norm_var": 0.01682891845703125,
"learning_rate": 0.009903926402016152,
"loss": 3.0935,
"loss/crossentropy": 2.800452947616577,
"loss/logits": 0.2930735796689987,
"step": 25
},
{
"epoch": 3.02,
"grad_norm": 0.328125,
"grad_norm_var": 0.019060516357421876,
"learning_rate": 0.009861849601988383,
"loss": 2.972,
"loss/crossentropy": 2.6579891443252563,
"loss/logits": 0.31396760046482086,
"step": 26
},
{
"epoch": 3.03,
"grad_norm": 0.31640625,
"grad_norm_var": 0.021935526529947916,
"learning_rate": 0.009812276182268235,
"loss": 3.0729,
"loss/crossentropy": 2.7841025590896606,
"loss/logits": 0.2887759953737259,
"step": 27
},
{
"epoch": 3.04,
"grad_norm": 0.31640625,
"grad_norm_var": 0.02453505198160807,
"learning_rate": 0.009755282581475769,
"loss": 2.9013,
"loss/crossentropy": 2.6322743892669678,
"loss/logits": 0.2690378725528717,
"step": 28
},
{
"epoch": 3.05,
"grad_norm": 0.349609375,
"grad_norm_var": 0.022967767715454102,
"learning_rate": 0.00969095667961242,
"loss": 2.9966,
"loss/crossentropy": 2.718039870262146,
"loss/logits": 0.2785477787256241,
"step": 29
},
{
"epoch": 3.06,
"grad_norm": 0.337890625,
"grad_norm_var": 0.022896321614583333,
"learning_rate": 0.009619397662556433,
"loss": 2.9799,
"loss/crossentropy": 2.687403917312622,
"loss/logits": 0.2925274521112442,
"step": 30
},
{
"epoch": 3.07,
"grad_norm": 0.294921875,
"grad_norm_var": 0.02292021115620931,
"learning_rate": 0.009540715869125407,
"loss": 2.8545,
"loss/crossentropy": 2.56766939163208,
"loss/logits": 0.2868521362543106,
"step": 31
},
{
"epoch": 3.08,
"grad_norm": 0.232421875,
"grad_norm_var": 0.02558739980061849,
"learning_rate": 0.00945503262094184,
"loss": 3.0801,
"loss/crossentropy": 2.790625810623169,
"loss/logits": 0.28946465253829956,
"step": 32
},
{
"epoch": 4.01,
"grad_norm": 0.2197265625,
"grad_norm_var": 0.01530239979426066,
"learning_rate": 0.009362480035363985,
"loss": 2.8576,
"loss/crossentropy": 2.584877848625183,
"loss/logits": 0.2727508991956711,
"step": 33
},
{
"epoch": 4.02,
"grad_norm": 0.2431640625,
"grad_norm_var": 0.010964266459147136,
"learning_rate": 0.009263200821770462,
"loss": 2.8391,
"loss/crossentropy": 2.565540909767151,
"loss/logits": 0.2735111564397812,
"step": 34
},
{
"epoch": 4.03,
"grad_norm": 0.205078125,
"grad_norm_var": 0.006621154149373373,
"learning_rate": 0.009157348061512726,
"loss": 2.8551,
"loss/crossentropy": 2.577111601829529,
"loss/logits": 0.27798011898994446,
"step": 35
},
{
"epoch": 4.04,
"grad_norm": 0.208984375,
"grad_norm_var": 0.004467582702636719,
"learning_rate": 0.009045084971874737,
"loss": 2.8021,
"loss/crossentropy": 2.5365407466888428,
"loss/logits": 0.26556773483753204,
"step": 36
},
{
"epoch": 4.05,
"grad_norm": 0.2392578125,
"grad_norm_var": 0.00390551487604777,
"learning_rate": 0.008926584654403725,
"loss": 3.0098,
"loss/crossentropy": 2.7032312154769897,
"loss/logits": 0.30657848715782166,
"step": 37
},
{
"epoch": 4.06,
"grad_norm": 0.19921875,
"grad_norm_var": 0.0038659056027730305,
"learning_rate": 0.008802029828000156,
"loss": 2.6913,
"loss/crossentropy": 2.424975633621216,
"loss/logits": 0.2662935256958008,
"step": 38
},
{
"epoch": 4.07,
"grad_norm": 0.2197265625,
"grad_norm_var": 0.0034123579661051433,
"learning_rate": 0.008671612547178428,
"loss": 2.8371,
"loss/crossentropy": 2.559100031852722,
"loss/logits": 0.27803806960582733,
"step": 39
},
{
"epoch": 4.08,
"grad_norm": 0.23046875,
"grad_norm_var": 0.0027981917063395183,
"learning_rate": 0.008535533905932738,
"loss": 2.7885,
"loss/crossentropy": 2.5171408653259277,
"loss/logits": 0.271391361951828,
"step": 40
},
{
"epoch": 5.01,
"grad_norm": 0.166015625,
"grad_norm_var": 0.002695910135904948,
"learning_rate": 0.00839400372766471,
"loss": 2.6022,
"loss/crossentropy": 2.342413067817688,
"loss/logits": 0.259748637676239,
"step": 41
},
{
"epoch": 5.02,
"grad_norm": 0.16796875,
"grad_norm_var": 0.002666918436686198,
"learning_rate": 0.008247240241650917,
"loss": 2.6642,
"loss/crossentropy": 2.3876761198043823,
"loss/logits": 0.27653323113918304,
"step": 42
},
{
"epoch": 5.03,
"grad_norm": 0.1689453125,
"grad_norm_var": 0.00197222630182902,
"learning_rate": 0.00809546974654917,
"loss": 2.7753,
"loss/crossentropy": 2.5002858638763428,
"loss/logits": 0.2750103175640106,
"step": 43
},
{
"epoch": 5.04,
"grad_norm": 0.201171875,
"grad_norm_var": 0.001078792413075765,
"learning_rate": 0.007938926261462366,
"loss": 2.9022,
"loss/crossentropy": 2.592116355895996,
"loss/logits": 0.31007957458496094,
"step": 44
},
{
"epoch": 5.05,
"grad_norm": 0.185546875,
"grad_norm_var": 0.0006791392962137859,
"learning_rate": 0.007777851165098011,
"loss": 2.6588,
"loss/crossentropy": 2.39296555519104,
"loss/logits": 0.2658107429742813,
"step": 45
},
{
"epoch": 5.06,
"grad_norm": 0.158203125,
"grad_norm_var": 0.0007957100868225097,
"learning_rate": 0.0076124928235797445,
"loss": 2.6616,
"loss/crossentropy": 2.383183717727661,
"loss/logits": 0.2784254252910614,
"step": 46
},
{
"epoch": 5.07,
"grad_norm": 0.181640625,
"grad_norm_var": 0.0007696747779846191,
"learning_rate": 0.007443106207484776,
"loss": 2.696,
"loss/crossentropy": 2.416291117668152,
"loss/logits": 0.2797371596097946,
"step": 47
},
{
"epoch": 5.08,
"grad_norm": 0.181640625,
"grad_norm_var": 0.000768280029296875,
"learning_rate": 0.007269952498697734,
"loss": 2.733,
"loss/crossentropy": 2.4512546062469482,
"loss/logits": 0.28178632259368896,
"step": 48
},
{
"epoch": 6.01,
"grad_norm": 0.181640625,
"grad_norm_var": 0.0006483674049377441,
"learning_rate": 0.007093298687687141,
"loss": 2.7384,
"loss/crossentropy": 2.44217312335968,
"loss/logits": 0.2962687909603119,
"step": 49
},
{
"epoch": 6.02,
"grad_norm": 0.1552734375,
"grad_norm_var": 0.0007207075754801433,
"learning_rate": 0.00691341716182545,
"loss": 2.5692,
"loss/crossentropy": 2.2908589839935303,
"loss/logits": 0.2783200442790985,
"step": 50
},
{
"epoch": 6.03,
"grad_norm": 0.1494140625,
"grad_norm_var": 0.0006417433420817057,
"learning_rate": 0.006730585285387465,
"loss": 2.6463,
"loss/crossentropy": 2.3550511598587036,
"loss/logits": 0.29120513796806335,
"step": 51
},
{
"epoch": 6.04,
"grad_norm": 0.1552734375,
"grad_norm_var": 0.0006787578264872233,
"learning_rate": 0.006545084971874737,
"loss": 2.7074,
"loss/crossentropy": 2.42005717754364,
"loss/logits": 0.2873295992612839,
"step": 52
},
{
"epoch": 6.05,
"grad_norm": 0.14453125,
"grad_norm_var": 0.0006558100382486979,
"learning_rate": 0.006357202249325371,
"loss": 2.5666,
"loss/crossentropy": 2.284132957458496,
"loss/logits": 0.2824671268463135,
"step": 53
},
{
"epoch": 6.06,
"grad_norm": 0.15234375,
"grad_norm_var": 0.00048542022705078125,
"learning_rate": 0.0061672268192795275,
"loss": 2.644,
"loss/crossentropy": 2.3572909832000732,
"loss/logits": 0.286677747964859,
"step": 54
},
{
"epoch": 6.07,
"grad_norm": 0.1533203125,
"grad_norm_var": 0.0002622246742248535,
"learning_rate": 0.005975451610080642,
"loss": 2.5268,
"loss/crossentropy": 2.2449907064437866,
"loss/logits": 0.28181955218315125,
"step": 55
},
{
"epoch": 6.08,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.00035235087076822914,
"learning_rate": 0.0057821723252011546,
"loss": 2.503,
"loss/crossentropy": 2.2291653156280518,
"loss/logits": 0.27385060489177704,
"step": 56
},
{
"epoch": 7.01,
"grad_norm": 0.1337890625,
"grad_norm_var": 0.00048067967096964516,
"learning_rate": 0.0055876869872891885,
"loss": 2.5122,
"loss/crossentropy": 2.2260255813598633,
"loss/logits": 0.2862100601196289,
"step": 57
},
{
"epoch": 7.02,
"grad_norm": 0.1318359375,
"grad_norm_var": 0.0004093011220296224,
"learning_rate": 0.0053922954786392256,
"loss": 2.5321,
"loss/crossentropy": 2.243638038635254,
"loss/logits": 0.2884128838777542,
"step": 58
},
{
"epoch": 7.03,
"grad_norm": 0.130859375,
"grad_norm_var": 0.0003852685292561849,
"learning_rate": 0.005196299078795343,
"loss": 2.4881,
"loss/crossentropy": 2.1921509504318237,
"loss/logits": 0.2959805279970169,
"step": 59
},
{
"epoch": 7.04,
"grad_norm": 0.1357421875,
"grad_norm_var": 0.0004018108050028483,
"learning_rate": 0.005,
"loss": 2.6882,
"loss/crossentropy": 2.393338203430176,
"loss/logits": 0.2948644757270813,
"step": 60
},
{
"epoch": 7.05,
"grad_norm": 0.138671875,
"grad_norm_var": 0.0003462115923563639,
"learning_rate": 0.004803700921204659,
"loss": 2.6864,
"loss/crossentropy": 2.387941598892212,
"loss/logits": 0.29849502444267273,
"step": 61
},
{
"epoch": 7.06,
"grad_norm": 0.1279296875,
"grad_norm_var": 0.00029354095458984376,
"learning_rate": 0.004607704521360776,
"loss": 2.446,
"loss/crossentropy": 2.1593087911605835,
"loss/logits": 0.28673939406871796,
"step": 62
},
{
"epoch": 7.07,
"grad_norm": 0.1298828125,
"grad_norm_var": 0.00021330118179321289,
"learning_rate": 0.004412313012710813,
"loss": 2.5145,
"loss/crossentropy": 2.2218284606933594,
"loss/logits": 0.29268868267536163,
"step": 63
},
{
"epoch": 7.08,
"grad_norm": 0.1240234375,
"grad_norm_var": 0.00012022654215494791,
"learning_rate": 0.004217827674798845,
"loss": 2.4267,
"loss/crossentropy": 2.1349563598632812,
"loss/logits": 0.29178011417388916,
"step": 64
},
{
"epoch": 8.01,
"grad_norm": 0.12451171875,
"grad_norm_var": 0.00011052191257476807,
"learning_rate": 0.004024548389919359,
"loss": 2.5903,
"loss/crossentropy": 2.2803404331207275,
"loss/logits": 0.30999650061130524,
"step": 65
},
{
"epoch": 8.02,
"grad_norm": 0.11376953125,
"grad_norm_var": 0.00010824203491210938,
"learning_rate": 0.003832773180720475,
"loss": 2.4998,
"loss/crossentropy": 2.2094690799713135,
"loss/logits": 0.2903156131505966,
"step": 66
},
{
"epoch": 8.03,
"grad_norm": 0.11865234375,
"grad_norm_var": 0.00010966360569000244,
"learning_rate": 0.003642797750674629,
"loss": 2.4863,
"loss/crossentropy": 2.1871025562286377,
"loss/logits": 0.29923413693904877,
"step": 67
},
{
"epoch": 8.04,
"grad_norm": 0.11572265625,
"grad_norm_var": 9.021759033203124e-05,
"learning_rate": 0.003454915028125263,
"loss": 2.3923,
"loss/crossentropy": 2.0989878177642822,
"loss/logits": 0.2932642847299576,
"step": 68
},
{
"epoch": 8.05,
"grad_norm": 0.1181640625,
"grad_norm_var": 5.30242919921875e-05,
"learning_rate": 0.003269414714612534,
"loss": 2.563,
"loss/crossentropy": 2.2577706575393677,
"loss/logits": 0.30524544417858124,
"step": 69
},
{
"epoch": 8.06,
"grad_norm": 0.11083984375,
"grad_norm_var": 6.763041019439698e-05,
"learning_rate": 0.0030865828381745515,
"loss": 2.4157,
"loss/crossentropy": 2.1293630599975586,
"loss/logits": 0.2863532304763794,
"step": 70
},
{
"epoch": 8.07,
"grad_norm": 0.1181640625,
"grad_norm_var": 6.939470767974854e-05,
"learning_rate": 0.002906701312312861,
"loss": 2.4613,
"loss/crossentropy": 2.155823588371277,
"loss/logits": 0.3054400086402893,
"step": 71
},
{
"epoch": 8.08,
"grad_norm": 0.1142578125,
"grad_norm_var": 6.979207197825114e-05,
"learning_rate": 0.0027300475013022664,
"loss": 2.4934,
"loss/crossentropy": 2.1984496116638184,
"loss/logits": 0.29494835436344147,
"step": 72
},
{
"epoch": 9.01,
"grad_norm": 0.10546875,
"grad_norm_var": 8.160173892974853e-05,
"learning_rate": 0.002556893792515227,
"loss": 2.436,
"loss/crossentropy": 2.138706684112549,
"loss/logits": 0.29732728004455566,
"step": 73
},
{
"epoch": 9.02,
"grad_norm": 0.1103515625,
"grad_norm_var": 7.158021132151286e-05,
"learning_rate": 0.002387507176420256,
"loss": 2.5448,
"loss/crossentropy": 2.2357265949249268,
"loss/logits": 0.3090888559818268,
"step": 74
},
{
"epoch": 9.03,
"grad_norm": 0.10888671875,
"grad_norm_var": 5.0067901611328125e-05,
"learning_rate": 0.00222214883490199,
"loss": 2.4924,
"loss/crossentropy": 2.195006251335144,
"loss/logits": 0.29741397500038147,
"step": 75
},
{
"epoch": 9.04,
"grad_norm": 0.1083984375,
"grad_norm_var": 4.657109578450521e-05,
"learning_rate": 0.0020610737385376348,
"loss": 2.3857,
"loss/crossentropy": 2.0854870080947876,
"loss/logits": 0.3002435863018036,
"step": 76
},
{
"epoch": 9.05,
"grad_norm": 0.11279296875,
"grad_norm_var": 3.367165724436442e-05,
"learning_rate": 0.0019045302534508297,
"loss": 2.3901,
"loss/crossentropy": 2.0945165157318115,
"loss/logits": 0.2955891638994217,
"step": 77
},
{
"epoch": 9.06,
"grad_norm": 0.1083984375,
"grad_norm_var": 3.042916456858317e-05,
"learning_rate": 0.0017527597583490823,
"loss": 2.5162,
"loss/crossentropy": 2.212076425552368,
"loss/logits": 0.3041383922100067,
"step": 78
},
{
"epoch": 9.07,
"grad_norm": 0.1083984375,
"grad_norm_var": 2.5152166684468588e-05,
"learning_rate": 0.0016059962723352912,
"loss": 2.5047,
"loss/crossentropy": 2.201292634010315,
"loss/logits": 0.30345703661441803,
"step": 79
},
{
"epoch": 9.08,
"grad_norm": 0.107421875,
"grad_norm_var": 1.760721206665039e-05,
"learning_rate": 0.0014644660940672626,
"loss": 2.4349,
"loss/crossentropy": 2.1304625272750854,
"loss/logits": 0.30440980195999146,
"step": 80
},
{
"epoch": 10.01,
"grad_norm": 0.1015625,
"grad_norm_var": 2.1199385325113933e-05,
"learning_rate": 0.0013283874528215733,
"loss": 2.5581,
"loss/crossentropy": 2.2433000802993774,
"loss/logits": 0.31478117406368256,
"step": 81
},
{
"epoch": 10.02,
"grad_norm": 0.10107421875,
"grad_norm_var": 2.471605936686198e-05,
"learning_rate": 0.0011979701719998454,
"loss": 2.4003,
"loss/crossentropy": 2.098269462585449,
"loss/logits": 0.30198484659194946,
"step": 82
},
{
"epoch": 10.03,
"grad_norm": 0.1005859375,
"grad_norm_var": 2.4286905924479165e-05,
"learning_rate": 0.0010734153455962765,
"loss": 2.5084,
"loss/crossentropy": 2.2013310194015503,
"loss/logits": 0.30710338056087494,
"step": 83
},
{
"epoch": 10.04,
"grad_norm": 0.10205078125,
"grad_norm_var": 2.653996149698893e-05,
"learning_rate": 0.0009549150281252633,
"loss": 2.5173,
"loss/crossentropy": 2.213552951812744,
"loss/logits": 0.3037945479154587,
"step": 84
},
{
"epoch": 10.05,
"grad_norm": 0.0986328125,
"grad_norm_var": 2.4155775705973306e-05,
"learning_rate": 0.0008426519384872733,
"loss": 2.3663,
"loss/crossentropy": 2.0724344849586487,
"loss/logits": 0.2938496023416519,
"step": 85
},
{
"epoch": 10.06,
"grad_norm": 0.10205078125,
"grad_norm_var": 2.1448731422424318e-05,
"learning_rate": 0.0007367991782295391,
"loss": 2.3785,
"loss/crossentropy": 2.0789949893951416,
"loss/logits": 0.2995530366897583,
"step": 86
},
{
"epoch": 10.07,
"grad_norm": 0.10009765625,
"grad_norm_var": 1.8596649169921875e-05,
"learning_rate": 0.0006375199646360141,
"loss": 2.4332,
"loss/crossentropy": 2.1326472759246826,
"loss/logits": 0.30053600668907166,
"step": 87
},
{
"epoch": 10.08,
"grad_norm": 0.10009765625,
"grad_norm_var": 2.0224849383036295e-05,
"learning_rate": 0.0005449673790581611,
"loss": 2.494,
"loss/crossentropy": 2.1958537101745605,
"loss/logits": 0.2981116622686386,
"step": 88
},
{
"epoch": 11.01,
"grad_norm": 0.0966796875,
"grad_norm_var": 2.1091103553771974e-05,
"learning_rate": 0.0004592841308745932,
"loss": 2.4855,
"loss/crossentropy": 2.183037042617798,
"loss/logits": 0.3025069534778595,
"step": 89
},
{
"epoch": 11.02,
"grad_norm": 0.09814453125,
"grad_norm_var": 2.0945072174072264e-05,
"learning_rate": 0.0003806023374435663,
"loss": 2.3937,
"loss/crossentropy": 2.0915567874908447,
"loss/logits": 0.3020932674407959,
"step": 90
},
{
"epoch": 11.03,
"grad_norm": 0.0966796875,
"grad_norm_var": 1.5798211097717286e-05,
"learning_rate": 0.00030904332038757975,
"loss": 2.4012,
"loss/crossentropy": 2.1043936014175415,
"loss/logits": 0.29680541157722473,
"step": 91
},
{
"epoch": 11.04,
"grad_norm": 0.09716796875,
"grad_norm_var": 1.3856093088785807e-05,
"learning_rate": 0.00024471741852423234,
"loss": 2.4577,
"loss/crossentropy": 2.1379482746124268,
"loss/logits": 0.3198002427816391,
"step": 92
},
{
"epoch": 11.05,
"grad_norm": 0.095703125,
"grad_norm_var": 1.163482666015625e-05,
"learning_rate": 0.00018772381773176416,
"loss": 2.4786,
"loss/crossentropy": 2.173209071159363,
"loss/logits": 0.3054308444261551,
"step": 93
},
{
"epoch": 11.06,
"grad_norm": 0.0966796875,
"grad_norm_var": 8.706251780192057e-06,
"learning_rate": 0.0001381503980116172,
"loss": 2.5107,
"loss/crossentropy": 2.2105352878570557,
"loss/logits": 0.30013974010944366,
"step": 94
},
{
"epoch": 11.07,
"grad_norm": 0.095703125,
"grad_norm_var": 5.177656809488932e-06,
"learning_rate": 9.607359798384785e-05,
"loss": 2.4619,
"loss/crossentropy": 2.167112946510315,
"loss/logits": 0.29475128650665283,
"step": 95
},
{
"epoch": 11.08,
"grad_norm": 0.095703125,
"grad_norm_var": 5.273024241129557e-06,
"learning_rate": 6.15582970243117e-05,
"loss": 2.4592,
"loss/crossentropy": 2.160895347595215,
"loss/logits": 0.298322930932045,
"step": 96
},
{
"epoch": 12.01,
"grad_norm": 0.09912109375,
"grad_norm_var": 4.9591064453125e-06,
"learning_rate": 3.465771522536854e-05,
"loss": 2.5121,
"loss/crossentropy": 2.1986340284347534,
"loss/logits": 0.31342923641204834,
"step": 97
},
{
"epoch": 12.02,
"grad_norm": 0.0947265625,
"grad_norm_var": 4.497170448303223e-06,
"learning_rate": 1.541333133436018e-05,
"loss": 2.4629,
"loss/crossentropy": 2.156323194503784,
"loss/logits": 0.30655351281166077,
"step": 98
},
{
"epoch": 12.03,
"grad_norm": 0.09228515625,
"grad_norm_var": 6.214777628580729e-06,
"learning_rate": 3.854818796385496e-06,
"loss": 2.5164,
"loss/crossentropy": 2.2150908708572388,
"loss/logits": 0.30133919417858124,
"step": 99
},
{
"epoch": 12.04,
"grad_norm": 0.09228515625,
"grad_norm_var": 5.976359049479167e-06,
"learning_rate": 0.0,
"loss": 2.378,
"loss/crossentropy": 2.089114785194397,
"loss/logits": 0.2889086455106735,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.16881259823104e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}