lesso03's picture
Training in progress, step 200, checkpoint
025033e verified
raw
history blame
36.9 kB
{
"best_metric": 10.490081787109375,
"best_model_checkpoint": "miner_id_24/checkpoint-200",
"epoch": 0.176056338028169,
"eval_steps": 50,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008802816901408451,
"grad_norm": 1.0416910648345947,
"learning_rate": 1.1000000000000001e-05,
"loss": 10.8354,
"step": 1
},
{
"epoch": 0.0008802816901408451,
"eval_loss": 10.828083038330078,
"eval_runtime": 40.666,
"eval_samples_per_second": 47.042,
"eval_steps_per_second": 11.779,
"step": 1
},
{
"epoch": 0.0017605633802816902,
"grad_norm": 1.0834181308746338,
"learning_rate": 2.2000000000000003e-05,
"loss": 10.8404,
"step": 2
},
{
"epoch": 0.002640845070422535,
"grad_norm": 0.968176007270813,
"learning_rate": 3.3e-05,
"loss": 10.8331,
"step": 3
},
{
"epoch": 0.0035211267605633804,
"grad_norm": 0.827340304851532,
"learning_rate": 4.4000000000000006e-05,
"loss": 10.8353,
"step": 4
},
{
"epoch": 0.0044014084507042256,
"grad_norm": 0.8887184858322144,
"learning_rate": 5.5e-05,
"loss": 10.8322,
"step": 5
},
{
"epoch": 0.00528169014084507,
"grad_norm": 0.8796689510345459,
"learning_rate": 6.6e-05,
"loss": 10.8238,
"step": 6
},
{
"epoch": 0.006161971830985915,
"grad_norm": 0.8759607672691345,
"learning_rate": 7.7e-05,
"loss": 10.8217,
"step": 7
},
{
"epoch": 0.007042253521126761,
"grad_norm": 0.9419655203819275,
"learning_rate": 8.800000000000001e-05,
"loss": 10.8036,
"step": 8
},
{
"epoch": 0.007922535211267605,
"grad_norm": 0.8953964710235596,
"learning_rate": 9.900000000000001e-05,
"loss": 10.8025,
"step": 9
},
{
"epoch": 0.008802816901408451,
"grad_norm": 0.8648999333381653,
"learning_rate": 0.00011,
"loss": 10.7915,
"step": 10
},
{
"epoch": 0.009683098591549295,
"grad_norm": 0.9457488059997559,
"learning_rate": 0.0001099924817745858,
"loss": 10.7801,
"step": 11
},
{
"epoch": 0.01056338028169014,
"grad_norm": 0.7545454502105713,
"learning_rate": 0.00010996992915375093,
"loss": 10.7735,
"step": 12
},
{
"epoch": 0.011443661971830986,
"grad_norm": 0.7653600573539734,
"learning_rate": 0.00010993234830315676,
"loss": 10.7706,
"step": 13
},
{
"epoch": 0.01232394366197183,
"grad_norm": 0.8165815472602844,
"learning_rate": 0.0001098797494970326,
"loss": 10.7605,
"step": 14
},
{
"epoch": 0.013204225352112676,
"grad_norm": 0.7317748069763184,
"learning_rate": 0.00010981214711536684,
"loss": 10.7501,
"step": 15
},
{
"epoch": 0.014084507042253521,
"grad_norm": 0.8150272369384766,
"learning_rate": 0.00010972955963997563,
"loss": 10.7434,
"step": 16
},
{
"epoch": 0.014964788732394365,
"grad_norm": 0.7701242566108704,
"learning_rate": 0.00010963200964945011,
"loss": 10.7278,
"step": 17
},
{
"epoch": 0.01584507042253521,
"grad_norm": 0.7349888682365417,
"learning_rate": 0.00010951952381298364,
"loss": 10.7288,
"step": 18
},
{
"epoch": 0.016725352112676055,
"grad_norm": 0.6739762425422668,
"learning_rate": 0.00010939213288308077,
"loss": 10.7154,
"step": 19
},
{
"epoch": 0.017605633802816902,
"grad_norm": 0.6542978286743164,
"learning_rate": 0.00010924987168714973,
"loss": 10.7205,
"step": 20
},
{
"epoch": 0.018485915492957746,
"grad_norm": 0.6472326517105103,
"learning_rate": 0.00010909277911798103,
"loss": 10.7262,
"step": 21
},
{
"epoch": 0.01936619718309859,
"grad_norm": 0.6118732690811157,
"learning_rate": 0.00010892089812311451,
"loss": 10.6898,
"step": 22
},
{
"epoch": 0.020246478873239437,
"grad_norm": 0.6045047640800476,
"learning_rate": 0.00010873427569309797,
"loss": 10.7162,
"step": 23
},
{
"epoch": 0.02112676056338028,
"grad_norm": 0.7520612478256226,
"learning_rate": 0.00010853296284864032,
"loss": 10.6945,
"step": 24
},
{
"epoch": 0.022007042253521125,
"grad_norm": 0.6803677678108215,
"learning_rate": 0.00010831701462666318,
"loss": 10.7144,
"step": 25
},
{
"epoch": 0.022887323943661973,
"grad_norm": 0.6361639499664307,
"learning_rate": 0.00010808649006525419,
"loss": 10.702,
"step": 26
},
{
"epoch": 0.023767605633802816,
"grad_norm": 0.5413468480110168,
"learning_rate": 0.00010784145218752665,
"loss": 10.7203,
"step": 27
},
{
"epoch": 0.02464788732394366,
"grad_norm": 0.5481899976730347,
"learning_rate": 0.00010758196798438968,
"loss": 10.6863,
"step": 28
},
{
"epoch": 0.025528169014084508,
"grad_norm": 0.5831198692321777,
"learning_rate": 0.00010730810839623346,
"loss": 10.6874,
"step": 29
},
{
"epoch": 0.02640845070422535,
"grad_norm": 0.5215359926223755,
"learning_rate": 0.0001070199482935349,
"loss": 10.6734,
"step": 30
},
{
"epoch": 0.027288732394366196,
"grad_norm": 0.48211073875427246,
"learning_rate": 0.00010671756645638888,
"loss": 10.6803,
"step": 31
},
{
"epoch": 0.028169014084507043,
"grad_norm": 0.5410356521606445,
"learning_rate": 0.00010640104555297034,
"loss": 10.6592,
"step": 32
},
{
"epoch": 0.029049295774647887,
"grad_norm": 0.4434191882610321,
"learning_rate": 0.00010607047211693389,
"loss": 10.6771,
"step": 33
},
{
"epoch": 0.02992957746478873,
"grad_norm": 0.5003845691680908,
"learning_rate": 0.00010572593652375616,
"loss": 10.6756,
"step": 34
},
{
"epoch": 0.030809859154929578,
"grad_norm": 0.44623205065727234,
"learning_rate": 0.00010536753296602816,
"loss": 10.6654,
"step": 35
},
{
"epoch": 0.03169014084507042,
"grad_norm": 0.48966875672340393,
"learning_rate": 0.00010499535942770394,
"loss": 10.6103,
"step": 36
},
{
"epoch": 0.032570422535211266,
"grad_norm": 0.42709842324256897,
"learning_rate": 0.00010460951765731275,
"loss": 10.6529,
"step": 37
},
{
"epoch": 0.03345070422535211,
"grad_norm": 0.3827378451824188,
"learning_rate": 0.000104210113140142,
"loss": 10.6431,
"step": 38
},
{
"epoch": 0.03433098591549296,
"grad_norm": 0.39901039004325867,
"learning_rate": 0.00010379725506939865,
"loss": 10.622,
"step": 39
},
{
"epoch": 0.035211267605633804,
"grad_norm": 0.43945929408073425,
"learning_rate": 0.0001033710563163569,
"loss": 10.6221,
"step": 40
},
{
"epoch": 0.03609154929577465,
"grad_norm": 0.3692222237586975,
"learning_rate": 0.00010293163339950024,
"loss": 10.6182,
"step": 41
},
{
"epoch": 0.03697183098591549,
"grad_norm": 0.37782034277915955,
"learning_rate": 0.00010247910645266658,
"loss": 10.6313,
"step": 42
},
{
"epoch": 0.037852112676056336,
"grad_norm": 0.4363635182380676,
"learning_rate": 0.00010201359919220464,
"loss": 10.606,
"step": 43
},
{
"epoch": 0.03873239436619718,
"grad_norm": 0.3427577614784241,
"learning_rate": 0.00010153523888315144,
"loss": 10.6184,
"step": 44
},
{
"epoch": 0.03961267605633803,
"grad_norm": 0.4304777681827545,
"learning_rate": 0.00010104415630443907,
"loss": 10.6178,
"step": 45
},
{
"epoch": 0.040492957746478875,
"grad_norm": 0.41288602352142334,
"learning_rate": 0.0001005404857131411,
"loss": 10.6375,
"step": 46
},
{
"epoch": 0.04137323943661972,
"grad_norm": 0.4251928925514221,
"learning_rate": 0.00010002436480776809,
"loss": 10.6102,
"step": 47
},
{
"epoch": 0.04225352112676056,
"grad_norm": 0.4017108976840973,
"learning_rate": 9.949593469062211e-05,
"loss": 10.6077,
"step": 48
},
{
"epoch": 0.043133802816901406,
"grad_norm": 0.4274183511734009,
"learning_rate": 9.895533982922087e-05,
"loss": 10.6156,
"step": 49
},
{
"epoch": 0.04401408450704225,
"grad_norm": 0.5586544871330261,
"learning_rate": 9.840272801680165e-05,
"loss": 10.5981,
"step": 50
},
{
"epoch": 0.04401408450704225,
"eval_loss": 10.611796379089355,
"eval_runtime": 33.253,
"eval_samples_per_second": 57.529,
"eval_steps_per_second": 14.405,
"step": 50
},
{
"epoch": 0.0448943661971831,
"grad_norm": 0.8303582668304443,
"learning_rate": 9.783825033191619e-05,
"loss": 10.6168,
"step": 51
},
{
"epoch": 0.045774647887323945,
"grad_norm": 0.7027772665023804,
"learning_rate": 9.726206109712725e-05,
"loss": 10.623,
"step": 52
},
{
"epoch": 0.04665492957746479,
"grad_norm": 0.7661027908325195,
"learning_rate": 9.667431783681842e-05,
"loss": 10.627,
"step": 53
},
{
"epoch": 0.04753521126760563,
"grad_norm": 0.7784258723258972,
"learning_rate": 9.607518123412847e-05,
"loss": 10.6067,
"step": 54
},
{
"epoch": 0.04841549295774648,
"grad_norm": 0.70957350730896,
"learning_rate": 9.546481508702224e-05,
"loss": 10.6065,
"step": 55
},
{
"epoch": 0.04929577464788732,
"grad_norm": 0.7046729326248169,
"learning_rate": 9.48433862635099e-05,
"loss": 10.5984,
"step": 56
},
{
"epoch": 0.05017605633802817,
"grad_norm": 0.6503332853317261,
"learning_rate": 9.421106465602684e-05,
"loss": 10.5944,
"step": 57
},
{
"epoch": 0.051056338028169015,
"grad_norm": 0.6239966154098511,
"learning_rate": 9.356802313498687e-05,
"loss": 10.6094,
"step": 58
},
{
"epoch": 0.05193661971830986,
"grad_norm": 0.6146323680877686,
"learning_rate": 9.291443750152112e-05,
"loss": 10.579,
"step": 59
},
{
"epoch": 0.0528169014084507,
"grad_norm": 0.5953949689865112,
"learning_rate": 9.225048643941577e-05,
"loss": 10.5865,
"step": 60
},
{
"epoch": 0.05369718309859155,
"grad_norm": 0.5488158464431763,
"learning_rate": 9.157635146626164e-05,
"loss": 10.5657,
"step": 61
},
{
"epoch": 0.05457746478873239,
"grad_norm": 0.468292236328125,
"learning_rate": 9.089221688382928e-05,
"loss": 10.5856,
"step": 62
},
{
"epoch": 0.05545774647887324,
"grad_norm": 0.484653115272522,
"learning_rate": 9.019826972768242e-05,
"loss": 10.5658,
"step": 63
},
{
"epoch": 0.056338028169014086,
"grad_norm": 0.3967801332473755,
"learning_rate": 8.949469971604454e-05,
"loss": 10.5907,
"step": 64
},
{
"epoch": 0.05721830985915493,
"grad_norm": 0.41012856364250183,
"learning_rate": 8.878169919793173e-05,
"loss": 10.5643,
"step": 65
},
{
"epoch": 0.058098591549295774,
"grad_norm": 0.3663182556629181,
"learning_rate": 8.805946310056638e-05,
"loss": 10.5643,
"step": 66
},
{
"epoch": 0.05897887323943662,
"grad_norm": 0.40321776270866394,
"learning_rate": 8.732818887608602e-05,
"loss": 10.5693,
"step": 67
},
{
"epoch": 0.05985915492957746,
"grad_norm": 0.31598755717277527,
"learning_rate": 8.65880764475619e-05,
"loss": 10.5841,
"step": 68
},
{
"epoch": 0.06073943661971831,
"grad_norm": 0.2976154386997223,
"learning_rate": 8.583932815434201e-05,
"loss": 10.5584,
"step": 69
},
{
"epoch": 0.061619718309859156,
"grad_norm": 0.2813892662525177,
"learning_rate": 8.50821486967335e-05,
"loss": 10.558,
"step": 70
},
{
"epoch": 0.0625,
"grad_norm": 0.2954387366771698,
"learning_rate": 8.431674508003966e-05,
"loss": 10.5793,
"step": 71
},
{
"epoch": 0.06338028169014084,
"grad_norm": 0.32197305560112,
"learning_rate": 8.354332655796683e-05,
"loss": 10.5817,
"step": 72
},
{
"epoch": 0.06426056338028169,
"grad_norm": 0.43186065554618835,
"learning_rate": 8.276210457541642e-05,
"loss": 10.5966,
"step": 73
},
{
"epoch": 0.06514084507042253,
"grad_norm": 0.34619054198265076,
"learning_rate": 8.197329271067796e-05,
"loss": 10.5901,
"step": 74
},
{
"epoch": 0.06602112676056338,
"grad_norm": 0.3864317238330841,
"learning_rate": 8.117710661703905e-05,
"loss": 10.6212,
"step": 75
},
{
"epoch": 0.06690140845070422,
"grad_norm": 0.4301004111766815,
"learning_rate": 8.037376396382784e-05,
"loss": 10.606,
"step": 76
},
{
"epoch": 0.06778169014084508,
"grad_norm": 0.3685256838798523,
"learning_rate": 7.956348437690437e-05,
"loss": 10.58,
"step": 77
},
{
"epoch": 0.06866197183098592,
"grad_norm": 0.4085758328437805,
"learning_rate": 7.87464893786171e-05,
"loss": 10.5759,
"step": 78
},
{
"epoch": 0.06954225352112677,
"grad_norm": 0.38861989974975586,
"learning_rate": 7.792300232724097e-05,
"loss": 10.5768,
"step": 79
},
{
"epoch": 0.07042253521126761,
"grad_norm": 0.4298862814903259,
"learning_rate": 7.709324835591332e-05,
"loss": 10.5884,
"step": 80
},
{
"epoch": 0.07130281690140845,
"grad_norm": 0.3438059091567993,
"learning_rate": 7.625745431108487e-05,
"loss": 10.5703,
"step": 81
},
{
"epoch": 0.0721830985915493,
"grad_norm": 0.5023322105407715,
"learning_rate": 7.541584869050213e-05,
"loss": 10.5616,
"step": 82
},
{
"epoch": 0.07306338028169014,
"grad_norm": 0.34021085500717163,
"learning_rate": 7.456866158073842e-05,
"loss": 10.5553,
"step": 83
},
{
"epoch": 0.07394366197183098,
"grad_norm": 0.4679986536502838,
"learning_rate": 7.371612459429037e-05,
"loss": 10.5767,
"step": 84
},
{
"epoch": 0.07482394366197183,
"grad_norm": 0.4663142263889313,
"learning_rate": 7.28584708062576e-05,
"loss": 10.5478,
"step": 85
},
{
"epoch": 0.07570422535211267,
"grad_norm": 0.3878236413002014,
"learning_rate": 7.19959346906221e-05,
"loss": 10.5509,
"step": 86
},
{
"epoch": 0.07658450704225352,
"grad_norm": 0.3434685170650482,
"learning_rate": 7.112875205614558e-05,
"loss": 10.5629,
"step": 87
},
{
"epoch": 0.07746478873239436,
"grad_norm": 0.40229496359825134,
"learning_rate": 7.025715998190145e-05,
"loss": 10.5896,
"step": 88
},
{
"epoch": 0.07834507042253522,
"grad_norm": 0.4033350944519043,
"learning_rate": 6.938139675246009e-05,
"loss": 10.5509,
"step": 89
},
{
"epoch": 0.07922535211267606,
"grad_norm": 0.36661842465400696,
"learning_rate": 6.850170179274395e-05,
"loss": 10.558,
"step": 90
},
{
"epoch": 0.0801056338028169,
"grad_norm": 0.3692269027233124,
"learning_rate": 6.761831560257134e-05,
"loss": 10.5177,
"step": 91
},
{
"epoch": 0.08098591549295775,
"grad_norm": 0.37002119421958923,
"learning_rate": 6.673147969090608e-05,
"loss": 10.539,
"step": 92
},
{
"epoch": 0.0818661971830986,
"grad_norm": 0.3126441240310669,
"learning_rate": 6.584143650983141e-05,
"loss": 10.5218,
"step": 93
},
{
"epoch": 0.08274647887323944,
"grad_norm": 0.3806772828102112,
"learning_rate": 6.494842938826605e-05,
"loss": 10.5072,
"step": 94
},
{
"epoch": 0.08362676056338028,
"grad_norm": 0.34609004855155945,
"learning_rate": 6.405270246544037e-05,
"loss": 10.5073,
"step": 95
},
{
"epoch": 0.08450704225352113,
"grad_norm": 0.3094387352466583,
"learning_rate": 6.31545006241513e-05,
"loss": 10.5837,
"step": 96
},
{
"epoch": 0.08538732394366197,
"grad_norm": 0.3375723659992218,
"learning_rate": 6.22540694238138e-05,
"loss": 10.5428,
"step": 97
},
{
"epoch": 0.08626760563380281,
"grad_norm": 0.3610488772392273,
"learning_rate": 6.135165503332725e-05,
"loss": 10.5195,
"step": 98
},
{
"epoch": 0.08714788732394366,
"grad_norm": 0.3436351418495178,
"learning_rate": 6.0447504163775465e-05,
"loss": 10.5591,
"step": 99
},
{
"epoch": 0.0880281690140845,
"grad_norm": 0.46384698152542114,
"learning_rate": 5.954186400097829e-05,
"loss": 10.5287,
"step": 100
},
{
"epoch": 0.0880281690140845,
"eval_loss": 10.527392387390137,
"eval_runtime": 28.8434,
"eval_samples_per_second": 66.324,
"eval_steps_per_second": 16.607,
"step": 100
},
{
"epoch": 0.08890845070422536,
"grad_norm": 0.5941808819770813,
"learning_rate": 5.8634982137913465e-05,
"loss": 10.5435,
"step": 101
},
{
"epoch": 0.0897887323943662,
"grad_norm": 0.5891147255897522,
"learning_rate": 5.772710650702723e-05,
"loss": 10.5219,
"step": 102
},
{
"epoch": 0.09066901408450705,
"grad_norm": 0.6356647610664368,
"learning_rate": 5.681848531245195e-05,
"loss": 10.5299,
"step": 103
},
{
"epoch": 0.09154929577464789,
"grad_norm": 0.5486959218978882,
"learning_rate": 5.590936696214972e-05,
"loss": 10.5153,
"step": 104
},
{
"epoch": 0.09242957746478873,
"grad_norm": 0.5351731181144714,
"learning_rate": 5.5e-05,
"loss": 10.5248,
"step": 105
},
{
"epoch": 0.09330985915492958,
"grad_norm": 0.5519973635673523,
"learning_rate": 5.409063303785029e-05,
"loss": 10.5358,
"step": 106
},
{
"epoch": 0.09419014084507042,
"grad_norm": 0.5336778163909912,
"learning_rate": 5.318151468754805e-05,
"loss": 10.5088,
"step": 107
},
{
"epoch": 0.09507042253521127,
"grad_norm": 0.4727165997028351,
"learning_rate": 5.227289349297277e-05,
"loss": 10.5292,
"step": 108
},
{
"epoch": 0.09595070422535211,
"grad_norm": 0.5302690267562866,
"learning_rate": 5.136501786208654e-05,
"loss": 10.5202,
"step": 109
},
{
"epoch": 0.09683098591549295,
"grad_norm": 0.5168509483337402,
"learning_rate": 5.045813599902173e-05,
"loss": 10.5157,
"step": 110
},
{
"epoch": 0.0977112676056338,
"grad_norm": 0.4725133776664734,
"learning_rate": 4.955249583622455e-05,
"loss": 10.4946,
"step": 111
},
{
"epoch": 0.09859154929577464,
"grad_norm": 0.45433416962623596,
"learning_rate": 4.8648344966672767e-05,
"loss": 10.5199,
"step": 112
},
{
"epoch": 0.0994718309859155,
"grad_norm": 0.42494186758995056,
"learning_rate": 4.774593057618621e-05,
"loss": 10.4857,
"step": 113
},
{
"epoch": 0.10035211267605634,
"grad_norm": 0.35899239778518677,
"learning_rate": 4.6845499375848686e-05,
"loss": 10.5143,
"step": 114
},
{
"epoch": 0.10123239436619719,
"grad_norm": 0.3459785282611847,
"learning_rate": 4.5947297534559625e-05,
"loss": 10.5069,
"step": 115
},
{
"epoch": 0.10211267605633803,
"grad_norm": 0.3599611222743988,
"learning_rate": 4.5051570611733976e-05,
"loss": 10.5378,
"step": 116
},
{
"epoch": 0.10299295774647887,
"grad_norm": 0.3270639479160309,
"learning_rate": 4.415856349016859e-05,
"loss": 10.5354,
"step": 117
},
{
"epoch": 0.10387323943661972,
"grad_norm": 0.3274960219860077,
"learning_rate": 4.326852030909393e-05,
"loss": 10.4849,
"step": 118
},
{
"epoch": 0.10475352112676056,
"grad_norm": 0.3062029182910919,
"learning_rate": 4.238168439742867e-05,
"loss": 10.5315,
"step": 119
},
{
"epoch": 0.1056338028169014,
"grad_norm": 0.3626440763473511,
"learning_rate": 4.149829820725605e-05,
"loss": 10.5202,
"step": 120
},
{
"epoch": 0.10651408450704225,
"grad_norm": 0.3809753656387329,
"learning_rate": 4.0618603247539916e-05,
"loss": 10.5317,
"step": 121
},
{
"epoch": 0.1073943661971831,
"grad_norm": 0.3086736798286438,
"learning_rate": 3.9742840018098564e-05,
"loss": 10.5318,
"step": 122
},
{
"epoch": 0.10827464788732394,
"grad_norm": 0.3391195237636566,
"learning_rate": 3.887124794385445e-05,
"loss": 10.5107,
"step": 123
},
{
"epoch": 0.10915492957746478,
"grad_norm": 0.34638404846191406,
"learning_rate": 3.80040653093779e-05,
"loss": 10.5355,
"step": 124
},
{
"epoch": 0.11003521126760564,
"grad_norm": 0.45098984241485596,
"learning_rate": 3.714152919374241e-05,
"loss": 10.55,
"step": 125
},
{
"epoch": 0.11091549295774648,
"grad_norm": 0.39215147495269775,
"learning_rate": 3.628387540570963e-05,
"loss": 10.5317,
"step": 126
},
{
"epoch": 0.11179577464788733,
"grad_norm": 0.3315400779247284,
"learning_rate": 3.543133841926159e-05,
"loss": 10.5396,
"step": 127
},
{
"epoch": 0.11267605633802817,
"grad_norm": 0.3427654504776001,
"learning_rate": 3.458415130949785e-05,
"loss": 10.5269,
"step": 128
},
{
"epoch": 0.11355633802816902,
"grad_norm": 0.33311086893081665,
"learning_rate": 3.374254568891514e-05,
"loss": 10.5672,
"step": 129
},
{
"epoch": 0.11443661971830986,
"grad_norm": 0.31714025139808655,
"learning_rate": 3.290675164408669e-05,
"loss": 10.5272,
"step": 130
},
{
"epoch": 0.1153169014084507,
"grad_norm": 0.37221792340278625,
"learning_rate": 3.207699767275904e-05,
"loss": 10.5114,
"step": 131
},
{
"epoch": 0.11619718309859155,
"grad_norm": 0.298801451921463,
"learning_rate": 3.12535106213829e-05,
"loss": 10.5184,
"step": 132
},
{
"epoch": 0.11707746478873239,
"grad_norm": 0.3310137093067169,
"learning_rate": 3.0436515623095647e-05,
"loss": 10.5289,
"step": 133
},
{
"epoch": 0.11795774647887323,
"grad_norm": 0.3380754590034485,
"learning_rate": 2.962623603617218e-05,
"loss": 10.5012,
"step": 134
},
{
"epoch": 0.11883802816901408,
"grad_norm": 0.32100123167037964,
"learning_rate": 2.8822893382960955e-05,
"loss": 10.5141,
"step": 135
},
{
"epoch": 0.11971830985915492,
"grad_norm": 0.3254896402359009,
"learning_rate": 2.802670728932207e-05,
"loss": 10.5539,
"step": 136
},
{
"epoch": 0.12059859154929578,
"grad_norm": 0.3702283799648285,
"learning_rate": 2.723789542458361e-05,
"loss": 10.512,
"step": 137
},
{
"epoch": 0.12147887323943662,
"grad_norm": 0.32699835300445557,
"learning_rate": 2.6456673442033183e-05,
"loss": 10.5026,
"step": 138
},
{
"epoch": 0.12235915492957747,
"grad_norm": 0.33314529061317444,
"learning_rate": 2.5683254919960356e-05,
"loss": 10.5311,
"step": 139
},
{
"epoch": 0.12323943661971831,
"grad_norm": 0.3764292597770691,
"learning_rate": 2.4917851303266533e-05,
"loss": 10.4995,
"step": 140
},
{
"epoch": 0.12411971830985916,
"grad_norm": 0.3277135491371155,
"learning_rate": 2.4160671845658007e-05,
"loss": 10.5025,
"step": 141
},
{
"epoch": 0.125,
"grad_norm": 0.3547554314136505,
"learning_rate": 2.3411923552438105e-05,
"loss": 10.5012,
"step": 142
},
{
"epoch": 0.12588028169014084,
"grad_norm": 0.3986217677593231,
"learning_rate": 2.2671811123913983e-05,
"loss": 10.451,
"step": 143
},
{
"epoch": 0.1267605633802817,
"grad_norm": 0.37957799434661865,
"learning_rate": 2.194053689943362e-05,
"loss": 10.4798,
"step": 144
},
{
"epoch": 0.12764084507042253,
"grad_norm": 0.3498072028160095,
"learning_rate": 2.121830080206827e-05,
"loss": 10.5078,
"step": 145
},
{
"epoch": 0.12852112676056338,
"grad_norm": 0.36097705364227295,
"learning_rate": 2.0505300283955464e-05,
"loss": 10.4758,
"step": 146
},
{
"epoch": 0.12940140845070422,
"grad_norm": 0.30697742104530334,
"learning_rate": 1.9801730272317585e-05,
"loss": 10.5026,
"step": 147
},
{
"epoch": 0.13028169014084506,
"grad_norm": 0.42456531524658203,
"learning_rate": 1.910778311617072e-05,
"loss": 10.4967,
"step": 148
},
{
"epoch": 0.1311619718309859,
"grad_norm": 0.47736701369285583,
"learning_rate": 1.8423648533738342e-05,
"loss": 10.5141,
"step": 149
},
{
"epoch": 0.13204225352112675,
"grad_norm": 0.5607922673225403,
"learning_rate": 1.7749513560584252e-05,
"loss": 10.5202,
"step": 150
},
{
"epoch": 0.13204225352112675,
"eval_loss": 10.495588302612305,
"eval_runtime": 26.6017,
"eval_samples_per_second": 71.913,
"eval_steps_per_second": 18.006,
"step": 150
},
{
"epoch": 0.1329225352112676,
"grad_norm": 0.655456006526947,
"learning_rate": 1.7085562498478883e-05,
"loss": 10.4826,
"step": 151
},
{
"epoch": 0.13380281690140844,
"grad_norm": 0.4750436246395111,
"learning_rate": 1.6431976865013128e-05,
"loss": 10.5067,
"step": 152
},
{
"epoch": 0.13468309859154928,
"grad_norm": 0.5088787078857422,
"learning_rate": 1.5788935343973164e-05,
"loss": 10.4799,
"step": 153
},
{
"epoch": 0.13556338028169015,
"grad_norm": 0.4471653401851654,
"learning_rate": 1.5156613736490108e-05,
"loss": 10.5016,
"step": 154
},
{
"epoch": 0.136443661971831,
"grad_norm": 0.4595582187175751,
"learning_rate": 1.4535184912977763e-05,
"loss": 10.4885,
"step": 155
},
{
"epoch": 0.13732394366197184,
"grad_norm": 0.5350455641746521,
"learning_rate": 1.3924818765871553e-05,
"loss": 10.4808,
"step": 156
},
{
"epoch": 0.1382042253521127,
"grad_norm": 0.5852507948875427,
"learning_rate": 1.3325682163181601e-05,
"loss": 10.4695,
"step": 157
},
{
"epoch": 0.13908450704225353,
"grad_norm": 0.5205310583114624,
"learning_rate": 1.2737938902872767e-05,
"loss": 10.4588,
"step": 158
},
{
"epoch": 0.13996478873239437,
"grad_norm": 0.5743169784545898,
"learning_rate": 1.2161749668083823e-05,
"loss": 10.4531,
"step": 159
},
{
"epoch": 0.14084507042253522,
"grad_norm": 0.4958171844482422,
"learning_rate": 1.159727198319836e-05,
"loss": 10.4593,
"step": 160
},
{
"epoch": 0.14172535211267606,
"grad_norm": 0.43265965580940247,
"learning_rate": 1.1044660170779142e-05,
"loss": 10.4656,
"step": 161
},
{
"epoch": 0.1426056338028169,
"grad_norm": 0.5412198901176453,
"learning_rate": 1.0504065309377897e-05,
"loss": 10.4804,
"step": 162
},
{
"epoch": 0.14348591549295775,
"grad_norm": 0.4394540786743164,
"learning_rate": 9.97563519223192e-06,
"loss": 10.4911,
"step": 163
},
{
"epoch": 0.1443661971830986,
"grad_norm": 0.4882407784461975,
"learning_rate": 9.459514286858898e-06,
"loss": 10.5067,
"step": 164
},
{
"epoch": 0.14524647887323944,
"grad_norm": 0.3944062888622284,
"learning_rate": 8.95584369556093e-06,
"loss": 10.4899,
"step": 165
},
{
"epoch": 0.14612676056338028,
"grad_norm": 0.3780195415019989,
"learning_rate": 8.464761116848546e-06,
"loss": 10.4926,
"step": 166
},
{
"epoch": 0.14700704225352113,
"grad_norm": 0.31584465503692627,
"learning_rate": 7.986400807795349e-06,
"loss": 10.4902,
"step": 167
},
{
"epoch": 0.14788732394366197,
"grad_norm": 0.2965972125530243,
"learning_rate": 7.520893547333436e-06,
"loss": 10.4713,
"step": 168
},
{
"epoch": 0.1487676056338028,
"grad_norm": 0.3396422266960144,
"learning_rate": 7.068366600499744e-06,
"loss": 10.5113,
"step": 169
},
{
"epoch": 0.14964788732394366,
"grad_norm": 0.27547672390937805,
"learning_rate": 6.6289436836431076e-06,
"loss": 10.4863,
"step": 170
},
{
"epoch": 0.1505281690140845,
"grad_norm": 0.21695564687252045,
"learning_rate": 6.20274493060135e-06,
"loss": 10.5045,
"step": 171
},
{
"epoch": 0.15140845070422534,
"grad_norm": 0.2346537858247757,
"learning_rate": 5.789886859858009e-06,
"loss": 10.5259,
"step": 172
},
{
"epoch": 0.1522887323943662,
"grad_norm": 0.23305346071720123,
"learning_rate": 5.3904823426872605e-06,
"loss": 10.5179,
"step": 173
},
{
"epoch": 0.15316901408450703,
"grad_norm": 0.3447682559490204,
"learning_rate": 5.004640572296062e-06,
"loss": 10.5116,
"step": 174
},
{
"epoch": 0.15404929577464788,
"grad_norm": 0.2727600038051605,
"learning_rate": 4.632467033971838e-06,
"loss": 10.5402,
"step": 175
},
{
"epoch": 0.15492957746478872,
"grad_norm": 0.3000151813030243,
"learning_rate": 4.274063476243839e-06,
"loss": 10.5252,
"step": 176
},
{
"epoch": 0.15580985915492956,
"grad_norm": 0.24301907420158386,
"learning_rate": 3.929527883066117e-06,
"loss": 10.5167,
"step": 177
},
{
"epoch": 0.15669014084507044,
"grad_norm": 0.29026785492897034,
"learning_rate": 3.5989544470296595e-06,
"loss": 10.5109,
"step": 178
},
{
"epoch": 0.15757042253521128,
"grad_norm": 0.30544570088386536,
"learning_rate": 3.282433543611136e-06,
"loss": 10.4965,
"step": 179
},
{
"epoch": 0.15845070422535212,
"grad_norm": 0.3145160973072052,
"learning_rate": 2.980051706465095e-06,
"loss": 10.4923,
"step": 180
},
{
"epoch": 0.15933098591549297,
"grad_norm": 0.29084667563438416,
"learning_rate": 2.691891603766556e-06,
"loss": 10.5384,
"step": 181
},
{
"epoch": 0.1602112676056338,
"grad_norm": 0.2924867570400238,
"learning_rate": 2.4180320156103298e-06,
"loss": 10.5349,
"step": 182
},
{
"epoch": 0.16109154929577466,
"grad_norm": 0.31382471323013306,
"learning_rate": 2.158547812473352e-06,
"loss": 10.5328,
"step": 183
},
{
"epoch": 0.1619718309859155,
"grad_norm": 0.2887389361858368,
"learning_rate": 1.9135099347458293e-06,
"loss": 10.5209,
"step": 184
},
{
"epoch": 0.16285211267605634,
"grad_norm": 0.35892584919929504,
"learning_rate": 1.6829853733368294e-06,
"loss": 10.5239,
"step": 185
},
{
"epoch": 0.1637323943661972,
"grad_norm": 0.24830038845539093,
"learning_rate": 1.4670371513596842e-06,
"loss": 10.5013,
"step": 186
},
{
"epoch": 0.16461267605633803,
"grad_norm": 0.33427461981773376,
"learning_rate": 1.2657243069020402e-06,
"loss": 10.4891,
"step": 187
},
{
"epoch": 0.16549295774647887,
"grad_norm": 0.3475538492202759,
"learning_rate": 1.0791018768854855e-06,
"loss": 10.4896,
"step": 188
},
{
"epoch": 0.16637323943661972,
"grad_norm": 0.3785645365715027,
"learning_rate": 9.072208820189698e-07,
"loss": 10.5247,
"step": 189
},
{
"epoch": 0.16725352112676056,
"grad_norm": 0.3462846279144287,
"learning_rate": 7.501283128502722e-07,
"loss": 10.5091,
"step": 190
},
{
"epoch": 0.1681338028169014,
"grad_norm": 0.3662095367908478,
"learning_rate": 6.07867116919233e-07,
"loss": 10.465,
"step": 191
},
{
"epoch": 0.16901408450704225,
"grad_norm": 0.46672433614730835,
"learning_rate": 4.804761870163643e-07,
"loss": 10.4353,
"step": 192
},
{
"epoch": 0.1698943661971831,
"grad_norm": 0.3511195778846741,
"learning_rate": 3.6799035054990215e-07,
"loss": 10.4936,
"step": 193
},
{
"epoch": 0.17077464788732394,
"grad_norm": 0.3752671778202057,
"learning_rate": 2.704403600243721e-07,
"loss": 10.4624,
"step": 194
},
{
"epoch": 0.17165492957746478,
"grad_norm": 0.3640790283679962,
"learning_rate": 1.878528846331584e-07,
"loss": 10.495,
"step": 195
},
{
"epoch": 0.17253521126760563,
"grad_norm": 0.37574926018714905,
"learning_rate": 1.202505029674006e-07,
"loss": 10.4919,
"step": 196
},
{
"epoch": 0.17341549295774647,
"grad_norm": 0.32661178708076477,
"learning_rate": 6.765169684323947e-08,
"loss": 10.4605,
"step": 197
},
{
"epoch": 0.1742957746478873,
"grad_norm": 0.415720671415329,
"learning_rate": 3.007084624906731e-08,
"loss": 10.4822,
"step": 198
},
{
"epoch": 0.17517605633802816,
"grad_norm": 0.4906235337257385,
"learning_rate": 7.518225414204771e-09,
"loss": 10.5168,
"step": 199
},
{
"epoch": 0.176056338028169,
"grad_norm": 0.49893417954444885,
"learning_rate": 0.0,
"loss": 10.4814,
"step": 200
},
{
"epoch": 0.176056338028169,
"eval_loss": 10.490081787109375,
"eval_runtime": 30.7966,
"eval_samples_per_second": 62.117,
"eval_steps_per_second": 15.554,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 136620225331200.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}