{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0257510729613735, "eval_steps": 500, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004291845493562232, "grad_norm": 0.48923006653785706, "learning_rate": 2.4000000000000003e-06, "loss": 3.0401, "step": 1 }, { "epoch": 0.008583690987124463, "grad_norm": 0.4999135434627533, "learning_rate": 4.800000000000001e-06, "loss": 3.0045, "step": 2 }, { "epoch": 0.012875536480686695, "grad_norm": 0.5719347596168518, "learning_rate": 7.2e-06, "loss": 3.233, "step": 3 }, { "epoch": 0.017167381974248927, "grad_norm": 0.4990224242210388, "learning_rate": 9.600000000000001e-06, "loss": 3.0813, "step": 4 }, { "epoch": 0.02145922746781116, "grad_norm": 0.5448071360588074, "learning_rate": 1.2e-05, "loss": 3.1513, "step": 5 }, { "epoch": 0.02575107296137339, "grad_norm": 0.4316995441913605, "learning_rate": 1.44e-05, "loss": 2.9247, "step": 6 }, { "epoch": 0.030042918454935622, "grad_norm": 0.39892056584358215, "learning_rate": 1.6800000000000002e-05, "loss": 3.0529, "step": 7 }, { "epoch": 0.034334763948497854, "grad_norm": 0.3437541723251343, "learning_rate": 1.9200000000000003e-05, "loss": 2.742, "step": 8 }, { "epoch": 0.03862660944206009, "grad_norm": 0.3232952356338501, "learning_rate": 2.16e-05, "loss": 2.8807, "step": 9 }, { "epoch": 0.04291845493562232, "grad_norm": 0.2316465526819229, "learning_rate": 2.4e-05, "loss": 2.8284, "step": 10 }, { "epoch": 0.04721030042918455, "grad_norm": 0.1895979940891266, "learning_rate": 2.64e-05, "loss": 2.846, "step": 11 }, { "epoch": 0.05150214592274678, "grad_norm": 0.18949951231479645, "learning_rate": 2.88e-05, "loss": 2.6129, "step": 12 }, { "epoch": 0.055793991416309016, "grad_norm": 0.29111406207084656, "learning_rate": 3.12e-05, "loss": 2.8553, "step": 13 }, { "epoch": 0.060085836909871244, "grad_norm": 0.3171645998954773, "learning_rate": 3.3600000000000004e-05, "loss": 2.7663, "step": 14 }, { "epoch": 0.06437768240343347, "grad_norm": 0.3853575587272644, "learning_rate": 3.6e-05, "loss": 2.8468, "step": 15 }, { "epoch": 0.06866952789699571, "grad_norm": 0.33584126830101013, "learning_rate": 3.8400000000000005e-05, "loss": 2.8286, "step": 16 }, { "epoch": 0.07296137339055794, "grad_norm": 0.22323380410671234, "learning_rate": 4.08e-05, "loss": 2.9605, "step": 17 }, { "epoch": 0.07725321888412018, "grad_norm": 0.20761723816394806, "learning_rate": 4.32e-05, "loss": 2.5953, "step": 18 }, { "epoch": 0.0815450643776824, "grad_norm": 0.18277060985565186, "learning_rate": 4.5600000000000004e-05, "loss": 2.5327, "step": 19 }, { "epoch": 0.08583690987124463, "grad_norm": 0.25170719623565674, "learning_rate": 4.8e-05, "loss": 2.7127, "step": 20 }, { "epoch": 0.09012875536480687, "grad_norm": 0.2634769678115845, "learning_rate": 5.04e-05, "loss": 2.7035, "step": 21 }, { "epoch": 0.0944206008583691, "grad_norm": 0.1749686598777771, "learning_rate": 5.28e-05, "loss": 2.7313, "step": 22 }, { "epoch": 0.09871244635193133, "grad_norm": 0.19000954926013947, "learning_rate": 5.520000000000001e-05, "loss": 2.6551, "step": 23 }, { "epoch": 0.10300429184549356, "grad_norm": 0.16048619151115417, "learning_rate": 5.76e-05, "loss": 2.6901, "step": 24 }, { "epoch": 0.1072961373390558, "grad_norm": 0.1346961259841919, "learning_rate": 6e-05, "loss": 2.6762, "step": 25 }, { "epoch": 0.11158798283261803, "grad_norm": 0.10837673395872116, "learning_rate": 5.9999238776847435e-05, "loss": 2.5797, "step": 26 }, { "epoch": 0.11587982832618025, "grad_norm": 0.12566141784191132, "learning_rate": 5.9996955146020456e-05, "loss": 2.6367, "step": 27 }, { "epoch": 0.12017167381974249, "grad_norm": 0.10796438157558441, "learning_rate": 5.999314922340924e-05, "loss": 2.6986, "step": 28 }, { "epoch": 0.12446351931330472, "grad_norm": 0.1174619048833847, "learning_rate": 5.9987821202157545e-05, "loss": 2.4532, "step": 29 }, { "epoch": 0.12875536480686695, "grad_norm": 0.12774387001991272, "learning_rate": 5.998097135265291e-05, "loss": 2.7154, "step": 30 }, { "epoch": 0.13304721030042918, "grad_norm": 0.12038439512252808, "learning_rate": 5.9972600022512946e-05, "loss": 2.6464, "step": 31 }, { "epoch": 0.13733905579399142, "grad_norm": 0.1207851693034172, "learning_rate": 5.996270763656767e-05, "loss": 2.745, "step": 32 }, { "epoch": 0.14163090128755365, "grad_norm": 0.12519803643226624, "learning_rate": 5.9951294696837966e-05, "loss": 2.4953, "step": 33 }, { "epoch": 0.1459227467811159, "grad_norm": 0.11793860048055649, "learning_rate": 5.993836178251009e-05, "loss": 2.5615, "step": 34 }, { "epoch": 0.15021459227467812, "grad_norm": 0.10079298913478851, "learning_rate": 5.99239095499063e-05, "loss": 2.5886, "step": 35 }, { "epoch": 0.15450643776824036, "grad_norm": 0.1022179052233696, "learning_rate": 5.990793873245154e-05, "loss": 2.4649, "step": 36 }, { "epoch": 0.15879828326180256, "grad_norm": 0.1409706175327301, "learning_rate": 5.989045014063621e-05, "loss": 2.7143, "step": 37 }, { "epoch": 0.1630901287553648, "grad_norm": 0.11522237211465836, "learning_rate": 5.9871444661975037e-05, "loss": 2.6908, "step": 38 }, { "epoch": 0.16738197424892703, "grad_norm": 0.11969179660081863, "learning_rate": 5.9850923260962045e-05, "loss": 2.6429, "step": 39 }, { "epoch": 0.17167381974248927, "grad_norm": 0.13146190345287323, "learning_rate": 5.982888697902161e-05, "loss": 2.5398, "step": 40 }, { "epoch": 0.1759656652360515, "grad_norm": 0.08940872550010681, "learning_rate": 5.98053369344556e-05, "loss": 2.6048, "step": 41 }, { "epoch": 0.18025751072961374, "grad_norm": 0.0957597866654396, "learning_rate": 5.978027432238662e-05, "loss": 2.758, "step": 42 }, { "epoch": 0.18454935622317598, "grad_norm": 0.12192820012569427, "learning_rate": 5.975370041469738e-05, "loss": 2.6317, "step": 43 }, { "epoch": 0.1888412017167382, "grad_norm": 0.11058337986469269, "learning_rate": 5.972561655996614e-05, "loss": 2.631, "step": 44 }, { "epoch": 0.19313304721030042, "grad_norm": 0.0976683646440506, "learning_rate": 5.969602418339825e-05, "loss": 2.5953, "step": 45 }, { "epoch": 0.19742489270386265, "grad_norm": 0.07907278090715408, "learning_rate": 5.966492478675385e-05, "loss": 2.5996, "step": 46 }, { "epoch": 0.2017167381974249, "grad_norm": 0.0848410427570343, "learning_rate": 5.963231994827169e-05, "loss": 2.5325, "step": 47 }, { "epoch": 0.20600858369098712, "grad_norm": 0.0901293084025383, "learning_rate": 5.9598211322588925e-05, "loss": 2.4015, "step": 48 }, { "epoch": 0.21030042918454936, "grad_norm": 0.09382897615432739, "learning_rate": 5.956260064065728e-05, "loss": 2.6818, "step": 49 }, { "epoch": 0.2145922746781116, "grad_norm": 0.10944745689630508, "learning_rate": 5.952548970965513e-05, "loss": 2.4901, "step": 50 }, { "epoch": 0.21888412017167383, "grad_norm": 0.1057049036026001, "learning_rate": 5.948688041289578e-05, "loss": 2.5959, "step": 51 }, { "epoch": 0.22317596566523606, "grad_norm": 0.10103785246610641, "learning_rate": 5.944677470973196e-05, "loss": 2.493, "step": 52 }, { "epoch": 0.22746781115879827, "grad_norm": 0.11647158861160278, "learning_rate": 5.9405174635456315e-05, "loss": 2.6496, "step": 53 }, { "epoch": 0.2317596566523605, "grad_norm": 0.09211481362581253, "learning_rate": 5.9362082301198156e-05, "loss": 2.6669, "step": 54 }, { "epoch": 0.23605150214592274, "grad_norm": 0.09014479070901871, "learning_rate": 5.931749989381632e-05, "loss": 2.6098, "step": 55 }, { "epoch": 0.24034334763948498, "grad_norm": 0.16046330332756042, "learning_rate": 5.9271429675788184e-05, "loss": 2.4713, "step": 56 }, { "epoch": 0.2446351931330472, "grad_norm": 0.1069481372833252, "learning_rate": 5.9223873985094866e-05, "loss": 2.3664, "step": 57 }, { "epoch": 0.24892703862660945, "grad_norm": 0.10389809310436249, "learning_rate": 5.9174835235102536e-05, "loss": 2.5278, "step": 58 }, { "epoch": 0.2532188841201717, "grad_norm": 0.11560378223657608, "learning_rate": 5.912431591443999e-05, "loss": 2.4859, "step": 59 }, { "epoch": 0.2575107296137339, "grad_norm": 0.09909158945083618, "learning_rate": 5.9072318586872344e-05, "loss": 2.4819, "step": 60 }, { "epoch": 0.26180257510729615, "grad_norm": 0.08729315549135208, "learning_rate": 5.901884589117089e-05, "loss": 2.5432, "step": 61 }, { "epoch": 0.26609442060085836, "grad_norm": 0.11550580710172653, "learning_rate": 5.896390054097922e-05, "loss": 2.592, "step": 62 }, { "epoch": 0.2703862660944206, "grad_norm": 0.10853075236082077, "learning_rate": 5.8907485324675545e-05, "loss": 2.3256, "step": 63 }, { "epoch": 0.27467811158798283, "grad_norm": 0.09386483579874039, "learning_rate": 5.884960310523109e-05, "loss": 2.676, "step": 64 }, { "epoch": 0.27896995708154504, "grad_norm": 0.12031152844429016, "learning_rate": 5.879025682006491e-05, "loss": 2.5849, "step": 65 }, { "epoch": 0.2832618025751073, "grad_norm": 0.09846463054418564, "learning_rate": 5.872944948089474e-05, "loss": 2.542, "step": 66 }, { "epoch": 0.2875536480686695, "grad_norm": 0.09881763160228729, "learning_rate": 5.8667184173584226e-05, "loss": 2.6389, "step": 67 }, { "epoch": 0.2918454935622318, "grad_norm": 0.12872399389743805, "learning_rate": 5.860346405798625e-05, "loss": 2.5906, "step": 68 }, { "epoch": 0.296137339055794, "grad_norm": 0.0927918404340744, "learning_rate": 5.853829236778266e-05, "loss": 2.3233, "step": 69 }, { "epoch": 0.30042918454935624, "grad_norm": 0.1097516193985939, "learning_rate": 5.847167241032006e-05, "loss": 2.3085, "step": 70 }, { "epoch": 0.30472103004291845, "grad_norm": 0.12274881452322006, "learning_rate": 5.8403607566442066e-05, "loss": 2.4758, "step": 71 }, { "epoch": 0.3090128755364807, "grad_norm": 0.10112845152616501, "learning_rate": 5.833410129031768e-05, "loss": 2.6385, "step": 72 }, { "epoch": 0.3133047210300429, "grad_norm": 0.107538603246212, "learning_rate": 5.8263157109266e-05, "loss": 2.4394, "step": 73 }, { "epoch": 0.31759656652360513, "grad_norm": 0.15851671993732452, "learning_rate": 5.819077862357725e-05, "loss": 2.348, "step": 74 }, { "epoch": 0.3218884120171674, "grad_norm": 0.09074151515960693, "learning_rate": 5.811696950633003e-05, "loss": 2.4884, "step": 75 }, { "epoch": 0.3261802575107296, "grad_norm": 0.10217540711164474, "learning_rate": 5.8041733503204934e-05, "loss": 2.5566, "step": 76 }, { "epoch": 0.33047210300429186, "grad_norm": 0.12177087366580963, "learning_rate": 5.796507443229445e-05, "loss": 2.4852, "step": 77 }, { "epoch": 0.33476394849785407, "grad_norm": 0.10884562134742737, "learning_rate": 5.788699618390924e-05, "loss": 2.6596, "step": 78 }, { "epoch": 0.33905579399141633, "grad_norm": 0.09898590296506882, "learning_rate": 5.7807502720380655e-05, "loss": 2.5299, "step": 79 }, { "epoch": 0.34334763948497854, "grad_norm": 0.16023777425289154, "learning_rate": 5.772659807585968e-05, "loss": 2.5859, "step": 80 }, { "epoch": 0.34763948497854075, "grad_norm": 0.1152929812669754, "learning_rate": 5.764428635611223e-05, "loss": 2.6133, "step": 81 }, { "epoch": 0.351931330472103, "grad_norm": 0.11540824174880981, "learning_rate": 5.756057173831075e-05, "loss": 2.4024, "step": 82 }, { "epoch": 0.3562231759656652, "grad_norm": 0.10451588034629822, "learning_rate": 5.7475458470822275e-05, "loss": 2.4877, "step": 83 }, { "epoch": 0.3605150214592275, "grad_norm": 0.09626314043998718, "learning_rate": 5.7388950872992764e-05, "loss": 2.67, "step": 84 }, { "epoch": 0.3648068669527897, "grad_norm": 0.11973975598812103, "learning_rate": 5.7301053334928e-05, "loss": 2.3961, "step": 85 }, { "epoch": 0.36909871244635195, "grad_norm": 0.09656506031751633, "learning_rate": 5.7211770317270696e-05, "loss": 2.5181, "step": 86 }, { "epoch": 0.37339055793991416, "grad_norm": 0.09045439958572388, "learning_rate": 5.712110635097422e-05, "loss": 2.6467, "step": 87 }, { "epoch": 0.3776824034334764, "grad_norm": 0.11997085064649582, "learning_rate": 5.702906603707257e-05, "loss": 2.6225, "step": 88 }, { "epoch": 0.38197424892703863, "grad_norm": 0.10198812931776047, "learning_rate": 5.6935654046446955e-05, "loss": 2.5756, "step": 89 }, { "epoch": 0.38626609442060084, "grad_norm": 0.12353700399398804, "learning_rate": 5.684087511958869e-05, "loss": 2.5561, "step": 90 }, { "epoch": 0.3905579399141631, "grad_norm": 0.09727780520915985, "learning_rate": 5.674473406635869e-05, "loss": 2.3674, "step": 91 }, { "epoch": 0.3948497854077253, "grad_norm": 0.13741353154182434, "learning_rate": 5.664723576574332e-05, "loss": 2.4249, "step": 92 }, { "epoch": 0.39914163090128757, "grad_norm": 0.09821099787950516, "learning_rate": 5.6548385165606835e-05, "loss": 2.5469, "step": 93 }, { "epoch": 0.4034334763948498, "grad_norm": 0.12704302370548248, "learning_rate": 5.644818728244027e-05, "loss": 2.5243, "step": 94 }, { "epoch": 0.40772532188841204, "grad_norm": 0.14363928139209747, "learning_rate": 5.634664720110686e-05, "loss": 2.3621, "step": 95 }, { "epoch": 0.41201716738197425, "grad_norm": 0.09331246465444565, "learning_rate": 5.6243770074583985e-05, "loss": 2.4098, "step": 96 }, { "epoch": 0.41630901287553645, "grad_norm": 0.13661201298236847, "learning_rate": 5.613956112370168e-05, "loss": 2.3998, "step": 97 }, { "epoch": 0.4206008583690987, "grad_norm": 0.15692031383514404, "learning_rate": 5.60340256368777e-05, "loss": 2.6108, "step": 98 }, { "epoch": 0.4248927038626609, "grad_norm": 0.1194879561662674, "learning_rate": 5.59271689698491e-05, "loss": 2.609, "step": 99 }, { "epoch": 0.4291845493562232, "grad_norm": 0.14594995975494385, "learning_rate": 5.581899654540049e-05, "loss": 2.3956, "step": 100 }, { "epoch": 0.4334763948497854, "grad_norm": 0.11137986928224564, "learning_rate": 5.570951385308879e-05, "loss": 2.6656, "step": 101 }, { "epoch": 0.43776824034334766, "grad_norm": 0.16587291657924652, "learning_rate": 5.559872644896467e-05, "loss": 2.4815, "step": 102 }, { "epoch": 0.44206008583690987, "grad_norm": 0.1582924723625183, "learning_rate": 5.548663995529062e-05, "loss": 2.4112, "step": 103 }, { "epoch": 0.44635193133047213, "grad_norm": 0.12027094513177872, "learning_rate": 5.5373260060255563e-05, "loss": 2.4611, "step": 104 }, { "epoch": 0.45064377682403434, "grad_norm": 0.17147387564182281, "learning_rate": 5.525859251768625e-05, "loss": 2.4084, "step": 105 }, { "epoch": 0.45493562231759654, "grad_norm": 0.138417050242424, "learning_rate": 5.5142643146755215e-05, "loss": 2.655, "step": 106 }, { "epoch": 0.4592274678111588, "grad_norm": 0.15900184214115143, "learning_rate": 5.5025417831685533e-05, "loss": 2.4029, "step": 107 }, { "epoch": 0.463519313304721, "grad_norm": 0.1893249750137329, "learning_rate": 5.4906922521452105e-05, "loss": 2.367, "step": 108 }, { "epoch": 0.4678111587982833, "grad_norm": 0.13178469240665436, "learning_rate": 5.478716322947985e-05, "loss": 2.4976, "step": 109 }, { "epoch": 0.4721030042918455, "grad_norm": 0.22058036923408508, "learning_rate": 5.466614603333848e-05, "loss": 2.4154, "step": 110 }, { "epoch": 0.47639484978540775, "grad_norm": 0.10874748975038528, "learning_rate": 5.4543877074434106e-05, "loss": 2.3568, "step": 111 }, { "epoch": 0.48068669527896996, "grad_norm": 0.1645105481147766, "learning_rate": 5.4420362557697546e-05, "loss": 2.5891, "step": 112 }, { "epoch": 0.48497854077253216, "grad_norm": 0.10352698713541031, "learning_rate": 5.429560875126946e-05, "loss": 2.2487, "step": 113 }, { "epoch": 0.4892703862660944, "grad_norm": 0.10524651408195496, "learning_rate": 5.4169621986182234e-05, "loss": 2.5208, "step": 114 }, { "epoch": 0.49356223175965663, "grad_norm": 0.10389380156993866, "learning_rate": 5.40424086560387e-05, "loss": 2.3373, "step": 115 }, { "epoch": 0.4978540772532189, "grad_norm": 0.11282224953174591, "learning_rate": 5.3913975216687675e-05, "loss": 2.4336, "step": 116 }, { "epoch": 0.5021459227467812, "grad_norm": 0.14653658866882324, "learning_rate": 5.378432818589633e-05, "loss": 2.5201, "step": 117 }, { "epoch": 0.5064377682403434, "grad_norm": 0.10697951167821884, "learning_rate": 5.365347414301943e-05, "loss": 2.5, "step": 118 }, { "epoch": 0.5107296137339056, "grad_norm": 0.08726029098033905, "learning_rate": 5.352141972866545e-05, "loss": 2.4833, "step": 119 }, { "epoch": 0.5150214592274678, "grad_norm": 0.1065029427409172, "learning_rate": 5.3388171644359565e-05, "loss": 2.3921, "step": 120 }, { "epoch": 0.51931330472103, "grad_norm": 0.12069697678089142, "learning_rate": 5.325373665220356e-05, "loss": 2.4335, "step": 121 }, { "epoch": 0.5236051502145923, "grad_norm": 0.11373002082109451, "learning_rate": 5.311812157453266e-05, "loss": 2.5685, "step": 122 }, { "epoch": 0.5278969957081545, "grad_norm": 0.13439607620239258, "learning_rate": 5.298133329356934e-05, "loss": 2.4198, "step": 123 }, { "epoch": 0.5321888412017167, "grad_norm": 0.10372010618448257, "learning_rate": 5.284337875107403e-05, "loss": 2.4413, "step": 124 }, { "epoch": 0.5364806866952789, "grad_norm": 0.1336512565612793, "learning_rate": 5.2704264947992855e-05, "loss": 2.5762, "step": 125 }, { "epoch": 0.5407725321888412, "grad_norm": 0.12671151757240295, "learning_rate": 5.256399894410232e-05, "loss": 2.5616, "step": 126 }, { "epoch": 0.5450643776824035, "grad_norm": 0.10133809596300125, "learning_rate": 5.242258785765106e-05, "loss": 2.3512, "step": 127 }, { "epoch": 0.5493562231759657, "grad_norm": 0.10729261487722397, "learning_rate": 5.228003886499863e-05, "loss": 2.4487, "step": 128 }, { "epoch": 0.5536480686695279, "grad_norm": 0.10313385725021362, "learning_rate": 5.213635920025127e-05, "loss": 2.3446, "step": 129 }, { "epoch": 0.5579399141630901, "grad_norm": 0.11684712022542953, "learning_rate": 5.1991556154894786e-05, "loss": 2.4377, "step": 130 }, { "epoch": 0.5622317596566524, "grad_norm": 0.12217995524406433, "learning_rate": 5.1845637077424576e-05, "loss": 2.4721, "step": 131 }, { "epoch": 0.5665236051502146, "grad_norm": 0.1254771202802658, "learning_rate": 5.169860937297264e-05, "loss": 2.3565, "step": 132 }, { "epoch": 0.5708154506437768, "grad_norm": 0.10808940976858139, "learning_rate": 5.155048050293183e-05, "loss": 2.464, "step": 133 }, { "epoch": 0.575107296137339, "grad_norm": 0.1136014387011528, "learning_rate": 5.140125798457716e-05, "loss": 2.4179, "step": 134 }, { "epoch": 0.5793991416309013, "grad_norm": 0.12022742629051208, "learning_rate": 5.125094939068439e-05, "loss": 2.6821, "step": 135 }, { "epoch": 0.5836909871244635, "grad_norm": 0.11005590111017227, "learning_rate": 5.109956234914558e-05, "loss": 2.5086, "step": 136 }, { "epoch": 0.5879828326180258, "grad_norm": 0.14693163335323334, "learning_rate": 5.0947104542582184e-05, "loss": 2.5458, "step": 137 }, { "epoch": 0.592274678111588, "grad_norm": 0.09363128244876862, "learning_rate": 5.0793583707954984e-05, "loss": 2.5145, "step": 138 }, { "epoch": 0.5965665236051502, "grad_norm": 0.19561366736888885, "learning_rate": 5.063900763617156e-05, "loss": 2.5379, "step": 139 }, { "epoch": 0.6008583690987125, "grad_norm": 0.10757976770401001, "learning_rate": 5.04833841716909e-05, "loss": 2.5076, "step": 140 }, { "epoch": 0.6051502145922747, "grad_norm": 0.13970068097114563, "learning_rate": 5.032672121212529e-05, "loss": 2.472, "step": 141 }, { "epoch": 0.6094420600858369, "grad_norm": 0.13635942339897156, "learning_rate": 5.0169026707839506e-05, "loss": 2.4703, "step": 142 }, { "epoch": 0.6137339055793991, "grad_norm": 0.10295607149600983, "learning_rate": 5.001030866154741e-05, "loss": 2.3987, "step": 143 }, { "epoch": 0.6180257510729614, "grad_norm": 0.1324918419122696, "learning_rate": 4.985057512790579e-05, "loss": 2.4456, "step": 144 }, { "epoch": 0.6223175965665236, "grad_norm": 0.10353874415159225, "learning_rate": 4.968983421310555e-05, "loss": 2.3827, "step": 145 }, { "epoch": 0.6266094420600858, "grad_norm": 0.10886164009571075, "learning_rate": 4.952809407446043e-05, "loss": 2.4664, "step": 146 }, { "epoch": 0.630901287553648, "grad_norm": 0.1082598865032196, "learning_rate": 4.9365362919992994e-05, "loss": 2.423, "step": 147 }, { "epoch": 0.6351931330472103, "grad_norm": 0.14654968678951263, "learning_rate": 4.9201649008018055e-05, "loss": 2.3452, "step": 148 }, { "epoch": 0.6394849785407726, "grad_norm": 0.10624393820762634, "learning_rate": 4.9036960646723617e-05, "loss": 2.4337, "step": 149 }, { "epoch": 0.6437768240343348, "grad_norm": 0.10619154572486877, "learning_rate": 4.887130619374927e-05, "loss": 2.2317, "step": 150 }, { "epoch": 0.648068669527897, "grad_norm": 0.1240013837814331, "learning_rate": 4.870469405576201e-05, "loss": 2.4641, "step": 151 }, { "epoch": 0.6523605150214592, "grad_norm": 0.11749018728733063, "learning_rate": 4.853713268802962e-05, "loss": 2.6394, "step": 152 }, { "epoch": 0.6566523605150214, "grad_norm": 0.11559685319662094, "learning_rate": 4.836863059399161e-05, "loss": 2.6346, "step": 153 }, { "epoch": 0.6609442060085837, "grad_norm": 0.12158619612455368, "learning_rate": 4.819919632482766e-05, "loss": 2.5024, "step": 154 }, { "epoch": 0.6652360515021459, "grad_norm": 0.10826972126960754, "learning_rate": 4.802883847902368e-05, "loss": 2.5089, "step": 155 }, { "epoch": 0.6695278969957081, "grad_norm": 0.12029755115509033, "learning_rate": 4.785756570193543e-05, "loss": 2.5017, "step": 156 }, { "epoch": 0.6738197424892703, "grad_norm": 0.11460699141025543, "learning_rate": 4.76853866853498e-05, "loss": 2.4295, "step": 157 }, { "epoch": 0.6781115879828327, "grad_norm": 0.10718906670808792, "learning_rate": 4.75123101670437e-05, "loss": 2.4886, "step": 158 }, { "epoch": 0.6824034334763949, "grad_norm": 0.11268241703510284, "learning_rate": 4.733834493034066e-05, "loss": 2.5896, "step": 159 }, { "epoch": 0.6866952789699571, "grad_norm": 0.12579713761806488, "learning_rate": 4.716349980366509e-05, "loss": 2.3132, "step": 160 }, { "epoch": 0.6909871244635193, "grad_norm": 0.11618609726428986, "learning_rate": 4.698778366009421e-05, "loss": 2.4478, "step": 161 }, { "epoch": 0.6952789699570815, "grad_norm": 0.12247326225042343, "learning_rate": 4.681120541690781e-05, "loss": 2.4638, "step": 162 }, { "epoch": 0.6995708154506438, "grad_norm": 0.11064168810844421, "learning_rate": 4.663377403513569e-05, "loss": 2.6396, "step": 163 }, { "epoch": 0.703862660944206, "grad_norm": 0.1138993427157402, "learning_rate": 4.64554985191029e-05, "loss": 2.3691, "step": 164 }, { "epoch": 0.7081545064377682, "grad_norm": 0.11352220177650452, "learning_rate": 4.6276387915972783e-05, "loss": 2.5472, "step": 165 }, { "epoch": 0.7124463519313304, "grad_norm": 0.13823212683200836, "learning_rate": 4.609645131528789e-05, "loss": 2.3696, "step": 166 }, { "epoch": 0.7167381974248928, "grad_norm": 0.16149644553661346, "learning_rate": 4.5915697848508645e-05, "loss": 2.3184, "step": 167 }, { "epoch": 0.721030042918455, "grad_norm": 0.10547155141830444, "learning_rate": 4.5734136688549964e-05, "loss": 2.4221, "step": 168 }, { "epoch": 0.7253218884120172, "grad_norm": 0.1176508441567421, "learning_rate": 4.555177704931576e-05, "loss": 2.4223, "step": 169 }, { "epoch": 0.7296137339055794, "grad_norm": 0.12838046252727509, "learning_rate": 4.5368628185231314e-05, "loss": 2.5376, "step": 170 }, { "epoch": 0.7339055793991416, "grad_norm": 0.1296073943376541, "learning_rate": 4.518469939077369e-05, "loss": 2.4453, "step": 171 }, { "epoch": 0.7381974248927039, "grad_norm": 0.13209278881549835, "learning_rate": 4.5e-05, "loss": 2.4902, "step": 172 }, { "epoch": 0.7424892703862661, "grad_norm": 0.13168583810329437, "learning_rate": 4.4814539386073744e-05, "loss": 2.2446, "step": 173 }, { "epoch": 0.7467811158798283, "grad_norm": 0.12592634558677673, "learning_rate": 4.462832696078915e-05, "loss": 2.5433, "step": 174 }, { "epoch": 0.7510729613733905, "grad_norm": 0.12793345749378204, "learning_rate": 4.4441372174093495e-05, "loss": 2.4155, "step": 175 }, { "epoch": 0.7553648068669528, "grad_norm": 0.12817440927028656, "learning_rate": 4.4253684513607585e-05, "loss": 2.4488, "step": 176 }, { "epoch": 0.759656652360515, "grad_norm": 0.12905055284500122, "learning_rate": 4.406527350414427e-05, "loss": 2.3792, "step": 177 }, { "epoch": 0.7639484978540773, "grad_norm": 0.11602860689163208, "learning_rate": 4.3876148707225067e-05, "loss": 2.2747, "step": 178 }, { "epoch": 0.7682403433476395, "grad_norm": 0.1343417763710022, "learning_rate": 4.368631972059489e-05, "loss": 2.3509, "step": 179 }, { "epoch": 0.7725321888412017, "grad_norm": 0.17558203637599945, "learning_rate": 4.349579617773507e-05, "loss": 2.2545, "step": 180 }, { "epoch": 0.776824034334764, "grad_norm": 0.11374380439519882, "learning_rate": 4.33045877473744e-05, "loss": 2.4592, "step": 181 }, { "epoch": 0.7811158798283262, "grad_norm": 0.16170468926429749, "learning_rate": 4.31127041329985e-05, "loss": 2.5294, "step": 182 }, { "epoch": 0.7854077253218884, "grad_norm": 0.1064993217587471, "learning_rate": 4.2920155072357335e-05, "loss": 2.4469, "step": 183 }, { "epoch": 0.7896995708154506, "grad_norm": 0.14201977849006653, "learning_rate": 4.2726950336971115e-05, "loss": 2.4416, "step": 184 }, { "epoch": 0.7939914163090128, "grad_norm": 0.12264000624418259, "learning_rate": 4.2533099731634376e-05, "loss": 2.5181, "step": 185 }, { "epoch": 0.7982832618025751, "grad_norm": 0.12580525875091553, "learning_rate": 4.233861309391835e-05, "loss": 2.3767, "step": 186 }, { "epoch": 0.8025751072961373, "grad_norm": 0.15109586715698242, "learning_rate": 4.214350029367181e-05, "loss": 2.3987, "step": 187 }, { "epoch": 0.8068669527896996, "grad_norm": 0.15967005491256714, "learning_rate": 4.1947771232520165e-05, "loss": 2.4359, "step": 188 }, { "epoch": 0.8111587982832618, "grad_norm": 0.10725586116313934, "learning_rate": 4.175143584336295e-05, "loss": 2.4294, "step": 189 }, { "epoch": 0.8154506437768241, "grad_norm": 0.12302330881357193, "learning_rate": 4.155450408986972e-05, "loss": 2.4109, "step": 190 }, { "epoch": 0.8197424892703863, "grad_norm": 0.13229899108409882, "learning_rate": 4.1356985965974536e-05, "loss": 2.5981, "step": 191 }, { "epoch": 0.8240343347639485, "grad_norm": 0.17274914681911469, "learning_rate": 4.115889149536863e-05, "loss": 2.3754, "step": 192 }, { "epoch": 0.8283261802575107, "grad_norm": 0.10760139673948288, "learning_rate": 4.0960230730991856e-05, "loss": 2.3867, "step": 193 }, { "epoch": 0.8326180257510729, "grad_norm": 0.17432305216789246, "learning_rate": 4.076101375452241e-05, "loss": 2.4265, "step": 194 }, { "epoch": 0.8369098712446352, "grad_norm": 0.15838249027729034, "learning_rate": 4.05612506758653e-05, "loss": 2.4993, "step": 195 }, { "epoch": 0.8412017167381974, "grad_norm": 0.12234227359294891, "learning_rate": 4.0360951632639226e-05, "loss": 2.3877, "step": 196 }, { "epoch": 0.8454935622317596, "grad_norm": 0.1838800013065338, "learning_rate": 4.016012678966213e-05, "loss": 2.2853, "step": 197 }, { "epoch": 0.8497854077253219, "grad_norm": 0.15433116257190704, "learning_rate": 3.995878633843535e-05, "loss": 2.4563, "step": 198 }, { "epoch": 0.8540772532188842, "grad_norm": 0.1247561052441597, "learning_rate": 3.9756940496626416e-05, "loss": 2.4479, "step": 199 }, { "epoch": 0.8583690987124464, "grad_norm": 0.1395588368177414, "learning_rate": 3.955459950755054e-05, "loss": 2.5016, "step": 200 }, { "epoch": 0.8626609442060086, "grad_norm": 0.1456160992383957, "learning_rate": 3.9351773639650745e-05, "loss": 2.3222, "step": 201 }, { "epoch": 0.8669527896995708, "grad_norm": 0.1138000562787056, "learning_rate": 3.914847318597682e-05, "loss": 2.503, "step": 202 }, { "epoch": 0.871244635193133, "grad_norm": 0.18854430317878723, "learning_rate": 3.894470846366291e-05, "loss": 2.3799, "step": 203 }, { "epoch": 0.8755364806866953, "grad_norm": 0.14914338290691376, "learning_rate": 3.874048981340397e-05, "loss": 2.4057, "step": 204 }, { "epoch": 0.8798283261802575, "grad_norm": 0.1462559849023819, "learning_rate": 3.853582759893097e-05, "loss": 2.3917, "step": 205 }, { "epoch": 0.8841201716738197, "grad_norm": 0.13625341653823853, "learning_rate": 3.833073220648501e-05, "loss": 2.3863, "step": 206 }, { "epoch": 0.8884120171673819, "grad_norm": 0.1319255530834198, "learning_rate": 3.812521404429016e-05, "loss": 2.3733, "step": 207 }, { "epoch": 0.8927038626609443, "grad_norm": 0.12159706652164459, "learning_rate": 3.7919283542025295e-05, "loss": 2.402, "step": 208 }, { "epoch": 0.8969957081545065, "grad_norm": 0.12551629543304443, "learning_rate": 3.7712951150294845e-05, "loss": 2.3653, "step": 209 }, { "epoch": 0.9012875536480687, "grad_norm": 0.12352804839611053, "learning_rate": 3.7506227340098376e-05, "loss": 2.2787, "step": 210 }, { "epoch": 0.9055793991416309, "grad_norm": 0.1033705621957779, "learning_rate": 3.729912260229926e-05, "loss": 2.2657, "step": 211 }, { "epoch": 0.9098712446351931, "grad_norm": 0.13322466611862183, "learning_rate": 3.7091647447092265e-05, "loss": 2.4372, "step": 212 }, { "epoch": 0.9141630901287554, "grad_norm": 0.1130763366818428, "learning_rate": 3.6883812403470165e-05, "loss": 2.3048, "step": 213 }, { "epoch": 0.9184549356223176, "grad_norm": 0.13779044151306152, "learning_rate": 3.667562801868943e-05, "loss": 2.4289, "step": 214 }, { "epoch": 0.9227467811158798, "grad_norm": 0.11905843019485474, "learning_rate": 3.646710485773499e-05, "loss": 2.513, "step": 215 }, { "epoch": 0.927038626609442, "grad_norm": 0.14390388131141663, "learning_rate": 3.625825350278403e-05, "loss": 2.3498, "step": 216 }, { "epoch": 0.9313304721030042, "grad_norm": 0.11969032138586044, "learning_rate": 3.6049084552669e-05, "loss": 2.3459, "step": 217 }, { "epoch": 0.9356223175965666, "grad_norm": 0.12738005816936493, "learning_rate": 3.5839608622339755e-05, "loss": 2.2709, "step": 218 }, { "epoch": 0.9399141630901288, "grad_norm": 0.12393856048583984, "learning_rate": 3.562983634232483e-05, "loss": 2.3483, "step": 219 }, { "epoch": 0.944206008583691, "grad_norm": 0.12841208279132843, "learning_rate": 3.541977835819197e-05, "loss": 2.3226, "step": 220 }, { "epoch": 0.9484978540772532, "grad_norm": 0.12215293943881989, "learning_rate": 3.520944533000792e-05, "loss": 2.4929, "step": 221 }, { "epoch": 0.9527896995708155, "grad_norm": 0.10832389444112778, "learning_rate": 3.4998847931797374e-05, "loss": 2.4507, "step": 222 }, { "epoch": 0.9570815450643777, "grad_norm": 0.10891756415367126, "learning_rate": 3.478799685100138e-05, "loss": 2.4425, "step": 223 }, { "epoch": 0.9613733905579399, "grad_norm": 0.124311164021492, "learning_rate": 3.457690278793489e-05, "loss": 2.5339, "step": 224 }, { "epoch": 0.9656652360515021, "grad_norm": 0.17225196957588196, "learning_rate": 3.436557645524379e-05, "loss": 2.4238, "step": 225 }, { "epoch": 0.9699570815450643, "grad_norm": 0.12012173235416412, "learning_rate": 3.415402857736122e-05, "loss": 2.5209, "step": 226 }, { "epoch": 0.9742489270386266, "grad_norm": 0.17129479348659515, "learning_rate": 3.394226988996336e-05, "loss": 2.5539, "step": 227 }, { "epoch": 0.9785407725321889, "grad_norm": 0.13525094091892242, "learning_rate": 3.373031113942456e-05, "loss": 2.2249, "step": 228 }, { "epoch": 0.9828326180257511, "grad_norm": 0.11054141074419022, "learning_rate": 3.351816308227206e-05, "loss": 2.4088, "step": 229 }, { "epoch": 0.9871244635193133, "grad_norm": 0.11343766748905182, "learning_rate": 3.330583648464004e-05, "loss": 2.0978, "step": 230 }, { "epoch": 0.9914163090128756, "grad_norm": 0.22153422236442566, "learning_rate": 3.309334212172331e-05, "loss": 2.5402, "step": 231 }, { "epoch": 0.9957081545064378, "grad_norm": 0.11208418756723404, "learning_rate": 3.288069077723045e-05, "loss": 2.3747, "step": 232 }, { "epoch": 1.0, "grad_norm": 0.1252536177635193, "learning_rate": 3.26678932428366e-05, "loss": 2.4297, "step": 233 }, { "epoch": 1.0042918454935623, "grad_norm": 0.23914779722690582, "learning_rate": 3.2454960317635747e-05, "loss": 2.5602, "step": 234 }, { "epoch": 1.0042918454935623, "grad_norm": 0.15806083381175995, "learning_rate": 3.2241902807592734e-05, "loss": 2.3144, "step": 235 }, { "epoch": 1.0085836909871244, "grad_norm": 0.11224311590194702, "learning_rate": 3.202873152499485e-05, "loss": 2.3083, "step": 236 }, { "epoch": 1.0128755364806867, "grad_norm": 0.12876267731189728, "learning_rate": 3.181545728790317e-05, "loss": 2.2468, "step": 237 }, { "epoch": 1.0171673819742488, "grad_norm": 0.16172175109386444, "learning_rate": 3.160209091960347e-05, "loss": 2.4206, "step": 238 }, { "epoch": 1.0214592274678111, "grad_norm": 0.10915794968605042, "learning_rate": 3.138864324805707e-05, "loss": 2.3789, "step": 239 }, { "epoch": 1.0257510729613735, "grad_norm": 0.18216541409492493, "learning_rate": 3.117512510535128e-05, "loss": 2.3726, "step": 240 } ], "logging_steps": 1, "max_steps": 466, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 24, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3380404450615624e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }