{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.471956224350206, "eval_steps": 500, "global_step": 12000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022799817601459188, "grad_norm": 2.5625, "learning_rate": 3.3333333333333335e-05, "loss": 4.9225, "step": 50 }, { "epoch": 0.045599635202918376, "grad_norm": 3.546875, "learning_rate": 6.666666666666667e-05, "loss": 2.1478, "step": 100 }, { "epoch": 0.06839945280437756, "grad_norm": 1.8671875, "learning_rate": 0.0001, "loss": 0.6361, "step": 150 }, { "epoch": 0.09119927040583675, "grad_norm": 1.46875, "learning_rate": 0.00013333333333333334, "loss": 0.467, "step": 200 }, { "epoch": 0.11399908800729594, "grad_norm": 2.15625, "learning_rate": 0.0001666666666666667, "loss": 0.4381, "step": 250 }, { "epoch": 0.13679890560875513, "grad_norm": 1.8203125, "learning_rate": 0.0002, "loss": 0.3365, "step": 300 }, { "epoch": 0.15959872321021432, "grad_norm": 1.4765625, "learning_rate": 0.00019988560970029743, "loss": 0.3255, "step": 350 }, { "epoch": 0.1823985408116735, "grad_norm": 1.40625, "learning_rate": 0.00019977121940059484, "loss": 0.2946, "step": 400 }, { "epoch": 0.2051983584131327, "grad_norm": 1.1328125, "learning_rate": 0.00019965682910089226, "loss": 0.3023, "step": 450 }, { "epoch": 0.22799817601459188, "grad_norm": 1.1640625, "learning_rate": 0.00019954243880118967, "loss": 0.2708, "step": 500 }, { "epoch": 0.2507979936160511, "grad_norm": 0.9921875, "learning_rate": 0.0001994280485014871, "loss": 0.3645, "step": 550 }, { "epoch": 0.27359781121751026, "grad_norm": 0.82421875, "learning_rate": 0.00019931365820178448, "loss": 0.3072, "step": 600 }, { "epoch": 0.29639762881896947, "grad_norm": 1.2265625, "learning_rate": 0.00019919926790208192, "loss": 0.2891, "step": 650 }, { "epoch": 0.31919744642042863, "grad_norm": 2.015625, "learning_rate": 0.00019908487760237934, "loss": 0.2763, "step": 700 }, { "epoch": 0.34199726402188785, "grad_norm": 1.5078125, "learning_rate": 0.00019897048730267673, "loss": 0.2824, "step": 750 }, { "epoch": 0.364797081623347, "grad_norm": 1.4140625, "learning_rate": 0.00019885609700297417, "loss": 0.3007, "step": 800 }, { "epoch": 0.3875968992248062, "grad_norm": 1.359375, "learning_rate": 0.00019874170670327156, "loss": 0.2955, "step": 850 }, { "epoch": 0.4103967168262654, "grad_norm": 1.078125, "learning_rate": 0.000198627316403569, "loss": 0.2385, "step": 900 }, { "epoch": 0.4331965344277246, "grad_norm": 0.97265625, "learning_rate": 0.0001985129261038664, "loss": 0.2461, "step": 950 }, { "epoch": 0.45599635202918376, "grad_norm": 1.421875, "learning_rate": 0.0001983985358041638, "loss": 0.2567, "step": 1000 }, { "epoch": 0.478796169630643, "grad_norm": 1.25, "learning_rate": 0.00019828414550446125, "loss": 0.2189, "step": 1050 }, { "epoch": 0.5015959872321022, "grad_norm": 1.0703125, "learning_rate": 0.00019816975520475864, "loss": 0.272, "step": 1100 }, { "epoch": 0.5243958048335613, "grad_norm": 1.78125, "learning_rate": 0.00019805536490505606, "loss": 0.2615, "step": 1150 }, { "epoch": 0.5471956224350205, "grad_norm": 1.0859375, "learning_rate": 0.00019794097460535347, "loss": 0.2476, "step": 1200 }, { "epoch": 0.5699954400364797, "grad_norm": 1.328125, "learning_rate": 0.0001978265843056509, "loss": 0.2327, "step": 1250 }, { "epoch": 0.5927952576379389, "grad_norm": 1.1953125, "learning_rate": 0.0001977121940059483, "loss": 0.2664, "step": 1300 }, { "epoch": 0.615595075239398, "grad_norm": 1.109375, "learning_rate": 0.00019759780370624572, "loss": 0.2266, "step": 1350 }, { "epoch": 0.6383948928408573, "grad_norm": 1.2265625, "learning_rate": 0.00019748341340654314, "loss": 0.2234, "step": 1400 }, { "epoch": 0.6611947104423165, "grad_norm": 1.1875, "learning_rate": 0.00019736902310684055, "loss": 0.2463, "step": 1450 }, { "epoch": 0.6839945280437757, "grad_norm": 1.265625, "learning_rate": 0.00019725463280713797, "loss": 0.2311, "step": 1500 }, { "epoch": 0.7067943456452348, "grad_norm": 1.09375, "learning_rate": 0.00019714024250743536, "loss": 0.2251, "step": 1550 }, { "epoch": 0.729594163246694, "grad_norm": 1.046875, "learning_rate": 0.0001970258522077328, "loss": 0.2492, "step": 1600 }, { "epoch": 0.7523939808481532, "grad_norm": 1.3984375, "learning_rate": 0.00019691146190803022, "loss": 0.2554, "step": 1650 }, { "epoch": 0.7751937984496124, "grad_norm": 0.8125, "learning_rate": 0.00019679707160832763, "loss": 0.2384, "step": 1700 }, { "epoch": 0.7979936160510716, "grad_norm": 1.3125, "learning_rate": 0.00019668268130862505, "loss": 0.2583, "step": 1750 }, { "epoch": 0.8207934336525308, "grad_norm": 0.625, "learning_rate": 0.00019656829100892244, "loss": 0.2343, "step": 1800 }, { "epoch": 0.84359325125399, "grad_norm": 1.0234375, "learning_rate": 0.00019645390070921988, "loss": 0.2193, "step": 1850 }, { "epoch": 0.8663930688554492, "grad_norm": 0.875, "learning_rate": 0.00019633951040951727, "loss": 0.2212, "step": 1900 }, { "epoch": 0.8891928864569083, "grad_norm": 1.578125, "learning_rate": 0.00019622512010981468, "loss": 0.2178, "step": 1950 }, { "epoch": 0.9119927040583675, "grad_norm": 0.8125, "learning_rate": 0.00019611072981011213, "loss": 0.2135, "step": 2000 }, { "epoch": 0.9347925216598267, "grad_norm": 0.92578125, "learning_rate": 0.00019599633951040952, "loss": 0.2522, "step": 2050 }, { "epoch": 0.957592339261286, "grad_norm": 0.8828125, "learning_rate": 0.00019588194921070696, "loss": 0.2138, "step": 2100 }, { "epoch": 0.9803921568627451, "grad_norm": 1.3125, "learning_rate": 0.00019576755891100435, "loss": 0.2043, "step": 2150 }, { "epoch": 1.0031919744642044, "grad_norm": 1.0546875, "learning_rate": 0.00019565316861130177, "loss": 0.1855, "step": 2200 }, { "epoch": 1.0259917920656634, "grad_norm": 1.1796875, "learning_rate": 0.00019553877831159918, "loss": 0.2099, "step": 2250 }, { "epoch": 1.0487916096671226, "grad_norm": 0.99609375, "learning_rate": 0.0001954243880118966, "loss": 0.2032, "step": 2300 }, { "epoch": 1.0715914272685818, "grad_norm": 0.91015625, "learning_rate": 0.000195309997712194, "loss": 0.1753, "step": 2350 }, { "epoch": 1.094391244870041, "grad_norm": 1.359375, "learning_rate": 0.00019519560741249143, "loss": 0.1997, "step": 2400 }, { "epoch": 1.1171910624715002, "grad_norm": 1.3203125, "learning_rate": 0.00019508121711278885, "loss": 0.2101, "step": 2450 }, { "epoch": 1.1399908800729595, "grad_norm": 0.8046875, "learning_rate": 0.00019496682681308626, "loss": 0.1734, "step": 2500 }, { "epoch": 1.1627906976744187, "grad_norm": 1.0078125, "learning_rate": 0.00019485243651338368, "loss": 0.1716, "step": 2550 }, { "epoch": 1.1855905152758779, "grad_norm": 0.78515625, "learning_rate": 0.0001947380462136811, "loss": 0.1588, "step": 2600 }, { "epoch": 1.2083903328773369, "grad_norm": 0.85546875, "learning_rate": 0.0001946236559139785, "loss": 0.166, "step": 2650 }, { "epoch": 1.231190150478796, "grad_norm": 1.296875, "learning_rate": 0.00019450926561427593, "loss": 0.1514, "step": 2700 }, { "epoch": 1.2539899680802553, "grad_norm": 1.3125, "learning_rate": 0.00019439487531457334, "loss": 0.2103, "step": 2750 }, { "epoch": 1.2767897856817145, "grad_norm": 1.3984375, "learning_rate": 0.00019428048501487076, "loss": 0.185, "step": 2800 }, { "epoch": 1.2995896032831737, "grad_norm": 1.1015625, "learning_rate": 0.00019416609471516815, "loss": 0.1498, "step": 2850 }, { "epoch": 1.322389420884633, "grad_norm": 1.28125, "learning_rate": 0.0001940517044154656, "loss": 0.1706, "step": 2900 }, { "epoch": 1.3451892384860922, "grad_norm": 1.515625, "learning_rate": 0.00019393731411576298, "loss": 0.1579, "step": 2950 }, { "epoch": 1.3679890560875512, "grad_norm": 1.28125, "learning_rate": 0.0001938229238160604, "loss": 0.1777, "step": 3000 }, { "epoch": 1.3907888736890106, "grad_norm": 0.734375, "learning_rate": 0.00019370853351635784, "loss": 0.168, "step": 3050 }, { "epoch": 1.4135886912904696, "grad_norm": 0.5546875, "learning_rate": 0.00019359414321665523, "loss": 0.1325, "step": 3100 }, { "epoch": 1.4363885088919288, "grad_norm": 0.4453125, "learning_rate": 0.00019347975291695267, "loss": 0.1483, "step": 3150 }, { "epoch": 1.459188326493388, "grad_norm": 0.953125, "learning_rate": 0.00019336536261725006, "loss": 0.1424, "step": 3200 }, { "epoch": 1.4819881440948472, "grad_norm": 1.03125, "learning_rate": 0.00019325097231754748, "loss": 0.1352, "step": 3250 }, { "epoch": 1.5047879616963065, "grad_norm": 1.7421875, "learning_rate": 0.0001931365820178449, "loss": 0.1583, "step": 3300 }, { "epoch": 1.5275877792977655, "grad_norm": 1.40625, "learning_rate": 0.0001930221917181423, "loss": 0.1496, "step": 3350 }, { "epoch": 1.550387596899225, "grad_norm": 1.578125, "learning_rate": 0.00019290780141843972, "loss": 0.1551, "step": 3400 }, { "epoch": 1.573187414500684, "grad_norm": 0.59375, "learning_rate": 0.00019279341111873714, "loss": 0.1275, "step": 3450 }, { "epoch": 1.5959872321021433, "grad_norm": 1.25, "learning_rate": 0.00019267902081903456, "loss": 0.1685, "step": 3500 }, { "epoch": 1.6187870497036023, "grad_norm": 1.171875, "learning_rate": 0.00019256463051933197, "loss": 0.1271, "step": 3550 }, { "epoch": 1.6415868673050615, "grad_norm": 0.98828125, "learning_rate": 0.0001924502402196294, "loss": 0.1344, "step": 3600 }, { "epoch": 1.6643866849065208, "grad_norm": 0.72265625, "learning_rate": 0.0001923358499199268, "loss": 0.1516, "step": 3650 }, { "epoch": 1.68718650250798, "grad_norm": 0.7109375, "learning_rate": 0.00019222145962022422, "loss": 0.1379, "step": 3700 }, { "epoch": 1.7099863201094392, "grad_norm": 0.37890625, "learning_rate": 0.00019210706932052164, "loss": 0.1396, "step": 3750 }, { "epoch": 1.7327861377108982, "grad_norm": 1.25, "learning_rate": 0.00019199267902081902, "loss": 0.1544, "step": 3800 }, { "epoch": 1.7555859553123576, "grad_norm": 1.390625, "learning_rate": 0.00019187828872111647, "loss": 0.1639, "step": 3850 }, { "epoch": 1.7783857729138166, "grad_norm": 1.734375, "learning_rate": 0.00019176389842141386, "loss": 0.1473, "step": 3900 }, { "epoch": 1.8011855905152758, "grad_norm": 0.9765625, "learning_rate": 0.0001916495081217113, "loss": 0.1606, "step": 3950 }, { "epoch": 1.823985408116735, "grad_norm": 0.326171875, "learning_rate": 0.00019153511782200872, "loss": 0.155, "step": 4000 }, { "epoch": 1.8467852257181943, "grad_norm": 1.1171875, "learning_rate": 0.0001914207275223061, "loss": 0.1222, "step": 4050 }, { "epoch": 1.8695850433196535, "grad_norm": 1.828125, "learning_rate": 0.00019130633722260355, "loss": 0.1476, "step": 4100 }, { "epoch": 1.8923848609211125, "grad_norm": 0.67578125, "learning_rate": 0.00019119194692290094, "loss": 0.1358, "step": 4150 }, { "epoch": 1.915184678522572, "grad_norm": 1.59375, "learning_rate": 0.00019107755662319838, "loss": 0.1439, "step": 4200 }, { "epoch": 1.937984496124031, "grad_norm": 1.2109375, "learning_rate": 0.00019096316632349577, "loss": 0.1699, "step": 4250 }, { "epoch": 1.9607843137254903, "grad_norm": 1.09375, "learning_rate": 0.00019084877602379319, "loss": 0.1364, "step": 4300 }, { "epoch": 1.9835841313269493, "grad_norm": 1.328125, "learning_rate": 0.0001907343857240906, "loss": 0.1242, "step": 4350 }, { "epoch": 2.0063839489284088, "grad_norm": 1.6328125, "learning_rate": 0.00019061999542438802, "loss": 0.1146, "step": 4400 }, { "epoch": 2.0291837665298678, "grad_norm": 0.9609375, "learning_rate": 0.00019050560512468543, "loss": 0.1327, "step": 4450 }, { "epoch": 2.0519835841313268, "grad_norm": 1.0390625, "learning_rate": 0.00019039121482498285, "loss": 0.1278, "step": 4500 }, { "epoch": 2.074783401732786, "grad_norm": 1.484375, "learning_rate": 0.00019027682452528027, "loss": 0.1069, "step": 4550 }, { "epoch": 2.097583219334245, "grad_norm": 1.3125, "learning_rate": 0.00019016243422557768, "loss": 0.1351, "step": 4600 }, { "epoch": 2.1203830369357046, "grad_norm": 1.7421875, "learning_rate": 0.0001900480439258751, "loss": 0.1372, "step": 4650 }, { "epoch": 2.1431828545371636, "grad_norm": 1.4140625, "learning_rate": 0.0001899336536261725, "loss": 0.1093, "step": 4700 }, { "epoch": 2.165982672138623, "grad_norm": 0.8046875, "learning_rate": 0.00018981926332646993, "loss": 0.1, "step": 4750 }, { "epoch": 2.188782489740082, "grad_norm": 1.109375, "learning_rate": 0.00018970487302676735, "loss": 0.1016, "step": 4800 }, { "epoch": 2.2115823073415415, "grad_norm": 0.76953125, "learning_rate": 0.00018959048272706473, "loss": 0.0976, "step": 4850 }, { "epoch": 2.2343821249430005, "grad_norm": 1.7734375, "learning_rate": 0.00018947609242736218, "loss": 0.1132, "step": 4900 }, { "epoch": 2.2571819425444595, "grad_norm": 1.0625, "learning_rate": 0.00018936170212765957, "loss": 0.1174, "step": 4950 }, { "epoch": 2.279981760145919, "grad_norm": 1.59375, "learning_rate": 0.000189247311827957, "loss": 0.1191, "step": 5000 }, { "epoch": 2.302781577747378, "grad_norm": 0.7734375, "learning_rate": 0.00018913292152825443, "loss": 0.0933, "step": 5050 }, { "epoch": 2.3255813953488373, "grad_norm": 1.3984375, "learning_rate": 0.00018901853122855181, "loss": 0.1065, "step": 5100 }, { "epoch": 2.3483812129502963, "grad_norm": 0.8125, "learning_rate": 0.00018890414092884926, "loss": 0.0993, "step": 5150 }, { "epoch": 2.3711810305517558, "grad_norm": 1.265625, "learning_rate": 0.00018878975062914665, "loss": 0.1179, "step": 5200 }, { "epoch": 2.3939808481532148, "grad_norm": 0.8671875, "learning_rate": 0.00018867536032944406, "loss": 0.0982, "step": 5250 }, { "epoch": 2.4167806657546738, "grad_norm": 1.203125, "learning_rate": 0.00018856097002974148, "loss": 0.0831, "step": 5300 }, { "epoch": 2.439580483356133, "grad_norm": 0.9453125, "learning_rate": 0.0001884465797300389, "loss": 0.0943, "step": 5350 }, { "epoch": 2.462380300957592, "grad_norm": 1.484375, "learning_rate": 0.00018833218943033634, "loss": 0.088, "step": 5400 }, { "epoch": 2.4851801185590516, "grad_norm": 1.59375, "learning_rate": 0.00018821779913063373, "loss": 0.0952, "step": 5450 }, { "epoch": 2.5079799361605106, "grad_norm": 1.7421875, "learning_rate": 0.00018810340883093114, "loss": 0.0952, "step": 5500 }, { "epoch": 2.53077975376197, "grad_norm": 0.76171875, "learning_rate": 0.00018798901853122856, "loss": 0.0901, "step": 5550 }, { "epoch": 2.553579571363429, "grad_norm": 0.58984375, "learning_rate": 0.00018787462823152598, "loss": 0.0989, "step": 5600 }, { "epoch": 2.576379388964888, "grad_norm": 0.87890625, "learning_rate": 0.0001877602379318234, "loss": 0.0794, "step": 5650 }, { "epoch": 2.5991792065663475, "grad_norm": 0.9921875, "learning_rate": 0.0001876458476321208, "loss": 0.1076, "step": 5700 }, { "epoch": 2.621979024167807, "grad_norm": 0.64453125, "learning_rate": 0.00018753145733241822, "loss": 0.0745, "step": 5750 }, { "epoch": 2.644778841769266, "grad_norm": 0.6875, "learning_rate": 0.00018741706703271564, "loss": 0.0938, "step": 5800 }, { "epoch": 2.667578659370725, "grad_norm": 0.478515625, "learning_rate": 0.00018730267673301306, "loss": 0.0888, "step": 5850 }, { "epoch": 2.6903784769721844, "grad_norm": 0.98828125, "learning_rate": 0.00018718828643331044, "loss": 0.0836, "step": 5900 }, { "epoch": 2.7131782945736433, "grad_norm": 0.53125, "learning_rate": 0.0001870738961336079, "loss": 0.0821, "step": 5950 }, { "epoch": 2.7359781121751023, "grad_norm": 0.875, "learning_rate": 0.0001869595058339053, "loss": 0.1043, "step": 6000 }, { "epoch": 2.7587779297765618, "grad_norm": 0.32421875, "learning_rate": 0.00018684511553420272, "loss": 0.1005, "step": 6050 }, { "epoch": 2.781577747378021, "grad_norm": 2.3125, "learning_rate": 0.00018673072523450014, "loss": 0.1028, "step": 6100 }, { "epoch": 2.80437756497948, "grad_norm": 1.9453125, "learning_rate": 0.00018661633493479752, "loss": 0.1046, "step": 6150 }, { "epoch": 2.827177382580939, "grad_norm": 0.875, "learning_rate": 0.00018650194463509497, "loss": 0.0964, "step": 6200 }, { "epoch": 2.8499772001823986, "grad_norm": 1.421875, "learning_rate": 0.00018638755433539236, "loss": 0.0775, "step": 6250 }, { "epoch": 2.8727770177838576, "grad_norm": 0.75390625, "learning_rate": 0.00018627316403568977, "loss": 0.0963, "step": 6300 }, { "epoch": 2.895576835385317, "grad_norm": 0.7421875, "learning_rate": 0.0001861587737359872, "loss": 0.0903, "step": 6350 }, { "epoch": 2.918376652986776, "grad_norm": 1.078125, "learning_rate": 0.0001860443834362846, "loss": 0.0914, "step": 6400 }, { "epoch": 2.9411764705882355, "grad_norm": 1.2109375, "learning_rate": 0.00018592999313658205, "loss": 0.108, "step": 6450 }, { "epoch": 2.9639762881896945, "grad_norm": 0.83203125, "learning_rate": 0.00018581560283687944, "loss": 0.0876, "step": 6500 }, { "epoch": 2.9867761057911535, "grad_norm": 0.73046875, "learning_rate": 0.00018570121253717685, "loss": 0.0834, "step": 6550 }, { "epoch": 3.009575923392613, "grad_norm": 0.3828125, "learning_rate": 0.00018558682223747427, "loss": 0.0786, "step": 6600 }, { "epoch": 3.032375740994072, "grad_norm": 1.2109375, "learning_rate": 0.00018547243193777169, "loss": 0.0803, "step": 6650 }, { "epoch": 3.0551755585955314, "grad_norm": 1.59375, "learning_rate": 0.0001853580416380691, "loss": 0.0846, "step": 6700 }, { "epoch": 3.0779753761969904, "grad_norm": 0.5234375, "learning_rate": 0.00018524365133836652, "loss": 0.0562, "step": 6750 }, { "epoch": 3.10077519379845, "grad_norm": 1.9765625, "learning_rate": 0.00018512926103866393, "loss": 0.0934, "step": 6800 }, { "epoch": 3.123575011399909, "grad_norm": 0.7578125, "learning_rate": 0.00018501487073896135, "loss": 0.0813, "step": 6850 }, { "epoch": 3.146374829001368, "grad_norm": 1.171875, "learning_rate": 0.00018490048043925877, "loss": 0.0661, "step": 6900 }, { "epoch": 3.169174646602827, "grad_norm": 0.5234375, "learning_rate": 0.00018478609013955615, "loss": 0.0589, "step": 6950 }, { "epoch": 3.191974464204286, "grad_norm": 1.4609375, "learning_rate": 0.0001846716998398536, "loss": 0.0679, "step": 7000 }, { "epoch": 3.2147742818057456, "grad_norm": 1.515625, "learning_rate": 0.000184557309540151, "loss": 0.0564, "step": 7050 }, { "epoch": 3.2375740994072046, "grad_norm": 1.4453125, "learning_rate": 0.0001844429192404484, "loss": 0.0763, "step": 7100 }, { "epoch": 3.260373917008664, "grad_norm": 1.953125, "learning_rate": 0.00018432852894074585, "loss": 0.0768, "step": 7150 }, { "epoch": 3.283173734610123, "grad_norm": 0.56640625, "learning_rate": 0.00018421413864104323, "loss": 0.0686, "step": 7200 }, { "epoch": 3.305973552211582, "grad_norm": 1.0703125, "learning_rate": 0.00018409974834134068, "loss": 0.0638, "step": 7250 }, { "epoch": 3.3287733698130415, "grad_norm": 0.275390625, "learning_rate": 0.00018398535804163807, "loss": 0.0643, "step": 7300 }, { "epoch": 3.3515731874145005, "grad_norm": 0.8984375, "learning_rate": 0.00018387096774193548, "loss": 0.067, "step": 7350 }, { "epoch": 3.37437300501596, "grad_norm": 1.28125, "learning_rate": 0.00018375657744223293, "loss": 0.0756, "step": 7400 }, { "epoch": 3.397172822617419, "grad_norm": 1.1640625, "learning_rate": 0.00018364218714253031, "loss": 0.0579, "step": 7450 }, { "epoch": 3.4199726402188784, "grad_norm": 0.62890625, "learning_rate": 0.00018352779684282773, "loss": 0.0533, "step": 7500 }, { "epoch": 3.4427724578203374, "grad_norm": 0.62890625, "learning_rate": 0.00018341340654312515, "loss": 0.0569, "step": 7550 }, { "epoch": 3.465572275421797, "grad_norm": 1.328125, "learning_rate": 0.00018329901624342256, "loss": 0.0593, "step": 7600 }, { "epoch": 3.488372093023256, "grad_norm": 1.859375, "learning_rate": 0.00018318462594371998, "loss": 0.0664, "step": 7650 }, { "epoch": 3.5111719106247152, "grad_norm": 1.109375, "learning_rate": 0.0001830702356440174, "loss": 0.0593, "step": 7700 }, { "epoch": 3.5339717282261742, "grad_norm": 0.6640625, "learning_rate": 0.0001829558453443148, "loss": 0.063, "step": 7750 }, { "epoch": 3.556771545827633, "grad_norm": 0.2333984375, "learning_rate": 0.00018284145504461223, "loss": 0.0553, "step": 7800 }, { "epoch": 3.5795713634290927, "grad_norm": 0.60546875, "learning_rate": 0.00018272706474490964, "loss": 0.0513, "step": 7850 }, { "epoch": 3.6023711810305517, "grad_norm": 0.421875, "learning_rate": 0.00018261267444520706, "loss": 0.0678, "step": 7900 }, { "epoch": 3.625170998632011, "grad_norm": 0.87890625, "learning_rate": 0.00018249828414550448, "loss": 0.0446, "step": 7950 }, { "epoch": 3.64797081623347, "grad_norm": 0.6640625, "learning_rate": 0.0001823838938458019, "loss": 0.0588, "step": 8000 }, { "epoch": 3.6707706338349295, "grad_norm": 0.2734375, "learning_rate": 0.0001822695035460993, "loss": 0.0564, "step": 8050 }, { "epoch": 3.6935704514363885, "grad_norm": 0.294921875, "learning_rate": 0.00018215511324639672, "loss": 0.0526, "step": 8100 }, { "epoch": 3.7163702690378475, "grad_norm": 1.765625, "learning_rate": 0.0001820407229466941, "loss": 0.0609, "step": 8150 }, { "epoch": 3.739170086639307, "grad_norm": 0.76953125, "learning_rate": 0.00018192633264699156, "loss": 0.0679, "step": 8200 }, { "epoch": 3.761969904240766, "grad_norm": 0.48828125, "learning_rate": 0.00018181194234728894, "loss": 0.0638, "step": 8250 }, { "epoch": 3.7847697218422254, "grad_norm": 0.9765625, "learning_rate": 0.0001816975520475864, "loss": 0.0647, "step": 8300 }, { "epoch": 3.8075695394436844, "grad_norm": 0.84375, "learning_rate": 0.00018158316174788378, "loss": 0.0747, "step": 8350 }, { "epoch": 3.830369357045144, "grad_norm": 0.53515625, "learning_rate": 0.0001814687714481812, "loss": 0.0586, "step": 8400 }, { "epoch": 3.853169174646603, "grad_norm": 1.3828125, "learning_rate": 0.00018135438114847864, "loss": 0.0544, "step": 8450 }, { "epoch": 3.875968992248062, "grad_norm": 1.1640625, "learning_rate": 0.00018123999084877602, "loss": 0.0644, "step": 8500 }, { "epoch": 3.8987688098495212, "grad_norm": 0.94140625, "learning_rate": 0.00018112560054907344, "loss": 0.0559, "step": 8550 }, { "epoch": 3.9215686274509802, "grad_norm": 0.72265625, "learning_rate": 0.00018101121024937086, "loss": 0.0608, "step": 8600 }, { "epoch": 3.9443684450524397, "grad_norm": 1.578125, "learning_rate": 0.00018089681994966827, "loss": 0.0688, "step": 8650 }, { "epoch": 3.9671682626538987, "grad_norm": 1.046875, "learning_rate": 0.0001807824296499657, "loss": 0.0614, "step": 8700 }, { "epoch": 3.989968080255358, "grad_norm": 0.79296875, "learning_rate": 0.0001806680393502631, "loss": 0.0515, "step": 8750 }, { "epoch": 4.0127678978568175, "grad_norm": 1.046875, "learning_rate": 0.00018055364905056052, "loss": 0.0527, "step": 8800 }, { "epoch": 4.035567715458276, "grad_norm": 0.8515625, "learning_rate": 0.00018043925875085794, "loss": 0.0567, "step": 8850 }, { "epoch": 4.0583675330597355, "grad_norm": 0.92578125, "learning_rate": 0.00018032486845115535, "loss": 0.0501, "step": 8900 }, { "epoch": 4.081167350661195, "grad_norm": 1.109375, "learning_rate": 0.00018021047815145274, "loss": 0.0329, "step": 8950 }, { "epoch": 4.1039671682626535, "grad_norm": 1.453125, "learning_rate": 0.00018009608785175019, "loss": 0.0615, "step": 9000 }, { "epoch": 4.126766985864113, "grad_norm": 0.32421875, "learning_rate": 0.0001799816975520476, "loss": 0.0471, "step": 9050 }, { "epoch": 4.149566803465572, "grad_norm": 0.63671875, "learning_rate": 0.00017986730725234502, "loss": 0.0467, "step": 9100 }, { "epoch": 4.172366621067032, "grad_norm": 0.462890625, "learning_rate": 0.00017975291695264243, "loss": 0.0381, "step": 9150 }, { "epoch": 4.19516643866849, "grad_norm": 1.2890625, "learning_rate": 0.00017963852665293982, "loss": 0.0414, "step": 9200 }, { "epoch": 4.21796625626995, "grad_norm": 0.66015625, "learning_rate": 0.00017952413635323727, "loss": 0.0412, "step": 9250 }, { "epoch": 4.240766073871409, "grad_norm": 0.96875, "learning_rate": 0.00017940974605353465, "loss": 0.0492, "step": 9300 }, { "epoch": 4.263565891472869, "grad_norm": 0.5703125, "learning_rate": 0.00017929535575383207, "loss": 0.0526, "step": 9350 }, { "epoch": 4.286365709074327, "grad_norm": 0.51953125, "learning_rate": 0.0001791809654541295, "loss": 0.0403, "step": 9400 }, { "epoch": 4.309165526675787, "grad_norm": 0.984375, "learning_rate": 0.0001790665751544269, "loss": 0.0415, "step": 9450 }, { "epoch": 4.331965344277246, "grad_norm": 0.7734375, "learning_rate": 0.00017895218485472435, "loss": 0.04, "step": 9500 }, { "epoch": 4.354765161878705, "grad_norm": 0.349609375, "learning_rate": 0.00017883779455502173, "loss": 0.0461, "step": 9550 }, { "epoch": 4.377564979480164, "grad_norm": 0.7890625, "learning_rate": 0.00017872340425531915, "loss": 0.0499, "step": 9600 }, { "epoch": 4.4003647970816235, "grad_norm": 0.416015625, "learning_rate": 0.00017860901395561657, "loss": 0.038, "step": 9650 }, { "epoch": 4.423164614683083, "grad_norm": 0.84765625, "learning_rate": 0.00017849462365591398, "loss": 0.0383, "step": 9700 }, { "epoch": 4.4459644322845415, "grad_norm": 0.9765625, "learning_rate": 0.00017838023335621143, "loss": 0.0385, "step": 9750 }, { "epoch": 4.468764249886001, "grad_norm": 0.482421875, "learning_rate": 0.00017826584305650881, "loss": 0.0361, "step": 9800 }, { "epoch": 4.49156406748746, "grad_norm": 0.73046875, "learning_rate": 0.00017815145275680623, "loss": 0.0485, "step": 9850 }, { "epoch": 4.514363885088919, "grad_norm": 2.125, "learning_rate": 0.00017803706245710365, "loss": 0.0372, "step": 9900 }, { "epoch": 4.537163702690378, "grad_norm": 0.451171875, "learning_rate": 0.00017792267215740106, "loss": 0.0433, "step": 9950 }, { "epoch": 4.559963520291838, "grad_norm": 0.63671875, "learning_rate": 0.00017780828185769848, "loss": 0.0337, "step": 10000 }, { "epoch": 4.582763337893297, "grad_norm": 1.375, "learning_rate": 0.0001776938915579959, "loss": 0.0392, "step": 10050 }, { "epoch": 4.605563155494756, "grad_norm": 0.8359375, "learning_rate": 0.0001775795012582933, "loss": 0.0404, "step": 10100 }, { "epoch": 4.628362973096215, "grad_norm": 0.65234375, "learning_rate": 0.00017746511095859073, "loss": 0.0312, "step": 10150 }, { "epoch": 4.651162790697675, "grad_norm": 0.609375, "learning_rate": 0.00017735072065888814, "loss": 0.0398, "step": 10200 }, { "epoch": 4.673962608299133, "grad_norm": 0.45703125, "learning_rate": 0.00017723633035918553, "loss": 0.0386, "step": 10250 }, { "epoch": 4.696762425900593, "grad_norm": 0.3828125, "learning_rate": 0.00017712194005948298, "loss": 0.0343, "step": 10300 }, { "epoch": 4.719562243502052, "grad_norm": 0.921875, "learning_rate": 0.0001770075497597804, "loss": 0.0472, "step": 10350 }, { "epoch": 4.7423620611035116, "grad_norm": 0.8984375, "learning_rate": 0.00017689315946007778, "loss": 0.042, "step": 10400 }, { "epoch": 4.76516187870497, "grad_norm": 1.78125, "learning_rate": 0.00017677876916037522, "loss": 0.0412, "step": 10450 }, { "epoch": 4.7879616963064295, "grad_norm": 0.5234375, "learning_rate": 0.0001766643788606726, "loss": 0.0432, "step": 10500 }, { "epoch": 4.810761513907889, "grad_norm": 0.2265625, "learning_rate": 0.00017654998856097006, "loss": 0.0525, "step": 10550 }, { "epoch": 4.8335613315093475, "grad_norm": 0.89453125, "learning_rate": 0.00017643559826126744, "loss": 0.0395, "step": 10600 }, { "epoch": 4.856361149110807, "grad_norm": 1.0859375, "learning_rate": 0.00017632120796156486, "loss": 0.0387, "step": 10650 }, { "epoch": 4.879160966712266, "grad_norm": 0.8671875, "learning_rate": 0.00017620681766186228, "loss": 0.041, "step": 10700 }, { "epoch": 4.901960784313726, "grad_norm": 1.640625, "learning_rate": 0.0001760924273621597, "loss": 0.038, "step": 10750 }, { "epoch": 4.924760601915184, "grad_norm": 1.3125, "learning_rate": 0.0001759780370624571, "loss": 0.0448, "step": 10800 }, { "epoch": 4.947560419516644, "grad_norm": 1.3046875, "learning_rate": 0.00017586364676275452, "loss": 0.0417, "step": 10850 }, { "epoch": 4.970360237118103, "grad_norm": 0.423828125, "learning_rate": 0.00017574925646305194, "loss": 0.0393, "step": 10900 }, { "epoch": 4.993160054719562, "grad_norm": 1.1171875, "learning_rate": 0.00017563486616334936, "loss": 0.0353, "step": 10950 }, { "epoch": 5.015959872321021, "grad_norm": 0.421875, "learning_rate": 0.00017552047586364677, "loss": 0.0376, "step": 11000 }, { "epoch": 5.038759689922481, "grad_norm": 1.828125, "learning_rate": 0.0001754060855639442, "loss": 0.0388, "step": 11050 }, { "epoch": 5.06155950752394, "grad_norm": 0.32421875, "learning_rate": 0.0001752916952642416, "loss": 0.0318, "step": 11100 }, { "epoch": 5.084359325125399, "grad_norm": 1.0859375, "learning_rate": 0.00017517730496453902, "loss": 0.0251, "step": 11150 }, { "epoch": 5.107159142726858, "grad_norm": 0.373046875, "learning_rate": 0.0001750629146648364, "loss": 0.0443, "step": 11200 }, { "epoch": 5.1299589603283176, "grad_norm": 0.51171875, "learning_rate": 0.00017494852436513385, "loss": 0.0285, "step": 11250 }, { "epoch": 5.152758777929777, "grad_norm": 0.75, "learning_rate": 0.00017483413406543124, "loss": 0.0319, "step": 11300 }, { "epoch": 5.1755585955312355, "grad_norm": 0.375, "learning_rate": 0.00017471974376572869, "loss": 0.0292, "step": 11350 }, { "epoch": 5.198358413132695, "grad_norm": 0.28515625, "learning_rate": 0.0001746053534660261, "loss": 0.0281, "step": 11400 }, { "epoch": 5.221158230734154, "grad_norm": 0.361328125, "learning_rate": 0.0001744909631663235, "loss": 0.029, "step": 11450 }, { "epoch": 5.243958048335613, "grad_norm": 0.265625, "learning_rate": 0.00017437657286662093, "loss": 0.0382, "step": 11500 }, { "epoch": 5.266757865937072, "grad_norm": 0.2177734375, "learning_rate": 0.00017426218256691832, "loss": 0.0328, "step": 11550 }, { "epoch": 5.289557683538532, "grad_norm": 1.6015625, "learning_rate": 0.00017414779226721577, "loss": 0.03, "step": 11600 }, { "epoch": 5.312357501139991, "grad_norm": 1.109375, "learning_rate": 0.00017403340196751315, "loss": 0.0251, "step": 11650 }, { "epoch": 5.33515731874145, "grad_norm": 0.70703125, "learning_rate": 0.00017391901166781057, "loss": 0.0267, "step": 11700 }, { "epoch": 5.357957136342909, "grad_norm": 0.6171875, "learning_rate": 0.000173804621368108, "loss": 0.0357, "step": 11750 }, { "epoch": 5.380756953944369, "grad_norm": 1.6015625, "learning_rate": 0.0001736902310684054, "loss": 0.035, "step": 11800 }, { "epoch": 5.403556771545827, "grad_norm": 0.5390625, "learning_rate": 0.00017357584076870282, "loss": 0.0235, "step": 11850 }, { "epoch": 5.426356589147287, "grad_norm": 0.58984375, "learning_rate": 0.00017346145046900023, "loss": 0.0277, "step": 11900 }, { "epoch": 5.449156406748746, "grad_norm": 0.78515625, "learning_rate": 0.00017334706016929765, "loss": 0.0294, "step": 11950 }, { "epoch": 5.471956224350206, "grad_norm": 0.859375, "learning_rate": 0.00017323266986959507, "loss": 0.0245, "step": 12000 } ], "logging_steps": 50, "max_steps": 87720, "num_input_tokens_seen": 0, "num_train_epochs": 40, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }