|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 7.934336525307797, |
|
"eval_steps": 500, |
|
"global_step": 17400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.022799817601459188, |
|
"grad_norm": 2.5625, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 4.9225, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.045599635202918376, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.1478, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06839945280437756, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6361, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09119927040583675, |
|
"grad_norm": 1.46875, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.467, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11399908800729594, |
|
"grad_norm": 2.15625, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.4381, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13679890560875513, |
|
"grad_norm": 1.8203125, |
|
"learning_rate": 0.0002, |
|
"loss": 0.3365, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15959872321021432, |
|
"grad_norm": 1.4765625, |
|
"learning_rate": 0.00019988560970029743, |
|
"loss": 0.3255, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1823985408116735, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.00019977121940059484, |
|
"loss": 0.2946, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2051983584131327, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 0.00019965682910089226, |
|
"loss": 0.3023, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22799817601459188, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00019954243880118967, |
|
"loss": 0.2708, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2507979936160511, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.0001994280485014871, |
|
"loss": 0.3645, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.27359781121751026, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.00019931365820178448, |
|
"loss": 0.3072, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.29639762881896947, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00019919926790208192, |
|
"loss": 0.2891, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.31919744642042863, |
|
"grad_norm": 2.015625, |
|
"learning_rate": 0.00019908487760237934, |
|
"loss": 0.2763, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.34199726402188785, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00019897048730267673, |
|
"loss": 0.2824, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.364797081623347, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.00019885609700297417, |
|
"loss": 0.3007, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3875968992248062, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00019874170670327156, |
|
"loss": 0.2955, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4103967168262654, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.000198627316403569, |
|
"loss": 0.2385, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.4331965344277246, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 0.0001985129261038664, |
|
"loss": 0.2461, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.45599635202918376, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.0001983985358041638, |
|
"loss": 0.2567, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.478796169630643, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00019828414550446125, |
|
"loss": 0.2189, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5015959872321022, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00019816975520475864, |
|
"loss": 0.272, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5243958048335613, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.00019805536490505606, |
|
"loss": 0.2615, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5471956224350205, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019794097460535347, |
|
"loss": 0.2476, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5699954400364797, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0001978265843056509, |
|
"loss": 0.2327, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.5927952576379389, |
|
"grad_norm": 1.1953125, |
|
"learning_rate": 0.0001977121940059483, |
|
"loss": 0.2664, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.615595075239398, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00019759780370624572, |
|
"loss": 0.2266, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6383948928408573, |
|
"grad_norm": 1.2265625, |
|
"learning_rate": 0.00019748341340654314, |
|
"loss": 0.2234, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6611947104423165, |
|
"grad_norm": 1.1875, |
|
"learning_rate": 0.00019736902310684055, |
|
"loss": 0.2463, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.6839945280437757, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00019725463280713797, |
|
"loss": 0.2311, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7067943456452348, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019714024250743536, |
|
"loss": 0.2251, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.729594163246694, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001970258522077328, |
|
"loss": 0.2492, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7523939808481532, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00019691146190803022, |
|
"loss": 0.2554, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.7751937984496124, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00019679707160832763, |
|
"loss": 0.2384, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7979936160510716, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019668268130862505, |
|
"loss": 0.2583, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8207934336525308, |
|
"grad_norm": 0.625, |
|
"learning_rate": 0.00019656829100892244, |
|
"loss": 0.2343, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.84359325125399, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 0.00019645390070921988, |
|
"loss": 0.2193, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.8663930688554492, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00019633951040951727, |
|
"loss": 0.2212, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.8891928864569083, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.00019622512010981468, |
|
"loss": 0.2178, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.9119927040583675, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00019611072981011213, |
|
"loss": 0.2135, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9347925216598267, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00019599633951040952, |
|
"loss": 0.2522, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.957592339261286, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00019588194921070696, |
|
"loss": 0.2138, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.9803921568627451, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019576755891100435, |
|
"loss": 0.2043, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.0031919744642044, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 0.00019565316861130177, |
|
"loss": 0.1855, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.0259917920656634, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 0.00019553877831159918, |
|
"loss": 0.2099, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.0487916096671226, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.0001954243880118966, |
|
"loss": 0.2032, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.0715914272685818, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 0.000195309997712194, |
|
"loss": 0.1753, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.094391244870041, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 0.00019519560741249143, |
|
"loss": 0.1997, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.1171910624715002, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.00019508121711278885, |
|
"loss": 0.2101, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.1399908800729595, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00019496682681308626, |
|
"loss": 0.1734, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.1627906976744187, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00019485243651338368, |
|
"loss": 0.1716, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.1855905152758779, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.0001947380462136811, |
|
"loss": 0.1588, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.2083903328773369, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.0001946236559139785, |
|
"loss": 0.166, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.231190150478796, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.00019450926561427593, |
|
"loss": 0.1514, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.2539899680802553, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019439487531457334, |
|
"loss": 0.2103, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.2767897856817145, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00019428048501487076, |
|
"loss": 0.185, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.2995896032831737, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00019416609471516815, |
|
"loss": 0.1498, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.322389420884633, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0001940517044154656, |
|
"loss": 0.1706, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.3451892384860922, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.00019393731411576298, |
|
"loss": 0.1579, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.3679890560875512, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.0001938229238160604, |
|
"loss": 0.1777, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.3907888736890106, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.00019370853351635784, |
|
"loss": 0.168, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.4135886912904696, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.00019359414321665523, |
|
"loss": 0.1325, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.4363885088919288, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00019347975291695267, |
|
"loss": 0.1483, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.459188326493388, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 0.00019336536261725006, |
|
"loss": 0.1424, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.4819881440948472, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 0.00019325097231754748, |
|
"loss": 0.1352, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.5047879616963065, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0001931365820178449, |
|
"loss": 0.1583, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.5275877792977655, |
|
"grad_norm": 1.40625, |
|
"learning_rate": 0.0001930221917181423, |
|
"loss": 0.1496, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.550387596899225, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.00019290780141843972, |
|
"loss": 0.1551, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.573187414500684, |
|
"grad_norm": 0.59375, |
|
"learning_rate": 0.00019279341111873714, |
|
"loss": 0.1275, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.5959872321021433, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00019267902081903456, |
|
"loss": 0.1685, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.6187870497036023, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00019256463051933197, |
|
"loss": 0.1271, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.6415868673050615, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.0001924502402196294, |
|
"loss": 0.1344, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.6643866849065208, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001923358499199268, |
|
"loss": 0.1516, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.68718650250798, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.00019222145962022422, |
|
"loss": 0.1379, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.7099863201094392, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00019210706932052164, |
|
"loss": 0.1396, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.7327861377108982, |
|
"grad_norm": 1.25, |
|
"learning_rate": 0.00019199267902081902, |
|
"loss": 0.1544, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.7555859553123576, |
|
"grad_norm": 1.390625, |
|
"learning_rate": 0.00019187828872111647, |
|
"loss": 0.1639, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.7783857729138166, |
|
"grad_norm": 1.734375, |
|
"learning_rate": 0.00019176389842141386, |
|
"loss": 0.1473, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.8011855905152758, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.0001916495081217113, |
|
"loss": 0.1606, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.823985408116735, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019153511782200872, |
|
"loss": 0.155, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.8467852257181943, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.0001914207275223061, |
|
"loss": 0.1222, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.8695850433196535, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.00019130633722260355, |
|
"loss": 0.1476, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.8923848609211125, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00019119194692290094, |
|
"loss": 0.1358, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.915184678522572, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00019107755662319838, |
|
"loss": 0.1439, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.937984496124031, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00019096316632349577, |
|
"loss": 0.1699, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.9607843137254903, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 0.00019084877602379319, |
|
"loss": 0.1364, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.9835841313269493, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.0001907343857240906, |
|
"loss": 0.1242, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 2.0063839489284088, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.00019061999542438802, |
|
"loss": 0.1146, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.0291837665298678, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00019050560512468543, |
|
"loss": 0.1327, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 2.0519835841313268, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 0.00019039121482498285, |
|
"loss": 0.1278, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.074783401732786, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00019027682452528027, |
|
"loss": 0.1069, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 2.097583219334245, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00019016243422557768, |
|
"loss": 0.1351, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.1203830369357046, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.0001900480439258751, |
|
"loss": 0.1372, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 2.1431828545371636, |
|
"grad_norm": 1.4140625, |
|
"learning_rate": 0.0001899336536261725, |
|
"loss": 0.1093, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.165982672138623, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00018981926332646993, |
|
"loss": 0.1, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 2.188782489740082, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00018970487302676735, |
|
"loss": 0.1016, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.2115823073415415, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00018959048272706473, |
|
"loss": 0.0976, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 2.2343821249430005, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.00018947609242736218, |
|
"loss": 0.1132, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.2571819425444595, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00018936170212765957, |
|
"loss": 0.1174, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.279981760145919, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.000189247311827957, |
|
"loss": 0.1191, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.302781577747378, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00018913292152825443, |
|
"loss": 0.0933, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 1.3984375, |
|
"learning_rate": 0.00018901853122855181, |
|
"loss": 0.1065, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.3483812129502963, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00018890414092884926, |
|
"loss": 0.0993, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.3711810305517558, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00018878975062914665, |
|
"loss": 0.1179, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.3939808481532148, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00018867536032944406, |
|
"loss": 0.0982, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.4167806657546738, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00018856097002974148, |
|
"loss": 0.0831, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.439580483356133, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 0.0001884465797300389, |
|
"loss": 0.0943, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 2.462380300957592, |
|
"grad_norm": 1.484375, |
|
"learning_rate": 0.00018833218943033634, |
|
"loss": 0.088, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.4851801185590516, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.00018821779913063373, |
|
"loss": 0.0952, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 2.5079799361605106, |
|
"grad_norm": 1.7421875, |
|
"learning_rate": 0.00018810340883093114, |
|
"loss": 0.0952, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.53077975376197, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00018798901853122856, |
|
"loss": 0.0901, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 2.553579571363429, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00018787462823152598, |
|
"loss": 0.0989, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.576379388964888, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001877602379318234, |
|
"loss": 0.0794, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.5991792065663475, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 0.0001876458476321208, |
|
"loss": 0.1076, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.621979024167807, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00018753145733241822, |
|
"loss": 0.0745, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.644778841769266, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00018741706703271564, |
|
"loss": 0.0938, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.667578659370725, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00018730267673301306, |
|
"loss": 0.0888, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.6903784769721844, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00018718828643331044, |
|
"loss": 0.0836, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.7131782945736433, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.0001870738961336079, |
|
"loss": 0.0821, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.7359781121751023, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.0001869595058339053, |
|
"loss": 0.1043, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.7587779297765618, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00018684511553420272, |
|
"loss": 0.1005, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.781577747378021, |
|
"grad_norm": 2.3125, |
|
"learning_rate": 0.00018673072523450014, |
|
"loss": 0.1028, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.80437756497948, |
|
"grad_norm": 1.9453125, |
|
"learning_rate": 0.00018661633493479752, |
|
"loss": 0.1046, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.827177382580939, |
|
"grad_norm": 0.875, |
|
"learning_rate": 0.00018650194463509497, |
|
"loss": 0.0964, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.8499772001823986, |
|
"grad_norm": 1.421875, |
|
"learning_rate": 0.00018638755433539236, |
|
"loss": 0.0775, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.8727770177838576, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00018627316403568977, |
|
"loss": 0.0963, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.895576835385317, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.0001861587737359872, |
|
"loss": 0.0903, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 2.918376652986776, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 0.0001860443834362846, |
|
"loss": 0.0914, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.9411764705882355, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00018592999313658205, |
|
"loss": 0.108, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 2.9639762881896945, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00018581560283687944, |
|
"loss": 0.0876, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.9867761057911535, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00018570121253717685, |
|
"loss": 0.0834, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 3.009575923392613, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00018558682223747427, |
|
"loss": 0.0786, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 3.032375740994072, |
|
"grad_norm": 1.2109375, |
|
"learning_rate": 0.00018547243193777169, |
|
"loss": 0.0803, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 3.0551755585955314, |
|
"grad_norm": 1.59375, |
|
"learning_rate": 0.0001853580416380691, |
|
"loss": 0.0846, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 3.0779753761969904, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00018524365133836652, |
|
"loss": 0.0562, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 3.10077519379845, |
|
"grad_norm": 1.9765625, |
|
"learning_rate": 0.00018512926103866393, |
|
"loss": 0.0934, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 3.123575011399909, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.00018501487073896135, |
|
"loss": 0.0813, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 3.146374829001368, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00018490048043925877, |
|
"loss": 0.0661, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 3.169174646602827, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.00018478609013955615, |
|
"loss": 0.0589, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 3.191974464204286, |
|
"grad_norm": 1.4609375, |
|
"learning_rate": 0.0001846716998398536, |
|
"loss": 0.0679, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.2147742818057456, |
|
"grad_norm": 1.515625, |
|
"learning_rate": 0.000184557309540151, |
|
"loss": 0.0564, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 3.2375740994072046, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.0001844429192404484, |
|
"loss": 0.0763, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 3.260373917008664, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.00018432852894074585, |
|
"loss": 0.0768, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 3.283173734610123, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00018421413864104323, |
|
"loss": 0.0686, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 3.305973552211582, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00018409974834134068, |
|
"loss": 0.0638, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 3.3287733698130415, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00018398535804163807, |
|
"loss": 0.0643, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 3.3515731874145005, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00018387096774193548, |
|
"loss": 0.067, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 3.37437300501596, |
|
"grad_norm": 1.28125, |
|
"learning_rate": 0.00018375657744223293, |
|
"loss": 0.0756, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.397172822617419, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00018364218714253031, |
|
"loss": 0.0579, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 3.4199726402188784, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00018352779684282773, |
|
"loss": 0.0533, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.4427724578203374, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00018341340654312515, |
|
"loss": 0.0569, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 3.465572275421797, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00018329901624342256, |
|
"loss": 0.0593, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.488372093023256, |
|
"grad_norm": 1.859375, |
|
"learning_rate": 0.00018318462594371998, |
|
"loss": 0.0664, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 3.5111719106247152, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.0001830702356440174, |
|
"loss": 0.0593, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.5339717282261742, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0001829558453443148, |
|
"loss": 0.063, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 3.556771545827633, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00018284145504461223, |
|
"loss": 0.0553, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.5795713634290927, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00018272706474490964, |
|
"loss": 0.0513, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 3.6023711810305517, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00018261267444520706, |
|
"loss": 0.0678, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.625170998632011, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.00018249828414550448, |
|
"loss": 0.0446, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 3.64797081623347, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0001823838938458019, |
|
"loss": 0.0588, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.6707706338349295, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0001822695035460993, |
|
"loss": 0.0564, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 3.6935704514363885, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00018215511324639672, |
|
"loss": 0.0526, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.7163702690378475, |
|
"grad_norm": 1.765625, |
|
"learning_rate": 0.0001820407229466941, |
|
"loss": 0.0609, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 3.739170086639307, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 0.00018192633264699156, |
|
"loss": 0.0679, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.761969904240766, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.00018181194234728894, |
|
"loss": 0.0638, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 3.7847697218422254, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.0001816975520475864, |
|
"loss": 0.0647, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.8075695394436844, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 0.00018158316174788378, |
|
"loss": 0.0747, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 3.830369357045144, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.0001814687714481812, |
|
"loss": 0.0586, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.853169174646603, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00018135438114847864, |
|
"loss": 0.0544, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 3.875968992248062, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00018123999084877602, |
|
"loss": 0.0644, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.8987688098495212, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00018112560054907344, |
|
"loss": 0.0559, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 3.9215686274509802, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.00018101121024937086, |
|
"loss": 0.0608, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 3.9443684450524397, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.00018089681994966827, |
|
"loss": 0.0688, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 3.9671682626538987, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001807824296499657, |
|
"loss": 0.0614, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 3.989968080255358, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.0001806680393502631, |
|
"loss": 0.0515, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 4.0127678978568175, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.00018055364905056052, |
|
"loss": 0.0527, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 4.035567715458276, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 0.00018043925875085794, |
|
"loss": 0.0567, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 4.0583675330597355, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 0.00018032486845115535, |
|
"loss": 0.0501, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 4.081167350661195, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00018021047815145274, |
|
"loss": 0.0329, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 4.1039671682626535, |
|
"grad_norm": 1.453125, |
|
"learning_rate": 0.00018009608785175019, |
|
"loss": 0.0615, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.126766985864113, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001799816975520476, |
|
"loss": 0.0471, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 4.149566803465572, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00017986730725234502, |
|
"loss": 0.0467, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 4.172366621067032, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00017975291695264243, |
|
"loss": 0.0381, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 4.19516643866849, |
|
"grad_norm": 1.2890625, |
|
"learning_rate": 0.00017963852665293982, |
|
"loss": 0.0414, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 4.21796625626995, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00017952413635323727, |
|
"loss": 0.0412, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 4.240766073871409, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 0.00017940974605353465, |
|
"loss": 0.0492, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 4.263565891472869, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00017929535575383207, |
|
"loss": 0.0526, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 4.286365709074327, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.0001791809654541295, |
|
"loss": 0.0403, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 4.309165526675787, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 0.0001790665751544269, |
|
"loss": 0.0415, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 4.331965344277246, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00017895218485472435, |
|
"loss": 0.04, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.354765161878705, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00017883779455502173, |
|
"loss": 0.0461, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 4.377564979480164, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 0.00017872340425531915, |
|
"loss": 0.0499, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 4.4003647970816235, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00017860901395561657, |
|
"loss": 0.038, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 4.423164614683083, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00017849462365591398, |
|
"loss": 0.0383, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 4.4459644322845415, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 0.00017838023335621143, |
|
"loss": 0.0385, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 4.468764249886001, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00017826584305650881, |
|
"loss": 0.0361, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 4.49156406748746, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00017815145275680623, |
|
"loss": 0.0485, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 4.514363885088919, |
|
"grad_norm": 2.125, |
|
"learning_rate": 0.00017803706245710365, |
|
"loss": 0.0372, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.537163702690378, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00017792267215740106, |
|
"loss": 0.0433, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 4.559963520291838, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00017780828185769848, |
|
"loss": 0.0337, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.582763337893297, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.0001776938915579959, |
|
"loss": 0.0392, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 4.605563155494756, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 0.0001775795012582933, |
|
"loss": 0.0404, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.628362973096215, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00017746511095859073, |
|
"loss": 0.0312, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 4.651162790697675, |
|
"grad_norm": 0.609375, |
|
"learning_rate": 0.00017735072065888814, |
|
"loss": 0.0398, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.673962608299133, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00017723633035918553, |
|
"loss": 0.0386, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 4.696762425900593, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.00017712194005948298, |
|
"loss": 0.0343, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.719562243502052, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.0001770075497597804, |
|
"loss": 0.0472, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 4.7423620611035116, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00017689315946007778, |
|
"loss": 0.042, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.76516187870497, |
|
"grad_norm": 1.78125, |
|
"learning_rate": 0.00017677876916037522, |
|
"loss": 0.0412, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 4.7879616963064295, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001766643788606726, |
|
"loss": 0.0432, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.810761513907889, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00017654998856097006, |
|
"loss": 0.0525, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 4.8335613315093475, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00017643559826126744, |
|
"loss": 0.0395, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.856361149110807, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00017632120796156486, |
|
"loss": 0.0387, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 4.879160966712266, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.00017620681766186228, |
|
"loss": 0.041, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 4.901960784313726, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.0001760924273621597, |
|
"loss": 0.038, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 4.924760601915184, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0001759780370624571, |
|
"loss": 0.0448, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 4.947560419516644, |
|
"grad_norm": 1.3046875, |
|
"learning_rate": 0.00017586364676275452, |
|
"loss": 0.0417, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 4.970360237118103, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00017574925646305194, |
|
"loss": 0.0393, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 4.993160054719562, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00017563486616334936, |
|
"loss": 0.0353, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 5.015959872321021, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00017552047586364677, |
|
"loss": 0.0376, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.038759689922481, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.0001754060855639442, |
|
"loss": 0.0388, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 5.06155950752394, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001752916952642416, |
|
"loss": 0.0318, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 5.084359325125399, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00017517730496453902, |
|
"loss": 0.0251, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 5.107159142726858, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0001750629146648364, |
|
"loss": 0.0443, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 5.1299589603283176, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.00017494852436513385, |
|
"loss": 0.0285, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 5.152758777929777, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00017483413406543124, |
|
"loss": 0.0319, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 5.1755585955312355, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00017471974376572869, |
|
"loss": 0.0292, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 5.198358413132695, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001746053534660261, |
|
"loss": 0.0281, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 5.221158230734154, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0001744909631663235, |
|
"loss": 0.029, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 5.243958048335613, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00017437657286662093, |
|
"loss": 0.0382, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.266757865937072, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.00017426218256691832, |
|
"loss": 0.0328, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 5.289557683538532, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.00017414779226721577, |
|
"loss": 0.03, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 5.312357501139991, |
|
"grad_norm": 1.109375, |
|
"learning_rate": 0.00017403340196751315, |
|
"loss": 0.0251, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 5.33515731874145, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00017391901166781057, |
|
"loss": 0.0267, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 5.357957136342909, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.000173804621368108, |
|
"loss": 0.0357, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 5.380756953944369, |
|
"grad_norm": 1.6015625, |
|
"learning_rate": 0.0001736902310684054, |
|
"loss": 0.035, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 5.403556771545827, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00017357584076870282, |
|
"loss": 0.0235, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 5.426356589147287, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00017346145046900023, |
|
"loss": 0.0277, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 5.449156406748746, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 0.00017334706016929765, |
|
"loss": 0.0294, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 5.471956224350206, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 0.00017323266986959507, |
|
"loss": 0.0245, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.494756041951664, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.00017311827956989248, |
|
"loss": 0.0343, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 5.5175558595531236, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001730038892701899, |
|
"loss": 0.0279, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 5.540355677154583, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00017288949897048731, |
|
"loss": 0.0322, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 5.563155494756042, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00017277510867078473, |
|
"loss": 0.0233, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 5.585955312357501, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00017266071837108212, |
|
"loss": 0.0303, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 5.60875512995896, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.00017254632807137956, |
|
"loss": 0.026, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 5.63155494756042, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00017243193777167698, |
|
"loss": 0.022, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 5.654354765161878, |
|
"grad_norm": 0.5546875, |
|
"learning_rate": 0.0001723175474719744, |
|
"loss": 0.0304, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 5.677154582763338, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.0001722031571722718, |
|
"loss": 0.0294, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 5.699954400364797, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.0001720887668725692, |
|
"loss": 0.025, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 5.722754217966257, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00017197437657286664, |
|
"loss": 0.0307, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 5.745554035567715, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00017185998627316403, |
|
"loss": 0.0306, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 5.768353853169175, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00017174559597346145, |
|
"loss": 0.0329, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 5.791153670770634, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00017163120567375886, |
|
"loss": 0.0273, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 5.813953488372093, |
|
"grad_norm": 0.55078125, |
|
"learning_rate": 0.00017151681537405628, |
|
"loss": 0.0399, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 5.836753305973552, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.00017140242507435372, |
|
"loss": 0.0268, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 5.859553123575012, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001712880347746511, |
|
"loss": 0.0308, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 5.882352941176471, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.00017117364447494853, |
|
"loss": 0.0286, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 5.9051527587779296, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00017105925417524594, |
|
"loss": 0.0288, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 5.927952576379389, |
|
"grad_norm": 2.4375, |
|
"learning_rate": 0.00017094486387554336, |
|
"loss": 0.036, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.950752393980848, |
|
"grad_norm": 1.953125, |
|
"learning_rate": 0.00017083047357584078, |
|
"loss": 0.0276, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 5.973552211582307, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0001707160832761382, |
|
"loss": 0.0259, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 5.996352029183766, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0001706016929764356, |
|
"loss": 0.0262, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 6.019151846785226, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.00017048730267673302, |
|
"loss": 0.0287, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 6.041951664386685, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00017037291237703044, |
|
"loss": 0.0318, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 6.064751481988144, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00017025852207732783, |
|
"loss": 0.019, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 6.087551299589603, |
|
"grad_norm": 1.84375, |
|
"learning_rate": 0.00017014413177762527, |
|
"loss": 0.0233, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 6.110351117191063, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.0001700297414779227, |
|
"loss": 0.0283, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 6.133150934792521, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0001699153511782201, |
|
"loss": 0.0206, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 6.155950752393981, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00016980096087851752, |
|
"loss": 0.0228, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 6.17875056999544, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 0.0001696865705788149, |
|
"loss": 0.0238, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 6.2015503875969, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 0.00016957218027911235, |
|
"loss": 0.0209, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 6.224350205198358, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00016945778997940974, |
|
"loss": 0.0216, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 6.247150022799818, |
|
"grad_norm": 1.3515625, |
|
"learning_rate": 0.00016934339967970716, |
|
"loss": 0.0306, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 6.269949840401277, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.0001692290093800046, |
|
"loss": 0.0229, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 6.292749658002736, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.000169114619080302, |
|
"loss": 0.0214, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 6.315549475604195, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.00016900022878059943, |
|
"loss": 0.0199, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 6.338349293205654, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00016888583848089682, |
|
"loss": 0.0212, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 6.361149110807114, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00016877144818119424, |
|
"loss": 0.0264, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 6.383948928408572, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00016865705788149165, |
|
"loss": 0.0236, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.406748746010032, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00016854266758178907, |
|
"loss": 0.0189, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 6.429548563611491, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.0001684282772820865, |
|
"loss": 0.0223, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 6.45234838121295, |
|
"grad_norm": 0.06884765625, |
|
"learning_rate": 0.0001683138869823839, |
|
"loss": 0.0196, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 6.475148198814409, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00016819949668268132, |
|
"loss": 0.0193, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 6.497948016415869, |
|
"grad_norm": 1.171875, |
|
"learning_rate": 0.00016808510638297873, |
|
"loss": 0.0266, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 6.520747834017328, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00016797071608327615, |
|
"loss": 0.0222, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 6.543547651618787, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.00016785632578357357, |
|
"loss": 0.0238, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 6.566347469220246, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.00016774193548387098, |
|
"loss": 0.0171, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 6.589147286821706, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0001676275451841684, |
|
"loss": 0.026, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 6.611947104423164, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001675131548844658, |
|
"loss": 0.0182, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 6.634746922024624, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00016739876458476323, |
|
"loss": 0.02, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 6.657546739626083, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00016728437428506062, |
|
"loss": 0.0227, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 6.6803465572275424, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00016716998398535806, |
|
"loss": 0.0216, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 6.703146374829001, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 0.00016705559368565545, |
|
"loss": 0.0196, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 6.72594619243046, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00016694120338595287, |
|
"loss": 0.0227, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 6.74874601003192, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0001668268130862503, |
|
"loss": 0.024, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 6.771545827633379, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0001667124227865477, |
|
"loss": 0.0244, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 6.794345645234838, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.00016659803248684512, |
|
"loss": 0.025, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 6.817145462836297, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00016648364218714253, |
|
"loss": 0.0276, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 6.839945280437757, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00016636925188743995, |
|
"loss": 0.019, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 6.862745098039216, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00016625486158773736, |
|
"loss": 0.023, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 6.885544915640675, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.00016614047128803478, |
|
"loss": 0.0216, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 6.908344733242134, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001660260809883322, |
|
"loss": 0.0235, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 6.931144550843594, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0001659116906886296, |
|
"loss": 0.0281, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 6.953944368445052, |
|
"grad_norm": 0.5, |
|
"learning_rate": 0.00016579730038892703, |
|
"loss": 0.0213, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 6.976744186046512, |
|
"grad_norm": 0.431640625, |
|
"learning_rate": 0.00016568291008922444, |
|
"loss": 0.0207, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 6.999544003647971, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 0.00016556851978952186, |
|
"loss": 0.0197, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 7.02234382124943, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.00016545412948981928, |
|
"loss": 0.0218, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 7.045143638850889, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0001653397391901167, |
|
"loss": 0.0253, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 7.0679434564523484, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001652253488904141, |
|
"loss": 0.0148, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 7.090743274053808, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 0.0001651109585907115, |
|
"loss": 0.0185, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 7.113543091655266, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00016499656829100894, |
|
"loss": 0.0203, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 7.136342909256726, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.00016488217799130633, |
|
"loss": 0.0178, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 7.159142726858185, |
|
"grad_norm": 2.453125, |
|
"learning_rate": 0.00016476778769160377, |
|
"loss": 0.0209, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 7.181942544459645, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.0001646533973919012, |
|
"loss": 0.0165, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 7.204742362061103, |
|
"grad_norm": 0.1787109375, |
|
"learning_rate": 0.00016453900709219858, |
|
"loss": 0.017, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 7.227542179662563, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00016442461679249602, |
|
"loss": 0.0156, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 7.250341997264022, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.0001643102264927934, |
|
"loss": 0.0258, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 7.273141814865481, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00016419583619309083, |
|
"loss": 0.0191, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 7.29594163246694, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00016408144589338824, |
|
"loss": 0.0158, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.3187414500684, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00016396705559368566, |
|
"loss": 0.0166, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 7.341541267669859, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.00016385266529398307, |
|
"loss": 0.0178, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 7.364341085271318, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0001637382749942805, |
|
"loss": 0.0216, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 7.387140902872777, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.0001636238846945779, |
|
"loss": 0.0199, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 7.4099407204742365, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00016350949439487532, |
|
"loss": 0.0146, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 7.432740538075695, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00016339510409517274, |
|
"loss": 0.0193, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 7.4555403556771545, |
|
"grad_norm": 0.427734375, |
|
"learning_rate": 0.00016328071379547015, |
|
"loss": 0.0172, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 7.478340173278614, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00016316632349576757, |
|
"loss": 0.0154, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 7.501139990880073, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.000163051933196065, |
|
"loss": 0.0201, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 7.523939808481532, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001629375428963624, |
|
"loss": 0.0187, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 7.546739626082991, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00016282315259665982, |
|
"loss": 0.02, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 7.569539443684451, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.0001627087622969572, |
|
"loss": 0.0152, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 7.592339261285909, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00016259437199725465, |
|
"loss": 0.02, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 7.615139078887369, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00016247998169755204, |
|
"loss": 0.0158, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 7.637938896488828, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00016236559139784946, |
|
"loss": 0.0182, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 7.660738714090288, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001622512010981469, |
|
"loss": 0.0176, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 7.683538531691746, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001621368107984443, |
|
"loss": 0.0161, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 7.706338349293206, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00016202242049874173, |
|
"loss": 0.0172, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 7.729138166894665, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.00016190803019903912, |
|
"loss": 0.0189, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 7.751937984496124, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00016179363989933654, |
|
"loss": 0.0191, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.774737802097583, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00016167924959963395, |
|
"loss": 0.0193, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 7.7975376196990425, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00016156485929993137, |
|
"loss": 0.0213, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 7.820337437300502, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0001614504690002288, |
|
"loss": 0.0204, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 7.8431372549019605, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0001613360787005262, |
|
"loss": 0.0197, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 7.86593707250342, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00016122168840082362, |
|
"loss": 0.0164, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 7.888736890104879, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.00016110729810112103, |
|
"loss": 0.0157, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 7.911536707706338, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00016099290780141845, |
|
"loss": 0.0201, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 7.934336525307797, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.00016087851750171586, |
|
"loss": 0.0239, |
|
"step": 17400 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 87720, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 200, |
|
"total_flos": 0.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|