{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.999630191191153, "eval_steps": 500, "global_step": 67600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07396176176916534, "grad_norm": 0.1740281581878662, "learning_rate": 9.92603550295858e-05, "loss": 0.1624, "step": 500 }, { "epoch": 0.1479235235383307, "grad_norm": 0.1718689501285553, "learning_rate": 9.85207100591716e-05, "loss": 0.1642, "step": 1000 }, { "epoch": 0.22188528530749602, "grad_norm": 0.2358703315258026, "learning_rate": 9.77810650887574e-05, "loss": 0.1646, "step": 1500 }, { "epoch": 0.2958470470766614, "grad_norm": 0.24712756276130676, "learning_rate": 9.70414201183432e-05, "loss": 0.1648, "step": 2000 }, { "epoch": 0.36980880884582673, "grad_norm": 0.1822899430990219, "learning_rate": 9.6301775147929e-05, "loss": 0.1648, "step": 2500 }, { "epoch": 0.44377057061499203, "grad_norm": 0.17361949384212494, "learning_rate": 9.55621301775148e-05, "loss": 0.1646, "step": 3000 }, { "epoch": 0.5177323323841574, "grad_norm": 0.23278675973415375, "learning_rate": 9.48224852071006e-05, "loss": 0.1645, "step": 3500 }, { "epoch": 0.5916940941533227, "grad_norm": 0.2091914713382721, "learning_rate": 9.408284023668639e-05, "loss": 0.1642, "step": 4000 }, { "epoch": 0.6656558559224881, "grad_norm": 0.1606203019618988, "learning_rate": 9.33431952662722e-05, "loss": 0.164, "step": 4500 }, { "epoch": 0.7396176176916535, "grad_norm": 0.20530381798744202, "learning_rate": 9.260355029585799e-05, "loss": 0.1639, "step": 5000 }, { "epoch": 0.8135793794608187, "grad_norm": 0.17460191249847412, "learning_rate": 9.186390532544379e-05, "loss": 0.164, "step": 5500 }, { "epoch": 0.8875411412299841, "grad_norm": 0.20624151825904846, "learning_rate": 9.112426035502959e-05, "loss": 0.1635, "step": 6000 }, { "epoch": 0.9615029029991494, "grad_norm": 0.1738990843296051, "learning_rate": 9.038461538461538e-05, "loss": 0.1638, "step": 6500 }, { "epoch": 0.9999630191191154, "eval_bleu": 69.5209, "eval_gen_len": 59.3135, "eval_loss": 0.15481509268283844, "eval_runtime": 92133.6666, "eval_samples_per_second": 21.467, "eval_steps_per_second": 0.084, "step": 6760 }, { "epoch": 1.0354646647683148, "grad_norm": 0.18864519894123077, "learning_rate": 8.96449704142012e-05, "loss": 0.1634, "step": 7000 }, { "epoch": 1.10942642653748, "grad_norm": 0.21531303226947784, "learning_rate": 8.8905325443787e-05, "loss": 0.1631, "step": 7500 }, { "epoch": 1.1833881883066455, "grad_norm": 0.19655725359916687, "learning_rate": 8.816568047337278e-05, "loss": 0.163, "step": 8000 }, { "epoch": 1.2573499500758107, "grad_norm": 0.14028051495552063, "learning_rate": 8.742603550295858e-05, "loss": 0.163, "step": 8500 }, { "epoch": 1.3313117118449762, "grad_norm": 0.17167729139328003, "learning_rate": 8.668639053254438e-05, "loss": 0.1629, "step": 9000 }, { "epoch": 1.4052734736141415, "grad_norm": 0.21067723631858826, "learning_rate": 8.594674556213019e-05, "loss": 0.1628, "step": 9500 }, { "epoch": 1.4792352353833067, "grad_norm": 0.16309915482997894, "learning_rate": 8.520710059171599e-05, "loss": 0.1625, "step": 10000 }, { "epoch": 1.5531969971524722, "grad_norm": 0.18199725449085236, "learning_rate": 8.446745562130178e-05, "loss": 0.1627, "step": 10500 }, { "epoch": 1.6271587589216376, "grad_norm": 0.23482349514961243, "learning_rate": 8.372781065088757e-05, "loss": 0.1624, "step": 11000 }, { "epoch": 1.7011205206908029, "grad_norm": 0.19855183362960815, "learning_rate": 8.298816568047337e-05, "loss": 0.1622, "step": 11500 }, { "epoch": 1.7750822824599681, "grad_norm": 0.2019995003938675, "learning_rate": 8.224852071005918e-05, "loss": 0.1623, "step": 12000 }, { "epoch": 1.8490440442291334, "grad_norm": 0.1451653093099594, "learning_rate": 8.150887573964498e-05, "loss": 0.1621, "step": 12500 }, { "epoch": 1.9230058059982988, "grad_norm": 0.21032656729221344, "learning_rate": 8.076923076923078e-05, "loss": 0.1619, "step": 13000 }, { "epoch": 1.9969675677674643, "grad_norm": 0.1798778474330902, "learning_rate": 8.002958579881658e-05, "loss": 0.1619, "step": 13500 }, { "epoch": 1.9999260382382307, "eval_bleu": 69.8274, "eval_gen_len": 59.3128, "eval_loss": 0.15445278584957123, "eval_runtime": 96422.3766, "eval_samples_per_second": 20.512, "eval_steps_per_second": 0.08, "step": 13520 }, { "epoch": 2.0709293295366296, "grad_norm": 0.1496940404176712, "learning_rate": 7.928994082840237e-05, "loss": 0.1612, "step": 14000 }, { "epoch": 2.144891091305795, "grad_norm": 0.1841261237859726, "learning_rate": 7.855029585798817e-05, "loss": 0.1614, "step": 14500 }, { "epoch": 2.21885285307496, "grad_norm": 0.1598207801580429, "learning_rate": 7.781065088757397e-05, "loss": 0.1612, "step": 15000 }, { "epoch": 2.2928146148441257, "grad_norm": 0.15474896132946014, "learning_rate": 7.707100591715977e-05, "loss": 0.1613, "step": 15500 }, { "epoch": 2.366776376613291, "grad_norm": 0.16358235478401184, "learning_rate": 7.633136094674557e-05, "loss": 0.1611, "step": 16000 }, { "epoch": 2.4407381383824562, "grad_norm": 0.13007846474647522, "learning_rate": 7.559171597633137e-05, "loss": 0.161, "step": 16500 }, { "epoch": 2.5146999001516215, "grad_norm": 0.22567081451416016, "learning_rate": 7.485207100591716e-05, "loss": 0.1609, "step": 17000 }, { "epoch": 2.5886616619207867, "grad_norm": 0.13961590826511383, "learning_rate": 7.411242603550296e-05, "loss": 0.1608, "step": 17500 }, { "epoch": 2.6626234236899524, "grad_norm": 0.2529994547367096, "learning_rate": 7.337278106508876e-05, "loss": 0.1609, "step": 18000 }, { "epoch": 2.7365851854591177, "grad_norm": 0.1591728776693344, "learning_rate": 7.263313609467456e-05, "loss": 0.1606, "step": 18500 }, { "epoch": 2.810546947228283, "grad_norm": 0.18489252030849457, "learning_rate": 7.189349112426036e-05, "loss": 0.1604, "step": 19000 }, { "epoch": 2.884508708997448, "grad_norm": 0.1314728558063507, "learning_rate": 7.115384615384616e-05, "loss": 0.1606, "step": 19500 }, { "epoch": 2.9584704707666134, "grad_norm": 0.20559369027614594, "learning_rate": 7.041420118343195e-05, "loss": 0.1603, "step": 20000 }, { "epoch": 2.999889057357346, "eval_bleu": 69.6744, "eval_gen_len": 59.31, "eval_loss": 0.154197096824646, "eval_runtime": 86276.4597, "eval_samples_per_second": 22.924, "eval_steps_per_second": 0.09, "step": 20280 }, { "epoch": 3.032432232535779, "grad_norm": 0.17542998492717743, "learning_rate": 6.967455621301775e-05, "loss": 0.1601, "step": 20500 }, { "epoch": 3.1063939943049443, "grad_norm": 0.1904236227273941, "learning_rate": 6.893491124260355e-05, "loss": 0.1597, "step": 21000 }, { "epoch": 3.1803557560741096, "grad_norm": 0.14015412330627441, "learning_rate": 6.819526627218935e-05, "loss": 0.1599, "step": 21500 }, { "epoch": 3.2543175178432753, "grad_norm": 0.1829299032688141, "learning_rate": 6.745562130177515e-05, "loss": 0.1598, "step": 22000 }, { "epoch": 3.3282792796124405, "grad_norm": 0.1960357278585434, "learning_rate": 6.671597633136095e-05, "loss": 0.1596, "step": 22500 }, { "epoch": 3.4022410413816058, "grad_norm": 0.11597836762666702, "learning_rate": 6.597633136094676e-05, "loss": 0.1596, "step": 23000 }, { "epoch": 3.476202803150771, "grad_norm": 0.16021938621997833, "learning_rate": 6.523668639053254e-05, "loss": 0.1594, "step": 23500 }, { "epoch": 3.5501645649199363, "grad_norm": 0.14418700337409973, "learning_rate": 6.449704142011834e-05, "loss": 0.1596, "step": 24000 }, { "epoch": 3.624126326689102, "grad_norm": 0.15580447018146515, "learning_rate": 6.375739644970414e-05, "loss": 0.1594, "step": 24500 }, { "epoch": 3.698088088458267, "grad_norm": 0.1460428386926651, "learning_rate": 6.301775147928994e-05, "loss": 0.1592, "step": 25000 }, { "epoch": 3.7720498502274324, "grad_norm": 0.16111800074577332, "learning_rate": 6.227810650887575e-05, "loss": 0.1591, "step": 25500 }, { "epoch": 3.8460116119965977, "grad_norm": 0.1610732525587082, "learning_rate": 6.153846153846155e-05, "loss": 0.159, "step": 26000 }, { "epoch": 3.919973373765763, "grad_norm": 0.13734981417655945, "learning_rate": 6.079881656804735e-05, "loss": 0.1591, "step": 26500 }, { "epoch": 3.9939351355349286, "grad_norm": 0.14462892711162567, "learning_rate": 6.005917159763313e-05, "loss": 0.159, "step": 27000 }, { "epoch": 4.0, "eval_bleu": 69.6287, "eval_gen_len": 59.3133, "eval_loss": 0.15394015610218048, "eval_runtime": 90243.9313, "eval_samples_per_second": 21.916, "eval_steps_per_second": 0.086, "step": 27041 }, { "epoch": 4.067896897304093, "grad_norm": 0.19975131750106812, "learning_rate": 5.931952662721894e-05, "loss": 0.1587, "step": 27500 }, { "epoch": 4.141858659073259, "grad_norm": 0.16285482048988342, "learning_rate": 5.8579881656804736e-05, "loss": 0.1586, "step": 28000 }, { "epoch": 4.215820420842425, "grad_norm": 0.18427428603172302, "learning_rate": 5.7840236686390534e-05, "loss": 0.1585, "step": 28500 }, { "epoch": 4.28978218261159, "grad_norm": 0.25297749042510986, "learning_rate": 5.710059171597634e-05, "loss": 0.1584, "step": 29000 }, { "epoch": 4.363743944380755, "grad_norm": 0.15325430035591125, "learning_rate": 5.636094674556214e-05, "loss": 0.1584, "step": 29500 }, { "epoch": 4.43770570614992, "grad_norm": 0.15429024398326874, "learning_rate": 5.562130177514793e-05, "loss": 0.1584, "step": 30000 }, { "epoch": 4.511667467919086, "grad_norm": 0.15342459082603455, "learning_rate": 5.488165680473373e-05, "loss": 0.1583, "step": 30500 }, { "epoch": 4.5856292296882515, "grad_norm": 0.2258235365152359, "learning_rate": 5.4142011834319526e-05, "loss": 0.1582, "step": 31000 }, { "epoch": 4.659590991457416, "grad_norm": 0.14954350888729095, "learning_rate": 5.340236686390533e-05, "loss": 0.1582, "step": 31500 }, { "epoch": 4.733552753226582, "grad_norm": 0.1259755790233612, "learning_rate": 5.266272189349113e-05, "loss": 0.158, "step": 32000 }, { "epoch": 4.807514514995747, "grad_norm": 0.17876796424388885, "learning_rate": 5.192307692307693e-05, "loss": 0.1579, "step": 32500 }, { "epoch": 4.8814762767649125, "grad_norm": 0.13326044380664825, "learning_rate": 5.118343195266272e-05, "loss": 0.1579, "step": 33000 }, { "epoch": 4.955438038534078, "grad_norm": 0.1948281228542328, "learning_rate": 5.044378698224852e-05, "loss": 0.1579, "step": 33500 }, { "epoch": 4.999963019119115, "eval_bleu": 69.763, "eval_gen_len": 59.3153, "eval_loss": 0.15361113846302032, "eval_runtime": 83880.1297, "eval_samples_per_second": 23.579, "eval_steps_per_second": 0.092, "step": 33801 }, { "epoch": 5.029399800303243, "grad_norm": 0.17324520647525787, "learning_rate": 4.970414201183432e-05, "loss": 0.1576, "step": 34000 }, { "epoch": 5.103361562072409, "grad_norm": 0.17606763541698456, "learning_rate": 4.896449704142012e-05, "loss": 0.1574, "step": 34500 }, { "epoch": 5.1773233238415735, "grad_norm": 0.12601391971111298, "learning_rate": 4.822485207100592e-05, "loss": 0.1574, "step": 35000 }, { "epoch": 5.251285085610739, "grad_norm": 0.17805936932563782, "learning_rate": 4.748520710059172e-05, "loss": 0.1574, "step": 35500 }, { "epoch": 5.325246847379905, "grad_norm": 0.11296247690916061, "learning_rate": 4.674556213017752e-05, "loss": 0.1572, "step": 36000 }, { "epoch": 5.39920860914907, "grad_norm": 0.13727295398712158, "learning_rate": 4.6005917159763315e-05, "loss": 0.1574, "step": 36500 }, { "epoch": 5.473170370918235, "grad_norm": 0.13962909579277039, "learning_rate": 4.5266272189349114e-05, "loss": 0.1572, "step": 37000 }, { "epoch": 5.5471321326874, "grad_norm": 0.15142448246479034, "learning_rate": 4.452662721893491e-05, "loss": 0.1572, "step": 37500 }, { "epoch": 5.621093894456566, "grad_norm": 0.1354900598526001, "learning_rate": 4.378698224852072e-05, "loss": 0.1573, "step": 38000 }, { "epoch": 5.6950556562257315, "grad_norm": 0.11066178232431412, "learning_rate": 4.304733727810651e-05, "loss": 0.157, "step": 38500 }, { "epoch": 5.769017417994896, "grad_norm": 0.1964336335659027, "learning_rate": 4.230769230769231e-05, "loss": 0.157, "step": 39000 }, { "epoch": 5.842979179764062, "grad_norm": 0.16445380449295044, "learning_rate": 4.156804733727811e-05, "loss": 0.157, "step": 39500 }, { "epoch": 5.916940941533228, "grad_norm": 0.15109862387180328, "learning_rate": 4.0828402366863904e-05, "loss": 0.1568, "step": 40000 }, { "epoch": 5.9909027033023925, "grad_norm": 0.12768307328224182, "learning_rate": 4.00887573964497e-05, "loss": 0.1569, "step": 40500 }, { "epoch": 5.999926038238231, "eval_bleu": 69.5799, "eval_gen_len": 59.3135, "eval_loss": 0.15333101153373718, "eval_runtime": 83929.6137, "eval_samples_per_second": 23.565, "eval_steps_per_second": 0.092, "step": 40561 }, { "epoch": 6.064864465071558, "grad_norm": 0.1407214105129242, "learning_rate": 3.934911242603551e-05, "loss": 0.1565, "step": 41000 }, { "epoch": 6.138826226840723, "grad_norm": 0.125221386551857, "learning_rate": 3.86094674556213e-05, "loss": 0.1566, "step": 41500 }, { "epoch": 6.212787988609889, "grad_norm": 0.18703357875347137, "learning_rate": 3.7869822485207104e-05, "loss": 0.1563, "step": 42000 }, { "epoch": 6.286749750379054, "grad_norm": 0.11690807342529297, "learning_rate": 3.71301775147929e-05, "loss": 0.1565, "step": 42500 }, { "epoch": 6.360711512148219, "grad_norm": 0.21256986260414124, "learning_rate": 3.63905325443787e-05, "loss": 0.1563, "step": 43000 }, { "epoch": 6.434673273917385, "grad_norm": 0.15973211824893951, "learning_rate": 3.56508875739645e-05, "loss": 0.1563, "step": 43500 }, { "epoch": 6.5086350356865506, "grad_norm": 0.12254957109689713, "learning_rate": 3.49112426035503e-05, "loss": 0.1562, "step": 44000 }, { "epoch": 6.582596797455715, "grad_norm": 0.13712991774082184, "learning_rate": 3.4171597633136096e-05, "loss": 0.1561, "step": 44500 }, { "epoch": 6.656558559224881, "grad_norm": 0.12777595221996307, "learning_rate": 3.3431952662721895e-05, "loss": 0.1562, "step": 45000 }, { "epoch": 6.730520320994046, "grad_norm": 0.11526304483413696, "learning_rate": 3.269230769230769e-05, "loss": 0.1562, "step": 45500 }, { "epoch": 6.8044820827632115, "grad_norm": 0.10418285429477692, "learning_rate": 3.195266272189349e-05, "loss": 0.1562, "step": 46000 }, { "epoch": 6.878443844532377, "grad_norm": 0.13641230762004852, "learning_rate": 3.121301775147929e-05, "loss": 0.1561, "step": 46500 }, { "epoch": 6.952405606301542, "grad_norm": 0.1278037428855896, "learning_rate": 3.047337278106509e-05, "loss": 0.1559, "step": 47000 }, { "epoch": 6.999889057357346, "eval_bleu": 69.8684, "eval_gen_len": 59.3121, "eval_loss": 0.15310530364513397, "eval_runtime": 94874.1952, "eval_samples_per_second": 20.847, "eval_steps_per_second": 0.081, "step": 47321 }, { "epoch": 7.026367368070708, "grad_norm": 0.12435588985681534, "learning_rate": 2.973372781065089e-05, "loss": 0.1558, "step": 47500 }, { "epoch": 7.1003291298398725, "grad_norm": 0.1565004140138626, "learning_rate": 2.8994082840236685e-05, "loss": 0.1556, "step": 48000 }, { "epoch": 7.174290891609038, "grad_norm": 0.16372515261173248, "learning_rate": 2.8254437869822487e-05, "loss": 0.1557, "step": 48500 }, { "epoch": 7.248252653378204, "grad_norm": 0.16113600134849548, "learning_rate": 2.751479289940829e-05, "loss": 0.1556, "step": 49000 }, { "epoch": 7.322214415147369, "grad_norm": 0.12941241264343262, "learning_rate": 2.6775147928994084e-05, "loss": 0.1555, "step": 49500 }, { "epoch": 7.396176176916534, "grad_norm": 0.14799603819847107, "learning_rate": 2.6035502958579882e-05, "loss": 0.1555, "step": 50000 }, { "epoch": 7.470137938685699, "grad_norm": 0.1722722053527832, "learning_rate": 2.5295857988165684e-05, "loss": 0.1555, "step": 50500 }, { "epoch": 7.544099700454865, "grad_norm": 0.1380268782377243, "learning_rate": 2.4556213017751482e-05, "loss": 0.1554, "step": 51000 }, { "epoch": 7.618061462224031, "grad_norm": 0.11100411415100098, "learning_rate": 2.3816568047337277e-05, "loss": 0.1554, "step": 51500 }, { "epoch": 7.692023223993195, "grad_norm": 0.1322886347770691, "learning_rate": 2.307692307692308e-05, "loss": 0.1555, "step": 52000 }, { "epoch": 7.765984985762361, "grad_norm": 0.1415729820728302, "learning_rate": 2.2337278106508877e-05, "loss": 0.1554, "step": 52500 }, { "epoch": 7.839946747531526, "grad_norm": 0.12428125739097595, "learning_rate": 2.1597633136094676e-05, "loss": 0.1553, "step": 53000 }, { "epoch": 7.913908509300692, "grad_norm": 0.11778873205184937, "learning_rate": 2.0857988165680474e-05, "loss": 0.1551, "step": 53500 }, { "epoch": 7.987870271069857, "grad_norm": 0.11119575798511505, "learning_rate": 2.0118343195266273e-05, "loss": 0.1551, "step": 54000 }, { "epoch": 8.0, "eval_bleu": 69.5408, "eval_gen_len": 59.3123, "eval_loss": 0.15284955501556396, "eval_runtime": 85251.0461, "eval_samples_per_second": 23.2, "eval_steps_per_second": 0.091, "step": 54082 }, { "epoch": 8.061832032839023, "grad_norm": 0.17124830186367035, "learning_rate": 1.937869822485207e-05, "loss": 0.155, "step": 54500 }, { "epoch": 8.135793794608187, "grad_norm": 0.11387038975954056, "learning_rate": 1.8639053254437873e-05, "loss": 0.155, "step": 55000 }, { "epoch": 8.209755556377353, "grad_norm": 0.16694188117980957, "learning_rate": 1.7899408284023668e-05, "loss": 0.155, "step": 55500 }, { "epoch": 8.283717318146518, "grad_norm": 0.11885961145162582, "learning_rate": 1.7159763313609466e-05, "loss": 0.1549, "step": 56000 }, { "epoch": 8.357679079915684, "grad_norm": 0.1217753067612648, "learning_rate": 1.6420118343195268e-05, "loss": 0.1548, "step": 56500 }, { "epoch": 8.43164084168485, "grad_norm": 0.1143990233540535, "learning_rate": 1.5680473372781066e-05, "loss": 0.1548, "step": 57000 }, { "epoch": 8.505602603454014, "grad_norm": 0.11847871541976929, "learning_rate": 1.4940828402366867e-05, "loss": 0.1547, "step": 57500 }, { "epoch": 8.57956436522318, "grad_norm": 0.1191825419664383, "learning_rate": 1.4201183431952663e-05, "loss": 0.1547, "step": 58000 }, { "epoch": 8.653526126992345, "grad_norm": 0.1388404816389084, "learning_rate": 1.3461538461538462e-05, "loss": 0.1546, "step": 58500 }, { "epoch": 8.72748788876151, "grad_norm": 0.11382226645946503, "learning_rate": 1.2721893491124262e-05, "loss": 0.1547, "step": 59000 }, { "epoch": 8.801449650530676, "grad_norm": 0.1184522733092308, "learning_rate": 1.198224852071006e-05, "loss": 0.1547, "step": 59500 }, { "epoch": 8.87541141229984, "grad_norm": 0.12202349305152893, "learning_rate": 1.1242603550295859e-05, "loss": 0.1546, "step": 60000 }, { "epoch": 8.949373174069006, "grad_norm": 0.19976742565631866, "learning_rate": 1.0502958579881657e-05, "loss": 0.1545, "step": 60500 }, { "epoch": 8.999963019119116, "eval_bleu": 69.6281, "eval_gen_len": 59.3107, "eval_loss": 0.15265466272830963, "eval_runtime": 91514.132, "eval_samples_per_second": 21.612, "eval_steps_per_second": 0.084, "step": 60842 }, { "epoch": 9.023334935838172, "grad_norm": 0.12483187019824982, "learning_rate": 9.763313609467455e-06, "loss": 0.1546, "step": 61000 }, { "epoch": 9.097296697607337, "grad_norm": 0.10932891070842743, "learning_rate": 9.023668639053255e-06, "loss": 0.1543, "step": 61500 }, { "epoch": 9.171258459376503, "grad_norm": 0.09837622195482254, "learning_rate": 8.284023668639054e-06, "loss": 0.1543, "step": 62000 }, { "epoch": 9.245220221145667, "grad_norm": 0.1304999738931656, "learning_rate": 7.544378698224852e-06, "loss": 0.1544, "step": 62500 }, { "epoch": 9.319181982914833, "grad_norm": 0.15383677184581757, "learning_rate": 6.8047337278106515e-06, "loss": 0.1543, "step": 63000 }, { "epoch": 9.393143744683998, "grad_norm": 0.10340707749128342, "learning_rate": 6.06508875739645e-06, "loss": 0.1542, "step": 63500 }, { "epoch": 9.467105506453164, "grad_norm": 0.09543921798467636, "learning_rate": 5.325443786982249e-06, "loss": 0.1542, "step": 64000 }, { "epoch": 9.54106726822233, "grad_norm": 0.10040238499641418, "learning_rate": 4.5857988165680475e-06, "loss": 0.1542, "step": 64500 }, { "epoch": 9.615029029991494, "grad_norm": 0.13185805082321167, "learning_rate": 3.846153846153847e-06, "loss": 0.1542, "step": 65000 }, { "epoch": 9.68899079176066, "grad_norm": 0.10633710771799088, "learning_rate": 3.106508875739645e-06, "loss": 0.1541, "step": 65500 }, { "epoch": 9.762952553529825, "grad_norm": 0.08551553636789322, "learning_rate": 2.366863905325444e-06, "loss": 0.1543, "step": 66000 }, { "epoch": 9.83691431529899, "grad_norm": 0.1212979108095169, "learning_rate": 1.6272189349112426e-06, "loss": 0.1541, "step": 66500 }, { "epoch": 9.910876077068156, "grad_norm": 0.12985078990459442, "learning_rate": 8.875739644970415e-07, "loss": 0.1544, "step": 67000 }, { "epoch": 9.984837838837322, "grad_norm": 0.09649886190891266, "learning_rate": 1.4792899408284025e-07, "loss": 0.1541, "step": 67500 }, { "epoch": 9.999630191191153, "eval_bleu": 69.7395, "eval_gen_len": 59.3104, "eval_loss": 0.1525367647409439, "eval_runtime": 88843.8291, "eval_samples_per_second": 22.262, "eval_steps_per_second": 0.087, "step": 67600 }, { "epoch": 9.999630191191153, "step": 67600, "total_flos": 5.451344157553459e+17, "train_loss": 0.158306239286118, "train_runtime": 1060442.5654, "train_samples_per_second": 65.278, "train_steps_per_second": 0.064 } ], "logging_steps": 500, "max_steps": 67600, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.451344157553459e+17, "train_batch_size": 256, "trial_name": null, "trial_params": null }