{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999818016378526, "eval_steps": 500, "global_step": 2747, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 1.9036518335342407, "learning_rate": 1.8181818181818182e-05, "loss": 1.975, "step": 25 }, { "epoch": 0.02, "grad_norm": 1.182765007019043, "learning_rate": 3.6363636363636364e-05, "loss": 1.7533, "step": 50 }, { "epoch": 0.03, "grad_norm": 1.3171876668930054, "learning_rate": 5.3818181818181827e-05, "loss": 1.4783, "step": 75 }, { "epoch": 0.04, "grad_norm": 1.141636610031128, "learning_rate": 7.2e-05, "loss": 1.2943, "step": 100 }, { "epoch": 0.05, "grad_norm": 1.0257848501205444, "learning_rate": 9.018181818181819e-05, "loss": 1.2977, "step": 125 }, { "epoch": 0.05, "grad_norm": 0.816966712474823, "learning_rate": 0.00010836363636363638, "loss": 1.1967, "step": 150 }, { "epoch": 0.06, "grad_norm": 0.92842036485672, "learning_rate": 0.00012654545454545454, "loss": 1.1982, "step": 175 }, { "epoch": 0.07, "grad_norm": 0.8091335892677307, "learning_rate": 0.00014472727272727274, "loss": 1.2558, "step": 200 }, { "epoch": 0.08, "grad_norm": 0.6926524043083191, "learning_rate": 0.00016290909090909092, "loss": 1.2308, "step": 225 }, { "epoch": 0.09, "grad_norm": 0.8470534086227417, "learning_rate": 0.0001810909090909091, "loss": 1.1464, "step": 250 }, { "epoch": 0.1, "grad_norm": 0.9637011289596558, "learning_rate": 0.00019927272727272727, "loss": 1.2155, "step": 275 }, { "epoch": 0.11, "grad_norm": 0.799651026725769, "learning_rate": 0.00019805825242718447, "loss": 1.1846, "step": 300 }, { "epoch": 0.12, "grad_norm": 0.7091360092163086, "learning_rate": 0.00019603559870550163, "loss": 1.1741, "step": 325 }, { "epoch": 0.13, "grad_norm": 0.7197450995445251, "learning_rate": 0.0001940129449838188, "loss": 1.1863, "step": 350 }, { "epoch": 0.14, "grad_norm": 0.6201531291007996, "learning_rate": 0.00019199029126213592, "loss": 1.1242, "step": 375 }, { "epoch": 0.15, "grad_norm": 0.7758994698524475, "learning_rate": 0.00018996763754045308, "loss": 1.1845, "step": 400 }, { "epoch": 0.15, "grad_norm": 0.676243245601654, "learning_rate": 0.00018794498381877024, "loss": 1.1862, "step": 425 }, { "epoch": 0.16, "grad_norm": 0.6000869274139404, "learning_rate": 0.0001859223300970874, "loss": 1.1684, "step": 450 }, { "epoch": 0.17, "grad_norm": 1.5969446897506714, "learning_rate": 0.00018389967637540456, "loss": 1.0784, "step": 475 }, { "epoch": 0.18, "grad_norm": 0.6639478206634521, "learning_rate": 0.0001818770226537217, "loss": 1.2557, "step": 500 }, { "epoch": 0.19, "grad_norm": 0.7261694669723511, "learning_rate": 0.00017985436893203885, "loss": 1.1934, "step": 525 }, { "epoch": 0.2, "grad_norm": 0.77945876121521, "learning_rate": 0.00017783171521035599, "loss": 1.153, "step": 550 }, { "epoch": 0.21, "grad_norm": 0.9188840985298157, "learning_rate": 0.00017580906148867315, "loss": 1.2041, "step": 575 }, { "epoch": 0.22, "grad_norm": 0.7868269681930542, "learning_rate": 0.00017378640776699028, "loss": 1.0461, "step": 600 }, { "epoch": 0.23, "grad_norm": 0.6817301511764526, "learning_rate": 0.00017176375404530744, "loss": 1.134, "step": 625 }, { "epoch": 0.24, "grad_norm": 0.8629248142242432, "learning_rate": 0.0001697411003236246, "loss": 1.1121, "step": 650 }, { "epoch": 0.25, "grad_norm": 0.8092464208602905, "learning_rate": 0.00016771844660194176, "loss": 1.0449, "step": 675 }, { "epoch": 0.25, "grad_norm": 0.8356920480728149, "learning_rate": 0.00016569579288025892, "loss": 1.1207, "step": 700 }, { "epoch": 0.26, "grad_norm": 0.5683802962303162, "learning_rate": 0.00016367313915857605, "loss": 1.1117, "step": 725 }, { "epoch": 0.27, "grad_norm": 0.6121156215667725, "learning_rate": 0.0001616504854368932, "loss": 1.0837, "step": 750 }, { "epoch": 0.28, "grad_norm": 0.5916587710380554, "learning_rate": 0.00015962783171521037, "loss": 1.1184, "step": 775 }, { "epoch": 0.29, "grad_norm": 0.798072874546051, "learning_rate": 0.00015760517799352753, "loss": 1.1487, "step": 800 }, { "epoch": 0.3, "grad_norm": 0.5768774747848511, "learning_rate": 0.0001555825242718447, "loss": 1.1142, "step": 825 }, { "epoch": 0.31, "grad_norm": 1.1297376155853271, "learning_rate": 0.00015355987055016182, "loss": 1.0542, "step": 850 }, { "epoch": 0.32, "grad_norm": 0.6882333755493164, "learning_rate": 0.00015153721682847896, "loss": 1.1733, "step": 875 }, { "epoch": 0.33, "grad_norm": 0.6317344903945923, "learning_rate": 0.00014951456310679611, "loss": 1.0417, "step": 900 }, { "epoch": 0.34, "grad_norm": 0.6320317387580872, "learning_rate": 0.00014749190938511327, "loss": 1.1673, "step": 925 }, { "epoch": 0.35, "grad_norm": 0.6424859762191772, "learning_rate": 0.00014546925566343043, "loss": 1.0318, "step": 950 }, { "epoch": 0.35, "grad_norm": 0.5994564294815063, "learning_rate": 0.00014344660194174757, "loss": 1.0861, "step": 975 }, { "epoch": 0.36, "grad_norm": 0.7013177871704102, "learning_rate": 0.00014142394822006473, "loss": 1.1187, "step": 1000 }, { "epoch": 0.37, "grad_norm": 0.7184430360794067, "learning_rate": 0.0001394012944983819, "loss": 1.0835, "step": 1025 }, { "epoch": 0.38, "grad_norm": 0.7505134344100952, "learning_rate": 0.00013737864077669905, "loss": 1.0466, "step": 1050 }, { "epoch": 0.39, "grad_norm": 0.8915770053863525, "learning_rate": 0.0001353559870550162, "loss": 1.0865, "step": 1075 }, { "epoch": 0.4, "grad_norm": 0.5262236595153809, "learning_rate": 0.00013333333333333334, "loss": 1.124, "step": 1100 }, { "epoch": 0.41, "grad_norm": 0.7065467238426208, "learning_rate": 0.0001313106796116505, "loss": 1.1761, "step": 1125 }, { "epoch": 0.42, "grad_norm": 0.634323000907898, "learning_rate": 0.00012928802588996763, "loss": 1.1285, "step": 1150 }, { "epoch": 0.43, "grad_norm": 0.6720498204231262, "learning_rate": 0.0001272653721682848, "loss": 1.0726, "step": 1175 }, { "epoch": 0.44, "grad_norm": 0.5689863562583923, "learning_rate": 0.00012524271844660195, "loss": 1.003, "step": 1200 }, { "epoch": 0.45, "grad_norm": 0.6395437717437744, "learning_rate": 0.00012322006472491908, "loss": 1.0641, "step": 1225 }, { "epoch": 0.45, "grad_norm": 0.5762475728988647, "learning_rate": 0.00012119741100323624, "loss": 1.0343, "step": 1250 }, { "epoch": 0.46, "grad_norm": 0.7222539782524109, "learning_rate": 0.0001191747572815534, "loss": 1.0858, "step": 1275 }, { "epoch": 0.47, "grad_norm": 0.7421953678131104, "learning_rate": 0.00011715210355987056, "loss": 1.0676, "step": 1300 }, { "epoch": 0.48, "grad_norm": 0.5756516456604004, "learning_rate": 0.0001151294498381877, "loss": 1.1356, "step": 1325 }, { "epoch": 0.49, "grad_norm": 0.8439106345176697, "learning_rate": 0.00011310679611650486, "loss": 1.0783, "step": 1350 }, { "epoch": 0.5, "grad_norm": 0.6689855456352234, "learning_rate": 0.00011108414239482202, "loss": 1.063, "step": 1375 }, { "epoch": 0.51, "grad_norm": 0.6357136368751526, "learning_rate": 0.00010906148867313916, "loss": 1.1426, "step": 1400 }, { "epoch": 0.52, "grad_norm": 0.5917367935180664, "learning_rate": 0.00010703883495145632, "loss": 1.0824, "step": 1425 }, { "epoch": 0.53, "grad_norm": 0.7326861619949341, "learning_rate": 0.00010501618122977346, "loss": 1.062, "step": 1450 }, { "epoch": 0.54, "grad_norm": 0.7226691246032715, "learning_rate": 0.00010299352750809062, "loss": 1.0033, "step": 1475 }, { "epoch": 0.55, "grad_norm": 1.0422271490097046, "learning_rate": 0.00010097087378640778, "loss": 1.0165, "step": 1500 }, { "epoch": 0.56, "grad_norm": 0.7214282155036926, "learning_rate": 9.894822006472492e-05, "loss": 1.1014, "step": 1525 }, { "epoch": 0.56, "grad_norm": 0.6132606267929077, "learning_rate": 9.692556634304207e-05, "loss": 1.085, "step": 1550 }, { "epoch": 0.57, "grad_norm": 0.6944576501846313, "learning_rate": 9.490291262135923e-05, "loss": 1.1325, "step": 1575 }, { "epoch": 0.58, "grad_norm": 0.5832145810127258, "learning_rate": 9.288025889967637e-05, "loss": 1.0155, "step": 1600 }, { "epoch": 0.59, "grad_norm": 0.8507530689239502, "learning_rate": 9.085760517799353e-05, "loss": 1.0128, "step": 1625 }, { "epoch": 0.6, "grad_norm": 0.6273424625396729, "learning_rate": 8.88349514563107e-05, "loss": 1.1325, "step": 1650 }, { "epoch": 0.61, "grad_norm": 0.7286422848701477, "learning_rate": 8.681229773462784e-05, "loss": 1.0364, "step": 1675 }, { "epoch": 0.62, "grad_norm": 0.7322813272476196, "learning_rate": 8.478964401294499e-05, "loss": 1.0556, "step": 1700 }, { "epoch": 0.63, "grad_norm": 0.8728362917900085, "learning_rate": 8.276699029126213e-05, "loss": 1.0334, "step": 1725 }, { "epoch": 0.64, "grad_norm": 0.621295154094696, "learning_rate": 8.074433656957929e-05, "loss": 1.0005, "step": 1750 }, { "epoch": 0.65, "grad_norm": 0.7512532472610474, "learning_rate": 7.872168284789644e-05, "loss": 1.06, "step": 1775 }, { "epoch": 0.66, "grad_norm": 0.605369508266449, "learning_rate": 7.66990291262136e-05, "loss": 0.9775, "step": 1800 }, { "epoch": 0.66, "grad_norm": 0.6124484539031982, "learning_rate": 7.467637540453075e-05, "loss": 1.0157, "step": 1825 }, { "epoch": 0.67, "grad_norm": 0.7696976661682129, "learning_rate": 7.265372168284789e-05, "loss": 1.0534, "step": 1850 }, { "epoch": 0.68, "grad_norm": 0.8271819353103638, "learning_rate": 7.063106796116505e-05, "loss": 1.0123, "step": 1875 }, { "epoch": 0.69, "grad_norm": 0.6692514419555664, "learning_rate": 6.86084142394822e-05, "loss": 1.0332, "step": 1900 }, { "epoch": 0.7, "grad_norm": 0.6449939012527466, "learning_rate": 6.658576051779936e-05, "loss": 1.1004, "step": 1925 }, { "epoch": 0.71, "grad_norm": 0.7441306114196777, "learning_rate": 6.456310679611652e-05, "loss": 1.0315, "step": 1950 }, { "epoch": 0.72, "grad_norm": 0.5857362747192383, "learning_rate": 6.254045307443366e-05, "loss": 1.1277, "step": 1975 }, { "epoch": 0.73, "grad_norm": 0.49737441539764404, "learning_rate": 6.051779935275082e-05, "loss": 1.017, "step": 2000 }, { "epoch": 0.74, "grad_norm": 0.7484114766120911, "learning_rate": 5.8495145631067963e-05, "loss": 1.0246, "step": 2025 }, { "epoch": 0.75, "grad_norm": 0.8260968923568726, "learning_rate": 5.6472491909385117e-05, "loss": 1.0697, "step": 2050 }, { "epoch": 0.76, "grad_norm": 0.697239100933075, "learning_rate": 5.4449838187702276e-05, "loss": 1.0593, "step": 2075 }, { "epoch": 0.76, "grad_norm": 0.7635684609413147, "learning_rate": 5.2427184466019416e-05, "loss": 1.0806, "step": 2100 }, { "epoch": 0.77, "grad_norm": 0.8260409832000732, "learning_rate": 5.0404530744336576e-05, "loss": 1.0663, "step": 2125 }, { "epoch": 0.78, "grad_norm": 0.6073005199432373, "learning_rate": 4.838187702265373e-05, "loss": 1.0867, "step": 2150 }, { "epoch": 0.79, "grad_norm": 0.5522834658622742, "learning_rate": 4.6359223300970875e-05, "loss": 1.0729, "step": 2175 }, { "epoch": 0.8, "grad_norm": 0.8805132508277893, "learning_rate": 4.433656957928803e-05, "loss": 1.0101, "step": 2200 }, { "epoch": 0.81, "grad_norm": 0.8581610321998596, "learning_rate": 4.231391585760518e-05, "loss": 1.0228, "step": 2225 }, { "epoch": 0.82, "grad_norm": 0.726747989654541, "learning_rate": 4.029126213592233e-05, "loss": 1.0266, "step": 2250 }, { "epoch": 0.83, "grad_norm": 0.5809868574142456, "learning_rate": 3.826860841423948e-05, "loss": 1.0265, "step": 2275 }, { "epoch": 0.84, "grad_norm": 0.6280074715614319, "learning_rate": 3.624595469255664e-05, "loss": 1.0451, "step": 2300 }, { "epoch": 0.85, "grad_norm": 0.8596406579017639, "learning_rate": 3.422330097087379e-05, "loss": 1.0799, "step": 2325 }, { "epoch": 0.86, "grad_norm": 0.7058184742927551, "learning_rate": 3.220064724919094e-05, "loss": 0.9396, "step": 2350 }, { "epoch": 0.86, "grad_norm": 0.7207942605018616, "learning_rate": 3.0177993527508093e-05, "loss": 1.054, "step": 2375 }, { "epoch": 0.87, "grad_norm": 0.91398686170578, "learning_rate": 2.8155339805825243e-05, "loss": 1.013, "step": 2400 }, { "epoch": 0.88, "grad_norm": 0.7698450684547424, "learning_rate": 2.6132686084142393e-05, "loss": 1.0448, "step": 2425 }, { "epoch": 0.89, "grad_norm": 0.7863798141479492, "learning_rate": 2.411003236245955e-05, "loss": 1.0062, "step": 2450 }, { "epoch": 0.9, "grad_norm": 0.6553971767425537, "learning_rate": 2.2087378640776702e-05, "loss": 0.9851, "step": 2475 }, { "epoch": 0.91, "grad_norm": 0.7182110548019409, "learning_rate": 2.0064724919093852e-05, "loss": 1.004, "step": 2500 }, { "epoch": 0.92, "grad_norm": 0.6455408930778503, "learning_rate": 1.8042071197411005e-05, "loss": 1.0134, "step": 2525 }, { "epoch": 0.93, "grad_norm": 0.7096872329711914, "learning_rate": 1.6019417475728158e-05, "loss": 0.9618, "step": 2550 }, { "epoch": 0.94, "grad_norm": 0.5774602890014648, "learning_rate": 1.3996763754045308e-05, "loss": 1.0738, "step": 2575 }, { "epoch": 0.95, "grad_norm": 0.6721768975257874, "learning_rate": 1.197411003236246e-05, "loss": 1.0201, "step": 2600 }, { "epoch": 0.96, "grad_norm": 0.5367263555526733, "learning_rate": 9.951456310679612e-06, "loss": 1.0792, "step": 2625 }, { "epoch": 0.96, "grad_norm": 0.7015663385391235, "learning_rate": 7.928802588996764e-06, "loss": 0.9896, "step": 2650 }, { "epoch": 0.97, "grad_norm": 0.5989572405815125, "learning_rate": 5.906148867313916e-06, "loss": 1.0487, "step": 2675 }, { "epoch": 0.98, "grad_norm": 0.9555190801620483, "learning_rate": 3.883495145631068e-06, "loss": 1.0667, "step": 2700 }, { "epoch": 0.99, "grad_norm": 0.6964536905288696, "learning_rate": 1.8608414239482202e-06, "loss": 1.0087, "step": 2725 } ], "logging_steps": 25, "max_steps": 2747, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 4.805027299118285e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }