{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2122, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.5460187196731567, "learning_rate": 3.5211267605633804e-06, "loss": 2.7305, "step": 25 }, { "epoch": 0.05, "grad_norm": 1.3390659093856812, "learning_rate": 7.042253521126761e-06, "loss": 2.7192, "step": 50 }, { "epoch": 0.07, "grad_norm": 1.4170739650726318, "learning_rate": 1.056338028169014e-05, "loss": 2.6137, "step": 75 }, { "epoch": 0.09, "grad_norm": 0.9075584411621094, "learning_rate": 1.4084507042253522e-05, "loss": 2.3702, "step": 100 }, { "epoch": 0.12, "grad_norm": 0.8087872862815857, "learning_rate": 1.7605633802816902e-05, "loss": 2.0872, "step": 125 }, { "epoch": 0.14, "grad_norm": 0.6687940359115601, "learning_rate": 2.112676056338028e-05, "loss": 1.9849, "step": 150 }, { "epoch": 0.16, "grad_norm": 0.48965755105018616, "learning_rate": 2.464788732394366e-05, "loss": 1.8513, "step": 175 }, { "epoch": 0.19, "grad_norm": 0.44189536571502686, "learning_rate": 2.8169014084507043e-05, "loss": 1.7901, "step": 200 }, { "epoch": 0.21, "grad_norm": 0.4670378863811493, "learning_rate": 2.9827134625458355e-05, "loss": 1.7426, "step": 225 }, { "epoch": 0.24, "grad_norm": 0.5341612100601196, "learning_rate": 2.9434258774227347e-05, "loss": 1.663, "step": 250 }, { "epoch": 0.26, "grad_norm": 0.32935911417007446, "learning_rate": 2.9041382922996336e-05, "loss": 1.6262, "step": 275 }, { "epoch": 0.28, "grad_norm": 0.49150750041007996, "learning_rate": 2.864850707176532e-05, "loss": 1.5616, "step": 300 }, { "epoch": 0.31, "grad_norm": 0.4343280792236328, "learning_rate": 2.825563122053431e-05, "loss": 1.554, "step": 325 }, { "epoch": 0.33, "grad_norm": 0.46223270893096924, "learning_rate": 2.78627553693033e-05, "loss": 1.4944, "step": 350 }, { "epoch": 0.35, "grad_norm": 0.5165560245513916, "learning_rate": 2.7469879518072288e-05, "loss": 1.4857, "step": 375 }, { "epoch": 0.38, "grad_norm": 0.4836352467536926, "learning_rate": 2.707700366684128e-05, "loss": 1.4545, "step": 400 }, { "epoch": 0.4, "grad_norm": 0.5089160799980164, "learning_rate": 2.668412781561027e-05, "loss": 1.3965, "step": 425 }, { "epoch": 0.42, "grad_norm": 0.4400924742221832, "learning_rate": 2.6291251964379258e-05, "loss": 1.3703, "step": 450 }, { "epoch": 0.45, "grad_norm": 0.520064115524292, "learning_rate": 2.5898376113148247e-05, "loss": 1.3869, "step": 475 }, { "epoch": 0.47, "grad_norm": 0.4993634819984436, "learning_rate": 2.5505500261917232e-05, "loss": 1.3827, "step": 500 }, { "epoch": 0.49, "grad_norm": 0.4747565686702728, "learning_rate": 2.511262441068622e-05, "loss": 1.3494, "step": 525 }, { "epoch": 0.52, "grad_norm": 0.45222511887550354, "learning_rate": 2.4719748559455214e-05, "loss": 1.3295, "step": 550 }, { "epoch": 0.54, "grad_norm": 0.5314898490905762, "learning_rate": 2.4326872708224202e-05, "loss": 1.2971, "step": 575 }, { "epoch": 0.57, "grad_norm": 0.4976591467857361, "learning_rate": 2.393399685699319e-05, "loss": 1.2882, "step": 600 }, { "epoch": 0.59, "grad_norm": 0.5847007632255554, "learning_rate": 2.354112100576218e-05, "loss": 1.3001, "step": 625 }, { "epoch": 0.61, "grad_norm": 0.4775733947753906, "learning_rate": 2.314824515453117e-05, "loss": 1.3185, "step": 650 }, { "epoch": 0.64, "grad_norm": 0.4997495710849762, "learning_rate": 2.2755369303300158e-05, "loss": 1.31, "step": 675 }, { "epoch": 0.66, "grad_norm": 0.46225643157958984, "learning_rate": 2.2362493452069147e-05, "loss": 1.2577, "step": 700 }, { "epoch": 0.68, "grad_norm": 0.5184119939804077, "learning_rate": 2.1969617600838136e-05, "loss": 1.2846, "step": 725 }, { "epoch": 0.71, "grad_norm": 0.5487437844276428, "learning_rate": 2.1576741749607125e-05, "loss": 1.234, "step": 750 }, { "epoch": 0.73, "grad_norm": 0.44502514600753784, "learning_rate": 2.1183865898376113e-05, "loss": 1.2652, "step": 775 }, { "epoch": 0.75, "grad_norm": 0.5340321660041809, "learning_rate": 2.0790990047145102e-05, "loss": 1.2302, "step": 800 }, { "epoch": 0.78, "grad_norm": 0.46381640434265137, "learning_rate": 2.039811419591409e-05, "loss": 1.2402, "step": 825 }, { "epoch": 0.8, "grad_norm": 0.5519339442253113, "learning_rate": 2.0005238344683083e-05, "loss": 1.2924, "step": 850 }, { "epoch": 0.82, "grad_norm": 0.47584566473960876, "learning_rate": 1.9612362493452072e-05, "loss": 1.2526, "step": 875 }, { "epoch": 0.85, "grad_norm": 0.5393884778022766, "learning_rate": 1.9219486642221058e-05, "loss": 1.188, "step": 900 }, { "epoch": 0.87, "grad_norm": 0.5071331858634949, "learning_rate": 1.8826610790990047e-05, "loss": 1.228, "step": 925 }, { "epoch": 0.9, "grad_norm": 0.5538953542709351, "learning_rate": 1.8433734939759036e-05, "loss": 1.238, "step": 950 }, { "epoch": 0.92, "grad_norm": 0.4920578598976135, "learning_rate": 1.8040859088528024e-05, "loss": 1.1884, "step": 975 }, { "epoch": 0.94, "grad_norm": 0.5614542961120605, "learning_rate": 1.7647983237297017e-05, "loss": 1.1734, "step": 1000 }, { "epoch": 0.97, "grad_norm": 0.514637291431427, "learning_rate": 1.7255107386066006e-05, "loss": 1.1938, "step": 1025 }, { "epoch": 0.99, "grad_norm": 0.6818568110466003, "learning_rate": 1.6862231534834994e-05, "loss": 1.2151, "step": 1050 }, { "epoch": 1.01, "grad_norm": 0.5652730464935303, "learning_rate": 1.646935568360398e-05, "loss": 1.1932, "step": 1075 }, { "epoch": 1.04, "grad_norm": 0.46096867322921753, "learning_rate": 1.607647983237297e-05, "loss": 1.1895, "step": 1100 }, { "epoch": 1.06, "grad_norm": 0.5606017112731934, "learning_rate": 1.5683603981141958e-05, "loss": 1.1812, "step": 1125 }, { "epoch": 1.08, "grad_norm": 0.6677452325820923, "learning_rate": 1.5290728129910947e-05, "loss": 1.2344, "step": 1150 }, { "epoch": 1.11, "grad_norm": 0.4959929585456848, "learning_rate": 1.4897852278679937e-05, "loss": 1.1922, "step": 1175 }, { "epoch": 1.13, "grad_norm": 0.5288898348808289, "learning_rate": 1.4504976427448928e-05, "loss": 1.1824, "step": 1200 }, { "epoch": 1.15, "grad_norm": 0.49359196424484253, "learning_rate": 1.4112100576217915e-05, "loss": 1.2163, "step": 1225 }, { "epoch": 1.18, "grad_norm": 0.5195351839065552, "learning_rate": 1.3719224724986904e-05, "loss": 1.1688, "step": 1250 }, { "epoch": 1.2, "grad_norm": 0.5838152766227722, "learning_rate": 1.3326348873755894e-05, "loss": 1.1566, "step": 1275 }, { "epoch": 1.23, "grad_norm": 0.5872088074684143, "learning_rate": 1.2933473022524883e-05, "loss": 1.1685, "step": 1300 }, { "epoch": 1.25, "grad_norm": 0.5522224307060242, "learning_rate": 1.254059717129387e-05, "loss": 1.1745, "step": 1325 }, { "epoch": 1.27, "grad_norm": 0.56575608253479, "learning_rate": 1.2147721320062861e-05, "loss": 1.1895, "step": 1350 }, { "epoch": 1.3, "grad_norm": 0.5466055870056152, "learning_rate": 1.175484546883185e-05, "loss": 1.1445, "step": 1375 }, { "epoch": 1.32, "grad_norm": 0.5417299866676331, "learning_rate": 1.1361969617600839e-05, "loss": 1.2091, "step": 1400 }, { "epoch": 1.34, "grad_norm": 0.6165823936462402, "learning_rate": 1.0969093766369827e-05, "loss": 1.1533, "step": 1425 }, { "epoch": 1.37, "grad_norm": 0.6337828636169434, "learning_rate": 1.0576217915138816e-05, "loss": 1.1721, "step": 1450 }, { "epoch": 1.39, "grad_norm": 0.5324290990829468, "learning_rate": 1.0183342063907805e-05, "loss": 1.2054, "step": 1475 }, { "epoch": 1.41, "grad_norm": 0.5414568185806274, "learning_rate": 9.790466212676796e-06, "loss": 1.164, "step": 1500 }, { "epoch": 1.44, "grad_norm": 0.5330114364624023, "learning_rate": 9.397590361445783e-06, "loss": 1.1632, "step": 1525 }, { "epoch": 1.46, "grad_norm": 0.4752555787563324, "learning_rate": 9.004714510214772e-06, "loss": 1.123, "step": 1550 }, { "epoch": 1.48, "grad_norm": 0.5737998485565186, "learning_rate": 8.61183865898376e-06, "loss": 1.168, "step": 1575 }, { "epoch": 1.51, "grad_norm": 0.5172936320304871, "learning_rate": 8.218962807752751e-06, "loss": 1.1856, "step": 1600 }, { "epoch": 1.53, "grad_norm": 0.6017385721206665, "learning_rate": 7.826086956521738e-06, "loss": 1.1571, "step": 1625 }, { "epoch": 1.56, "grad_norm": 0.6177796125411987, "learning_rate": 7.433211105290728e-06, "loss": 1.166, "step": 1650 }, { "epoch": 1.58, "grad_norm": 0.5164004564285278, "learning_rate": 7.040335254059718e-06, "loss": 1.1979, "step": 1675 }, { "epoch": 1.6, "grad_norm": 0.6410110592842102, "learning_rate": 6.647459402828706e-06, "loss": 1.156, "step": 1700 }, { "epoch": 1.63, "grad_norm": 0.5372936129570007, "learning_rate": 6.254583551597696e-06, "loss": 1.1532, "step": 1725 }, { "epoch": 1.65, "grad_norm": 0.6036843657493591, "learning_rate": 5.8617077003666845e-06, "loss": 1.1793, "step": 1750 }, { "epoch": 1.67, "grad_norm": 0.6881660223007202, "learning_rate": 5.468831849135673e-06, "loss": 1.1709, "step": 1775 }, { "epoch": 1.7, "grad_norm": 0.5717695951461792, "learning_rate": 5.075955997904662e-06, "loss": 1.1441, "step": 1800 }, { "epoch": 1.72, "grad_norm": 0.5951740145683289, "learning_rate": 4.683080146673651e-06, "loss": 1.1855, "step": 1825 }, { "epoch": 1.74, "grad_norm": 0.628299355506897, "learning_rate": 4.29020429544264e-06, "loss": 1.1888, "step": 1850 }, { "epoch": 1.77, "grad_norm": 0.5050541758537292, "learning_rate": 3.89732844421163e-06, "loss": 1.1396, "step": 1875 }, { "epoch": 1.79, "grad_norm": 0.5191190838813782, "learning_rate": 3.504452592980618e-06, "loss": 1.1955, "step": 1900 }, { "epoch": 1.81, "grad_norm": 0.5869964957237244, "learning_rate": 3.111576741749607e-06, "loss": 1.1314, "step": 1925 }, { "epoch": 1.84, "grad_norm": 0.5007169842720032, "learning_rate": 2.718700890518596e-06, "loss": 1.1516, "step": 1950 }, { "epoch": 1.86, "grad_norm": 0.6014929413795471, "learning_rate": 2.325825039287585e-06, "loss": 1.1901, "step": 1975 }, { "epoch": 1.89, "grad_norm": 0.8301869034767151, "learning_rate": 1.932949188056574e-06, "loss": 1.1197, "step": 2000 }, { "epoch": 1.91, "grad_norm": 0.5901632905006409, "learning_rate": 1.5400733368255633e-06, "loss": 1.1843, "step": 2025 }, { "epoch": 1.93, "grad_norm": 0.580962598323822, "learning_rate": 1.1471974855945522e-06, "loss": 1.1456, "step": 2050 }, { "epoch": 1.96, "grad_norm": 0.5481582283973694, "learning_rate": 7.543216343635412e-07, "loss": 1.1644, "step": 2075 }, { "epoch": 1.98, "grad_norm": 0.5194025039672852, "learning_rate": 3.6144578313253016e-07, "loss": 1.1581, "step": 2100 } ], "logging_steps": 25, "max_steps": 2122, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 4558731463360512.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }