{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.0, "global_step": 98676, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-09, "loss": 10.532, "step": 1 }, { "epoch": 0.09, "learning_rate": 2.5e-06, "loss": 9.367, "step": 500 }, { "epoch": 0.18, "learning_rate": 5e-06, "loss": 7.4277, "step": 1000 }, { "epoch": 0.27, "learning_rate": 7.5e-06, "loss": 6.1401, "step": 1500 }, { "epoch": 0.36, "learning_rate": 1e-05, "loss": 5.8709, "step": 2000 }, { "epoch": 0.46, "learning_rate": 1.25e-05, "loss": 5.753, "step": 2500 }, { "epoch": 0.55, "learning_rate": 1.5e-05, "loss": 5.6777, "step": 3000 }, { "epoch": 0.64, "learning_rate": 1.75e-05, "loss": 5.6151, "step": 3500 }, { "epoch": 0.73, "learning_rate": 2e-05, "loss": 5.5717, "step": 4000 }, { "epoch": 0.82, "learning_rate": 2.25e-05, "loss": 5.5305, "step": 4500 }, { "epoch": 0.91, "learning_rate": 2.5e-05, "loss": 5.4947, "step": 5000 }, { "epoch": 1.0, "learning_rate": 2.7500000000000004e-05, "loss": 5.4688, "step": 5500 }, { "epoch": 1.09, "learning_rate": 3e-05, "loss": 5.4406, "step": 6000 }, { "epoch": 1.19, "learning_rate": 3.2500000000000004e-05, "loss": 5.4163, "step": 6500 }, { "epoch": 1.28, "learning_rate": 3.5e-05, "loss": 5.3942, "step": 7000 }, { "epoch": 1.37, "learning_rate": 3.7500000000000003e-05, "loss": 5.3762, "step": 7500 }, { "epoch": 1.46, "learning_rate": 4e-05, "loss": 5.3524, "step": 8000 }, { "epoch": 1.55, "learning_rate": 4.2495e-05, "loss": 5.338, "step": 8500 }, { "epoch": 1.64, "learning_rate": 4.4995000000000005e-05, "loss": 5.3205, "step": 9000 }, { "epoch": 1.73, "learning_rate": 4.7495e-05, "loss": 5.3096, "step": 9500 }, { "epoch": 1.82, "learning_rate": 4.9995000000000005e-05, "loss": 5.2971, "step": 10000 }, { "epoch": 1.92, "learning_rate": 4.998859263331501e-05, "loss": 5.2822, "step": 10500 }, { "epoch": 2.01, "learning_rate": 4.9977139453912406e-05, "loss": 5.2716, "step": 11000 }, { "epoch": 2.1, "learning_rate": 4.99656862745098e-05, "loss": 5.2599, "step": 11500 }, { "epoch": 2.19, "learning_rate": 4.9954233095107204e-05, "loss": 5.2516, "step": 12000 }, { "epoch": 2.28, "learning_rate": 4.99427799157046e-05, "loss": 5.2391, "step": 12500 }, { "epoch": 2.37, "learning_rate": 4.9931326736301996e-05, "loss": 5.2352, "step": 13000 }, { "epoch": 2.46, "learning_rate": 4.99198964632582e-05, "loss": 5.2247, "step": 13500 }, { "epoch": 2.55, "learning_rate": 4.99084432838556e-05, "loss": 5.2168, "step": 14000 }, { "epoch": 2.65, "learning_rate": 4.9896990104453e-05, "loss": 5.2091, "step": 14500 }, { "epoch": 2.74, "learning_rate": 4.9885536925050395e-05, "loss": 5.2037, "step": 15000 }, { "epoch": 2.83, "learning_rate": 4.98741066520066e-05, "loss": 5.196, "step": 15500 }, { "epoch": 2.92, "learning_rate": 4.9862653472603996e-05, "loss": 5.1892, "step": 16000 }, { "epoch": 3.01, "learning_rate": 4.985120029320139e-05, "loss": 5.1825, "step": 16500 }, { "epoch": 3.1, "learning_rate": 4.98397700201576e-05, "loss": 5.1753, "step": 17000 }, { "epoch": 3.19, "learning_rate": 4.9828316840755e-05, "loss": 5.1722, "step": 17500 }, { "epoch": 3.28, "learning_rate": 4.9816863661352395e-05, "loss": 5.1653, "step": 18000 }, { "epoch": 3.37, "learning_rate": 4.980541048194979e-05, "loss": 5.1603, "step": 18500 }, { "epoch": 3.47, "learning_rate": 4.9793980208905996e-05, "loss": 5.1547, "step": 19000 }, { "epoch": 3.56, "learning_rate": 4.97825270295034e-05, "loss": 5.151, "step": 19500 }, { "epoch": 3.65, "learning_rate": 4.9771073850100794e-05, "loss": 5.1468, "step": 20000 }, { "epoch": 3.74, "learning_rate": 4.975962067069819e-05, "loss": 5.1434, "step": 20500 }, { "epoch": 3.83, "learning_rate": 4.974816749129559e-05, "loss": 5.1388, "step": 21000 }, { "epoch": 3.92, "learning_rate": 4.973671431189299e-05, "loss": 5.133, "step": 21500 }, { "epoch": 4.01, "learning_rate": 4.9725284038849186e-05, "loss": 5.1264, "step": 22000 }, { "epoch": 4.1, "learning_rate": 4.971383085944659e-05, "loss": 5.1248, "step": 22500 }, { "epoch": 4.2, "learning_rate": 4.9702377680043984e-05, "loss": 5.1185, "step": 23000 }, { "epoch": 4.29, "learning_rate": 4.969092450064138e-05, "loss": 5.1139, "step": 23500 }, { "epoch": 4.38, "learning_rate": 4.967947132123878e-05, "loss": 5.1114, "step": 24000 }, { "epoch": 4.47, "learning_rate": 4.966806395455378e-05, "loss": 5.1084, "step": 24500 }, { "epoch": 4.56, "learning_rate": 4.9656610775151186e-05, "loss": 5.1041, "step": 25000 }, { "epoch": 4.65, "learning_rate": 4.964515759574858e-05, "loss": 5.102, "step": 25500 }, { "epoch": 4.74, "learning_rate": 4.963370441634598e-05, "loss": 5.1012, "step": 26000 }, { "epoch": 4.83, "learning_rate": 4.962225123694338e-05, "loss": 5.0961, "step": 26500 }, { "epoch": 4.93, "learning_rate": 4.9610798057540775e-05, "loss": 5.0918, "step": 27000 }, { "epoch": 5.02, "learning_rate": 4.959934487813817e-05, "loss": 5.0823, "step": 27500 }, { "epoch": 5.11, "learning_rate": 4.958789169873557e-05, "loss": 4.7898, "step": 28000 }, { "epoch": 5.2, "learning_rate": 4.957643851933297e-05, "loss": 4.47, "step": 28500 }, { "epoch": 5.29, "learning_rate": 4.9564985339930364e-05, "loss": 4.1839, "step": 29000 }, { "epoch": 5.38, "learning_rate": 4.9553532160527766e-05, "loss": 3.9283, "step": 29500 }, { "epoch": 5.47, "learning_rate": 4.9542101887483965e-05, "loss": 3.3536, "step": 30000 }, { "epoch": 5.56, "learning_rate": 4.953067161444017e-05, "loss": 2.7553, "step": 30500 }, { "epoch": 5.65, "learning_rate": 4.9519218435037566e-05, "loss": 2.3501, "step": 31000 }, { "epoch": 5.75, "learning_rate": 4.950776525563497e-05, "loss": 1.9139, "step": 31500 }, { "epoch": 5.84, "learning_rate": 4.9496312076232364e-05, "loss": 1.6857, "step": 32000 }, { "epoch": 5.93, "learning_rate": 4.948485889682976e-05, "loss": 1.5418, "step": 32500 }, { "epoch": 6.02, "learning_rate": 4.947340571742716e-05, "loss": 1.4448, "step": 33000 }, { "epoch": 6.11, "learning_rate": 4.946195253802456e-05, "loss": 1.3729, "step": 33500 }, { "epoch": 6.2, "learning_rate": 4.945049935862195e-05, "loss": 1.3178, "step": 34000 }, { "epoch": 6.29, "learning_rate": 4.9439046179219356e-05, "loss": 1.2557, "step": 34500 }, { "epoch": 6.38, "learning_rate": 4.942759299981675e-05, "loss": 1.1956, "step": 35000 }, { "epoch": 6.48, "learning_rate": 4.941613982041415e-05, "loss": 1.1306, "step": 35500 }, { "epoch": 6.57, "learning_rate": 4.940470954737035e-05, "loss": 1.0845, "step": 36000 }, { "epoch": 6.66, "learning_rate": 4.939325636796775e-05, "loss": 1.0483, "step": 36500 }, { "epoch": 6.75, "learning_rate": 4.9381803188565143e-05, "loss": 1.0169, "step": 37000 }, { "epoch": 6.84, "learning_rate": 4.9370350009162546e-05, "loss": 0.9886, "step": 37500 }, { "epoch": 6.93, "learning_rate": 4.9358919736118744e-05, "loss": 0.966, "step": 38000 }, { "epoch": 7.02, "learning_rate": 4.934746655671615e-05, "loss": 0.9429, "step": 38500 }, { "epoch": 7.11, "learning_rate": 4.933601337731354e-05, "loss": 0.9261, "step": 39000 }, { "epoch": 7.21, "learning_rate": 4.932456019791094e-05, "loss": 0.908, "step": 39500 }, { "epoch": 7.3, "learning_rate": 4.9313129924867143e-05, "loss": 0.8914, "step": 40000 }, { "epoch": 7.39, "learning_rate": 4.930167674546454e-05, "loss": 0.8789, "step": 40500 }, { "epoch": 7.48, "learning_rate": 4.929022356606194e-05, "loss": 0.8643, "step": 41000 }, { "epoch": 7.57, "learning_rate": 4.927877038665934e-05, "loss": 0.8531, "step": 41500 }, { "epoch": 7.66, "learning_rate": 4.926731720725673e-05, "loss": 0.8418, "step": 42000 }, { "epoch": 7.75, "learning_rate": 4.925588693421294e-05, "loss": 0.83, "step": 42500 }, { "epoch": 7.84, "learning_rate": 4.924443375481034e-05, "loss": 0.8201, "step": 43000 }, { "epoch": 7.94, "learning_rate": 4.9232980575407736e-05, "loss": 0.8115, "step": 43500 }, { "epoch": 8.03, "learning_rate": 4.922152739600513e-05, "loss": 0.803, "step": 44000 }, { "epoch": 8.12, "learning_rate": 4.9210074216602534e-05, "loss": 0.7935, "step": 44500 }, { "epoch": 8.21, "learning_rate": 4.919862103719993e-05, "loss": 0.7857, "step": 45000 }, { "epoch": 8.3, "learning_rate": 4.9187190764156135e-05, "loss": 0.7753, "step": 45500 }, { "epoch": 8.39, "learning_rate": 4.917573758475353e-05, "loss": 0.7683, "step": 46000 }, { "epoch": 8.48, "learning_rate": 4.916428440535093e-05, "loss": 0.761, "step": 46500 }, { "epoch": 8.57, "learning_rate": 4.915283122594833e-05, "loss": 0.7549, "step": 47000 }, { "epoch": 8.66, "learning_rate": 4.914140095290453e-05, "loss": 0.7478, "step": 47500 }, { "epoch": 8.76, "learning_rate": 4.912994777350193e-05, "loss": 0.7404, "step": 48000 }, { "epoch": 8.85, "learning_rate": 4.9118494594099325e-05, "loss": 0.7335, "step": 48500 }, { "epoch": 8.94, "learning_rate": 4.910704141469672e-05, "loss": 0.7283, "step": 49000 }, { "epoch": 9.03, "learning_rate": 4.9095611141652926e-05, "loss": 0.7231, "step": 49500 }, { "epoch": 9.12, "learning_rate": 4.908415796225032e-05, "loss": 0.7156, "step": 50000 }, { "epoch": 9.21, "learning_rate": 4.9072704782847724e-05, "loss": 0.7099, "step": 50500 }, { "epoch": 9.3, "learning_rate": 4.906125160344512e-05, "loss": 0.7071, "step": 51000 }, { "epoch": 9.39, "learning_rate": 4.9049798424042515e-05, "loss": 0.702, "step": 51500 }, { "epoch": 9.49, "learning_rate": 4.903836815099872e-05, "loss": 0.6982, "step": 52000 }, { "epoch": 9.58, "learning_rate": 4.9026914971596116e-05, "loss": 0.6933, "step": 52500 }, { "epoch": 9.67, "learning_rate": 4.901546179219352e-05, "loss": 0.6901, "step": 53000 }, { "epoch": 9.76, "learning_rate": 4.9004008612790914e-05, "loss": 0.6854, "step": 53500 }, { "epoch": 9.85, "learning_rate": 4.899255543338831e-05, "loss": 0.6813, "step": 54000 }, { "epoch": 9.94, "learning_rate": 4.8981125160344515e-05, "loss": 0.6762, "step": 54500 }, { "epoch": 10.03, "learning_rate": 4.896967198094191e-05, "loss": 0.6708, "step": 55000 }, { "epoch": 10.12, "learning_rate": 4.8958218801539307e-05, "loss": 0.6687, "step": 55500 }, { "epoch": 10.22, "learning_rate": 4.894676562213671e-05, "loss": 0.6641, "step": 56000 }, { "epoch": 10.31, "learning_rate": 4.8935312442734105e-05, "loss": 0.6594, "step": 56500 }, { "epoch": 10.4, "learning_rate": 4.892388216969031e-05, "loss": 0.6578, "step": 57000 }, { "epoch": 10.49, "learning_rate": 4.8912428990287706e-05, "loss": 0.6542, "step": 57500 }, { "epoch": 10.58, "learning_rate": 4.89009758108851e-05, "loss": 0.6509, "step": 58000 }, { "epoch": 10.67, "learning_rate": 4.8889522631482504e-05, "loss": 0.6471, "step": 58500 }, { "epoch": 10.76, "learning_rate": 4.88780694520799e-05, "loss": 0.6463, "step": 59000 }, { "epoch": 10.85, "learning_rate": 4.8866616272677295e-05, "loss": 0.6418, "step": 59500 }, { "epoch": 10.94, "learning_rate": 4.88551859996335e-05, "loss": 0.6391, "step": 60000 }, { "epoch": 11.04, "learning_rate": 4.8843732820230896e-05, "loss": 0.6354, "step": 60500 }, { "epoch": 11.13, "learning_rate": 4.88322796408283e-05, "loss": 0.6327, "step": 61000 }, { "epoch": 11.22, "learning_rate": 4.8820826461425694e-05, "loss": 0.6292, "step": 61500 }, { "epoch": 11.31, "learning_rate": 4.88094190947407e-05, "loss": 0.6258, "step": 62000 }, { "epoch": 11.4, "learning_rate": 4.879798882169691e-05, "loss": 0.6257, "step": 62500 }, { "epoch": 11.49, "learning_rate": 4.87865356422943e-05, "loss": 0.6221, "step": 63000 }, { "epoch": 11.58, "learning_rate": 4.8775082462891706e-05, "loss": 0.618, "step": 63500 }, { "epoch": 11.67, "learning_rate": 4.87636292834891e-05, "loss": 0.6156, "step": 64000 }, { "epoch": 11.77, "learning_rate": 4.87521761040865e-05, "loss": 0.614, "step": 64500 }, { "epoch": 11.86, "learning_rate": 4.87407229246839e-05, "loss": 0.612, "step": 65000 }, { "epoch": 11.95, "learning_rate": 4.8729269745281295e-05, "loss": 0.6096, "step": 65500 }, { "epoch": 12.04, "learning_rate": 4.871781656587869e-05, "loss": 0.6073, "step": 66000 }, { "epoch": 12.13, "learning_rate": 4.8706386292834896e-05, "loss": 0.6039, "step": 66500 }, { "epoch": 12.22, "learning_rate": 4.869493311343229e-05, "loss": 0.6033, "step": 67000 }, { "epoch": 12.31, "learning_rate": 4.8683479934029694e-05, "loss": 0.6005, "step": 67500 }, { "epoch": 12.4, "learning_rate": 4.867202675462709e-05, "loss": 0.5971, "step": 68000 }, { "epoch": 12.5, "learning_rate": 4.866059648158329e-05, "loss": 0.5933, "step": 68500 }, { "epoch": 12.59, "learning_rate": 4.864914330218069e-05, "loss": 0.5947, "step": 69000 }, { "epoch": 12.68, "learning_rate": 4.8637690122778086e-05, "loss": 0.5928, "step": 69500 }, { "epoch": 12.77, "learning_rate": 4.862623694337549e-05, "loss": 0.5897, "step": 70000 }, { "epoch": 12.86, "learning_rate": 4.8614783763972884e-05, "loss": 0.588, "step": 70500 }, { "epoch": 12.95, "learning_rate": 4.860333058457028e-05, "loss": 0.5862, "step": 71000 }, { "epoch": 13.04, "learning_rate": 4.859187740516768e-05, "loss": 0.5835, "step": 71500 }, { "epoch": 13.13, "learning_rate": 4.858042422576508e-05, "loss": 0.5827, "step": 72000 }, { "epoch": 13.23, "learning_rate": 4.8569016859080086e-05, "loss": 0.581, "step": 72500 }, { "epoch": 13.32, "learning_rate": 4.855756367967748e-05, "loss": 0.5791, "step": 73000 }, { "epoch": 13.41, "learning_rate": 4.854611050027488e-05, "loss": 0.5764, "step": 73500 }, { "epoch": 13.5, "learning_rate": 4.853465732087228e-05, "loss": 0.5749, "step": 74000 }, { "epoch": 13.59, "learning_rate": 4.8523204141469675e-05, "loss": 0.5747, "step": 74500 }, { "epoch": 13.68, "learning_rate": 4.8511773868425874e-05, "loss": 0.5717, "step": 75000 }, { "epoch": 13.77, "learning_rate": 4.8500320689023276e-05, "loss": 0.5716, "step": 75500 }, { "epoch": 13.86, "learning_rate": 4.848886750962067e-05, "loss": 0.5691, "step": 76000 }, { "epoch": 13.95, "learning_rate": 4.847741433021807e-05, "loss": 0.5661, "step": 76500 }, { "epoch": 14.05, "learning_rate": 4.846596115081547e-05, "loss": 0.565, "step": 77000 }, { "epoch": 14.14, "learning_rate": 4.845453087777167e-05, "loss": 0.5627, "step": 77500 }, { "epoch": 14.23, "learning_rate": 4.844307769836907e-05, "loss": 0.563, "step": 78000 }, { "epoch": 14.32, "learning_rate": 4.8431624518966466e-05, "loss": 0.561, "step": 78500 }, { "epoch": 14.41, "learning_rate": 4.842017133956386e-05, "loss": 0.5599, "step": 79000 }, { "epoch": 14.5, "learning_rate": 4.8408718160161264e-05, "loss": 0.5573, "step": 79500 }, { "epoch": 14.59, "learning_rate": 4.839726498075866e-05, "loss": 0.5558, "step": 80000 }, { "epoch": 14.68, "learning_rate": 4.8385834707714865e-05, "loss": 0.5559, "step": 80500 }, { "epoch": 14.78, "learning_rate": 4.837438152831226e-05, "loss": 0.5546, "step": 81000 }, { "epoch": 14.87, "learning_rate": 4.8362928348909656e-05, "loss": 0.5531, "step": 81500 }, { "epoch": 14.96, "learning_rate": 4.835147516950706e-05, "loss": 0.5493, "step": 82000 }, { "epoch": 15.05, "learning_rate": 4.8340021990104454e-05, "loss": 0.5483, "step": 82500 }, { "epoch": 15.14, "learning_rate": 4.832859171706065e-05, "loss": 0.548, "step": 83000 }, { "epoch": 15.23, "learning_rate": 4.8317138537658055e-05, "loss": 0.5462, "step": 83500 }, { "epoch": 15.32, "learning_rate": 4.830568535825545e-05, "loss": 0.5454, "step": 84000 }, { "epoch": 15.41, "learning_rate": 4.829423217885285e-05, "loss": 0.5428, "step": 84500 }, { "epoch": 15.51, "learning_rate": 4.828277899945025e-05, "loss": 0.5428, "step": 85000 }, { "epoch": 15.6, "learning_rate": 4.8271325820047645e-05, "loss": 0.5424, "step": 85500 }, { "epoch": 15.69, "learning_rate": 4.825987264064504e-05, "loss": 0.5421, "step": 86000 }, { "epoch": 15.78, "learning_rate": 4.824841946124244e-05, "loss": 0.5382, "step": 86500 }, { "epoch": 15.87, "learning_rate": 4.823698918819865e-05, "loss": 0.5373, "step": 87000 }, { "epoch": 15.96, "learning_rate": 4.8225536008796044e-05, "loss": 0.5359, "step": 87500 }, { "epoch": 16.05, "learning_rate": 4.821408282939344e-05, "loss": 0.5357, "step": 88000 }, { "epoch": 16.14, "learning_rate": 4.8202629649990835e-05, "loss": 0.5351, "step": 88500 }, { "epoch": 16.23, "learning_rate": 4.819117647058824e-05, "loss": 0.5317, "step": 89000 }, { "epoch": 16.33, "learning_rate": 4.817972329118563e-05, "loss": 0.5333, "step": 89500 }, { "epoch": 16.42, "learning_rate": 4.816829301814184e-05, "loss": 0.5309, "step": 90000 }, { "epoch": 16.51, "learning_rate": 4.815683983873924e-05, "loss": 0.5308, "step": 90500 }, { "epoch": 16.6, "learning_rate": 4.8145386659336636e-05, "loss": 0.5289, "step": 91000 }, { "epoch": 16.69, "learning_rate": 4.813393347993403e-05, "loss": 0.5275, "step": 91500 }, { "epoch": 16.78, "learning_rate": 4.812248030053143e-05, "loss": 0.5269, "step": 92000 }, { "epoch": 16.87, "learning_rate": 4.811102712112883e-05, "loss": 0.5248, "step": 92500 }, { "epoch": 16.96, "learning_rate": 4.8099573941726225e-05, "loss": 0.5242, "step": 93000 }, { "epoch": 17.06, "learning_rate": 4.808814366868243e-05, "loss": 0.5238, "step": 93500 }, { "epoch": 17.15, "learning_rate": 4.8076690489279826e-05, "loss": 0.5239, "step": 94000 }, { "epoch": 17.24, "learning_rate": 4.806523730987723e-05, "loss": 0.5221, "step": 94500 }, { "epoch": 17.33, "learning_rate": 4.8053784130474624e-05, "loss": 0.52, "step": 95000 }, { "epoch": 17.42, "learning_rate": 4.804233095107202e-05, "loss": 0.5184, "step": 95500 }, { "epoch": 17.51, "learning_rate": 4.8030900678028225e-05, "loss": 0.5186, "step": 96000 }, { "epoch": 17.6, "learning_rate": 4.801944749862562e-05, "loss": 0.5176, "step": 96500 }, { "epoch": 17.69, "learning_rate": 4.8008017225581826e-05, "loss": 0.5182, "step": 97000 }, { "epoch": 17.79, "learning_rate": 4.799656404617922e-05, "loss": 0.5148, "step": 97500 }, { "epoch": 17.88, "learning_rate": 4.798511086677662e-05, "loss": 0.5157, "step": 98000 }, { "epoch": 17.97, "learning_rate": 4.797365768737402e-05, "loss": 0.5131, "step": 98500 } ], "max_steps": 2192800, "num_train_epochs": 400, "total_flos": 2.659426878192668e+19, "trial_name": null, "trial_params": null }