{ "best_metric": null, "best_model_checkpoint": null, "epoch": 24.414529914529915, "eval_steps": 500, "global_step": 18018, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "learning_rate": 2.5527192008879026e-06, "loss": 0.8199, "step": 46 }, { "epoch": 0.39, "learning_rate": 5.105438401775805e-06, "loss": 0.8791, "step": 92 }, { "epoch": 0.59, "learning_rate": 7.658157602663706e-06, "loss": 0.7995, "step": 138 }, { "epoch": 0.79, "learning_rate": 1.021087680355161e-05, "loss": 0.7234, "step": 184 }, { "epoch": 0.98, "learning_rate": 1.2763596004439513e-05, "loss": 0.6077, "step": 230 }, { "epoch": 1.18, "learning_rate": 1.5316315205327412e-05, "loss": 0.5855, "step": 276 }, { "epoch": 1.38, "learning_rate": 1.786903440621532e-05, "loss": 0.5165, "step": 322 }, { "epoch": 1.57, "learning_rate": 2.042175360710322e-05, "loss": 0.5176, "step": 368 }, { "epoch": 1.77, "learning_rate": 2.297447280799112e-05, "loss": 0.4479, "step": 414 }, { "epoch": 1.97, "learning_rate": 2.5527192008879026e-05, "loss": 0.4451, "step": 460 }, { "epoch": 2.16, "learning_rate": 2.807991120976693e-05, "loss": 0.4133, "step": 506 }, { "epoch": 2.36, "learning_rate": 3.0632630410654825e-05, "loss": 0.3851, "step": 552 }, { "epoch": 2.56, "learning_rate": 3.3185349611542734e-05, "loss": 0.3973, "step": 598 }, { "epoch": 2.75, "learning_rate": 3.573806881243064e-05, "loss": 0.3546, "step": 644 }, { "epoch": 2.95, "learning_rate": 3.829078801331853e-05, "loss": 0.329, "step": 690 }, { "epoch": 3.15, "learning_rate": 4.084350721420644e-05, "loss": 0.3476, "step": 736 }, { "epoch": 1.06, "learning_rate": 4.3396226415094345e-05, "loss": 0.3039, "step": 782 }, { "epoch": 1.25, "learning_rate": 4.594894561598224e-05, "loss": 0.3156, "step": 828 }, { "epoch": 1.45, "learning_rate": 4.850166481687014e-05, "loss": 0.3129, "step": 874 }, { "epoch": 1.65, "learning_rate": 5.105438401775805e-05, "loss": 0.2968, "step": 920 }, { "epoch": 1.84, "learning_rate": 5.360710321864595e-05, "loss": 0.3168, "step": 966 }, { "epoch": 2.04, "learning_rate": 5.615982241953386e-05, "loss": 0.3347, "step": 1012 }, { "epoch": 2.24, "learning_rate": 5.871254162042176e-05, "loss": 0.3011, "step": 1058 }, { "epoch": 2.43, "learning_rate": 6.126526082130965e-05, "loss": 0.2856, "step": 1104 }, { "epoch": 2.63, "learning_rate": 6.381798002219757e-05, "loss": 0.2626, "step": 1150 }, { "epoch": 2.82, "learning_rate": 6.637069922308547e-05, "loss": 0.305, "step": 1196 }, { "epoch": 3.02, "learning_rate": 6.892341842397336e-05, "loss": 0.2895, "step": 1242 }, { "epoch": 3.22, "learning_rate": 7.147613762486127e-05, "loss": 0.2791, "step": 1288 }, { "epoch": 3.41, "learning_rate": 7.402885682574918e-05, "loss": 0.2542, "step": 1334 }, { "epoch": 3.61, "learning_rate": 7.658157602663707e-05, "loss": 0.2776, "step": 1380 }, { "epoch": 3.81, "learning_rate": 7.913429522752498e-05, "loss": 0.2573, "step": 1426 }, { "epoch": 4.0, "learning_rate": 8.168701442841288e-05, "loss": 0.2282, "step": 1472 }, { "epoch": 4.2, "learning_rate": 8.423973362930077e-05, "loss": 0.2645, "step": 1518 }, { "epoch": 2.11, "learning_rate": 8.679245283018869e-05, "loss": 0.2183, "step": 1564 }, { "epoch": 2.31, "learning_rate": 8.934517203107659e-05, "loss": 0.2505, "step": 1610 }, { "epoch": 2.5, "learning_rate": 9.189789123196448e-05, "loss": 0.2229, "step": 1656 }, { "epoch": 2.7, "learning_rate": 9.44506104328524e-05, "loss": 0.2404, "step": 1702 }, { "epoch": 2.9, "learning_rate": 9.700332963374029e-05, "loss": 0.2336, "step": 1748 }, { "epoch": 3.09, "learning_rate": 9.955604883462819e-05, "loss": 0.2178, "step": 1794 }, { "epoch": 3.29, "learning_rate": 9.976566354218057e-05, "loss": 0.1597, "step": 1840 }, { "epoch": 3.49, "learning_rate": 9.948199309324125e-05, "loss": 0.1837, "step": 1886 }, { "epoch": 3.68, "learning_rate": 9.919832264430194e-05, "loss": 0.2176, "step": 1932 }, { "epoch": 3.88, "learning_rate": 9.891465219536261e-05, "loss": 0.1992, "step": 1978 }, { "epoch": 4.08, "learning_rate": 9.863098174642329e-05, "loss": 0.1929, "step": 2024 }, { "epoch": 4.27, "learning_rate": 9.834731129748396e-05, "loss": 0.2017, "step": 2070 }, { "epoch": 4.47, "learning_rate": 9.806364084854464e-05, "loss": 0.1916, "step": 2116 }, { "epoch": 4.67, "learning_rate": 9.777997039960533e-05, "loss": 0.205, "step": 2162 }, { "epoch": 4.86, "learning_rate": 9.749629995066602e-05, "loss": 0.2272, "step": 2208 }, { "epoch": 5.06, "learning_rate": 9.72126295017267e-05, "loss": 0.2049, "step": 2254 }, { "epoch": 5.26, "learning_rate": 9.692895905278737e-05, "loss": 0.172, "step": 2300 }, { "epoch": 3.17, "learning_rate": 9.664528860384806e-05, "loss": 0.1737, "step": 2346 }, { "epoch": 3.36, "learning_rate": 9.636161815490874e-05, "loss": 0.1516, "step": 2392 }, { "epoch": 3.56, "learning_rate": 9.607794770596941e-05, "loss": 0.1686, "step": 2438 }, { "epoch": 3.76, "learning_rate": 9.579427725703009e-05, "loss": 0.1555, "step": 2484 }, { "epoch": 3.95, "learning_rate": 9.551060680809078e-05, "loss": 0.1519, "step": 2530 }, { "epoch": 4.15, "learning_rate": 9.522693635915147e-05, "loss": 0.1844, "step": 2576 }, { "epoch": 4.35, "learning_rate": 9.494326591021215e-05, "loss": 0.1493, "step": 2622 }, { "epoch": 4.54, "learning_rate": 9.465959546127282e-05, "loss": 0.155, "step": 2668 }, { "epoch": 4.74, "learning_rate": 9.43759250123335e-05, "loss": 0.153, "step": 2714 }, { "epoch": 4.94, "learning_rate": 9.409225456339417e-05, "loss": 0.157, "step": 2760 }, { "epoch": 5.13, "learning_rate": 9.380858411445486e-05, "loss": 0.1439, "step": 2806 }, { "epoch": 5.33, "learning_rate": 9.352491366551554e-05, "loss": 0.1429, "step": 2852 }, { "epoch": 5.53, "learning_rate": 9.324124321657623e-05, "loss": 0.1609, "step": 2898 }, { "epoch": 5.72, "learning_rate": 9.29575727676369e-05, "loss": 0.1664, "step": 2944 }, { "epoch": 5.92, "learning_rate": 9.26739023186976e-05, "loss": 0.1534, "step": 2990 }, { "epoch": 6.12, "learning_rate": 9.239023186975827e-05, "loss": 0.1518, "step": 3036 }, { "epoch": 4.03, "learning_rate": 9.210656142081895e-05, "loss": 0.1435, "step": 3082 }, { "epoch": 4.22, "learning_rate": 9.182289097187962e-05, "loss": 0.1335, "step": 3128 }, { "epoch": 4.42, "learning_rate": 9.15392205229403e-05, "loss": 0.1231, "step": 3174 }, { "epoch": 4.62, "learning_rate": 9.125555007400099e-05, "loss": 0.1237, "step": 3220 }, { "epoch": 4.81, "learning_rate": 9.097187962506168e-05, "loss": 0.1312, "step": 3266 }, { "epoch": 5.01, "learning_rate": 9.068820917612236e-05, "loss": 0.133, "step": 3312 }, { "epoch": 5.21, "learning_rate": 9.040453872718303e-05, "loss": 0.1324, "step": 3358 }, { "epoch": 5.4, "learning_rate": 9.012086827824372e-05, "loss": 0.1232, "step": 3404 }, { "epoch": 5.6, "learning_rate": 8.98371978293044e-05, "loss": 0.1104, "step": 3450 }, { "epoch": 5.79, "learning_rate": 8.955352738036507e-05, "loss": 0.1166, "step": 3496 }, { "epoch": 5.99, "learning_rate": 8.926985693142575e-05, "loss": 0.1114, "step": 3542 }, { "epoch": 6.19, "learning_rate": 8.898618648248643e-05, "loss": 0.1276, "step": 3588 }, { "epoch": 6.38, "learning_rate": 8.870251603354712e-05, "loss": 0.127, "step": 3634 }, { "epoch": 6.58, "learning_rate": 8.84188455846078e-05, "loss": 0.1255, "step": 3680 }, { "epoch": 6.78, "learning_rate": 8.813517513566848e-05, "loss": 0.1269, "step": 3726 }, { "epoch": 6.97, "learning_rate": 8.785150468672916e-05, "loss": 0.1339, "step": 3772 }, { "epoch": 7.17, "learning_rate": 8.756783423778983e-05, "loss": 0.1326, "step": 3818 }, { "epoch": 5.08, "learning_rate": 8.728416378885052e-05, "loss": 0.1203, "step": 3864 }, { "epoch": 5.28, "learning_rate": 8.70004933399112e-05, "loss": 0.1026, "step": 3910 }, { "epoch": 5.47, "learning_rate": 8.671682289097188e-05, "loss": 0.1252, "step": 3956 }, { "epoch": 5.67, "learning_rate": 8.643315244203257e-05, "loss": 0.1052, "step": 4002 }, { "epoch": 5.87, "learning_rate": 8.614948199309326e-05, "loss": 0.1188, "step": 4048 }, { "epoch": 6.06, "learning_rate": 8.586581154415393e-05, "loss": 0.1113, "step": 4094 }, { "epoch": 6.26, "learning_rate": 8.558214109521461e-05, "loss": 0.0934, "step": 4140 }, { "epoch": 6.46, "learning_rate": 8.529847064627528e-05, "loss": 0.1168, "step": 4186 }, { "epoch": 6.65, "learning_rate": 8.501480019733596e-05, "loss": 0.1113, "step": 4232 }, { "epoch": 6.85, "learning_rate": 8.473112974839665e-05, "loss": 0.0944, "step": 4278 }, { "epoch": 7.05, "learning_rate": 8.444745929945733e-05, "loss": 0.0956, "step": 4324 }, { "epoch": 7.24, "learning_rate": 8.416378885051802e-05, "loss": 0.092, "step": 4370 }, { "epoch": 7.44, "learning_rate": 8.388011840157869e-05, "loss": 0.0899, "step": 4416 }, { "epoch": 7.64, "learning_rate": 8.359644795263937e-05, "loss": 0.111, "step": 4462 }, { "epoch": 7.83, "learning_rate": 8.331277750370006e-05, "loss": 0.114, "step": 4508 }, { "epoch": 8.03, "learning_rate": 8.302910705476073e-05, "loss": 0.0906, "step": 4554 }, { "epoch": 8.23, "learning_rate": 8.274543660582141e-05, "loss": 0.1053, "step": 4600 }, { "epoch": 6.14, "learning_rate": 8.246176615688209e-05, "loss": 0.0983, "step": 4646 }, { "epoch": 6.33, "learning_rate": 8.217809570794278e-05, "loss": 0.0864, "step": 4692 }, { "epoch": 6.53, "learning_rate": 8.189442525900347e-05, "loss": 0.0857, "step": 4738 }, { "epoch": 6.73, "learning_rate": 8.161075481006414e-05, "loss": 0.0979, "step": 4784 }, { "epoch": 6.92, "learning_rate": 8.132708436112482e-05, "loss": 0.0919, "step": 4830 }, { "epoch": 7.12, "learning_rate": 8.10434139121855e-05, "loss": 0.0758, "step": 4876 }, { "epoch": 7.32, "learning_rate": 8.075974346324618e-05, "loss": 0.0948, "step": 4922 }, { "epoch": 7.51, "learning_rate": 8.047607301430686e-05, "loss": 0.0856, "step": 4968 }, { "epoch": 7.71, "learning_rate": 8.019240256536754e-05, "loss": 0.0677, "step": 5014 }, { "epoch": 7.91, "learning_rate": 7.990873211642823e-05, "loss": 0.0855, "step": 5060 }, { "epoch": 8.1, "learning_rate": 7.96250616674889e-05, "loss": 0.0896, "step": 5106 }, { "epoch": 8.3, "learning_rate": 7.934139121854959e-05, "loss": 0.0999, "step": 5152 }, { "epoch": 8.5, "learning_rate": 7.905772076961027e-05, "loss": 0.0866, "step": 5198 }, { "epoch": 8.69, "learning_rate": 7.877405032067094e-05, "loss": 0.0821, "step": 5244 }, { "epoch": 8.89, "learning_rate": 7.849037987173162e-05, "loss": 0.0826, "step": 5290 }, { "epoch": 9.09, "learning_rate": 7.820670942279231e-05, "loss": 0.0872, "step": 5336 }, { "epoch": 9.28, "learning_rate": 7.792303897385299e-05, "loss": 0.1113, "step": 5382 }, { "epoch": 7.19, "learning_rate": 7.763936852491366e-05, "loss": 0.0628, "step": 5428 }, { "epoch": 7.39, "learning_rate": 7.735569807597435e-05, "loss": 0.0747, "step": 5474 }, { "epoch": 7.59, "learning_rate": 7.707202762703503e-05, "loss": 0.0653, "step": 5520 }, { "epoch": 7.78, "learning_rate": 7.678835717809572e-05, "loss": 0.0852, "step": 5566 }, { "epoch": 7.98, "learning_rate": 7.65046867291564e-05, "loss": 0.0661, "step": 5612 }, { "epoch": 8.18, "learning_rate": 7.622101628021707e-05, "loss": 0.0683, "step": 5658 }, { "epoch": 8.37, "learning_rate": 7.593734583127775e-05, "loss": 0.0865, "step": 5704 }, { "epoch": 8.57, "learning_rate": 7.565367538233844e-05, "loss": 0.0722, "step": 5750 }, { "epoch": 8.76, "learning_rate": 7.537000493339911e-05, "loss": 0.0802, "step": 5796 }, { "epoch": 8.96, "learning_rate": 7.50863344844598e-05, "loss": 0.0693, "step": 5842 }, { "epoch": 9.16, "learning_rate": 7.480266403552048e-05, "loss": 0.0817, "step": 5888 }, { "epoch": 9.35, "learning_rate": 7.451899358658115e-05, "loss": 0.0844, "step": 5934 }, { "epoch": 9.55, "learning_rate": 7.423532313764184e-05, "loss": 0.0803, "step": 5980 }, { "epoch": 9.75, "learning_rate": 7.395165268870252e-05, "loss": 0.0815, "step": 6026 }, { "epoch": 9.94, "learning_rate": 7.36679822397632e-05, "loss": 0.0871, "step": 6072 }, { "epoch": 10.14, "learning_rate": 7.338431179082387e-05, "loss": 0.0763, "step": 6118 }, { "epoch": 8.05, "learning_rate": 7.310064134188456e-05, "loss": 0.065, "step": 6164 }, { "epoch": 8.25, "learning_rate": 7.281697089294525e-05, "loss": 0.0629, "step": 6210 }, { "epoch": 8.44, "learning_rate": 7.253330044400593e-05, "loss": 0.0712, "step": 6256 }, { "epoch": 8.64, "learning_rate": 7.22496299950666e-05, "loss": 0.0645, "step": 6302 }, { "epoch": 8.84, "learning_rate": 7.196595954612728e-05, "loss": 0.0725, "step": 6348 }, { "epoch": 9.03, "learning_rate": 7.168228909718797e-05, "loss": 0.0573, "step": 6394 }, { "epoch": 9.23, "learning_rate": 7.139861864824865e-05, "loss": 0.0621, "step": 6440 }, { "epoch": 9.43, "learning_rate": 7.111494819930932e-05, "loss": 0.06, "step": 6486 }, { "epoch": 9.62, "learning_rate": 7.083127775037001e-05, "loss": 0.0646, "step": 6532 }, { "epoch": 9.82, "learning_rate": 7.054760730143069e-05, "loss": 0.0575, "step": 6578 }, { "epoch": 10.02, "learning_rate": 7.026393685249138e-05, "loss": 0.0634, "step": 6624 }, { "epoch": 10.21, "learning_rate": 6.998026640355205e-05, "loss": 0.0568, "step": 6670 }, { "epoch": 10.41, "learning_rate": 6.969659595461273e-05, "loss": 0.0652, "step": 6716 }, { "epoch": 10.61, "learning_rate": 6.941292550567341e-05, "loss": 0.0795, "step": 6762 }, { "epoch": 10.8, "learning_rate": 6.912925505673408e-05, "loss": 0.064, "step": 6808 }, { "epoch": 11.0, "learning_rate": 6.884558460779477e-05, "loss": 0.0846, "step": 6854 }, { "epoch": 11.2, "learning_rate": 6.856191415885546e-05, "loss": 0.0587, "step": 6900 }, { "epoch": 9.11, "learning_rate": 6.827824370991614e-05, "loss": 0.0663, "step": 6946 }, { "epoch": 9.3, "learning_rate": 6.799457326097681e-05, "loss": 0.0534, "step": 6992 }, { "epoch": 9.5, "learning_rate": 6.77109028120375e-05, "loss": 0.0466, "step": 7038 }, { "epoch": 9.7, "learning_rate": 6.742723236309818e-05, "loss": 0.0529, "step": 7084 }, { "epoch": 9.89, "learning_rate": 6.714356191415886e-05, "loss": 0.0595, "step": 7130 }, { "epoch": 10.09, "learning_rate": 6.685989146521953e-05, "loss": 0.0547, "step": 7176 }, { "epoch": 10.29, "learning_rate": 6.657622101628021e-05, "loss": 0.0571, "step": 7222 }, { "epoch": 10.48, "learning_rate": 6.62925505673409e-05, "loss": 0.0601, "step": 7268 }, { "epoch": 10.68, "learning_rate": 6.600888011840159e-05, "loss": 0.0508, "step": 7314 }, { "epoch": 10.88, "learning_rate": 6.572520966946226e-05, "loss": 0.0572, "step": 7360 }, { "epoch": 11.07, "learning_rate": 6.544153922052294e-05, "loss": 0.0591, "step": 7406 }, { "epoch": 11.27, "learning_rate": 6.515786877158362e-05, "loss": 0.0708, "step": 7452 }, { "epoch": 11.47, "learning_rate": 6.487419832264431e-05, "loss": 0.0711, "step": 7498 }, { "epoch": 11.66, "learning_rate": 6.459052787370498e-05, "loss": 0.0514, "step": 7544 }, { "epoch": 11.86, "learning_rate": 6.430685742476566e-05, "loss": 0.0607, "step": 7590 }, { "epoch": 12.06, "learning_rate": 6.402318697582635e-05, "loss": 0.0491, "step": 7636 }, { "epoch": 12.25, "learning_rate": 6.373951652688704e-05, "loss": 0.0577, "step": 7682 }, { "epoch": 10.16, "learning_rate": 6.345584607794771e-05, "loss": 0.0416, "step": 7728 }, { "epoch": 10.36, "learning_rate": 6.317217562900839e-05, "loss": 0.0572, "step": 7774 }, { "epoch": 10.56, "learning_rate": 6.288850518006907e-05, "loss": 0.0554, "step": 7820 }, { "epoch": 10.75, "learning_rate": 6.260483473112974e-05, "loss": 0.0457, "step": 7866 }, { "epoch": 10.95, "learning_rate": 6.232116428219043e-05, "loss": 0.0528, "step": 7912 }, { "epoch": 11.15, "learning_rate": 6.203749383325111e-05, "loss": 0.0524, "step": 7958 }, { "epoch": 11.34, "learning_rate": 6.17538233843118e-05, "loss": 0.0631, "step": 8004 }, { "epoch": 11.54, "learning_rate": 6.147015293537247e-05, "loss": 0.042, "step": 8050 }, { "epoch": 11.74, "learning_rate": 6.118648248643315e-05, "loss": 0.0422, "step": 8096 }, { "epoch": 11.93, "learning_rate": 6.090281203749384e-05, "loss": 0.0511, "step": 8142 }, { "epoch": 12.13, "learning_rate": 6.061914158855452e-05, "loss": 0.0526, "step": 8188 }, { "epoch": 12.32, "learning_rate": 6.033547113961519e-05, "loss": 0.0454, "step": 8234 }, { "epoch": 12.52, "learning_rate": 6.0051800690675876e-05, "loss": 0.0462, "step": 8280 }, { "epoch": 12.72, "learning_rate": 5.9768130241736566e-05, "loss": 0.0499, "step": 8326 }, { "epoch": 12.91, "learning_rate": 5.948445979279724e-05, "loss": 0.0444, "step": 8372 }, { "epoch": 13.11, "learning_rate": 5.920078934385792e-05, "loss": 0.0462, "step": 8418 }, { "epoch": 11.02, "learning_rate": 5.89171188949186e-05, "loss": 0.0443, "step": 8464 }, { "epoch": 11.22, "learning_rate": 5.863344844597928e-05, "loss": 0.0394, "step": 8510 }, { "epoch": 11.41, "learning_rate": 5.834977799703997e-05, "loss": 0.0463, "step": 8556 }, { "epoch": 11.61, "learning_rate": 5.806610754810064e-05, "loss": 0.0396, "step": 8602 }, { "epoch": 11.81, "learning_rate": 5.7782437099161326e-05, "loss": 0.0409, "step": 8648 }, { "epoch": 12.0, "learning_rate": 5.7498766650222e-05, "loss": 0.0407, "step": 8694 }, { "epoch": 12.2, "learning_rate": 5.721509620128269e-05, "loss": 0.0442, "step": 8740 }, { "epoch": 12.4, "learning_rate": 5.693142575234337e-05, "loss": 0.0437, "step": 8786 }, { "epoch": 12.59, "learning_rate": 5.664775530340405e-05, "loss": 0.0368, "step": 8832 }, { "epoch": 12.79, "learning_rate": 5.636408485446473e-05, "loss": 0.0418, "step": 8878 }, { "epoch": 12.99, "learning_rate": 5.60804144055254e-05, "loss": 0.0412, "step": 8924 }, { "epoch": 13.18, "learning_rate": 5.579674395658609e-05, "loss": 0.0385, "step": 8970 }, { "epoch": 13.38, "learning_rate": 5.5513073507646776e-05, "loss": 0.0469, "step": 9016 }, { "epoch": 13.58, "learning_rate": 5.522940305870745e-05, "loss": 0.0402, "step": 9062 }, { "epoch": 13.77, "learning_rate": 5.494573260976813e-05, "loss": 0.0514, "step": 9108 }, { "epoch": 13.97, "learning_rate": 5.466206216082881e-05, "loss": 0.045, "step": 9154 }, { "epoch": 14.17, "learning_rate": 5.43783917118895e-05, "loss": 0.0523, "step": 9200 }, { "epoch": 12.08, "learning_rate": 5.409472126295018e-05, "loss": 0.0338, "step": 9246 }, { "epoch": 12.27, "learning_rate": 5.381105081401085e-05, "loss": 0.0358, "step": 9292 }, { "epoch": 12.47, "learning_rate": 5.3527380365071536e-05, "loss": 0.0387, "step": 9338 }, { "epoch": 12.67, "learning_rate": 5.3243709916132226e-05, "loss": 0.0289, "step": 9384 }, { "epoch": 12.86, "learning_rate": 5.29600394671929e-05, "loss": 0.0396, "step": 9430 }, { "epoch": 13.06, "learning_rate": 5.267636901825358e-05, "loss": 0.0426, "step": 9476 }, { "epoch": 13.26, "learning_rate": 5.239269856931426e-05, "loss": 0.0345, "step": 9522 }, { "epoch": 13.45, "learning_rate": 5.210902812037494e-05, "loss": 0.0394, "step": 9568 }, { "epoch": 13.65, "learning_rate": 5.182535767143563e-05, "loss": 0.0349, "step": 9614 }, { "epoch": 13.85, "learning_rate": 5.15416872224963e-05, "loss": 0.0322, "step": 9660 }, { "epoch": 14.04, "learning_rate": 5.1258016773556986e-05, "loss": 0.0372, "step": 9706 }, { "epoch": 14.24, "learning_rate": 5.097434632461766e-05, "loss": 0.0334, "step": 9752 }, { "epoch": 14.44, "learning_rate": 5.069067587567834e-05, "loss": 0.0375, "step": 9798 }, { "epoch": 14.63, "learning_rate": 5.040700542673903e-05, "loss": 0.0396, "step": 9844 }, { "epoch": 14.83, "learning_rate": 5.0123334977799704e-05, "loss": 0.0356, "step": 9890 }, { "epoch": 15.03, "learning_rate": 4.983966452886039e-05, "loss": 0.0458, "step": 9936 }, { "epoch": 15.22, "learning_rate": 4.955599407992107e-05, "loss": 0.036, "step": 9982 }, { "epoch": 13.13, "learning_rate": 4.9272323630981746e-05, "loss": 0.028, "step": 10028 }, { "epoch": 13.33, "learning_rate": 4.898865318204243e-05, "loss": 0.0229, "step": 10074 }, { "epoch": 13.53, "learning_rate": 4.870498273310311e-05, "loss": 0.0353, "step": 10120 }, { "epoch": 13.72, "learning_rate": 4.842131228416379e-05, "loss": 0.0278, "step": 10166 }, { "epoch": 13.92, "learning_rate": 4.813764183522447e-05, "loss": 0.0296, "step": 10212 }, { "epoch": 14.12, "learning_rate": 4.7853971386285154e-05, "loss": 0.0284, "step": 10258 }, { "epoch": 14.31, "learning_rate": 4.757030093734584e-05, "loss": 0.0299, "step": 10304 }, { "epoch": 14.51, "learning_rate": 4.728663048840651e-05, "loss": 0.0309, "step": 10350 }, { "epoch": 14.71, "learning_rate": 4.7002960039467196e-05, "loss": 0.0283, "step": 10396 }, { "epoch": 14.9, "learning_rate": 4.671928959052788e-05, "loss": 0.0349, "step": 10442 }, { "epoch": 15.1, "learning_rate": 4.6435619141588555e-05, "loss": 0.0343, "step": 10488 }, { "epoch": 15.29, "learning_rate": 4.615194869264924e-05, "loss": 0.0347, "step": 10534 }, { "epoch": 15.49, "learning_rate": 4.5868278243709914e-05, "loss": 0.035, "step": 10580 }, { "epoch": 15.69, "learning_rate": 4.55846077947706e-05, "loss": 0.0284, "step": 10626 }, { "epoch": 15.88, "learning_rate": 4.530093734583128e-05, "loss": 0.0408, "step": 10672 }, { "epoch": 16.08, "learning_rate": 4.501726689689196e-05, "loss": 0.0386, "step": 10718 }, { "epoch": 16.28, "learning_rate": 4.473359644795264e-05, "loss": 0.0288, "step": 10764 }, { "epoch": 14.19, "learning_rate": 4.444992599901332e-05, "loss": 0.0274, "step": 10810 }, { "epoch": 14.38, "learning_rate": 4.4166255550074005e-05, "loss": 0.0268, "step": 10856 }, { "epoch": 14.58, "learning_rate": 4.388258510113468e-05, "loss": 0.0206, "step": 10902 }, { "epoch": 14.78, "learning_rate": 4.3598914652195364e-05, "loss": 0.0281, "step": 10948 }, { "epoch": 14.97, "learning_rate": 4.331524420325605e-05, "loss": 0.0242, "step": 10994 }, { "epoch": 15.17, "learning_rate": 4.303157375431673e-05, "loss": 0.0246, "step": 11040 }, { "epoch": 15.37, "learning_rate": 4.2747903305377406e-05, "loss": 0.0275, "step": 11086 }, { "epoch": 15.56, "learning_rate": 4.246423285643808e-05, "loss": 0.0259, "step": 11132 }, { "epoch": 15.76, "learning_rate": 4.218056240749877e-05, "loss": 0.0251, "step": 11178 }, { "epoch": 15.96, "learning_rate": 4.189689195855945e-05, "loss": 0.0229, "step": 11224 }, { "epoch": 16.15, "learning_rate": 4.161322150962013e-05, "loss": 0.0257, "step": 11270 }, { "epoch": 16.35, "learning_rate": 4.132955106068081e-05, "loss": 0.0269, "step": 11316 }, { "epoch": 16.55, "learning_rate": 4.10458806117415e-05, "loss": 0.0344, "step": 11362 }, { "epoch": 16.74, "learning_rate": 4.0762210162802173e-05, "loss": 0.0288, "step": 11408 }, { "epoch": 16.94, "learning_rate": 4.047853971386285e-05, "loss": 0.0287, "step": 11454 }, { "epoch": 17.14, "learning_rate": 4.019486926492353e-05, "loss": 0.0315, "step": 11500 }, { "epoch": 15.05, "learning_rate": 3.9911198815984215e-05, "loss": 0.0317, "step": 11546 }, { "epoch": 15.24, "learning_rate": 3.96275283670449e-05, "loss": 0.02, "step": 11592 }, { "epoch": 15.44, "learning_rate": 3.9343857918105575e-05, "loss": 0.0215, "step": 11638 }, { "epoch": 15.64, "learning_rate": 3.906018746916626e-05, "loss": 0.0276, "step": 11684 }, { "epoch": 15.83, "learning_rate": 3.877651702022694e-05, "loss": 0.0207, "step": 11730 }, { "epoch": 16.03, "learning_rate": 3.8492846571287617e-05, "loss": 0.0217, "step": 11776 }, { "epoch": 16.23, "learning_rate": 3.82091761223483e-05, "loss": 0.0254, "step": 11822 }, { "epoch": 16.42, "learning_rate": 3.7925505673408976e-05, "loss": 0.0245, "step": 11868 }, { "epoch": 16.62, "learning_rate": 3.7641835224469665e-05, "loss": 0.0275, "step": 11914 }, { "epoch": 16.82, "learning_rate": 3.735816477553034e-05, "loss": 0.0261, "step": 11960 }, { "epoch": 17.01, "learning_rate": 3.7074494326591025e-05, "loss": 0.0186, "step": 12006 }, { "epoch": 17.21, "learning_rate": 3.67908238776517e-05, "loss": 0.0186, "step": 12052 }, { "epoch": 17.41, "learning_rate": 3.650715342871239e-05, "loss": 0.0221, "step": 12098 }, { "epoch": 17.6, "learning_rate": 3.6223482979773067e-05, "loss": 0.0199, "step": 12144 }, { "epoch": 17.8, "learning_rate": 3.593981253083374e-05, "loss": 0.0241, "step": 12190 }, { "epoch": 18.0, "learning_rate": 3.5656142081894426e-05, "loss": 0.0224, "step": 12236 }, { "epoch": 18.19, "learning_rate": 3.537247163295511e-05, "loss": 0.0236, "step": 12282 }, { "epoch": 16.1, "learning_rate": 3.508880118401579e-05, "loss": 0.0206, "step": 12328 }, { "epoch": 16.3, "learning_rate": 3.480513073507647e-05, "loss": 0.0167, "step": 12374 }, { "epoch": 16.5, "learning_rate": 3.452146028613715e-05, "loss": 0.0209, "step": 12420 }, { "epoch": 16.69, "learning_rate": 3.4237789837197834e-05, "loss": 0.0188, "step": 12466 }, { "epoch": 16.89, "learning_rate": 3.395411938825851e-05, "loss": 0.0189, "step": 12512 }, { "epoch": 17.09, "learning_rate": 3.367044893931919e-05, "loss": 0.0229, "step": 12558 }, { "epoch": 17.28, "learning_rate": 3.338677849037987e-05, "loss": 0.0186, "step": 12604 }, { "epoch": 17.48, "learning_rate": 3.310310804144056e-05, "loss": 0.0218, "step": 12650 }, { "epoch": 17.68, "learning_rate": 3.2819437592501235e-05, "loss": 0.0165, "step": 12696 }, { "epoch": 17.87, "learning_rate": 3.253576714356192e-05, "loss": 0.0175, "step": 12742 }, { "epoch": 18.07, "learning_rate": 3.2252096694622594e-05, "loss": 0.0159, "step": 12788 }, { "epoch": 18.26, "learning_rate": 3.196842624568328e-05, "loss": 0.0174, "step": 12834 }, { "epoch": 18.46, "learning_rate": 3.168475579674396e-05, "loss": 0.0195, "step": 12880 }, { "epoch": 18.66, "learning_rate": 3.1401085347804636e-05, "loss": 0.0203, "step": 12926 }, { "epoch": 18.85, "learning_rate": 3.111741489886532e-05, "loss": 0.019, "step": 12972 }, { "epoch": 19.05, "learning_rate": 3.0833744449926e-05, "loss": 0.0197, "step": 13018 }, { "epoch": 19.25, "learning_rate": 3.0550074000986685e-05, "loss": 0.0221, "step": 13064 }, { "epoch": 17.16, "learning_rate": 3.026640355204736e-05, "loss": 0.0191, "step": 13110 }, { "epoch": 17.35, "learning_rate": 2.998273310310804e-05, "loss": 0.0138, "step": 13156 }, { "epoch": 17.55, "learning_rate": 2.9699062654168723e-05, "loss": 0.0149, "step": 13202 }, { "epoch": 17.75, "learning_rate": 2.9415392205229403e-05, "loss": 0.018, "step": 13248 }, { "epoch": 17.94, "learning_rate": 2.9131721756290086e-05, "loss": 0.0162, "step": 13294 }, { "epoch": 18.14, "learning_rate": 2.8848051307350765e-05, "loss": 0.0153, "step": 13340 }, { "epoch": 18.34, "learning_rate": 2.8564380858411448e-05, "loss": 0.0141, "step": 13386 }, { "epoch": 18.53, "learning_rate": 2.8280710409472128e-05, "loss": 0.013, "step": 13432 }, { "epoch": 18.73, "learning_rate": 2.7997039960532807e-05, "loss": 0.0184, "step": 13478 }, { "epoch": 18.93, "learning_rate": 2.771336951159349e-05, "loss": 0.0149, "step": 13524 }, { "epoch": 19.12, "learning_rate": 2.742969906265417e-05, "loss": 0.0159, "step": 13570 }, { "epoch": 19.32, "learning_rate": 2.7146028613714853e-05, "loss": 0.0171, "step": 13616 }, { "epoch": 19.52, "learning_rate": 2.6862358164775532e-05, "loss": 0.0174, "step": 13662 }, { "epoch": 19.71, "learning_rate": 2.6578687715836215e-05, "loss": 0.0159, "step": 13708 }, { "epoch": 19.91, "learning_rate": 2.629501726689689e-05, "loss": 0.0202, "step": 13754 }, { "epoch": 20.11, "learning_rate": 2.601134681795757e-05, "loss": 0.0182, "step": 13800 }, { "epoch": 18.02, "learning_rate": 2.5727676369018254e-05, "loss": 0.0138, "step": 13846 }, { "epoch": 18.21, "learning_rate": 2.5444005920078933e-05, "loss": 0.0136, "step": 13892 }, { "epoch": 18.41, "learning_rate": 2.5160335471139616e-05, "loss": 0.0135, "step": 13938 }, { "epoch": 18.61, "learning_rate": 2.4876665022200296e-05, "loss": 0.0125, "step": 13984 }, { "epoch": 18.8, "learning_rate": 2.459299457326098e-05, "loss": 0.014, "step": 14030 }, { "epoch": 19.0, "learning_rate": 2.430932412432166e-05, "loss": 0.0137, "step": 14076 }, { "epoch": 19.2, "learning_rate": 2.402565367538234e-05, "loss": 0.0162, "step": 14122 }, { "epoch": 19.39, "learning_rate": 2.374198322644302e-05, "loss": 0.0134, "step": 14168 }, { "epoch": 19.59, "learning_rate": 2.34583127775037e-05, "loss": 0.0113, "step": 14214 }, { "epoch": 19.79, "learning_rate": 2.3174642328564383e-05, "loss": 0.0134, "step": 14260 }, { "epoch": 19.98, "learning_rate": 2.2890971879625063e-05, "loss": 0.0135, "step": 14306 }, { "epoch": 20.18, "learning_rate": 2.2607301430685742e-05, "loss": 0.0132, "step": 14352 }, { "epoch": 20.38, "learning_rate": 2.2323630981746425e-05, "loss": 0.015, "step": 14398 }, { "epoch": 20.57, "learning_rate": 2.2039960532807105e-05, "loss": 0.0117, "step": 14444 }, { "epoch": 20.77, "learning_rate": 2.1756290083867788e-05, "loss": 0.0148, "step": 14490 }, { "epoch": 20.97, "learning_rate": 2.1472619634928467e-05, "loss": 0.0168, "step": 14536 }, { "epoch": 21.16, "learning_rate": 2.118894918598915e-05, "loss": 0.0139, "step": 14582 }, { "epoch": 19.07, "learning_rate": 2.0905278737049827e-05, "loss": 0.0112, "step": 14628 }, { "epoch": 19.27, "learning_rate": 2.062160828811051e-05, "loss": 0.0094, "step": 14674 }, { "epoch": 19.47, "learning_rate": 2.033793783917119e-05, "loss": 0.0092, "step": 14720 }, { "epoch": 19.66, "learning_rate": 2.0054267390231872e-05, "loss": 0.0111, "step": 14766 }, { "epoch": 19.86, "learning_rate": 1.977059694129255e-05, "loss": 0.0112, "step": 14812 }, { "epoch": 20.06, "learning_rate": 1.9486926492353234e-05, "loss": 0.0127, "step": 14858 }, { "epoch": 20.25, "learning_rate": 1.9203256043413914e-05, "loss": 0.0105, "step": 14904 }, { "epoch": 20.45, "learning_rate": 1.8919585594474594e-05, "loss": 0.0142, "step": 14950 }, { "epoch": 20.65, "learning_rate": 1.8635915145535273e-05, "loss": 0.0106, "step": 14996 }, { "epoch": 20.84, "learning_rate": 1.8352244696595956e-05, "loss": 0.0115, "step": 15042 }, { "epoch": 21.04, "learning_rate": 1.8068574247656636e-05, "loss": 0.0114, "step": 15088 }, { "epoch": 21.24, "learning_rate": 1.778490379871732e-05, "loss": 0.012, "step": 15134 }, { "epoch": 21.43, "learning_rate": 1.7501233349777998e-05, "loss": 0.0111, "step": 15180 }, { "epoch": 21.63, "learning_rate": 1.721756290083868e-05, "loss": 0.0127, "step": 15226 }, { "epoch": 21.82, "learning_rate": 1.693389245189936e-05, "loss": 0.0108, "step": 15272 }, { "epoch": 22.02, "learning_rate": 1.665022200296004e-05, "loss": 0.0123, "step": 15318 }, { "epoch": 22.22, "learning_rate": 1.636655155402072e-05, "loss": 0.0116, "step": 15364 }, { "epoch": 20.13, "learning_rate": 1.6082881105081403e-05, "loss": 0.0115, "step": 15410 }, { "epoch": 20.32, "learning_rate": 1.5799210656142082e-05, "loss": 0.0083, "step": 15456 }, { "epoch": 20.52, "learning_rate": 1.5515540207202765e-05, "loss": 0.0094, "step": 15502 }, { "epoch": 20.72, "learning_rate": 1.5231869758263445e-05, "loss": 0.0087, "step": 15548 }, { "epoch": 20.91, "learning_rate": 1.4948199309324126e-05, "loss": 0.0119, "step": 15594 }, { "epoch": 21.11, "learning_rate": 1.4664528860384805e-05, "loss": 0.009, "step": 15640 }, { "epoch": 21.31, "learning_rate": 1.4380858411445487e-05, "loss": 0.0113, "step": 15686 }, { "epoch": 21.5, "learning_rate": 1.4097187962506166e-05, "loss": 0.0096, "step": 15732 }, { "epoch": 21.7, "learning_rate": 1.3813517513566847e-05, "loss": 0.0103, "step": 15778 }, { "epoch": 21.9, "learning_rate": 1.3529847064627529e-05, "loss": 0.0089, "step": 15824 }, { "epoch": 22.09, "learning_rate": 1.324617661568821e-05, "loss": 0.0082, "step": 15870 }, { "epoch": 22.29, "learning_rate": 1.2962506166748891e-05, "loss": 0.01, "step": 15916 }, { "epoch": 22.49, "learning_rate": 1.267883571780957e-05, "loss": 0.0084, "step": 15962 }, { "epoch": 22.68, "learning_rate": 1.2395165268870252e-05, "loss": 0.0093, "step": 16008 }, { "epoch": 22.88, "learning_rate": 1.2111494819930933e-05, "loss": 0.0104, "step": 16054 }, { "epoch": 23.08, "learning_rate": 1.1827824370991614e-05, "loss": 0.0117, "step": 16100 }, { "epoch": 23.27, "learning_rate": 1.1544153922052296e-05, "loss": 0.0095, "step": 16146 }, { "epoch": 21.18, "learning_rate": 1.1260483473112975e-05, "loss": 0.0085, "step": 16192 }, { "epoch": 21.38, "learning_rate": 1.0976813024173656e-05, "loss": 0.0083, "step": 16238 }, { "epoch": 21.58, "learning_rate": 1.0693142575234338e-05, "loss": 0.0081, "step": 16284 }, { "epoch": 21.77, "learning_rate": 1.0409472126295019e-05, "loss": 0.0068, "step": 16330 }, { "epoch": 21.97, "learning_rate": 1.0125801677355699e-05, "loss": 0.0098, "step": 16376 }, { "epoch": 22.17, "learning_rate": 9.84213122841638e-06, "loss": 0.0087, "step": 16422 }, { "epoch": 22.36, "learning_rate": 9.558460779477061e-06, "loss": 0.007, "step": 16468 }, { "epoch": 22.56, "learning_rate": 9.274790330537742e-06, "loss": 0.0085, "step": 16514 }, { "epoch": 22.76, "learning_rate": 8.991119881598422e-06, "loss": 0.0094, "step": 16560 }, { "epoch": 22.95, "learning_rate": 8.707449432659103e-06, "loss": 0.0077, "step": 16606 }, { "epoch": 23.15, "learning_rate": 8.423778983719784e-06, "loss": 0.0077, "step": 16652 }, { "epoch": 23.35, "learning_rate": 8.140108534780466e-06, "loss": 0.008, "step": 16698 }, { "epoch": 23.54, "learning_rate": 7.856438085841145e-06, "loss": 0.0088, "step": 16744 }, { "epoch": 23.74, "learning_rate": 7.5727676369018255e-06, "loss": 0.0078, "step": 16790 }, { "epoch": 23.94, "learning_rate": 7.289097187962507e-06, "loss": 0.0095, "step": 16836 }, { "epoch": 24.13, "learning_rate": 7.005426739023187e-06, "loss": 0.0079, "step": 16882 }, { "epoch": 22.04, "learning_rate": 6.721756290083868e-06, "loss": 0.0068, "step": 16928 }, { "epoch": 22.24, "learning_rate": 6.438085841144549e-06, "loss": 0.0074, "step": 16974 }, { "epoch": 22.44, "learning_rate": 6.15441539220523e-06, "loss": 0.0095, "step": 17020 }, { "epoch": 22.63, "learning_rate": 5.87074494326591e-06, "loss": 0.007, "step": 17066 }, { "epoch": 22.83, "learning_rate": 5.587074494326592e-06, "loss": 0.0059, "step": 17112 }, { "epoch": 23.03, "learning_rate": 5.303404045387272e-06, "loss": 0.0075, "step": 17158 }, { "epoch": 23.22, "learning_rate": 5.019733596447953e-06, "loss": 0.0069, "step": 17204 }, { "epoch": 23.42, "learning_rate": 4.736063147508634e-06, "loss": 0.0068, "step": 17250 }, { "epoch": 23.62, "learning_rate": 4.452392698569315e-06, "loss": 0.0079, "step": 17296 }, { "epoch": 23.81, "learning_rate": 4.168722249629995e-06, "loss": 0.0068, "step": 17342 }, { "epoch": 24.01, "learning_rate": 3.8850518006906765e-06, "loss": 0.0074, "step": 17388 }, { "epoch": 24.21, "learning_rate": 3.6013813517513565e-06, "loss": 0.0068, "step": 17434 }, { "epoch": 24.4, "learning_rate": 3.3177109028120377e-06, "loss": 0.0076, "step": 17480 }, { "epoch": 24.6, "learning_rate": 3.0340404538727186e-06, "loss": 0.0063, "step": 17526 }, { "epoch": 24.79, "learning_rate": 2.7503700049333994e-06, "loss": 0.0066, "step": 17572 }, { "epoch": 24.99, "learning_rate": 2.46669955599408e-06, "loss": 0.0077, "step": 17618 }, { "epoch": 25.19, "learning_rate": 2.183029107054761e-06, "loss": 0.0077, "step": 17664 }, { "epoch": 23.1, "learning_rate": 1.8993586581154416e-06, "loss": 0.0065, "step": 17710 }, { "epoch": 23.29, "learning_rate": 1.6156882091761224e-06, "loss": 0.007, "step": 17756 }, { "epoch": 23.49, "learning_rate": 1.3320177602368033e-06, "loss": 0.0064, "step": 17802 }, { "epoch": 23.69, "learning_rate": 1.048347311297484e-06, "loss": 0.0066, "step": 17848 }, { "epoch": 23.88, "learning_rate": 7.646768623581648e-07, "loss": 0.0057, "step": 17894 }, { "epoch": 24.08, "learning_rate": 4.810064134188456e-07, "loss": 0.0063, "step": 17940 }, { "epoch": 24.28, "learning_rate": 1.973359644795264e-07, "loss": 0.0057, "step": 17986 } ], "logging_steps": 46, "max_steps": 18018, "num_train_epochs": 77, "save_steps": 500, "total_flos": 7.874587036609413e+17, "trial_name": null, "trial_params": null }