{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2167, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00046146746654360867, "grad_norm": 9.51411938145824, "learning_rate": 9.216589861751152e-08, "loss": 1.1509, "step": 1 }, { "epoch": 0.0023073373327180432, "grad_norm": 9.594129890510615, "learning_rate": 4.608294930875577e-07, "loss": 1.1369, "step": 5 }, { "epoch": 0.0046146746654360865, "grad_norm": 5.144220521250804, "learning_rate": 9.216589861751154e-07, "loss": 1.0949, "step": 10 }, { "epoch": 0.00692201199815413, "grad_norm": 3.40382562328973, "learning_rate": 1.382488479262673e-06, "loss": 1.0191, "step": 15 }, { "epoch": 0.009229349330872173, "grad_norm": 3.215915212337555, "learning_rate": 1.8433179723502307e-06, "loss": 1.0296, "step": 20 }, { "epoch": 0.011536686663590217, "grad_norm": 2.650742433924389, "learning_rate": 2.3041474654377884e-06, "loss": 0.976, "step": 25 }, { "epoch": 0.01384402399630826, "grad_norm": 2.7902393320247687, "learning_rate": 2.764976958525346e-06, "loss": 1.0089, "step": 30 }, { "epoch": 0.016151361329026302, "grad_norm": 2.4654209985126148, "learning_rate": 3.225806451612903e-06, "loss": 0.9951, "step": 35 }, { "epoch": 0.018458698661744346, "grad_norm": 2.527650306992279, "learning_rate": 3.6866359447004615e-06, "loss": 0.9988, "step": 40 }, { "epoch": 0.02076603599446239, "grad_norm": 2.655474988385568, "learning_rate": 4.147465437788019e-06, "loss": 1.004, "step": 45 }, { "epoch": 0.023073373327180433, "grad_norm": 2.537803872777302, "learning_rate": 4.608294930875577e-06, "loss": 0.9863, "step": 50 }, { "epoch": 0.025380710659898477, "grad_norm": 2.6464740488754366, "learning_rate": 5.0691244239631346e-06, "loss": 0.9694, "step": 55 }, { "epoch": 0.02768804799261652, "grad_norm": 2.7507142522196566, "learning_rate": 5.529953917050692e-06, "loss": 0.9688, "step": 60 }, { "epoch": 0.029995385325334564, "grad_norm": 2.836749382191462, "learning_rate": 5.9907834101382485e-06, "loss": 0.9968, "step": 65 }, { "epoch": 0.032302722658052604, "grad_norm": 3.048439725741993, "learning_rate": 6.451612903225806e-06, "loss": 1.0023, "step": 70 }, { "epoch": 0.03461005999077065, "grad_norm": 3.072103382083384, "learning_rate": 6.912442396313365e-06, "loss": 0.9909, "step": 75 }, { "epoch": 0.03691739732348869, "grad_norm": 2.4206649546182386, "learning_rate": 7.373271889400923e-06, "loss": 1.0277, "step": 80 }, { "epoch": 0.03922473465620674, "grad_norm": 2.6546178755277254, "learning_rate": 7.83410138248848e-06, "loss": 1.0123, "step": 85 }, { "epoch": 0.04153207198892478, "grad_norm": 2.3888921486796058, "learning_rate": 8.294930875576038e-06, "loss": 0.9688, "step": 90 }, { "epoch": 0.043839409321642826, "grad_norm": 2.32499977135365, "learning_rate": 8.755760368663595e-06, "loss": 0.9903, "step": 95 }, { "epoch": 0.046146746654360866, "grad_norm": 2.392443248966377, "learning_rate": 9.216589861751153e-06, "loss": 0.9893, "step": 100 }, { "epoch": 0.048454083987078914, "grad_norm": 2.680320857358668, "learning_rate": 9.67741935483871e-06, "loss": 0.9846, "step": 105 }, { "epoch": 0.050761421319796954, "grad_norm": 2.532469504703905, "learning_rate": 1.0138248847926269e-05, "loss": 1.0089, "step": 110 }, { "epoch": 0.053068758652515, "grad_norm": 3.1046898569172945, "learning_rate": 1.0599078341013826e-05, "loss": 1.0266, "step": 115 }, { "epoch": 0.05537609598523304, "grad_norm": 2.5574603903328743, "learning_rate": 1.1059907834101385e-05, "loss": 1.002, "step": 120 }, { "epoch": 0.05768343331795108, "grad_norm": 2.452071743693235, "learning_rate": 1.152073732718894e-05, "loss": 1.03, "step": 125 }, { "epoch": 0.05999077065066913, "grad_norm": 2.418236004711402, "learning_rate": 1.1981566820276497e-05, "loss": 1.006, "step": 130 }, { "epoch": 0.06229810798338717, "grad_norm": 2.378674843033103, "learning_rate": 1.2442396313364056e-05, "loss": 0.9717, "step": 135 }, { "epoch": 0.06460544531610521, "grad_norm": 2.288433336347559, "learning_rate": 1.2903225806451613e-05, "loss": 1.0247, "step": 140 }, { "epoch": 0.06691278264882326, "grad_norm": 2.7611308296401282, "learning_rate": 1.3364055299539171e-05, "loss": 1.003, "step": 145 }, { "epoch": 0.0692201199815413, "grad_norm": 2.421569626628109, "learning_rate": 1.382488479262673e-05, "loss": 1.0197, "step": 150 }, { "epoch": 0.07152745731425934, "grad_norm": 2.3978303307399247, "learning_rate": 1.4285714285714287e-05, "loss": 0.9898, "step": 155 }, { "epoch": 0.07383479464697738, "grad_norm": 2.980112762291027, "learning_rate": 1.4746543778801846e-05, "loss": 1.0275, "step": 160 }, { "epoch": 0.07614213197969544, "grad_norm": 2.6176775912790404, "learning_rate": 1.5207373271889403e-05, "loss": 1.0382, "step": 165 }, { "epoch": 0.07844946931241348, "grad_norm": 2.44164739077761, "learning_rate": 1.566820276497696e-05, "loss": 1.0236, "step": 170 }, { "epoch": 0.08075680664513152, "grad_norm": 2.404929271261824, "learning_rate": 1.6129032258064517e-05, "loss": 1.0304, "step": 175 }, { "epoch": 0.08306414397784956, "grad_norm": 2.3566906067196105, "learning_rate": 1.6589861751152075e-05, "loss": 1.0355, "step": 180 }, { "epoch": 0.0853714813105676, "grad_norm": 2.6391147388298246, "learning_rate": 1.705069124423963e-05, "loss": 1.0417, "step": 185 }, { "epoch": 0.08767881864328565, "grad_norm": 2.684091771591401, "learning_rate": 1.751152073732719e-05, "loss": 1.0434, "step": 190 }, { "epoch": 0.08998615597600369, "grad_norm": 2.482023036660176, "learning_rate": 1.7972350230414748e-05, "loss": 1.0638, "step": 195 }, { "epoch": 0.09229349330872173, "grad_norm": 2.4167958259613944, "learning_rate": 1.8433179723502307e-05, "loss": 1.0422, "step": 200 }, { "epoch": 0.09460083064143977, "grad_norm": 2.5336845010658586, "learning_rate": 1.8894009216589862e-05, "loss": 1.0711, "step": 205 }, { "epoch": 0.09690816797415783, "grad_norm": 2.51621301326488, "learning_rate": 1.935483870967742e-05, "loss": 1.0881, "step": 210 }, { "epoch": 0.09921550530687587, "grad_norm": 2.4344384988965175, "learning_rate": 1.981566820276498e-05, "loss": 1.0735, "step": 215 }, { "epoch": 0.10152284263959391, "grad_norm": 2.5052346642024728, "learning_rate": 1.9999883200175286e-05, "loss": 1.0593, "step": 220 }, { "epoch": 0.10383017997231195, "grad_norm": 2.3275673568454986, "learning_rate": 1.9999169433349454e-05, "loss": 1.0766, "step": 225 }, { "epoch": 0.10613751730503, "grad_norm": 2.444969349213072, "learning_rate": 1.9997806834748455e-05, "loss": 1.0805, "step": 230 }, { "epoch": 0.10844485463774804, "grad_norm": 2.5019688689621455, "learning_rate": 1.9995795492789368e-05, "loss": 1.0795, "step": 235 }, { "epoch": 0.11075219197046608, "grad_norm": 2.41754553967893, "learning_rate": 1.9993135537985285e-05, "loss": 1.0419, "step": 240 }, { "epoch": 0.11305952930318412, "grad_norm": 2.4632725320939297, "learning_rate": 1.9989827142936864e-05, "loss": 1.1022, "step": 245 }, { "epoch": 0.11536686663590216, "grad_norm": 2.1283515103999004, "learning_rate": 1.9985870522321118e-05, "loss": 1.0727, "step": 250 }, { "epoch": 0.11767420396862022, "grad_norm": 2.5373992715116316, "learning_rate": 1.9981265932877486e-05, "loss": 1.0595, "step": 255 }, { "epoch": 0.11998154130133826, "grad_norm": 2.382000156883209, "learning_rate": 1.9976013673391185e-05, "loss": 1.0585, "step": 260 }, { "epoch": 0.1222888786340563, "grad_norm": 2.4022206882693857, "learning_rate": 1.9970114084673796e-05, "loss": 1.089, "step": 265 }, { "epoch": 0.12459621596677434, "grad_norm": 2.4577010071803707, "learning_rate": 1.996356754954119e-05, "loss": 1.0971, "step": 270 }, { "epoch": 0.12690355329949238, "grad_norm": 2.973750019978815, "learning_rate": 1.995637449278864e-05, "loss": 1.083, "step": 275 }, { "epoch": 0.12921089063221042, "grad_norm": 2.771443155668778, "learning_rate": 1.994853538116329e-05, "loss": 1.0948, "step": 280 }, { "epoch": 0.13151822796492849, "grad_norm": 2.2945686154224902, "learning_rate": 1.9940050723333867e-05, "loss": 1.0684, "step": 285 }, { "epoch": 0.13382556529764653, "grad_norm": 2.2402037197255864, "learning_rate": 1.9930921069857653e-05, "loss": 1.0605, "step": 290 }, { "epoch": 0.13613290263036457, "grad_norm": 3.0321973969800955, "learning_rate": 1.9921147013144782e-05, "loss": 1.0629, "step": 295 }, { "epoch": 0.1384402399630826, "grad_norm": 2.4295781427476215, "learning_rate": 1.991072918741978e-05, "loss": 1.0353, "step": 300 }, { "epoch": 0.14074757729580065, "grad_norm": 2.5121504192318698, "learning_rate": 1.9899668268680438e-05, "loss": 1.1156, "step": 305 }, { "epoch": 0.1430549146285187, "grad_norm": 2.454863605026453, "learning_rate": 1.988796497465392e-05, "loss": 1.0921, "step": 310 }, { "epoch": 0.14536225196123673, "grad_norm": 2.2532325296884532, "learning_rate": 1.98756200647502e-05, "loss": 1.0683, "step": 315 }, { "epoch": 0.14766958929395477, "grad_norm": 2.3557373973476334, "learning_rate": 1.9862634340012796e-05, "loss": 1.0559, "step": 320 }, { "epoch": 0.1499769266266728, "grad_norm": 2.493215513816939, "learning_rate": 1.9849008643066774e-05, "loss": 1.0725, "step": 325 }, { "epoch": 0.15228426395939088, "grad_norm": 3.2539520651857594, "learning_rate": 1.983474385806408e-05, "loss": 1.0674, "step": 330 }, { "epoch": 0.15459160129210892, "grad_norm": 2.2486130234020165, "learning_rate": 1.9819840910626174e-05, "loss": 1.0705, "step": 335 }, { "epoch": 0.15689893862482696, "grad_norm": 2.237162636902425, "learning_rate": 1.9804300767783958e-05, "loss": 1.0772, "step": 340 }, { "epoch": 0.159206275957545, "grad_norm": 6.51182752032121, "learning_rate": 1.9788124437915034e-05, "loss": 1.0837, "step": 345 }, { "epoch": 0.16151361329026304, "grad_norm": 2.322519163352806, "learning_rate": 1.9771312970678258e-05, "loss": 1.0405, "step": 350 }, { "epoch": 0.16382095062298108, "grad_norm": 2.2704370082078773, "learning_rate": 1.9753867456945653e-05, "loss": 1.0632, "step": 355 }, { "epoch": 0.16612828795569912, "grad_norm": 2.1581678351113602, "learning_rate": 1.9735789028731603e-05, "loss": 1.0818, "step": 360 }, { "epoch": 0.16843562528841716, "grad_norm": 2.0342475926569583, "learning_rate": 1.971707885911941e-05, "loss": 1.0679, "step": 365 }, { "epoch": 0.1707429626211352, "grad_norm": 2.2080262528668957, "learning_rate": 1.9697738162185163e-05, "loss": 1.0813, "step": 370 }, { "epoch": 0.17305029995385326, "grad_norm": 2.627310254608405, "learning_rate": 1.9677768192918973e-05, "loss": 1.0682, "step": 375 }, { "epoch": 0.1753576372865713, "grad_norm": 31.427071093107436, "learning_rate": 1.9657170247143526e-05, "loss": 1.06, "step": 380 }, { "epoch": 0.17766497461928935, "grad_norm": 2.4322378895965016, "learning_rate": 1.9635945661430006e-05, "loss": 1.0648, "step": 385 }, { "epoch": 0.17997231195200739, "grad_norm": 2.446435285466414, "learning_rate": 1.9614095813011366e-05, "loss": 1.0795, "step": 390 }, { "epoch": 0.18227964928472543, "grad_norm": 2.6478307008342745, "learning_rate": 1.9591622119692953e-05, "loss": 1.0655, "step": 395 }, { "epoch": 0.18458698661744347, "grad_norm": 3.5462973637007433, "learning_rate": 1.956852603976052e-05, "loss": 1.074, "step": 400 }, { "epoch": 0.1868943239501615, "grad_norm": 2.1952483345123834, "learning_rate": 1.9544809071885603e-05, "loss": 1.066, "step": 405 }, { "epoch": 0.18920166128287955, "grad_norm": 2.2118543537817397, "learning_rate": 1.9520472755028256e-05, "loss": 1.0567, "step": 410 }, { "epoch": 0.1915089986155976, "grad_norm": 2.1220225112366085, "learning_rate": 1.9495518668337204e-05, "loss": 1.0485, "step": 415 }, { "epoch": 0.19381633594831565, "grad_norm": 2.0821093984759433, "learning_rate": 1.946994843104737e-05, "loss": 1.0374, "step": 420 }, { "epoch": 0.1961236732810337, "grad_norm": 2.0190113966629433, "learning_rate": 1.944376370237481e-05, "loss": 1.064, "step": 425 }, { "epoch": 0.19843101061375173, "grad_norm": 2.056684419876963, "learning_rate": 1.9416966181409047e-05, "loss": 1.0524, "step": 430 }, { "epoch": 0.20073834794646978, "grad_norm": 2.366243541981115, "learning_rate": 1.9389557607002808e-05, "loss": 1.0587, "step": 435 }, { "epoch": 0.20304568527918782, "grad_norm": 2.3366923572205356, "learning_rate": 1.9361539757659212e-05, "loss": 1.104, "step": 440 }, { "epoch": 0.20535302261190586, "grad_norm": 2.135135120153321, "learning_rate": 1.933291445141635e-05, "loss": 1.0838, "step": 445 }, { "epoch": 0.2076603599446239, "grad_norm": 2.1686749928093367, "learning_rate": 1.930368354572932e-05, "loss": 1.0889, "step": 450 }, { "epoch": 0.20996769727734194, "grad_norm": 2.18295591084296, "learning_rate": 1.9273848937349712e-05, "loss": 1.0517, "step": 455 }, { "epoch": 0.21227503461006, "grad_norm": 2.015301818923552, "learning_rate": 1.92434125622025e-05, "loss": 1.0766, "step": 460 }, { "epoch": 0.21458237194277804, "grad_norm": 2.0741949328544984, "learning_rate": 1.9212376395260447e-05, "loss": 1.0896, "step": 465 }, { "epoch": 0.21688970927549608, "grad_norm": 2.15451982178122, "learning_rate": 1.9180742450415962e-05, "loss": 1.0763, "step": 470 }, { "epoch": 0.21919704660821412, "grad_norm": 2.0906260465476967, "learning_rate": 1.9148512780350384e-05, "loss": 1.086, "step": 475 }, { "epoch": 0.22150438394093216, "grad_norm": 2.031559261836197, "learning_rate": 1.9115689476400817e-05, "loss": 1.059, "step": 480 }, { "epoch": 0.2238117212736502, "grad_norm": 2.07650174311531, "learning_rate": 1.9082274668424423e-05, "loss": 1.0679, "step": 485 }, { "epoch": 0.22611905860636825, "grad_norm": 2.126208239890011, "learning_rate": 1.9048270524660197e-05, "loss": 1.0809, "step": 490 }, { "epoch": 0.22842639593908629, "grad_norm": 1.9929349716624978, "learning_rate": 1.9013679251588304e-05, "loss": 1.085, "step": 495 }, { "epoch": 0.23073373327180433, "grad_norm": 3.001431077273745, "learning_rate": 1.8978503093786882e-05, "loss": 1.0558, "step": 500 }, { "epoch": 0.2330410706045224, "grad_norm": 1.9284000626000521, "learning_rate": 1.89427443337864e-05, "loss": 1.0685, "step": 505 }, { "epoch": 0.23534840793724043, "grad_norm": 2.156814659249471, "learning_rate": 1.890640529192155e-05, "loss": 1.0857, "step": 510 }, { "epoch": 0.23765574526995847, "grad_norm": 2.2063349330204174, "learning_rate": 1.8869488326180682e-05, "loss": 1.092, "step": 515 }, { "epoch": 0.23996308260267651, "grad_norm": 1.8963715836357997, "learning_rate": 1.8831995832052802e-05, "loss": 1.0694, "step": 520 }, { "epoch": 0.24227041993539455, "grad_norm": 2.0285632136378613, "learning_rate": 1.8793930242372117e-05, "loss": 1.0795, "step": 525 }, { "epoch": 0.2445777572681126, "grad_norm": 2.099474069037447, "learning_rate": 1.8755294027160203e-05, "loss": 1.0893, "step": 530 }, { "epoch": 0.24688509460083063, "grad_norm": 2.0358502445768165, "learning_rate": 1.8716089693465696e-05, "loss": 1.086, "step": 535 }, { "epoch": 0.24919243193354867, "grad_norm": 2.1218454361521633, "learning_rate": 1.8676319785201617e-05, "loss": 1.0842, "step": 540 }, { "epoch": 0.2514997692662667, "grad_norm": 2.0341225955626583, "learning_rate": 1.8635986882980325e-05, "loss": 1.0625, "step": 545 }, { "epoch": 0.25380710659898476, "grad_norm": 2.3910625184538747, "learning_rate": 1.8595093603946053e-05, "loss": 1.0727, "step": 550 }, { "epoch": 0.2561144439317028, "grad_norm": 1.98644765469211, "learning_rate": 1.855364260160507e-05, "loss": 1.0595, "step": 555 }, { "epoch": 0.25842178126442084, "grad_norm": 2.208738179396901, "learning_rate": 1.851163656565351e-05, "loss": 1.0936, "step": 560 }, { "epoch": 0.2607291185971389, "grad_norm": 2.0209364206754645, "learning_rate": 1.846907822180286e-05, "loss": 1.0684, "step": 565 }, { "epoch": 0.26303645592985697, "grad_norm": 1.925369099665116, "learning_rate": 1.842597033160306e-05, "loss": 1.0669, "step": 570 }, { "epoch": 0.265343793262575, "grad_norm": 2.169504176441067, "learning_rate": 1.8382315692263324e-05, "loss": 1.0914, "step": 575 }, { "epoch": 0.26765113059529305, "grad_norm": 2.0095996014073503, "learning_rate": 1.8338117136470648e-05, "loss": 1.0679, "step": 580 }, { "epoch": 0.2699584679280111, "grad_norm": 2.0780448467433468, "learning_rate": 1.829337753220597e-05, "loss": 1.0823, "step": 585 }, { "epoch": 0.27226580526072913, "grad_norm": 1.9092130149771946, "learning_rate": 1.8248099782558103e-05, "loss": 1.0485, "step": 590 }, { "epoch": 0.2745731425934472, "grad_norm": 2.2904699258286914, "learning_rate": 1.820228682553533e-05, "loss": 1.0676, "step": 595 }, { "epoch": 0.2768804799261652, "grad_norm": 2.1592942059891884, "learning_rate": 1.8155941633874787e-05, "loss": 1.0862, "step": 600 }, { "epoch": 0.27918781725888325, "grad_norm": 1.9056960337173154, "learning_rate": 1.810906721484954e-05, "loss": 1.027, "step": 605 }, { "epoch": 0.2814951545916013, "grad_norm": 2.029717811241469, "learning_rate": 1.8061666610073465e-05, "loss": 1.0638, "step": 610 }, { "epoch": 0.28380249192431933, "grad_norm": 2.0411421295106873, "learning_rate": 1.8013742895303883e-05, "loss": 1.0667, "step": 615 }, { "epoch": 0.2861098292570374, "grad_norm": 1.924799713813513, "learning_rate": 1.7965299180241963e-05, "loss": 1.0685, "step": 620 }, { "epoch": 0.2884171665897554, "grad_norm": 2.171875799314523, "learning_rate": 1.791633860833096e-05, "loss": 1.0463, "step": 625 }, { "epoch": 0.29072450392247345, "grad_norm": 2.068372996208825, "learning_rate": 1.7866864356552215e-05, "loss": 1.0715, "step": 630 }, { "epoch": 0.2930318412551915, "grad_norm": 1.8601211490681129, "learning_rate": 1.7816879635219028e-05, "loss": 1.0576, "step": 635 }, { "epoch": 0.29533917858790953, "grad_norm": 1.9725316785259686, "learning_rate": 1.7766387687768338e-05, "loss": 1.0648, "step": 640 }, { "epoch": 0.2976465159206276, "grad_norm": 2.1844471268704515, "learning_rate": 1.7715391790550255e-05, "loss": 1.0637, "step": 645 }, { "epoch": 0.2999538532533456, "grad_norm": 1.933021204525043, "learning_rate": 1.766389525261547e-05, "loss": 1.0803, "step": 650 }, { "epoch": 0.30226119058606365, "grad_norm": 2.1351960039602695, "learning_rate": 1.7611901415500536e-05, "loss": 1.0979, "step": 655 }, { "epoch": 0.30456852791878175, "grad_norm": 1.9611319269471612, "learning_rate": 1.7559413653011027e-05, "loss": 1.0652, "step": 660 }, { "epoch": 0.3068758652514998, "grad_norm": 2.033679368734863, "learning_rate": 1.7506435371002635e-05, "loss": 1.0749, "step": 665 }, { "epoch": 0.30918320258421783, "grad_norm": 2.0279720872015354, "learning_rate": 1.745297000716016e-05, "loss": 1.078, "step": 670 }, { "epoch": 0.31149053991693587, "grad_norm": 1.989733876253561, "learning_rate": 1.7399021030774443e-05, "loss": 1.0639, "step": 675 }, { "epoch": 0.3137978772496539, "grad_norm": 1.9037569950190747, "learning_rate": 1.734459194251725e-05, "loss": 1.0721, "step": 680 }, { "epoch": 0.31610521458237195, "grad_norm": 2.183774346551292, "learning_rate": 1.7289686274214116e-05, "loss": 1.0755, "step": 685 }, { "epoch": 0.31841255191509, "grad_norm": 1.992668508208317, "learning_rate": 1.7234307588615177e-05, "loss": 1.0761, "step": 690 }, { "epoch": 0.32071988924780803, "grad_norm": 1.9985850630928745, "learning_rate": 1.717845947916398e-05, "loss": 1.0575, "step": 695 }, { "epoch": 0.3230272265805261, "grad_norm": 2.1480400724448883, "learning_rate": 1.712214556976431e-05, "loss": 1.0404, "step": 700 }, { "epoch": 0.3253345639132441, "grad_norm": 1.9503339704430334, "learning_rate": 1.7065369514545054e-05, "loss": 1.0579, "step": 705 }, { "epoch": 0.32764190124596215, "grad_norm": 1.971699140050545, "learning_rate": 1.7008134997623066e-05, "loss": 1.0629, "step": 710 }, { "epoch": 0.3299492385786802, "grad_norm": 1.97358622805482, "learning_rate": 1.695044573286413e-05, "loss": 1.039, "step": 715 }, { "epoch": 0.33225657591139823, "grad_norm": 1.8903289514072814, "learning_rate": 1.6892305463641967e-05, "loss": 1.0996, "step": 720 }, { "epoch": 0.3345639132441163, "grad_norm": 1.9171530806208752, "learning_rate": 1.6833717962595327e-05, "loss": 1.0587, "step": 725 }, { "epoch": 0.3368712505768343, "grad_norm": 2.0292768253738855, "learning_rate": 1.677468703138319e-05, "loss": 1.0534, "step": 730 }, { "epoch": 0.33917858790955235, "grad_norm": 1.9857495035997068, "learning_rate": 1.6715216500438093e-05, "loss": 1.0805, "step": 735 }, { "epoch": 0.3414859252422704, "grad_norm": 5.464414796561983, "learning_rate": 1.6655310228717565e-05, "loss": 1.0802, "step": 740 }, { "epoch": 0.3437932625749885, "grad_norm": 1.909425909480839, "learning_rate": 1.6594972103453727e-05, "loss": 1.0813, "step": 745 }, { "epoch": 0.34610059990770653, "grad_norm": 1.9164783421961078, "learning_rate": 1.6534206039901057e-05, "loss": 1.0466, "step": 750 }, { "epoch": 0.34840793724042457, "grad_norm": 4.300395520109931, "learning_rate": 1.647301598108234e-05, "loss": 1.0326, "step": 755 }, { "epoch": 0.3507152745731426, "grad_norm": 2.2226539635666827, "learning_rate": 1.64114058975328e-05, "loss": 1.0824, "step": 760 }, { "epoch": 0.35302261190586065, "grad_norm": 2.08738140836867, "learning_rate": 1.6349379787042478e-05, "loss": 1.0445, "step": 765 }, { "epoch": 0.3553299492385787, "grad_norm": 1.7956395961308758, "learning_rate": 1.6286941674396788e-05, "loss": 1.0283, "step": 770 }, { "epoch": 0.35763728657129673, "grad_norm": 1.9001406773036147, "learning_rate": 1.6224095611115385e-05, "loss": 1.0558, "step": 775 }, { "epoch": 0.35994462390401477, "grad_norm": 1.8584506601925908, "learning_rate": 1.6160845675189254e-05, "loss": 1.0315, "step": 780 }, { "epoch": 0.3622519612367328, "grad_norm": 1.8994200106765273, "learning_rate": 1.6097195970816094e-05, "loss": 1.0736, "step": 785 }, { "epoch": 0.36455929856945085, "grad_norm": 2.396252821544053, "learning_rate": 1.603315062813401e-05, "loss": 1.0605, "step": 790 }, { "epoch": 0.3668666359021689, "grad_norm": 1.798952489279231, "learning_rate": 1.596871380295351e-05, "loss": 1.0439, "step": 795 }, { "epoch": 0.36917397323488693, "grad_norm": 1.8907451459454219, "learning_rate": 1.5903889676487832e-05, "loss": 1.047, "step": 800 }, { "epoch": 0.37148131056760497, "grad_norm": 1.9272449578154556, "learning_rate": 1.5838682455081657e-05, "loss": 1.0557, "step": 805 }, { "epoch": 0.373788647900323, "grad_norm": 1.8500068465129675, "learning_rate": 1.5773096369938125e-05, "loss": 1.0448, "step": 810 }, { "epoch": 0.37609598523304105, "grad_norm": 2.024989082401722, "learning_rate": 1.570713567684432e-05, "loss": 1.0444, "step": 815 }, { "epoch": 0.3784033225657591, "grad_norm": 1.864545609223796, "learning_rate": 1.5640804655895086e-05, "loss": 1.0316, "step": 820 }, { "epoch": 0.38071065989847713, "grad_norm": 1.9810163039010853, "learning_rate": 1.557410761121532e-05, "loss": 1.0476, "step": 825 }, { "epoch": 0.3830179972311952, "grad_norm": 1.9715732068507474, "learning_rate": 1.5507048870680668e-05, "loss": 1.0092, "step": 830 }, { "epoch": 0.38532533456391327, "grad_norm": 1.9346233566232378, "learning_rate": 1.5439632785636707e-05, "loss": 1.0834, "step": 835 }, { "epoch": 0.3876326718966313, "grad_norm": 2.2247085368619164, "learning_rate": 1.5371863730616586e-05, "loss": 1.0608, "step": 840 }, { "epoch": 0.38994000922934935, "grad_norm": 1.855445999462738, "learning_rate": 1.5303746103057163e-05, "loss": 1.0311, "step": 845 }, { "epoch": 0.3922473465620674, "grad_norm": 1.8433500481185805, "learning_rate": 1.5235284323013674e-05, "loss": 1.0513, "step": 850 }, { "epoch": 0.39455468389478543, "grad_norm": 1.9238020550812749, "learning_rate": 1.5166482832872923e-05, "loss": 1.0611, "step": 855 }, { "epoch": 0.39686202122750347, "grad_norm": 2.034539491931288, "learning_rate": 1.5097346097065008e-05, "loss": 1.0369, "step": 860 }, { "epoch": 0.3991693585602215, "grad_norm": 1.8719773240320596, "learning_rate": 1.5027878601773633e-05, "loss": 1.031, "step": 865 }, { "epoch": 0.40147669589293955, "grad_norm": 1.8218549093076317, "learning_rate": 1.4958084854645018e-05, "loss": 1.027, "step": 870 }, { "epoch": 0.4037840332256576, "grad_norm": 1.908377236915306, "learning_rate": 1.4887969384495403e-05, "loss": 1.0505, "step": 875 }, { "epoch": 0.40609137055837563, "grad_norm": 1.882874108335332, "learning_rate": 1.4817536741017153e-05, "loss": 1.0421, "step": 880 }, { "epoch": 0.40839870789109367, "grad_norm": 2.0217065440618622, "learning_rate": 1.4746791494483584e-05, "loss": 1.0533, "step": 885 }, { "epoch": 0.4107060452238117, "grad_norm": 1.8717323475177303, "learning_rate": 1.4675738235452352e-05, "loss": 1.0279, "step": 890 }, { "epoch": 0.41301338255652975, "grad_norm": 1.9788825364185045, "learning_rate": 1.4604381574467616e-05, "loss": 1.042, "step": 895 }, { "epoch": 0.4153207198892478, "grad_norm": 1.9327030504589935, "learning_rate": 1.4532726141760849e-05, "loss": 1.06, "step": 900 }, { "epoch": 0.41762805722196583, "grad_norm": 1.8050202525178007, "learning_rate": 1.4460776586950393e-05, "loss": 1.0176, "step": 905 }, { "epoch": 0.41993539455468387, "grad_norm": 1.7140772518888605, "learning_rate": 1.438853757873975e-05, "loss": 1.0336, "step": 910 }, { "epoch": 0.4222427318874019, "grad_norm": 1.9381284110778458, "learning_rate": 1.4316013804614644e-05, "loss": 1.0283, "step": 915 }, { "epoch": 0.42455006922012, "grad_norm": 1.8569863683755345, "learning_rate": 1.4243209970538846e-05, "loss": 1.0295, "step": 920 }, { "epoch": 0.42685740655283805, "grad_norm": 1.7584125894267681, "learning_rate": 1.4170130800648814e-05, "loss": 1.0451, "step": 925 }, { "epoch": 0.4291647438855561, "grad_norm": 1.8468563006595364, "learning_rate": 1.4096781036947159e-05, "loss": 1.0329, "step": 930 }, { "epoch": 0.43147208121827413, "grad_norm": 1.8005410726866136, "learning_rate": 1.4023165438994933e-05, "loss": 1.0523, "step": 935 }, { "epoch": 0.43377941855099217, "grad_norm": 1.7881203203680747, "learning_rate": 1.394928878360279e-05, "loss": 1.052, "step": 940 }, { "epoch": 0.4360867558837102, "grad_norm": 1.9402582404112974, "learning_rate": 1.3875155864521031e-05, "loss": 1.0418, "step": 945 }, { "epoch": 0.43839409321642825, "grad_norm": 1.926002050119894, "learning_rate": 1.3800771492128537e-05, "loss": 1.0491, "step": 950 }, { "epoch": 0.4407014305491463, "grad_norm": 1.8807563751664647, "learning_rate": 1.3726140493120639e-05, "loss": 1.032, "step": 955 }, { "epoch": 0.44300876788186433, "grad_norm": 1.9189485256851713, "learning_rate": 1.3651267710195909e-05, "loss": 1.0355, "step": 960 }, { "epoch": 0.44531610521458237, "grad_norm": 1.9803461150155048, "learning_rate": 1.3576158001741932e-05, "loss": 1.0569, "step": 965 }, { "epoch": 0.4476234425473004, "grad_norm": 2.018305042251882, "learning_rate": 1.3500816241520059e-05, "loss": 1.04, "step": 970 }, { "epoch": 0.44993077988001845, "grad_norm": 1.7580864552202506, "learning_rate": 1.3425247318349137e-05, "loss": 1.0075, "step": 975 }, { "epoch": 0.4522381172127365, "grad_norm": 1.8967387580024155, "learning_rate": 1.3349456135788298e-05, "loss": 1.0429, "step": 980 }, { "epoch": 0.45454545454545453, "grad_norm": 2.2301409193225985, "learning_rate": 1.3273447611818768e-05, "loss": 1.0244, "step": 985 }, { "epoch": 0.45685279187817257, "grad_norm": 1.8218900361937265, "learning_rate": 1.3197226678524739e-05, "loss": 1.0006, "step": 990 }, { "epoch": 0.4591601292108906, "grad_norm": 1.8705871989801575, "learning_rate": 1.3120798281773346e-05, "loss": 1.0382, "step": 995 }, { "epoch": 0.46146746654360865, "grad_norm": 1.8993921065361903, "learning_rate": 1.3044167380893726e-05, "loss": 1.0543, "step": 1000 }, { "epoch": 0.46377480387632675, "grad_norm": 1.762141098208751, "learning_rate": 1.2967338948355217e-05, "loss": 1.031, "step": 1005 }, { "epoch": 0.4660821412090448, "grad_norm": 1.8349620217027005, "learning_rate": 1.2890317969444716e-05, "loss": 1.0104, "step": 1010 }, { "epoch": 0.4683894785417628, "grad_norm": 1.9527169882770812, "learning_rate": 1.2813109441943166e-05, "loss": 1.0325, "step": 1015 }, { "epoch": 0.47069681587448087, "grad_norm": 1.7594153130967782, "learning_rate": 1.273571837580127e-05, "loss": 1.0476, "step": 1020 }, { "epoch": 0.4730041532071989, "grad_norm": 1.7823802580407797, "learning_rate": 1.2658149792814405e-05, "loss": 1.0397, "step": 1025 }, { "epoch": 0.47531149053991695, "grad_norm": 1.7288773807653248, "learning_rate": 1.258040872629676e-05, "loss": 1.0419, "step": 1030 }, { "epoch": 0.477618827872635, "grad_norm": 1.8969879276116197, "learning_rate": 1.2502500220754736e-05, "loss": 1.0538, "step": 1035 }, { "epoch": 0.47992616520535303, "grad_norm": 1.9748280209096565, "learning_rate": 1.242442933155961e-05, "loss": 1.0088, "step": 1040 }, { "epoch": 0.48223350253807107, "grad_norm": 1.8630834023430007, "learning_rate": 1.2346201124619502e-05, "loss": 1.0041, "step": 1045 }, { "epoch": 0.4845408398707891, "grad_norm": 1.93282001404706, "learning_rate": 1.2267820676050657e-05, "loss": 1.0117, "step": 1050 }, { "epoch": 0.48684817720350715, "grad_norm": 1.7732915883551568, "learning_rate": 1.2189293071848051e-05, "loss": 1.0395, "step": 1055 }, { "epoch": 0.4891555145362252, "grad_norm": 1.7668917225682153, "learning_rate": 1.2110623407555398e-05, "loss": 1.0055, "step": 1060 }, { "epoch": 0.49146285186894323, "grad_norm": 1.770548333794891, "learning_rate": 1.2031816787934465e-05, "loss": 1.0198, "step": 1065 }, { "epoch": 0.49377018920166127, "grad_norm": 1.804504292487286, "learning_rate": 1.1952878326633872e-05, "loss": 0.9925, "step": 1070 }, { "epoch": 0.4960775265343793, "grad_norm": 1.8384240011850799, "learning_rate": 1.187381314585725e-05, "loss": 1.0167, "step": 1075 }, { "epoch": 0.49838486386709735, "grad_norm": 1.8738489698458378, "learning_rate": 1.1794626376030866e-05, "loss": 1.0266, "step": 1080 }, { "epoch": 0.5006922011998154, "grad_norm": 1.7963716989600227, "learning_rate": 1.1715323155470745e-05, "loss": 1.0203, "step": 1085 }, { "epoch": 0.5029995385325334, "grad_norm": 1.7950924324700734, "learning_rate": 1.163590863004922e-05, "loss": 1.0014, "step": 1090 }, { "epoch": 0.5053068758652515, "grad_norm": 1.7996992785566162, "learning_rate": 1.1556387952861036e-05, "loss": 1.0147, "step": 1095 }, { "epoch": 0.5076142131979695, "grad_norm": 1.9262189643769105, "learning_rate": 1.1476766283888986e-05, "loss": 1.0176, "step": 1100 }, { "epoch": 0.5099215505306876, "grad_norm": 1.738673259571015, "learning_rate": 1.1397048789669061e-05, "loss": 1.0221, "step": 1105 }, { "epoch": 0.5122288878634056, "grad_norm": 1.7993896869653003, "learning_rate": 1.1317240642955226e-05, "loss": 1.0232, "step": 1110 }, { "epoch": 0.5145362251961236, "grad_norm": 1.8492729168966688, "learning_rate": 1.1237347022383747e-05, "loss": 1.0138, "step": 1115 }, { "epoch": 0.5168435625288417, "grad_norm": 1.792127722956897, "learning_rate": 1.1157373112137171e-05, "loss": 1.011, "step": 1120 }, { "epoch": 0.5191508998615597, "grad_norm": 1.767761412954839, "learning_rate": 1.107732410160793e-05, "loss": 0.9917, "step": 1125 }, { "epoch": 0.5214582371942778, "grad_norm": 1.78861016845621, "learning_rate": 1.0997205185061599e-05, "loss": 1.024, "step": 1130 }, { "epoch": 0.5237655745269958, "grad_norm": 1.796111964851059, "learning_rate": 1.0917021561299864e-05, "loss": 1.0094, "step": 1135 }, { "epoch": 0.5260729118597139, "grad_norm": 1.6839699201837544, "learning_rate": 1.083677843332316e-05, "loss": 1.0019, "step": 1140 }, { "epoch": 0.528380249192432, "grad_norm": 1.735381208836221, "learning_rate": 1.0756481007993063e-05, "loss": 0.9979, "step": 1145 }, { "epoch": 0.53068758652515, "grad_norm": 1.7283238773850635, "learning_rate": 1.0676134495694439e-05, "loss": 1.0127, "step": 1150 }, { "epoch": 0.5329949238578681, "grad_norm": 1.8700927823490678, "learning_rate": 1.0595744109997326e-05, "loss": 0.9897, "step": 1155 }, { "epoch": 0.5353022611905861, "grad_norm": 1.7850898978429104, "learning_rate": 1.0515315067318652e-05, "loss": 1.0155, "step": 1160 }, { "epoch": 0.5376095985233041, "grad_norm": 1.8867979217718087, "learning_rate": 1.0434852586583737e-05, "loss": 0.9996, "step": 1165 }, { "epoch": 0.5399169358560222, "grad_norm": 1.8409069763382047, "learning_rate": 1.0354361888887642e-05, "loss": 1.0038, "step": 1170 }, { "epoch": 0.5422242731887402, "grad_norm": 1.9318197730416369, "learning_rate": 1.0273848197156401e-05, "loss": 0.9893, "step": 1175 }, { "epoch": 0.5445316105214583, "grad_norm": 1.7534602196952722, "learning_rate": 1.0193316735808085e-05, "loss": 0.993, "step": 1180 }, { "epoch": 0.5468389478541763, "grad_norm": 1.7436059512387687, "learning_rate": 1.0112772730413816e-05, "loss": 1.0079, "step": 1185 }, { "epoch": 0.5491462851868943, "grad_norm": 1.8076118426423142, "learning_rate": 1.0032221407358683e-05, "loss": 1.0336, "step": 1190 }, { "epoch": 0.5514536225196124, "grad_norm": 1.9044420451434694, "learning_rate": 9.951667993502599e-06, "loss": 1.0152, "step": 1195 }, { "epoch": 0.5537609598523304, "grad_norm": 1.842329102136153, "learning_rate": 9.871117715841151e-06, "loss": 0.9783, "step": 1200 }, { "epoch": 0.5560682971850485, "grad_norm": 1.8583622986365993, "learning_rate": 9.790575801166432e-06, "loss": 1.0054, "step": 1205 }, { "epoch": 0.5583756345177665, "grad_norm": 1.7004919095912332, "learning_rate": 9.710047475727854e-06, "loss": 1.0011, "step": 1210 }, { "epoch": 0.5606829718504845, "grad_norm": 1.7280460978374188, "learning_rate": 9.629537964893063e-06, "loss": 1.0299, "step": 1215 }, { "epoch": 0.5629903091832026, "grad_norm": 1.7174011953937558, "learning_rate": 9.549052492808834e-06, "loss": 0.9946, "step": 1220 }, { "epoch": 0.5652976465159206, "grad_norm": 1.8215498597720168, "learning_rate": 9.468596282062114e-06, "loss": 1.0113, "step": 1225 }, { "epoch": 0.5676049838486387, "grad_norm": 1.6911500192296895, "learning_rate": 9.38817455334112e-06, "loss": 0.9855, "step": 1230 }, { "epoch": 0.5699123211813567, "grad_norm": 1.8405046704539174, "learning_rate": 9.307792525096582e-06, "loss": 1.0113, "step": 1235 }, { "epoch": 0.5722196585140747, "grad_norm": 1.838204327540361, "learning_rate": 9.227455413203115e-06, "loss": 0.9947, "step": 1240 }, { "epoch": 0.5745269958467928, "grad_norm": 1.656688699609939, "learning_rate": 9.147168430620788e-06, "loss": 0.9892, "step": 1245 }, { "epoch": 0.5768343331795108, "grad_norm": 1.7231036061816765, "learning_rate": 9.066936787056843e-06, "loss": 0.9944, "step": 1250 }, { "epoch": 0.5791416705122289, "grad_norm": 1.645605940940624, "learning_rate": 8.986765688627652e-06, "loss": 0.9936, "step": 1255 }, { "epoch": 0.5814490078449469, "grad_norm": 1.8141527360329759, "learning_rate": 8.906660337520903e-06, "loss": 1.0096, "step": 1260 }, { "epoch": 0.583756345177665, "grad_norm": 1.856808726362016, "learning_rate": 8.82662593165804e-06, "loss": 1.0032, "step": 1265 }, { "epoch": 0.586063682510383, "grad_norm": 1.8593818092553211, "learning_rate": 8.746667664356957e-06, "loss": 1.0177, "step": 1270 }, { "epoch": 0.588371019843101, "grad_norm": 1.7318701186944272, "learning_rate": 8.666790723995043e-06, "loss": 0.9933, "step": 1275 }, { "epoch": 0.5906783571758191, "grad_norm": 1.8632249625406112, "learning_rate": 8.587000293672482e-06, "loss": 1.0278, "step": 1280 }, { "epoch": 0.5929856945085371, "grad_norm": 1.8482080793994375, "learning_rate": 8.50730155087596e-06, "loss": 0.9753, "step": 1285 }, { "epoch": 0.5952930318412551, "grad_norm": 1.6654816438940703, "learning_rate": 8.427699667142681e-06, "loss": 0.9923, "step": 1290 }, { "epoch": 0.5976003691739732, "grad_norm": 1.816789112483473, "learning_rate": 8.348199807724806e-06, "loss": 0.9951, "step": 1295 }, { "epoch": 0.5999077065066912, "grad_norm": 1.8671938825009406, "learning_rate": 8.268807131254288e-06, "loss": 1.0063, "step": 1300 }, { "epoch": 0.6022150438394093, "grad_norm": 1.736173419625791, "learning_rate": 8.189526789408123e-06, "loss": 0.9942, "step": 1305 }, { "epoch": 0.6045223811721273, "grad_norm": 1.7397594354717327, "learning_rate": 8.110363926574088e-06, "loss": 0.9899, "step": 1310 }, { "epoch": 0.6068297185048455, "grad_norm": 1.7112354026341845, "learning_rate": 8.0313236795169e-06, "loss": 0.9981, "step": 1315 }, { "epoch": 0.6091370558375635, "grad_norm": 1.7633777819452738, "learning_rate": 7.952411177044923e-06, "loss": 0.9667, "step": 1320 }, { "epoch": 0.6114443931702815, "grad_norm": 1.7477692209080626, "learning_rate": 7.873631539677364e-06, "loss": 0.9979, "step": 1325 }, { "epoch": 0.6137517305029996, "grad_norm": 1.7532055508610305, "learning_rate": 7.794989879311991e-06, "loss": 0.9869, "step": 1330 }, { "epoch": 0.6160590678357176, "grad_norm": 1.8525858143415055, "learning_rate": 7.716491298893443e-06, "loss": 0.9834, "step": 1335 }, { "epoch": 0.6183664051684357, "grad_norm": 1.749585519245075, "learning_rate": 7.638140892082118e-06, "loss": 1.0092, "step": 1340 }, { "epoch": 0.6206737425011537, "grad_norm": 1.8420135288729067, "learning_rate": 7.559943742923626e-06, "loss": 0.9797, "step": 1345 }, { "epoch": 0.6229810798338717, "grad_norm": 1.7361527256574634, "learning_rate": 7.4819049255189215e-06, "loss": 1.0084, "step": 1350 }, { "epoch": 0.6252884171665898, "grad_norm": 1.8341519418326866, "learning_rate": 7.404029503695028e-06, "loss": 0.978, "step": 1355 }, { "epoch": 0.6275957544993078, "grad_norm": 1.8293945335237427, "learning_rate": 7.326322530676471e-06, "loss": 0.9949, "step": 1360 }, { "epoch": 0.6299030918320259, "grad_norm": 1.8042095660293147, "learning_rate": 7.248789048757368e-06, "loss": 0.9708, "step": 1365 }, { "epoch": 0.6322104291647439, "grad_norm": 1.845467719423503, "learning_rate": 7.171434088974252e-06, "loss": 0.9965, "step": 1370 }, { "epoch": 0.6345177664974619, "grad_norm": 1.6543843384272663, "learning_rate": 7.094262670779611e-06, "loss": 0.9745, "step": 1375 }, { "epoch": 0.63682510383018, "grad_norm": 1.78812671106571, "learning_rate": 7.017279801716177e-06, "loss": 0.9913, "step": 1380 }, { "epoch": 0.639132441162898, "grad_norm": 1.6947334759904245, "learning_rate": 6.940490477092004e-06, "loss": 0.9852, "step": 1385 }, { "epoch": 0.6414397784956161, "grad_norm": 1.8535301270043634, "learning_rate": 6.8638996796563275e-06, "loss": 1.007, "step": 1390 }, { "epoch": 0.6437471158283341, "grad_norm": 1.7676355127694694, "learning_rate": 6.78751237927623e-06, "loss": 0.9514, "step": 1395 }, { "epoch": 0.6460544531610521, "grad_norm": 1.6769380120076558, "learning_rate": 6.711333532614168e-06, "loss": 0.9698, "step": 1400 }, { "epoch": 0.6483617904937702, "grad_norm": 1.7272039849376555, "learning_rate": 6.6353680828063306e-06, "loss": 0.948, "step": 1405 }, { "epoch": 0.6506691278264882, "grad_norm": 1.7909691104530978, "learning_rate": 6.559620959141897e-06, "loss": 0.9741, "step": 1410 }, { "epoch": 0.6529764651592063, "grad_norm": 1.7584119603336634, "learning_rate": 6.48409707674317e-06, "loss": 0.9825, "step": 1415 }, { "epoch": 0.6552838024919243, "grad_norm": 1.704146715339984, "learning_rate": 6.408801336246645e-06, "loss": 0.9473, "step": 1420 }, { "epoch": 0.6575911398246423, "grad_norm": 1.675928516675119, "learning_rate": 6.3337386234850255e-06, "loss": 0.9726, "step": 1425 }, { "epoch": 0.6598984771573604, "grad_norm": 1.8640939079623915, "learning_rate": 6.258913809170169e-06, "loss": 0.9899, "step": 1430 }, { "epoch": 0.6622058144900784, "grad_norm": 1.7077639495220778, "learning_rate": 6.18433174857705e-06, "loss": 0.9856, "step": 1435 }, { "epoch": 0.6645131518227965, "grad_norm": 1.7756594990657744, "learning_rate": 6.1099972812287e-06, "loss": 0.9766, "step": 1440 }, { "epoch": 0.6668204891555145, "grad_norm": 1.9246917842171538, "learning_rate": 6.035915230582176e-06, "loss": 0.9802, "step": 1445 }, { "epoch": 0.6691278264882325, "grad_norm": 1.7061051671690723, "learning_rate": 5.962090403715592e-06, "loss": 0.9589, "step": 1450 }, { "epoch": 0.6714351638209506, "grad_norm": 1.9209301423646885, "learning_rate": 5.8885275910161574e-06, "loss": 0.9661, "step": 1455 }, { "epoch": 0.6737425011536686, "grad_norm": 1.7522165825796936, "learning_rate": 5.815231565869377e-06, "loss": 0.9683, "step": 1460 }, { "epoch": 0.6760498384863867, "grad_norm": 1.7093168808099815, "learning_rate": 5.742207084349274e-06, "loss": 0.9787, "step": 1465 }, { "epoch": 0.6783571758191047, "grad_norm": 1.7331687059615726, "learning_rate": 5.669458884909815e-06, "loss": 0.962, "step": 1470 }, { "epoch": 0.6806645131518227, "grad_norm": 1.7125984142423774, "learning_rate": 5.596991688077409e-06, "loss": 0.9749, "step": 1475 }, { "epoch": 0.6829718504845408, "grad_norm": 1.7116106483633, "learning_rate": 5.5248101961446065e-06, "loss": 0.9646, "step": 1480 }, { "epoch": 0.6852791878172588, "grad_norm": 1.7871177917200074, "learning_rate": 5.452919092864976e-06, "loss": 0.9869, "step": 1485 }, { "epoch": 0.687586525149977, "grad_norm": 1.7422263712914812, "learning_rate": 5.381323043149192e-06, "loss": 0.9598, "step": 1490 }, { "epoch": 0.689893862482695, "grad_norm": 1.827833481936086, "learning_rate": 5.310026692762316e-06, "loss": 0.9674, "step": 1495 }, { "epoch": 0.6922011998154131, "grad_norm": 1.8013595644003924, "learning_rate": 5.239034668022353e-06, "loss": 0.9573, "step": 1500 }, { "epoch": 0.6945085371481311, "grad_norm": 1.8007495209856474, "learning_rate": 5.168351575500049e-06, "loss": 0.9719, "step": 1505 }, { "epoch": 0.6968158744808491, "grad_norm": 1.7404494760342795, "learning_rate": 5.097982001719994e-06, "loss": 0.9724, "step": 1510 }, { "epoch": 0.6991232118135672, "grad_norm": 1.8207400554800481, "learning_rate": 5.027930512862976e-06, "loss": 0.9643, "step": 1515 }, { "epoch": 0.7014305491462852, "grad_norm": 1.6503774911907483, "learning_rate": 4.958201654469731e-06, "loss": 0.9718, "step": 1520 }, { "epoch": 0.7037378864790033, "grad_norm": 1.7042499652030019, "learning_rate": 4.888799951145948e-06, "loss": 0.9786, "step": 1525 }, { "epoch": 0.7060452238117213, "grad_norm": 1.691429781718496, "learning_rate": 4.8197299062687e-06, "loss": 0.9584, "step": 1530 }, { "epoch": 0.7083525611444393, "grad_norm": 1.789471718965235, "learning_rate": 4.750996001694215e-06, "loss": 0.978, "step": 1535 }, { "epoch": 0.7106598984771574, "grad_norm": 1.762516310353455, "learning_rate": 4.6826026974670665e-06, "loss": 0.9536, "step": 1540 }, { "epoch": 0.7129672358098754, "grad_norm": 1.6808685507807348, "learning_rate": 4.614554431530754e-06, "loss": 0.9453, "step": 1545 }, { "epoch": 0.7152745731425935, "grad_norm": 1.695661878440997, "learning_rate": 4.546855619439734e-06, "loss": 0.9674, "step": 1550 }, { "epoch": 0.7175819104753115, "grad_norm": 1.780885617378124, "learning_rate": 4.479510654072909e-06, "loss": 0.9724, "step": 1555 }, { "epoch": 0.7198892478080295, "grad_norm": 1.7441807694903777, "learning_rate": 4.412523905348568e-06, "loss": 0.9422, "step": 1560 }, { "epoch": 0.7221965851407476, "grad_norm": 1.7666139001524914, "learning_rate": 4.345899719940844e-06, "loss": 0.9496, "step": 1565 }, { "epoch": 0.7245039224734656, "grad_norm": 1.6463010573052135, "learning_rate": 4.279642420997655e-06, "loss": 0.9635, "step": 1570 }, { "epoch": 0.7268112598061837, "grad_norm": 1.7790169033851828, "learning_rate": 4.213756307860175e-06, "loss": 0.9795, "step": 1575 }, { "epoch": 0.7291185971389017, "grad_norm": 1.709252193673288, "learning_rate": 4.148245655783869e-06, "loss": 0.9542, "step": 1580 }, { "epoch": 0.7314259344716197, "grad_norm": 1.7064646780964507, "learning_rate": 4.083114715661069e-06, "loss": 0.9494, "step": 1585 }, { "epoch": 0.7337332718043378, "grad_norm": 1.7203471522785316, "learning_rate": 4.018367713745137e-06, "loss": 0.9513, "step": 1590 }, { "epoch": 0.7360406091370558, "grad_norm": 1.7329896835019194, "learning_rate": 3.954008851376252e-06, "loss": 0.9464, "step": 1595 }, { "epoch": 0.7383479464697739, "grad_norm": 1.6668720129339225, "learning_rate": 3.890042304708758e-06, "loss": 0.9349, "step": 1600 }, { "epoch": 0.7406552838024919, "grad_norm": 1.6612958616670062, "learning_rate": 3.826472224440202e-06, "loss": 0.9753, "step": 1605 }, { "epoch": 0.7429626211352099, "grad_norm": 1.689937062434287, "learning_rate": 3.763302735541987e-06, "loss": 0.9755, "step": 1610 }, { "epoch": 0.745269958467928, "grad_norm": 1.8524303075498816, "learning_rate": 3.700537936991733e-06, "loss": 0.9919, "step": 1615 }, { "epoch": 0.747577295800646, "grad_norm": 1.7330330880413027, "learning_rate": 3.6381819015072652e-06, "loss": 0.9968, "step": 1620 }, { "epoch": 0.7498846331333641, "grad_norm": 1.732375990079818, "learning_rate": 3.5762386752823643e-06, "loss": 0.9598, "step": 1625 }, { "epoch": 0.7521919704660821, "grad_norm": 1.6723382538398348, "learning_rate": 3.5147122777242203e-06, "loss": 0.9826, "step": 1630 }, { "epoch": 0.7544993077988001, "grad_norm": 1.698445076932435, "learning_rate": 3.4536067011925945e-06, "loss": 0.975, "step": 1635 }, { "epoch": 0.7568066451315182, "grad_norm": 1.6756544799204833, "learning_rate": 3.3929259107407785e-06, "loss": 0.9596, "step": 1640 }, { "epoch": 0.7591139824642362, "grad_norm": 1.7323687941815844, "learning_rate": 3.3326738438583116e-06, "loss": 0.9471, "step": 1645 }, { "epoch": 0.7614213197969543, "grad_norm": 1.6841658818773522, "learning_rate": 3.272854410215467e-06, "loss": 0.9478, "step": 1650 }, { "epoch": 0.7637286571296723, "grad_norm": 1.7258401397718819, "learning_rate": 3.213471491409568e-06, "loss": 0.9545, "step": 1655 }, { "epoch": 0.7660359944623903, "grad_norm": 1.754250495342998, "learning_rate": 3.1545289407131128e-06, "loss": 0.9557, "step": 1660 }, { "epoch": 0.7683433317951085, "grad_norm": 1.7109892895946872, "learning_rate": 3.0960305828237568e-06, "loss": 0.9649, "step": 1665 }, { "epoch": 0.7706506691278265, "grad_norm": 1.8321981237158624, "learning_rate": 3.0379802136161073e-06, "loss": 0.9612, "step": 1670 }, { "epoch": 0.7729580064605446, "grad_norm": 1.6838996787097582, "learning_rate": 2.9803815998954334e-06, "loss": 0.9701, "step": 1675 }, { "epoch": 0.7752653437932626, "grad_norm": 1.7069776476837635, "learning_rate": 2.9232384791532377e-06, "loss": 0.9724, "step": 1680 }, { "epoch": 0.7775726811259807, "grad_norm": 1.6302606016182208, "learning_rate": 2.866554559324731e-06, "loss": 0.9441, "step": 1685 }, { "epoch": 0.7798800184586987, "grad_norm": 1.6451304542174006, "learning_rate": 2.810333518548246e-06, "loss": 0.9337, "step": 1690 }, { "epoch": 0.7821873557914167, "grad_norm": 1.763422741436097, "learning_rate": 2.7545790049265506e-06, "loss": 0.9542, "step": 1695 }, { "epoch": 0.7844946931241348, "grad_norm": 1.6780739225283752, "learning_rate": 2.699294636290134e-06, "loss": 0.9468, "step": 1700 }, { "epoch": 0.7868020304568528, "grad_norm": 1.7170452157015115, "learning_rate": 2.6444839999624496e-06, "loss": 0.9333, "step": 1705 }, { "epoch": 0.7891093677895709, "grad_norm": 1.6339639009427172, "learning_rate": 2.5901506525271424e-06, "loss": 0.9656, "step": 1710 }, { "epoch": 0.7914167051222889, "grad_norm": 1.7076055984466658, "learning_rate": 2.5362981195972627e-06, "loss": 0.9292, "step": 1715 }, { "epoch": 0.7937240424550069, "grad_norm": 1.694775381636099, "learning_rate": 2.4829298955865022e-06, "loss": 0.9621, "step": 1720 }, { "epoch": 0.796031379787725, "grad_norm": 1.6448376757312444, "learning_rate": 2.4300494434824373e-06, "loss": 0.9323, "step": 1725 }, { "epoch": 0.798338717120443, "grad_norm": 1.7190721711044321, "learning_rate": 2.3776601946218225e-06, "loss": 0.9536, "step": 1730 }, { "epoch": 0.8006460544531611, "grad_norm": 1.7369056682520372, "learning_rate": 2.3257655484679376e-06, "loss": 0.9474, "step": 1735 }, { "epoch": 0.8029533917858791, "grad_norm": 1.9277048905233987, "learning_rate": 2.274368872390009e-06, "loss": 0.953, "step": 1740 }, { "epoch": 0.8052607291185971, "grad_norm": 1.6988058479966548, "learning_rate": 2.2234735014446905e-06, "loss": 0.9546, "step": 1745 }, { "epoch": 0.8075680664513152, "grad_norm": 1.708926311661711, "learning_rate": 2.1730827381596643e-06, "loss": 0.9442, "step": 1750 }, { "epoch": 0.8098754037840332, "grad_norm": 1.7384137551353784, "learning_rate": 2.123199852319352e-06, "loss": 0.9415, "step": 1755 }, { "epoch": 0.8121827411167513, "grad_norm": 1.706865871432203, "learning_rate": 2.073828080752728e-06, "loss": 0.9514, "step": 1760 }, { "epoch": 0.8144900784494693, "grad_norm": 1.631591345911517, "learning_rate": 2.024970627123295e-06, "loss": 0.9593, "step": 1765 }, { "epoch": 0.8167974157821873, "grad_norm": 1.7250014472303201, "learning_rate": 1.976630661721207e-06, "loss": 0.9312, "step": 1770 }, { "epoch": 0.8191047531149054, "grad_norm": 1.7176929983275837, "learning_rate": 1.9288113212575454e-06, "loss": 0.9392, "step": 1775 }, { "epoch": 0.8214120904476234, "grad_norm": 1.7515517183747666, "learning_rate": 1.8815157086607826e-06, "loss": 0.9461, "step": 1780 }, { "epoch": 0.8237194277803415, "grad_norm": 1.7256573401014044, "learning_rate": 1.8347468928754408e-06, "loss": 0.9625, "step": 1785 }, { "epoch": 0.8260267651130595, "grad_norm": 1.6925351828448565, "learning_rate": 1.7885079086629598e-06, "loss": 0.9618, "step": 1790 }, { "epoch": 0.8283341024457775, "grad_norm": 1.6848423059711715, "learning_rate": 1.7428017564047594e-06, "loss": 0.957, "step": 1795 }, { "epoch": 0.8306414397784956, "grad_norm": 1.685378392680059, "learning_rate": 1.697631401907559e-06, "loss": 0.9405, "step": 1800 }, { "epoch": 0.8329487771112136, "grad_norm": 1.779147836504438, "learning_rate": 1.6529997762109319e-06, "loss": 0.9475, "step": 1805 }, { "epoch": 0.8352561144439317, "grad_norm": 1.6862245640499274, "learning_rate": 1.6089097753971061e-06, "loss": 0.9433, "step": 1810 }, { "epoch": 0.8375634517766497, "grad_norm": 1.6387699919494911, "learning_rate": 1.565364260403055e-06, "loss": 0.9393, "step": 1815 }, { "epoch": 0.8398707891093677, "grad_norm": 1.7088328181524817, "learning_rate": 1.522366056834844e-06, "loss": 0.9322, "step": 1820 }, { "epoch": 0.8421781264420858, "grad_norm": 1.717214554275275, "learning_rate": 1.4799179547842823e-06, "loss": 0.9393, "step": 1825 }, { "epoch": 0.8444854637748038, "grad_norm": 1.717143943169584, "learning_rate": 1.4380227086478816e-06, "loss": 0.96, "step": 1830 }, { "epoch": 0.846792801107522, "grad_norm": 1.7390583641873172, "learning_rate": 1.3966830369481231e-06, "loss": 0.9487, "step": 1835 }, { "epoch": 0.84910013844024, "grad_norm": 1.6483510037137357, "learning_rate": 1.3559016221570663e-06, "loss": 0.9315, "step": 1840 }, { "epoch": 0.8514074757729581, "grad_norm": 1.7716145374153562, "learning_rate": 1.3156811105222723e-06, "loss": 0.9375, "step": 1845 }, { "epoch": 0.8537148131056761, "grad_norm": 1.7136369284376767, "learning_rate": 1.276024111895101e-06, "loss": 0.9592, "step": 1850 }, { "epoch": 0.8560221504383941, "grad_norm": 1.659445261277345, "learning_rate": 1.2369331995613664e-06, "loss": 0.9466, "step": 1855 }, { "epoch": 0.8583294877711122, "grad_norm": 1.7022849150801465, "learning_rate": 1.1984109100743445e-06, "loss": 0.934, "step": 1860 }, { "epoch": 0.8606368251038302, "grad_norm": 1.7296616243070897, "learning_rate": 1.1604597430902032e-06, "loss": 0.9413, "step": 1865 }, { "epoch": 0.8629441624365483, "grad_norm": 1.70400411645417, "learning_rate": 1.123082161205775e-06, "loss": 0.9192, "step": 1870 }, { "epoch": 0.8652514997692663, "grad_norm": 1.7716704719549248, "learning_rate": 1.0862805897987894e-06, "loss": 0.9313, "step": 1875 }, { "epoch": 0.8675588371019843, "grad_norm": 1.6028791905428008, "learning_rate": 1.0500574168704746e-06, "loss": 0.9222, "step": 1880 }, { "epoch": 0.8698661744347024, "grad_norm": 1.6811903757943443, "learning_rate": 1.014414992890611e-06, "loss": 0.9613, "step": 1885 }, { "epoch": 0.8721735117674204, "grad_norm": 1.6582769629121996, "learning_rate": 9.793556306450125e-07, "loss": 0.9397, "step": 1890 }, { "epoch": 0.8744808491001385, "grad_norm": 1.7329477389715338, "learning_rate": 9.448816050854559e-07, "loss": 0.9456, "step": 1895 }, { "epoch": 0.8767881864328565, "grad_norm": 1.6579278218600528, "learning_rate": 9.10995153182056e-07, "loss": 0.9622, "step": 1900 }, { "epoch": 0.8790955237655745, "grad_norm": 1.7059466024322731, "learning_rate": 8.776984737781135e-07, "loss": 0.9247, "step": 1905 }, { "epoch": 0.8814028610982926, "grad_norm": 1.6587934531932846, "learning_rate": 8.449937274474396e-07, "loss": 0.9287, "step": 1910 }, { "epoch": 0.8837101984310106, "grad_norm": 1.7000421816452764, "learning_rate": 8.128830363541574e-07, "loss": 0.9579, "step": 1915 }, { "epoch": 0.8860175357637287, "grad_norm": 1.5982289727068608, "learning_rate": 7.81368484114996e-07, "loss": 0.9252, "step": 1920 }, { "epoch": 0.8883248730964467, "grad_norm": 1.6595039047537794, "learning_rate": 7.504521156640854e-07, "loss": 0.9535, "step": 1925 }, { "epoch": 0.8906322104291647, "grad_norm": 1.67142320517366, "learning_rate": 7.201359371202698e-07, "loss": 0.9342, "step": 1930 }, { "epoch": 0.8929395477618828, "grad_norm": 1.724210373217245, "learning_rate": 6.904219156569325e-07, "loss": 0.9537, "step": 1935 }, { "epoch": 0.8952468850946008, "grad_norm": 1.775137829218875, "learning_rate": 6.613119793743428e-07, "loss": 0.9407, "step": 1940 }, { "epoch": 0.8975542224273189, "grad_norm": 1.6829966494434596, "learning_rate": 6.32808017174551e-07, "loss": 0.9271, "step": 1945 }, { "epoch": 0.8998615597600369, "grad_norm": 1.7666732440040138, "learning_rate": 6.049118786388153e-07, "loss": 0.9299, "step": 1950 }, { "epoch": 0.9021688970927549, "grad_norm": 1.7328446243608238, "learning_rate": 5.776253739075887e-07, "loss": 0.9368, "step": 1955 }, { "epoch": 0.904476234425473, "grad_norm": 1.794453478228994, "learning_rate": 5.509502735630601e-07, "loss": 0.9584, "step": 1960 }, { "epoch": 0.906783571758191, "grad_norm": 1.8595416734645607, "learning_rate": 5.248883085142653e-07, "loss": 0.9278, "step": 1965 }, { "epoch": 0.9090909090909091, "grad_norm": 1.8064599818138745, "learning_rate": 4.994411698847668e-07, "loss": 0.9521, "step": 1970 }, { "epoch": 0.9113982464236271, "grad_norm": 11.97761580494739, "learning_rate": 4.746105089029229e-07, "loss": 0.9353, "step": 1975 }, { "epoch": 0.9137055837563451, "grad_norm": 1.91129591540662, "learning_rate": 4.50397936794742e-07, "loss": 0.9518, "step": 1980 }, { "epoch": 0.9160129210890632, "grad_norm": 1.68892958160335, "learning_rate": 4.268050246793276e-07, "loss": 0.9417, "step": 1985 }, { "epoch": 0.9183202584217812, "grad_norm": 1.7682216031642473, "learning_rate": 4.038333034669406e-07, "loss": 0.9575, "step": 1990 }, { "epoch": 0.9206275957544993, "grad_norm": 1.6202959145852942, "learning_rate": 3.814842637596483e-07, "loss": 0.9202, "step": 1995 }, { "epoch": 0.9229349330872173, "grad_norm": 1.7355964322786217, "learning_rate": 3.5975935575461083e-07, "loss": 0.9408, "step": 2000 }, { "epoch": 0.9252422704199353, "grad_norm": 1.784833888232325, "learning_rate": 3.3865998914997645e-07, "loss": 0.9451, "step": 2005 }, { "epoch": 0.9275496077526535, "grad_norm": 1.7347540917405002, "learning_rate": 3.1818753305340566e-07, "loss": 0.9503, "step": 2010 }, { "epoch": 0.9298569450853715, "grad_norm": 1.7098932522181238, "learning_rate": 2.9834331589323697e-07, "loss": 0.9648, "step": 2015 }, { "epoch": 0.9321642824180896, "grad_norm": 1.7375733049730988, "learning_rate": 2.791286253322856e-07, "loss": 0.9325, "step": 2020 }, { "epoch": 0.9344716197508076, "grad_norm": 1.7376811511996035, "learning_rate": 2.605447081842838e-07, "loss": 0.9236, "step": 2025 }, { "epoch": 0.9367789570835257, "grad_norm": 1.6285382014408727, "learning_rate": 2.425927703329856e-07, "loss": 0.9374, "step": 2030 }, { "epoch": 0.9390862944162437, "grad_norm": 1.7187090366001978, "learning_rate": 2.2527397665391026e-07, "loss": 0.9408, "step": 2035 }, { "epoch": 0.9413936317489617, "grad_norm": 1.6018424242699771, "learning_rate": 2.0858945093876315e-07, "loss": 0.9255, "step": 2040 }, { "epoch": 0.9437009690816798, "grad_norm": 1.7403672056926338, "learning_rate": 1.9254027582250588e-07, "loss": 0.9386, "step": 2045 }, { "epoch": 0.9460083064143978, "grad_norm": 1.6413885110477684, "learning_rate": 1.7712749271311392e-07, "loss": 0.9463, "step": 2050 }, { "epoch": 0.9483156437471159, "grad_norm": 1.6584652712298515, "learning_rate": 1.6235210172399373e-07, "loss": 0.9197, "step": 2055 }, { "epoch": 0.9506229810798339, "grad_norm": 1.7146073811212112, "learning_rate": 1.4821506160909492e-07, "loss": 0.9325, "step": 2060 }, { "epoch": 0.9529303184125519, "grad_norm": 1.7862356926674563, "learning_rate": 1.3471728970068986e-07, "loss": 0.9415, "step": 2065 }, { "epoch": 0.95523765574527, "grad_norm": 1.7900835474706864, "learning_rate": 1.2185966184985687e-07, "loss": 0.9516, "step": 2070 }, { "epoch": 0.957544993077988, "grad_norm": 1.61965861822062, "learning_rate": 1.0964301236963904e-07, "loss": 0.9272, "step": 2075 }, { "epoch": 0.9598523304107061, "grad_norm": 1.752477339334271, "learning_rate": 9.806813398091419e-08, "loss": 0.9231, "step": 2080 }, { "epoch": 0.9621596677434241, "grad_norm": 1.6566621651012274, "learning_rate": 8.713577776095494e-08, "loss": 0.9293, "step": 2085 }, { "epoch": 0.9644670050761421, "grad_norm": 1.637994937785989, "learning_rate": 7.684665309468875e-08, "loss": 0.9539, "step": 2090 }, { "epoch": 0.9667743424088602, "grad_norm": 1.6112435656292767, "learning_rate": 6.720142762867032e-08, "loss": 0.9558, "step": 2095 }, { "epoch": 0.9690816797415782, "grad_norm": 1.7629733611663445, "learning_rate": 5.820072722775849e-08, "loss": 0.9441, "step": 2100 }, { "epoch": 0.9713890170742963, "grad_norm": 1.6502417380976349, "learning_rate": 4.984513593450424e-08, "loss": 0.9527, "step": 2105 }, { "epoch": 0.9736963544070143, "grad_norm": 1.697689686578805, "learning_rate": 4.2135195931249925e-08, "loss": 0.9468, "step": 2110 }, { "epoch": 0.9760036917397323, "grad_norm": 1.7391293446601994, "learning_rate": 3.50714075049563e-08, "loss": 0.932, "step": 2115 }, { "epoch": 0.9783110290724504, "grad_norm": 1.7092970910189305, "learning_rate": 2.8654229014730694e-08, "loss": 0.9377, "step": 2120 }, { "epoch": 0.9806183664051684, "grad_norm": 1.6908979053958657, "learning_rate": 2.2884076862089712e-08, "loss": 0.9238, "step": 2125 }, { "epoch": 0.9829257037378865, "grad_norm": 1.634296694613385, "learning_rate": 1.7761325463937495e-08, "loss": 0.9473, "step": 2130 }, { "epoch": 0.9852330410706045, "grad_norm": 1.700195091519778, "learning_rate": 1.3286307228269623e-08, "loss": 0.9491, "step": 2135 }, { "epoch": 0.9875403784033225, "grad_norm": 1.6899977441806657, "learning_rate": 9.459312532608122e-09, "loss": 0.9393, "step": 2140 }, { "epoch": 0.9898477157360406, "grad_norm": 1.6459431767651571, "learning_rate": 6.280589705153217e-09, "loss": 0.9316, "step": 2145 }, { "epoch": 0.9921550530687586, "grad_norm": 1.6824622254831971, "learning_rate": 3.750345008675105e-09, "loss": 0.9455, "step": 2150 }, { "epoch": 0.9944623904014767, "grad_norm": 1.6711402676574085, "learning_rate": 1.8687426271246646e-09, "loss": 0.9299, "step": 2155 }, { "epoch": 0.9967697277341947, "grad_norm": 1.7394983815714609, "learning_rate": 6.359046549864189e-10, "loss": 0.9262, "step": 2160 }, { "epoch": 0.9990770650669127, "grad_norm": 1.9029748936658466, "learning_rate": 5.1911089347100876e-11, "loss": 0.973, "step": 2165 }, { "epoch": 1.0, "eval_loss": 0.9539673924446106, "eval_runtime": 317.6397, "eval_samples_per_second": 48.325, "eval_steps_per_second": 0.756, "step": 2167 }, { "epoch": 1.0, "step": 2167, "total_flos": 453725713858560.0, "train_loss": 1.0076359760722753, "train_runtime": 13620.7718, "train_samples_per_second": 10.182, "train_steps_per_second": 0.159 } ], "logging_steps": 5, "max_steps": 2167, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 453725713858560.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }