diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.29088911064694545, + "epoch": 0.44441391904394445, "eval_steps": 500, - "global_step": 72000, + "global_step": 110000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -50407,6 +50407,26606 @@ "learning_rate": 4.480724108387977e-05, "loss": 94.2316, "step": 72000 + }, + { + "epoch": 0.2909295119123131, + "grad_norm": 495.1227722167969, + "learning_rate": 4.480511108240547e-05, + "loss": 57.7428, + "step": 72010 + }, + { + "epoch": 0.29096991317768073, + "grad_norm": 516.1085205078125, + "learning_rate": 4.480298069482033e-05, + "loss": 65.6927, + "step": 72020 + }, + { + "epoch": 0.29101031444304837, + "grad_norm": 592.4568481445312, + "learning_rate": 4.480084992116589e-05, + "loss": 54.2324, + "step": 72030 + }, + { + "epoch": 0.291050715708416, + "grad_norm": 661.2754516601562, + "learning_rate": 4.479871876148368e-05, + "loss": 101.6369, + "step": 72040 + }, + { + "epoch": 0.2910911169737836, + "grad_norm": 1018.503662109375, + "learning_rate": 4.479658721581527e-05, + "loss": 87.5494, + "step": 72050 + }, + { + "epoch": 0.29113151823915123, + "grad_norm": 1803.326416015625, + "learning_rate": 4.479445528420218e-05, + "loss": 121.1224, + "step": 72060 + }, + { + "epoch": 0.2911719195045189, + "grad_norm": 773.8688354492188, + "learning_rate": 4.479232296668601e-05, + "loss": 40.3872, + "step": 72070 + }, + { + "epoch": 0.2912123207698865, + "grad_norm": 598.6536865234375, + "learning_rate": 4.4790190263308306e-05, + "loss": 61.5503, + "step": 72080 + }, + { + "epoch": 0.29125272203525415, + "grad_norm": 1390.3765869140625, + "learning_rate": 4.478805717411066e-05, + "loss": 75.8073, + "step": 72090 + }, + { + "epoch": 0.2912931233006218, + "grad_norm": 1810.283447265625, + "learning_rate": 4.478592369913465e-05, + "loss": 66.2582, + "step": 72100 + }, + { + "epoch": 0.29133352456598943, + "grad_norm": 585.9959106445312, + "learning_rate": 4.478378983842186e-05, + "loss": 60.1694, + "step": 72110 + }, + { + "epoch": 0.291373925831357, + "grad_norm": 589.87109375, + "learning_rate": 4.4781655592013914e-05, + "loss": 69.5412, + "step": 72120 + }, + { + "epoch": 0.29141432709672466, + "grad_norm": 829.7304077148438, + "learning_rate": 4.47795209599524e-05, + "loss": 57.3538, + "step": 72130 + }, + { + "epoch": 0.2914547283620923, + "grad_norm": 331.1748962402344, + "learning_rate": 4.477738594227895e-05, + "loss": 54.7226, + "step": 72140 + }, + { + "epoch": 0.29149512962745994, + "grad_norm": 1150.1204833984375, + "learning_rate": 4.4775250539035174e-05, + "loss": 101.6825, + "step": 72150 + }, + { + "epoch": 0.2915355308928276, + "grad_norm": 1445.37255859375, + "learning_rate": 4.477311475026271e-05, + "loss": 73.3006, + "step": 72160 + }, + { + "epoch": 0.2915759321581952, + "grad_norm": 1057.3179931640625, + "learning_rate": 4.4770978576003196e-05, + "loss": 87.7836, + "step": 72170 + }, + { + "epoch": 0.2916163334235628, + "grad_norm": 258.7021484375, + "learning_rate": 4.4768842016298275e-05, + "loss": 47.9227, + "step": 72180 + }, + { + "epoch": 0.29165673468893044, + "grad_norm": 572.38623046875, + "learning_rate": 4.4766705071189595e-05, + "loss": 94.038, + "step": 72190 + }, + { + "epoch": 0.2916971359542981, + "grad_norm": 1339.6978759765625, + "learning_rate": 4.4764567740718825e-05, + "loss": 55.9968, + "step": 72200 + }, + { + "epoch": 0.2917375372196657, + "grad_norm": 671.230712890625, + "learning_rate": 4.4762430024927636e-05, + "loss": 83.76, + "step": 72210 + }, + { + "epoch": 0.29177793848503336, + "grad_norm": 459.9236145019531, + "learning_rate": 4.476029192385769e-05, + "loss": 52.4123, + "step": 72220 + }, + { + "epoch": 0.291818339750401, + "grad_norm": 1072.2008056640625, + "learning_rate": 4.4758153437550684e-05, + "loss": 61.8481, + "step": 72230 + }, + { + "epoch": 0.29185874101576864, + "grad_norm": 1238.638671875, + "learning_rate": 4.475601456604831e-05, + "loss": 101.6763, + "step": 72240 + }, + { + "epoch": 0.2918991422811362, + "grad_norm": 1013.4456787109375, + "learning_rate": 4.4753875309392266e-05, + "loss": 83.2611, + "step": 72250 + }, + { + "epoch": 0.29193954354650387, + "grad_norm": 609.5685424804688, + "learning_rate": 4.4751735667624237e-05, + "loss": 83.8193, + "step": 72260 + }, + { + "epoch": 0.2919799448118715, + "grad_norm": 683.1033325195312, + "learning_rate": 4.474959564078596e-05, + "loss": 51.5649, + "step": 72270 + }, + { + "epoch": 0.29202034607723915, + "grad_norm": 799.1218872070312, + "learning_rate": 4.4747455228919146e-05, + "loss": 48.9159, + "step": 72280 + }, + { + "epoch": 0.2920607473426068, + "grad_norm": 316.8408508300781, + "learning_rate": 4.4745314432065535e-05, + "loss": 80.174, + "step": 72290 + }, + { + "epoch": 0.2921011486079744, + "grad_norm": 549.0940551757812, + "learning_rate": 4.474317325026684e-05, + "loss": 56.1946, + "step": 72300 + }, + { + "epoch": 0.292141549873342, + "grad_norm": 591.0458984375, + "learning_rate": 4.474103168356483e-05, + "loss": 101.0572, + "step": 72310 + }, + { + "epoch": 0.29218195113870965, + "grad_norm": 3971.22802734375, + "learning_rate": 4.4738889732001234e-05, + "loss": 65.9609, + "step": 72320 + }, + { + "epoch": 0.2922223524040773, + "grad_norm": 947.2832641601562, + "learning_rate": 4.473674739561783e-05, + "loss": 108.9696, + "step": 72330 + }, + { + "epoch": 0.29226275366944493, + "grad_norm": 745.0115356445312, + "learning_rate": 4.473460467445637e-05, + "loss": 53.7305, + "step": 72340 + }, + { + "epoch": 0.29230315493481257, + "grad_norm": 1212.1033935546875, + "learning_rate": 4.473246156855863e-05, + "loss": 85.5423, + "step": 72350 + }, + { + "epoch": 0.2923435562001802, + "grad_norm": 1858.6473388671875, + "learning_rate": 4.473031807796639e-05, + "loss": 84.1671, + "step": 72360 + }, + { + "epoch": 0.2923839574655478, + "grad_norm": 1183.3797607421875, + "learning_rate": 4.4728174202721444e-05, + "loss": 85.8308, + "step": 72370 + }, + { + "epoch": 0.29242435873091543, + "grad_norm": 564.0057373046875, + "learning_rate": 4.472602994286559e-05, + "loss": 62.2768, + "step": 72380 + }, + { + "epoch": 0.2924647599962831, + "grad_norm": 740.6309204101562, + "learning_rate": 4.472388529844062e-05, + "loss": 46.9198, + "step": 72390 + }, + { + "epoch": 0.2925051612616507, + "grad_norm": 1595.9669189453125, + "learning_rate": 4.4721740269488355e-05, + "loss": 61.2028, + "step": 72400 + }, + { + "epoch": 0.29254556252701835, + "grad_norm": 1151.5087890625, + "learning_rate": 4.4719594856050604e-05, + "loss": 62.4049, + "step": 72410 + }, + { + "epoch": 0.292585963792386, + "grad_norm": 475.585205078125, + "learning_rate": 4.4717449058169216e-05, + "loss": 50.4637, + "step": 72420 + }, + { + "epoch": 0.29262636505775363, + "grad_norm": 519.3822631835938, + "learning_rate": 4.471530287588599e-05, + "loss": 77.0086, + "step": 72430 + }, + { + "epoch": 0.2926667663231212, + "grad_norm": 993.6683959960938, + "learning_rate": 4.471315630924279e-05, + "loss": 76.9779, + "step": 72440 + }, + { + "epoch": 0.29270716758848886, + "grad_norm": 1110.651611328125, + "learning_rate": 4.4711009358281456e-05, + "loss": 73.6586, + "step": 72450 + }, + { + "epoch": 0.2927475688538565, + "grad_norm": 703.5302124023438, + "learning_rate": 4.4708862023043854e-05, + "loss": 68.1201, + "step": 72460 + }, + { + "epoch": 0.29278797011922414, + "grad_norm": 773.6651611328125, + "learning_rate": 4.470671430357183e-05, + "loss": 62.8462, + "step": 72470 + }, + { + "epoch": 0.2928283713845918, + "grad_norm": 806.9835815429688, + "learning_rate": 4.470456619990727e-05, + "loss": 71.4623, + "step": 72480 + }, + { + "epoch": 0.2928687726499594, + "grad_norm": 805.1903686523438, + "learning_rate": 4.470241771209205e-05, + "loss": 75.0459, + "step": 72490 + }, + { + "epoch": 0.292909173915327, + "grad_norm": 408.85089111328125, + "learning_rate": 4.4700268840168045e-05, + "loss": 58.6624, + "step": 72500 + }, + { + "epoch": 0.29294957518069464, + "grad_norm": 701.9678344726562, + "learning_rate": 4.469811958417717e-05, + "loss": 55.822, + "step": 72510 + }, + { + "epoch": 0.2929899764460623, + "grad_norm": 563.7380981445312, + "learning_rate": 4.46959699441613e-05, + "loss": 49.1962, + "step": 72520 + }, + { + "epoch": 0.2930303777114299, + "grad_norm": 736.0975952148438, + "learning_rate": 4.469381992016236e-05, + "loss": 42.205, + "step": 72530 + }, + { + "epoch": 0.29307077897679756, + "grad_norm": 375.850830078125, + "learning_rate": 4.469166951222227e-05, + "loss": 58.5806, + "step": 72540 + }, + { + "epoch": 0.2931111802421652, + "grad_norm": 828.135498046875, + "learning_rate": 4.4689518720382937e-05, + "loss": 69.4176, + "step": 72550 + }, + { + "epoch": 0.29315158150753284, + "grad_norm": 955.2371215820312, + "learning_rate": 4.46873675446863e-05, + "loss": 100.2446, + "step": 72560 + }, + { + "epoch": 0.29319198277290043, + "grad_norm": 808.4039916992188, + "learning_rate": 4.468521598517429e-05, + "loss": 79.4082, + "step": 72570 + }, + { + "epoch": 0.29323238403826807, + "grad_norm": 563.0283813476562, + "learning_rate": 4.468306404188887e-05, + "loss": 33.0342, + "step": 72580 + }, + { + "epoch": 0.2932727853036357, + "grad_norm": 2005.7269287109375, + "learning_rate": 4.468091171487197e-05, + "loss": 87.8131, + "step": 72590 + }, + { + "epoch": 0.29331318656900335, + "grad_norm": 1091.245849609375, + "learning_rate": 4.4678759004165584e-05, + "loss": 60.274, + "step": 72600 + }, + { + "epoch": 0.293353587834371, + "grad_norm": 773.4925537109375, + "learning_rate": 4.467660590981165e-05, + "loss": 43.7714, + "step": 72610 + }, + { + "epoch": 0.29339398909973863, + "grad_norm": 414.9910583496094, + "learning_rate": 4.4674452431852155e-05, + "loss": 43.9692, + "step": 72620 + }, + { + "epoch": 0.2934343903651062, + "grad_norm": 974.5853881835938, + "learning_rate": 4.467229857032907e-05, + "loss": 73.6409, + "step": 72630 + }, + { + "epoch": 0.29347479163047385, + "grad_norm": 879.56494140625, + "learning_rate": 4.4670144325284414e-05, + "loss": 103.894, + "step": 72640 + }, + { + "epoch": 0.2935151928958415, + "grad_norm": 522.8005981445312, + "learning_rate": 4.466798969676015e-05, + "loss": 108.3693, + "step": 72650 + }, + { + "epoch": 0.29355559416120913, + "grad_norm": 680.3546142578125, + "learning_rate": 4.4665834684798316e-05, + "loss": 58.7339, + "step": 72660 + }, + { + "epoch": 0.2935959954265768, + "grad_norm": 639.8063354492188, + "learning_rate": 4.4663679289440895e-05, + "loss": 69.8376, + "step": 72670 + }, + { + "epoch": 0.2936363966919444, + "grad_norm": 279.01385498046875, + "learning_rate": 4.466152351072994e-05, + "loss": 60.8839, + "step": 72680 + }, + { + "epoch": 0.293676797957312, + "grad_norm": 744.683349609375, + "learning_rate": 4.465936734870745e-05, + "loss": 59.2348, + "step": 72690 + }, + { + "epoch": 0.29371719922267964, + "grad_norm": 884.4476928710938, + "learning_rate": 4.465721080341547e-05, + "loss": 55.9587, + "step": 72700 + }, + { + "epoch": 0.2937576004880473, + "grad_norm": 736.0975341796875, + "learning_rate": 4.465505387489606e-05, + "loss": 85.0535, + "step": 72710 + }, + { + "epoch": 0.2937980017534149, + "grad_norm": 1129.6541748046875, + "learning_rate": 4.465289656319124e-05, + "loss": 166.6917, + "step": 72720 + }, + { + "epoch": 0.29383840301878256, + "grad_norm": 918.02587890625, + "learning_rate": 4.465073886834309e-05, + "loss": 80.1093, + "step": 72730 + }, + { + "epoch": 0.2938788042841502, + "grad_norm": 741.1550903320312, + "learning_rate": 4.464858079039367e-05, + "loss": 62.3389, + "step": 72740 + }, + { + "epoch": 0.29391920554951784, + "grad_norm": 1366.1309814453125, + "learning_rate": 4.464642232938505e-05, + "loss": 127.8192, + "step": 72750 + }, + { + "epoch": 0.2939596068148854, + "grad_norm": 691.9977416992188, + "learning_rate": 4.464426348535931e-05, + "loss": 71.4388, + "step": 72760 + }, + { + "epoch": 0.29400000808025306, + "grad_norm": 743.5264282226562, + "learning_rate": 4.464210425835854e-05, + "loss": 77.9983, + "step": 72770 + }, + { + "epoch": 0.2940404093456207, + "grad_norm": 570.9989624023438, + "learning_rate": 4.463994464842484e-05, + "loss": 66.5061, + "step": 72780 + }, + { + "epoch": 0.29408081061098834, + "grad_norm": 419.94793701171875, + "learning_rate": 4.46377846556003e-05, + "loss": 80.6933, + "step": 72790 + }, + { + "epoch": 0.294121211876356, + "grad_norm": 399.4372253417969, + "learning_rate": 4.4635624279927044e-05, + "loss": 82.7092, + "step": 72800 + }, + { + "epoch": 0.2941616131417236, + "grad_norm": 1092.372802734375, + "learning_rate": 4.463346352144718e-05, + "loss": 61.0069, + "step": 72810 + }, + { + "epoch": 0.2942020144070912, + "grad_norm": 1244.2633056640625, + "learning_rate": 4.463130238020285e-05, + "loss": 69.0925, + "step": 72820 + }, + { + "epoch": 0.29424241567245885, + "grad_norm": 3014.16796875, + "learning_rate": 4.4629140856236155e-05, + "loss": 95.4817, + "step": 72830 + }, + { + "epoch": 0.2942828169378265, + "grad_norm": 855.9567260742188, + "learning_rate": 4.462697894958926e-05, + "loss": 79.4187, + "step": 72840 + }, + { + "epoch": 0.2943232182031941, + "grad_norm": 588.0103149414062, + "learning_rate": 4.4624816660304314e-05, + "loss": 46.0367, + "step": 72850 + }, + { + "epoch": 0.29436361946856177, + "grad_norm": 2154.02734375, + "learning_rate": 4.4622653988423455e-05, + "loss": 116.9557, + "step": 72860 + }, + { + "epoch": 0.2944040207339294, + "grad_norm": 743.9976196289062, + "learning_rate": 4.462049093398885e-05, + "loss": 52.3771, + "step": 72870 + }, + { + "epoch": 0.29444442199929705, + "grad_norm": 1056.8822021484375, + "learning_rate": 4.461832749704268e-05, + "loss": 70.0739, + "step": 72880 + }, + { + "epoch": 0.29448482326466463, + "grad_norm": 860.94580078125, + "learning_rate": 4.461616367762711e-05, + "loss": 53.0582, + "step": 72890 + }, + { + "epoch": 0.29452522453003227, + "grad_norm": 319.7754821777344, + "learning_rate": 4.4613999475784336e-05, + "loss": 50.5095, + "step": 72900 + }, + { + "epoch": 0.2945656257953999, + "grad_norm": 1478.31982421875, + "learning_rate": 4.4611834891556534e-05, + "loss": 94.7775, + "step": 72910 + }, + { + "epoch": 0.29460602706076755, + "grad_norm": 548.7947998046875, + "learning_rate": 4.460966992498593e-05, + "loss": 96.3625, + "step": 72920 + }, + { + "epoch": 0.2946464283261352, + "grad_norm": 567.1472778320312, + "learning_rate": 4.46075045761147e-05, + "loss": 66.8417, + "step": 72930 + }, + { + "epoch": 0.29468682959150283, + "grad_norm": 510.6759033203125, + "learning_rate": 4.460533884498509e-05, + "loss": 43.7635, + "step": 72940 + }, + { + "epoch": 0.2947272308568704, + "grad_norm": 1200.787353515625, + "learning_rate": 4.460317273163929e-05, + "loss": 79.7278, + "step": 72950 + }, + { + "epoch": 0.29476763212223805, + "grad_norm": 1466.562255859375, + "learning_rate": 4.460100623611955e-05, + "loss": 80.256, + "step": 72960 + }, + { + "epoch": 0.2948080333876057, + "grad_norm": 1230.124267578125, + "learning_rate": 4.45988393584681e-05, + "loss": 94.8147, + "step": 72970 + }, + { + "epoch": 0.29484843465297333, + "grad_norm": 1354.7569580078125, + "learning_rate": 4.4596672098727195e-05, + "loss": 79.9409, + "step": 72980 + }, + { + "epoch": 0.294888835918341, + "grad_norm": 486.84246826171875, + "learning_rate": 4.459450445693907e-05, + "loss": 48.9059, + "step": 72990 + }, + { + "epoch": 0.2949292371837086, + "grad_norm": 606.8703002929688, + "learning_rate": 4.4592336433146e-05, + "loss": 56.8745, + "step": 73000 + }, + { + "epoch": 0.2949696384490762, + "grad_norm": 1210.473388671875, + "learning_rate": 4.459016802739023e-05, + "loss": 81.2079, + "step": 73010 + }, + { + "epoch": 0.29501003971444384, + "grad_norm": 467.654296875, + "learning_rate": 4.458799923971406e-05, + "loss": 71.5637, + "step": 73020 + }, + { + "epoch": 0.2950504409798115, + "grad_norm": 1085.084228515625, + "learning_rate": 4.4585830070159764e-05, + "loss": 59.4551, + "step": 73030 + }, + { + "epoch": 0.2950908422451791, + "grad_norm": 1020.708984375, + "learning_rate": 4.458366051876962e-05, + "loss": 67.9901, + "step": 73040 + }, + { + "epoch": 0.29513124351054676, + "grad_norm": 413.11859130859375, + "learning_rate": 4.458149058558594e-05, + "loss": 94.3448, + "step": 73050 + }, + { + "epoch": 0.2951716447759144, + "grad_norm": 344.1728210449219, + "learning_rate": 4.457932027065102e-05, + "loss": 42.2916, + "step": 73060 + }, + { + "epoch": 0.29521204604128204, + "grad_norm": 0.0, + "learning_rate": 4.457714957400716e-05, + "loss": 72.4169, + "step": 73070 + }, + { + "epoch": 0.2952524473066496, + "grad_norm": 1693.0472412109375, + "learning_rate": 4.45749784956967e-05, + "loss": 81.7885, + "step": 73080 + }, + { + "epoch": 0.29529284857201726, + "grad_norm": 676.3005981445312, + "learning_rate": 4.457280703576194e-05, + "loss": 69.0328, + "step": 73090 + }, + { + "epoch": 0.2953332498373849, + "grad_norm": 723.7576293945312, + "learning_rate": 4.457063519424525e-05, + "loss": 74.3711, + "step": 73100 + }, + { + "epoch": 0.29537365110275254, + "grad_norm": 397.1194152832031, + "learning_rate": 4.456846297118894e-05, + "loss": 65.241, + "step": 73110 + }, + { + "epoch": 0.2954140523681202, + "grad_norm": 686.8461303710938, + "learning_rate": 4.456629036663537e-05, + "loss": 66.4102, + "step": 73120 + }, + { + "epoch": 0.2954544536334878, + "grad_norm": 991.1904296875, + "learning_rate": 4.45641173806269e-05, + "loss": 79.4375, + "step": 73130 + }, + { + "epoch": 0.2954948548988554, + "grad_norm": 1777.7161865234375, + "learning_rate": 4.4561944013205885e-05, + "loss": 74.0132, + "step": 73140 + }, + { + "epoch": 0.29553525616422305, + "grad_norm": 1228.3016357421875, + "learning_rate": 4.45597702644147e-05, + "loss": 55.363, + "step": 73150 + }, + { + "epoch": 0.2955756574295907, + "grad_norm": 846.4840087890625, + "learning_rate": 4.455759613429573e-05, + "loss": 76.3519, + "step": 73160 + }, + { + "epoch": 0.29561605869495833, + "grad_norm": 639.7553100585938, + "learning_rate": 4.455542162289136e-05, + "loss": 83.1235, + "step": 73170 + }, + { + "epoch": 0.29565645996032597, + "grad_norm": 485.4346618652344, + "learning_rate": 4.455324673024396e-05, + "loss": 106.5092, + "step": 73180 + }, + { + "epoch": 0.2956968612256936, + "grad_norm": 1039.0308837890625, + "learning_rate": 4.4551071456395957e-05, + "loss": 55.8722, + "step": 73190 + }, + { + "epoch": 0.2957372624910612, + "grad_norm": 4481.669921875, + "learning_rate": 4.454889580138975e-05, + "loss": 92.0912, + "step": 73200 + }, + { + "epoch": 0.29577766375642883, + "grad_norm": 1369.180419921875, + "learning_rate": 4.454671976526776e-05, + "loss": 81.6197, + "step": 73210 + }, + { + "epoch": 0.2958180650217965, + "grad_norm": 1476.4033203125, + "learning_rate": 4.45445433480724e-05, + "loss": 60.5057, + "step": 73220 + }, + { + "epoch": 0.2958584662871641, + "grad_norm": 817.6678466796875, + "learning_rate": 4.45423665498461e-05, + "loss": 107.7844, + "step": 73230 + }, + { + "epoch": 0.29589886755253175, + "grad_norm": 1333.8826904296875, + "learning_rate": 4.4540189370631315e-05, + "loss": 100.9942, + "step": 73240 + }, + { + "epoch": 0.2959392688178994, + "grad_norm": 465.54510498046875, + "learning_rate": 4.453801181047047e-05, + "loss": 77.737, + "step": 73250 + }, + { + "epoch": 0.29597967008326703, + "grad_norm": 1762.44287109375, + "learning_rate": 4.4535833869406027e-05, + "loss": 58.7377, + "step": 73260 + }, + { + "epoch": 0.2960200713486346, + "grad_norm": 568.57958984375, + "learning_rate": 4.4533655547480444e-05, + "loss": 56.8368, + "step": 73270 + }, + { + "epoch": 0.29606047261400226, + "grad_norm": 1187.254638671875, + "learning_rate": 4.45314768447362e-05, + "loss": 88.9359, + "step": 73280 + }, + { + "epoch": 0.2961008738793699, + "grad_norm": 902.0029296875, + "learning_rate": 4.452929776121575e-05, + "loss": 69.4659, + "step": 73290 + }, + { + "epoch": 0.29614127514473754, + "grad_norm": 0.0, + "learning_rate": 4.452711829696158e-05, + "loss": 47.5744, + "step": 73300 + }, + { + "epoch": 0.2961816764101052, + "grad_norm": 407.5782775878906, + "learning_rate": 4.452493845201619e-05, + "loss": 63.7359, + "step": 73310 + }, + { + "epoch": 0.2962220776754728, + "grad_norm": 674.528076171875, + "learning_rate": 4.4522758226422076e-05, + "loss": 43.8548, + "step": 73320 + }, + { + "epoch": 0.2962624789408404, + "grad_norm": 679.5468139648438, + "learning_rate": 4.452057762022174e-05, + "loss": 56.5893, + "step": 73330 + }, + { + "epoch": 0.29630288020620804, + "grad_norm": 982.80517578125, + "learning_rate": 4.4518396633457696e-05, + "loss": 103.2029, + "step": 73340 + }, + { + "epoch": 0.2963432814715757, + "grad_norm": 529.539794921875, + "learning_rate": 4.4516215266172453e-05, + "loss": 36.5715, + "step": 73350 + }, + { + "epoch": 0.2963836827369433, + "grad_norm": 703.641357421875, + "learning_rate": 4.451403351840855e-05, + "loss": 68.3904, + "step": 73360 + }, + { + "epoch": 0.29642408400231096, + "grad_norm": 865.1505126953125, + "learning_rate": 4.451185139020852e-05, + "loss": 64.2055, + "step": 73370 + }, + { + "epoch": 0.2964644852676786, + "grad_norm": 718.0322265625, + "learning_rate": 4.4509668881614894e-05, + "loss": 102.1239, + "step": 73380 + }, + { + "epoch": 0.29650488653304624, + "grad_norm": 1524.7041015625, + "learning_rate": 4.450748599267024e-05, + "loss": 81.8768, + "step": 73390 + }, + { + "epoch": 0.2965452877984138, + "grad_norm": 378.5061950683594, + "learning_rate": 4.450530272341709e-05, + "loss": 56.3704, + "step": 73400 + }, + { + "epoch": 0.29658568906378147, + "grad_norm": 1193.3980712890625, + "learning_rate": 4.4503119073898024e-05, + "loss": 62.8788, + "step": 73410 + }, + { + "epoch": 0.2966260903291491, + "grad_norm": 972.7481079101562, + "learning_rate": 4.4500935044155626e-05, + "loss": 55.7444, + "step": 73420 + }, + { + "epoch": 0.29666649159451675, + "grad_norm": 496.94207763671875, + "learning_rate": 4.4498750634232445e-05, + "loss": 55.0117, + "step": 73430 + }, + { + "epoch": 0.2967068928598844, + "grad_norm": 690.003173828125, + "learning_rate": 4.449656584417108e-05, + "loss": 63.6232, + "step": 73440 + }, + { + "epoch": 0.296747294125252, + "grad_norm": 1239.76123046875, + "learning_rate": 4.449438067401413e-05, + "loss": 56.0802, + "step": 73450 + }, + { + "epoch": 0.2967876953906196, + "grad_norm": 1073.9417724609375, + "learning_rate": 4.44921951238042e-05, + "loss": 74.7659, + "step": 73460 + }, + { + "epoch": 0.29682809665598725, + "grad_norm": 612.8600463867188, + "learning_rate": 4.449000919358388e-05, + "loss": 49.4253, + "step": 73470 + }, + { + "epoch": 0.2968684979213549, + "grad_norm": 803.32763671875, + "learning_rate": 4.4487822883395805e-05, + "loss": 70.9369, + "step": 73480 + }, + { + "epoch": 0.29690889918672253, + "grad_norm": 999.6846313476562, + "learning_rate": 4.448563619328259e-05, + "loss": 77.0513, + "step": 73490 + }, + { + "epoch": 0.29694930045209017, + "grad_norm": 260.62261962890625, + "learning_rate": 4.448344912328686e-05, + "loss": 40.874, + "step": 73500 + }, + { + "epoch": 0.2969897017174578, + "grad_norm": 1337.423828125, + "learning_rate": 4.4481261673451255e-05, + "loss": 63.2176, + "step": 73510 + }, + { + "epoch": 0.2970301029828254, + "grad_norm": 328.9664306640625, + "learning_rate": 4.447907384381843e-05, + "loss": 58.7723, + "step": 73520 + }, + { + "epoch": 0.29707050424819303, + "grad_norm": 767.3320922851562, + "learning_rate": 4.447688563443103e-05, + "loss": 100.0262, + "step": 73530 + }, + { + "epoch": 0.2971109055135607, + "grad_norm": 251.83306884765625, + "learning_rate": 4.447469704533172e-05, + "loss": 58.1344, + "step": 73540 + }, + { + "epoch": 0.2971513067789283, + "grad_norm": 696.3643798828125, + "learning_rate": 4.4472508076563166e-05, + "loss": 80.2865, + "step": 73550 + }, + { + "epoch": 0.29719170804429595, + "grad_norm": 578.1966552734375, + "learning_rate": 4.447031872816804e-05, + "loss": 63.5288, + "step": 73560 + }, + { + "epoch": 0.2972321093096636, + "grad_norm": 1685.81982421875, + "learning_rate": 4.446812900018902e-05, + "loss": 98.5778, + "step": 73570 + }, + { + "epoch": 0.29727251057503123, + "grad_norm": 605.77978515625, + "learning_rate": 4.4465938892668814e-05, + "loss": 108.6232, + "step": 73580 + }, + { + "epoch": 0.2973129118403988, + "grad_norm": 425.3101501464844, + "learning_rate": 4.44637484056501e-05, + "loss": 81.9932, + "step": 73590 + }, + { + "epoch": 0.29735331310576646, + "grad_norm": 664.3129272460938, + "learning_rate": 4.4461557539175594e-05, + "loss": 80.7212, + "step": 73600 + }, + { + "epoch": 0.2973937143711341, + "grad_norm": 932.316650390625, + "learning_rate": 4.4459366293287994e-05, + "loss": 83.0597, + "step": 73610 + }, + { + "epoch": 0.29743411563650174, + "grad_norm": 1423.0401611328125, + "learning_rate": 4.445717466803004e-05, + "loss": 64.9128, + "step": 73620 + }, + { + "epoch": 0.2974745169018694, + "grad_norm": 572.2749633789062, + "learning_rate": 4.445498266344444e-05, + "loss": 49.8087, + "step": 73630 + }, + { + "epoch": 0.297514918167237, + "grad_norm": 1016.2572021484375, + "learning_rate": 4.445279027957395e-05, + "loss": 64.3123, + "step": 73640 + }, + { + "epoch": 0.2975553194326046, + "grad_norm": 1676.9608154296875, + "learning_rate": 4.4450597516461287e-05, + "loss": 79.8256, + "step": 73650 + }, + { + "epoch": 0.29759572069797224, + "grad_norm": 501.0017395019531, + "learning_rate": 4.444840437414922e-05, + "loss": 63.8892, + "step": 73660 + }, + { + "epoch": 0.2976361219633399, + "grad_norm": 1168.860107421875, + "learning_rate": 4.444621085268049e-05, + "loss": 59.1573, + "step": 73670 + }, + { + "epoch": 0.2976765232287075, + "grad_norm": 664.9098510742188, + "learning_rate": 4.444401695209788e-05, + "loss": 57.3835, + "step": 73680 + }, + { + "epoch": 0.29771692449407516, + "grad_norm": 1014.5247192382812, + "learning_rate": 4.4441822672444134e-05, + "loss": 89.5824, + "step": 73690 + }, + { + "epoch": 0.2977573257594428, + "grad_norm": 765.1936645507812, + "learning_rate": 4.443962801376205e-05, + "loss": 38.8063, + "step": 73700 + }, + { + "epoch": 0.29779772702481044, + "grad_norm": 928.1898193359375, + "learning_rate": 4.443743297609442e-05, + "loss": 62.8045, + "step": 73710 + }, + { + "epoch": 0.29783812829017803, + "grad_norm": 685.7420654296875, + "learning_rate": 4.443523755948401e-05, + "loss": 66.0851, + "step": 73720 + }, + { + "epoch": 0.29787852955554567, + "grad_norm": 881.26416015625, + "learning_rate": 4.443304176397365e-05, + "loss": 71.4241, + "step": 73730 + }, + { + "epoch": 0.2979189308209133, + "grad_norm": 1430.6566162109375, + "learning_rate": 4.443084558960613e-05, + "loss": 110.0562, + "step": 73740 + }, + { + "epoch": 0.29795933208628095, + "grad_norm": 476.3144226074219, + "learning_rate": 4.442864903642428e-05, + "loss": 63.0536, + "step": 73750 + }, + { + "epoch": 0.2979997333516486, + "grad_norm": 597.589111328125, + "learning_rate": 4.4426452104470903e-05, + "loss": 49.1393, + "step": 73760 + }, + { + "epoch": 0.29804013461701623, + "grad_norm": 673.9664306640625, + "learning_rate": 4.4424254793788844e-05, + "loss": 49.0798, + "step": 73770 + }, + { + "epoch": 0.2980805358823838, + "grad_norm": 1329.530029296875, + "learning_rate": 4.4422057104420946e-05, + "loss": 71.3935, + "step": 73780 + }, + { + "epoch": 0.29812093714775145, + "grad_norm": 690.584228515625, + "learning_rate": 4.4419859036410036e-05, + "loss": 73.6381, + "step": 73790 + }, + { + "epoch": 0.2981613384131191, + "grad_norm": 627.472900390625, + "learning_rate": 4.441766058979898e-05, + "loss": 82.9257, + "step": 73800 + }, + { + "epoch": 0.29820173967848673, + "grad_norm": 638.86279296875, + "learning_rate": 4.441546176463063e-05, + "loss": 57.4086, + "step": 73810 + }, + { + "epoch": 0.2982421409438544, + "grad_norm": 844.6746215820312, + "learning_rate": 4.441326256094787e-05, + "loss": 74.4755, + "step": 73820 + }, + { + "epoch": 0.298282542209222, + "grad_norm": 1176.2183837890625, + "learning_rate": 4.4411062978793545e-05, + "loss": 48.4431, + "step": 73830 + }, + { + "epoch": 0.2983229434745896, + "grad_norm": 762.2763671875, + "learning_rate": 4.4408863018210564e-05, + "loss": 54.5466, + "step": 73840 + }, + { + "epoch": 0.29836334473995724, + "grad_norm": 2829.393310546875, + "learning_rate": 4.44066626792418e-05, + "loss": 88.412, + "step": 73850 + }, + { + "epoch": 0.2984037460053249, + "grad_norm": 2878.249267578125, + "learning_rate": 4.440446196193016e-05, + "loss": 129.003, + "step": 73860 + }, + { + "epoch": 0.2984441472706925, + "grad_norm": 631.333984375, + "learning_rate": 4.440226086631854e-05, + "loss": 59.6416, + "step": 73870 + }, + { + "epoch": 0.29848454853606016, + "grad_norm": 858.2770385742188, + "learning_rate": 4.440005939244986e-05, + "loss": 54.7916, + "step": 73880 + }, + { + "epoch": 0.2985249498014278, + "grad_norm": 945.3146362304688, + "learning_rate": 4.439785754036703e-05, + "loss": 70.1292, + "step": 73890 + }, + { + "epoch": 0.29856535106679544, + "grad_norm": 915.803955078125, + "learning_rate": 4.439565531011299e-05, + "loss": 66.0254, + "step": 73900 + }, + { + "epoch": 0.298605752332163, + "grad_norm": 683.1613159179688, + "learning_rate": 4.4393452701730655e-05, + "loss": 55.0487, + "step": 73910 + }, + { + "epoch": 0.29864615359753066, + "grad_norm": 1310.735107421875, + "learning_rate": 4.439124971526297e-05, + "loss": 99.916, + "step": 73920 + }, + { + "epoch": 0.2986865548628983, + "grad_norm": 276.0152893066406, + "learning_rate": 4.4389046350752905e-05, + "loss": 46.5066, + "step": 73930 + }, + { + "epoch": 0.29872695612826594, + "grad_norm": 1337.242919921875, + "learning_rate": 4.438684260824339e-05, + "loss": 112.974, + "step": 73940 + }, + { + "epoch": 0.2987673573936336, + "grad_norm": 583.646728515625, + "learning_rate": 4.43846384877774e-05, + "loss": 49.2991, + "step": 73950 + }, + { + "epoch": 0.2988077586590012, + "grad_norm": 1229.034423828125, + "learning_rate": 4.4382433989397895e-05, + "loss": 56.1411, + "step": 73960 + }, + { + "epoch": 0.2988481599243688, + "grad_norm": 556.5376586914062, + "learning_rate": 4.4380229113147866e-05, + "loss": 69.2976, + "step": 73970 + }, + { + "epoch": 0.29888856118973645, + "grad_norm": 448.5850524902344, + "learning_rate": 4.437802385907029e-05, + "loss": 65.7849, + "step": 73980 + }, + { + "epoch": 0.2989289624551041, + "grad_norm": 412.0335388183594, + "learning_rate": 4.4375818227208164e-05, + "loss": 47.8312, + "step": 73990 + }, + { + "epoch": 0.2989693637204717, + "grad_norm": 549.283935546875, + "learning_rate": 4.4373612217604496e-05, + "loss": 60.2066, + "step": 74000 + }, + { + "epoch": 0.29900976498583937, + "grad_norm": 641.086181640625, + "learning_rate": 4.437140583030227e-05, + "loss": 81.0014, + "step": 74010 + }, + { + "epoch": 0.299050166251207, + "grad_norm": 470.0632629394531, + "learning_rate": 4.4369199065344525e-05, + "loss": 48.1158, + "step": 74020 + }, + { + "epoch": 0.29909056751657465, + "grad_norm": 479.7843933105469, + "learning_rate": 4.436699192277426e-05, + "loss": 62.3095, + "step": 74030 + }, + { + "epoch": 0.29913096878194223, + "grad_norm": 953.147705078125, + "learning_rate": 4.436478440263453e-05, + "loss": 69.0217, + "step": 74040 + }, + { + "epoch": 0.29917137004730987, + "grad_norm": 609.3707885742188, + "learning_rate": 4.436257650496834e-05, + "loss": 71.6211, + "step": 74050 + }, + { + "epoch": 0.2992117713126775, + "grad_norm": 1487.8802490234375, + "learning_rate": 4.436036822981877e-05, + "loss": 84.6675, + "step": 74060 + }, + { + "epoch": 0.29925217257804515, + "grad_norm": 894.4046630859375, + "learning_rate": 4.435815957722885e-05, + "loss": 51.4052, + "step": 74070 + }, + { + "epoch": 0.2992925738434128, + "grad_norm": 1185.888671875, + "learning_rate": 4.4355950547241645e-05, + "loss": 79.9474, + "step": 74080 + }, + { + "epoch": 0.29933297510878043, + "grad_norm": 2298.0419921875, + "learning_rate": 4.435374113990021e-05, + "loss": 62.4108, + "step": 74090 + }, + { + "epoch": 0.299373376374148, + "grad_norm": 1640.901611328125, + "learning_rate": 4.435153135524763e-05, + "loss": 108.3077, + "step": 74100 + }, + { + "epoch": 0.29941377763951565, + "grad_norm": 509.5463562011719, + "learning_rate": 4.434932119332699e-05, + "loss": 74.0899, + "step": 74110 + }, + { + "epoch": 0.2994541789048833, + "grad_norm": 1251.0087890625, + "learning_rate": 4.434711065418137e-05, + "loss": 117.1789, + "step": 74120 + }, + { + "epoch": 0.29949458017025093, + "grad_norm": 838.3406982421875, + "learning_rate": 4.434489973785386e-05, + "loss": 99.4823, + "step": 74130 + }, + { + "epoch": 0.2995349814356186, + "grad_norm": 924.2326049804688, + "learning_rate": 4.434268844438758e-05, + "loss": 60.6183, + "step": 74140 + }, + { + "epoch": 0.2995753827009862, + "grad_norm": 602.224365234375, + "learning_rate": 4.4340476773825625e-05, + "loss": 65.4386, + "step": 74150 + }, + { + "epoch": 0.2996157839663538, + "grad_norm": 1039.2386474609375, + "learning_rate": 4.433826472621112e-05, + "loss": 72.3676, + "step": 74160 + }, + { + "epoch": 0.29965618523172144, + "grad_norm": 1209.8310546875, + "learning_rate": 4.4336052301587185e-05, + "loss": 86.5389, + "step": 74170 + }, + { + "epoch": 0.2996965864970891, + "grad_norm": 664.8455200195312, + "learning_rate": 4.4333839499996954e-05, + "loss": 77.3693, + "step": 74180 + }, + { + "epoch": 0.2997369877624567, + "grad_norm": 373.7446594238281, + "learning_rate": 4.4331626321483575e-05, + "loss": 37.4625, + "step": 74190 + }, + { + "epoch": 0.29977738902782436, + "grad_norm": 587.56396484375, + "learning_rate": 4.432941276609018e-05, + "loss": 53.5302, + "step": 74200 + }, + { + "epoch": 0.299817790293192, + "grad_norm": 399.6770935058594, + "learning_rate": 4.432719883385994e-05, + "loss": 43.0968, + "step": 74210 + }, + { + "epoch": 0.29985819155855964, + "grad_norm": 455.2350158691406, + "learning_rate": 4.4324984524836e-05, + "loss": 65.6205, + "step": 74220 + }, + { + "epoch": 0.2998985928239272, + "grad_norm": 1229.9195556640625, + "learning_rate": 4.432276983906155e-05, + "loss": 59.1207, + "step": 74230 + }, + { + "epoch": 0.29993899408929486, + "grad_norm": 1179.552734375, + "learning_rate": 4.4320554776579747e-05, + "loss": 68.4554, + "step": 74240 + }, + { + "epoch": 0.2999793953546625, + "grad_norm": 563.2706298828125, + "learning_rate": 4.431833933743378e-05, + "loss": 51.8571, + "step": 74250 + }, + { + "epoch": 0.30001979662003014, + "grad_norm": 779.345947265625, + "learning_rate": 4.431612352166684e-05, + "loss": 81.9162, + "step": 74260 + }, + { + "epoch": 0.3000601978853978, + "grad_norm": 363.64324951171875, + "learning_rate": 4.431390732932213e-05, + "loss": 55.0468, + "step": 74270 + }, + { + "epoch": 0.3001005991507654, + "grad_norm": 1637.340087890625, + "learning_rate": 4.431169076044286e-05, + "loss": 81.3846, + "step": 74280 + }, + { + "epoch": 0.300141000416133, + "grad_norm": 563.9585571289062, + "learning_rate": 4.4309473815072225e-05, + "loss": 60.6945, + "step": 74290 + }, + { + "epoch": 0.30018140168150065, + "grad_norm": 749.6682739257812, + "learning_rate": 4.4307256493253457e-05, + "loss": 68.1788, + "step": 74300 + }, + { + "epoch": 0.3002218029468683, + "grad_norm": 5840.30810546875, + "learning_rate": 4.4305038795029794e-05, + "loss": 86.4264, + "step": 74310 + }, + { + "epoch": 0.30026220421223593, + "grad_norm": 910.1226806640625, + "learning_rate": 4.4302820720444456e-05, + "loss": 77.6066, + "step": 74320 + }, + { + "epoch": 0.30030260547760357, + "grad_norm": 709.5213012695312, + "learning_rate": 4.430060226954069e-05, + "loss": 120.0873, + "step": 74330 + }, + { + "epoch": 0.3003430067429712, + "grad_norm": 907.42431640625, + "learning_rate": 4.429838344236174e-05, + "loss": 82.4217, + "step": 74340 + }, + { + "epoch": 0.30038340800833885, + "grad_norm": 595.138671875, + "learning_rate": 4.4296164238950874e-05, + "loss": 77.8745, + "step": 74350 + }, + { + "epoch": 0.30042380927370643, + "grad_norm": 1740.2891845703125, + "learning_rate": 4.429394465935136e-05, + "loss": 70.9948, + "step": 74360 + }, + { + "epoch": 0.3004642105390741, + "grad_norm": 624.7880249023438, + "learning_rate": 4.429172470360645e-05, + "loss": 84.4048, + "step": 74370 + }, + { + "epoch": 0.3005046118044417, + "grad_norm": 632.6150512695312, + "learning_rate": 4.428950437175944e-05, + "loss": 65.9942, + "step": 74380 + }, + { + "epoch": 0.30054501306980935, + "grad_norm": 217.38211059570312, + "learning_rate": 4.428728366385361e-05, + "loss": 73.9153, + "step": 74390 + }, + { + "epoch": 0.300585414335177, + "grad_norm": 1739.6029052734375, + "learning_rate": 4.428506257993226e-05, + "loss": 78.2645, + "step": 74400 + }, + { + "epoch": 0.30062581560054463, + "grad_norm": 1436.2427978515625, + "learning_rate": 4.428284112003868e-05, + "loss": 71.5589, + "step": 74410 + }, + { + "epoch": 0.3006662168659122, + "grad_norm": 316.6112060546875, + "learning_rate": 4.428061928421618e-05, + "loss": 77.6914, + "step": 74420 + }, + { + "epoch": 0.30070661813127986, + "grad_norm": 711.2620849609375, + "learning_rate": 4.427839707250809e-05, + "loss": 77.8412, + "step": 74430 + }, + { + "epoch": 0.3007470193966475, + "grad_norm": 1620.897705078125, + "learning_rate": 4.427617448495772e-05, + "loss": 78.4815, + "step": 74440 + }, + { + "epoch": 0.30078742066201514, + "grad_norm": 519.2609252929688, + "learning_rate": 4.427395152160841e-05, + "loss": 41.6989, + "step": 74450 + }, + { + "epoch": 0.3008278219273828, + "grad_norm": 666.103515625, + "learning_rate": 4.427172818250349e-05, + "loss": 61.7915, + "step": 74460 + }, + { + "epoch": 0.3008682231927504, + "grad_norm": 611.0293579101562, + "learning_rate": 4.42695044676863e-05, + "loss": 63.6666, + "step": 74470 + }, + { + "epoch": 0.300908624458118, + "grad_norm": 789.2578125, + "learning_rate": 4.4267280377200205e-05, + "loss": 82.4072, + "step": 74480 + }, + { + "epoch": 0.30094902572348564, + "grad_norm": 1049.16650390625, + "learning_rate": 4.426505591108856e-05, + "loss": 67.3771, + "step": 74490 + }, + { + "epoch": 0.3009894269888533, + "grad_norm": 565.5245361328125, + "learning_rate": 4.426283106939474e-05, + "loss": 52.4517, + "step": 74500 + }, + { + "epoch": 0.3010298282542209, + "grad_norm": 1094.810546875, + "learning_rate": 4.42606058521621e-05, + "loss": 89.7887, + "step": 74510 + }, + { + "epoch": 0.30107022951958856, + "grad_norm": 938.249267578125, + "learning_rate": 4.425838025943403e-05, + "loss": 67.3641, + "step": 74520 + }, + { + "epoch": 0.3011106307849562, + "grad_norm": 857.2614135742188, + "learning_rate": 4.4256154291253925e-05, + "loss": 102.7043, + "step": 74530 + }, + { + "epoch": 0.30115103205032384, + "grad_norm": 520.708251953125, + "learning_rate": 4.4253927947665185e-05, + "loss": 66.7792, + "step": 74540 + }, + { + "epoch": 0.3011914333156914, + "grad_norm": 764.5960693359375, + "learning_rate": 4.42517012287112e-05, + "loss": 69.1683, + "step": 74550 + }, + { + "epoch": 0.30123183458105907, + "grad_norm": 1161.567138671875, + "learning_rate": 4.424947413443539e-05, + "loss": 75.7145, + "step": 74560 + }, + { + "epoch": 0.3012722358464267, + "grad_norm": 810.30126953125, + "learning_rate": 4.424724666488117e-05, + "loss": 60.572, + "step": 74570 + }, + { + "epoch": 0.30131263711179435, + "grad_norm": 1023.4842529296875, + "learning_rate": 4.424501882009198e-05, + "loss": 60.1247, + "step": 74580 + }, + { + "epoch": 0.301353038377162, + "grad_norm": 572.3932495117188, + "learning_rate": 4.424279060011123e-05, + "loss": 58.7828, + "step": 74590 + }, + { + "epoch": 0.3013934396425296, + "grad_norm": 1139.3538818359375, + "learning_rate": 4.4240562004982364e-05, + "loss": 89.5676, + "step": 74600 + }, + { + "epoch": 0.3014338409078972, + "grad_norm": 685.8850708007812, + "learning_rate": 4.423833303474884e-05, + "loss": 74.0729, + "step": 74610 + }, + { + "epoch": 0.30147424217326485, + "grad_norm": 358.7518005371094, + "learning_rate": 4.423610368945411e-05, + "loss": 64.4605, + "step": 74620 + }, + { + "epoch": 0.3015146434386325, + "grad_norm": 2760.231201171875, + "learning_rate": 4.423387396914164e-05, + "loss": 63.1559, + "step": 74630 + }, + { + "epoch": 0.30155504470400013, + "grad_norm": 1156.1490478515625, + "learning_rate": 4.423164387385489e-05, + "loss": 68.2249, + "step": 74640 + }, + { + "epoch": 0.30159544596936777, + "grad_norm": 397.64569091796875, + "learning_rate": 4.4229413403637345e-05, + "loss": 70.5398, + "step": 74650 + }, + { + "epoch": 0.3016358472347354, + "grad_norm": 488.1241760253906, + "learning_rate": 4.422718255853248e-05, + "loss": 63.8304, + "step": 74660 + }, + { + "epoch": 0.30167624850010305, + "grad_norm": 623.3941650390625, + "learning_rate": 4.42249513385838e-05, + "loss": 93.5955, + "step": 74670 + }, + { + "epoch": 0.30171664976547063, + "grad_norm": 584.9109497070312, + "learning_rate": 4.422271974383479e-05, + "loss": 63.7987, + "step": 74680 + }, + { + "epoch": 0.3017570510308383, + "grad_norm": 1291.9105224609375, + "learning_rate": 4.4220487774328964e-05, + "loss": 80.7659, + "step": 74690 + }, + { + "epoch": 0.3017974522962059, + "grad_norm": 921.5263671875, + "learning_rate": 4.421825543010983e-05, + "loss": 57.1126, + "step": 74700 + }, + { + "epoch": 0.30183785356157355, + "grad_norm": 840.3859252929688, + "learning_rate": 4.4216022711220916e-05, + "loss": 47.0774, + "step": 74710 + }, + { + "epoch": 0.3018782548269412, + "grad_norm": 1399.228759765625, + "learning_rate": 4.4213789617705746e-05, + "loss": 60.6638, + "step": 74720 + }, + { + "epoch": 0.30191865609230883, + "grad_norm": 574.983642578125, + "learning_rate": 4.421155614960785e-05, + "loss": 64.1311, + "step": 74730 + }, + { + "epoch": 0.3019590573576764, + "grad_norm": 1133.6746826171875, + "learning_rate": 4.420932230697079e-05, + "loss": 66.0676, + "step": 74740 + }, + { + "epoch": 0.30199945862304406, + "grad_norm": 2249.599853515625, + "learning_rate": 4.420708808983809e-05, + "loss": 72.025, + "step": 74750 + }, + { + "epoch": 0.3020398598884117, + "grad_norm": 837.6932983398438, + "learning_rate": 4.420485349825332e-05, + "loss": 68.1134, + "step": 74760 + }, + { + "epoch": 0.30208026115377934, + "grad_norm": 984.0842895507812, + "learning_rate": 4.4202618532260046e-05, + "loss": 91.3187, + "step": 74770 + }, + { + "epoch": 0.302120662419147, + "grad_norm": 727.6422729492188, + "learning_rate": 4.420038319190184e-05, + "loss": 63.3439, + "step": 74780 + }, + { + "epoch": 0.3021610636845146, + "grad_norm": 412.24908447265625, + "learning_rate": 4.4198147477222274e-05, + "loss": 60.6722, + "step": 74790 + }, + { + "epoch": 0.3022014649498822, + "grad_norm": 1043.6190185546875, + "learning_rate": 4.4195911388264946e-05, + "loss": 50.6668, + "step": 74800 + }, + { + "epoch": 0.30224186621524984, + "grad_norm": 432.61956787109375, + "learning_rate": 4.419367492507343e-05, + "loss": 53.3677, + "step": 74810 + }, + { + "epoch": 0.3022822674806175, + "grad_norm": 554.3751831054688, + "learning_rate": 4.419143808769135e-05, + "loss": 45.9557, + "step": 74820 + }, + { + "epoch": 0.3023226687459851, + "grad_norm": 880.563720703125, + "learning_rate": 4.4189200876162295e-05, + "loss": 79.8755, + "step": 74830 + }, + { + "epoch": 0.30236307001135276, + "grad_norm": 766.0863647460938, + "learning_rate": 4.41869632905299e-05, + "loss": 81.7201, + "step": 74840 + }, + { + "epoch": 0.3024034712767204, + "grad_norm": 512.21240234375, + "learning_rate": 4.418472533083777e-05, + "loss": 64.7566, + "step": 74850 + }, + { + "epoch": 0.30244387254208804, + "grad_norm": 1144.3148193359375, + "learning_rate": 4.418248699712955e-05, + "loss": 119.2652, + "step": 74860 + }, + { + "epoch": 0.30248427380745563, + "grad_norm": 535.4666137695312, + "learning_rate": 4.418024828944886e-05, + "loss": 60.0249, + "step": 74870 + }, + { + "epoch": 0.30252467507282327, + "grad_norm": 3031.205322265625, + "learning_rate": 4.417800920783937e-05, + "loss": 56.3927, + "step": 74880 + }, + { + "epoch": 0.3025650763381909, + "grad_norm": 1405.0643310546875, + "learning_rate": 4.4175769752344706e-05, + "loss": 78.4172, + "step": 74890 + }, + { + "epoch": 0.30260547760355855, + "grad_norm": 779.4258422851562, + "learning_rate": 4.417352992300854e-05, + "loss": 59.0052, + "step": 74900 + }, + { + "epoch": 0.3026458788689262, + "grad_norm": 1063.1361083984375, + "learning_rate": 4.4171289719874543e-05, + "loss": 66.4332, + "step": 74910 + }, + { + "epoch": 0.30268628013429383, + "grad_norm": 479.256103515625, + "learning_rate": 4.4169049142986376e-05, + "loss": 51.7122, + "step": 74920 + }, + { + "epoch": 0.3027266813996614, + "grad_norm": 482.245849609375, + "learning_rate": 4.416680819238773e-05, + "loss": 71.5557, + "step": 74930 + }, + { + "epoch": 0.30276708266502905, + "grad_norm": 1661.60791015625, + "learning_rate": 4.4164566868122286e-05, + "loss": 110.3273, + "step": 74940 + }, + { + "epoch": 0.3028074839303967, + "grad_norm": 763.4310913085938, + "learning_rate": 4.4162325170233745e-05, + "loss": 63.9054, + "step": 74950 + }, + { + "epoch": 0.30284788519576433, + "grad_norm": 581.7908325195312, + "learning_rate": 4.4160083098765815e-05, + "loss": 73.6799, + "step": 74960 + }, + { + "epoch": 0.302888286461132, + "grad_norm": 522.9912109375, + "learning_rate": 4.4157840653762196e-05, + "loss": 63.8081, + "step": 74970 + }, + { + "epoch": 0.3029286877264996, + "grad_norm": 671.7311401367188, + "learning_rate": 4.4155597835266616e-05, + "loss": 43.7649, + "step": 74980 + }, + { + "epoch": 0.30296908899186725, + "grad_norm": 655.3804931640625, + "learning_rate": 4.415335464332279e-05, + "loss": 68.5104, + "step": 74990 + }, + { + "epoch": 0.30300949025723484, + "grad_norm": 705.6524047851562, + "learning_rate": 4.415111107797445e-05, + "loss": 55.7934, + "step": 75000 + }, + { + "epoch": 0.3030498915226025, + "grad_norm": 2905.262939453125, + "learning_rate": 4.4148867139265345e-05, + "loss": 111.0617, + "step": 75010 + }, + { + "epoch": 0.3030902927879701, + "grad_norm": 972.8043823242188, + "learning_rate": 4.414662282723922e-05, + "loss": 63.5983, + "step": 75020 + }, + { + "epoch": 0.30313069405333776, + "grad_norm": 1290.5452880859375, + "learning_rate": 4.414437814193982e-05, + "loss": 58.9312, + "step": 75030 + }, + { + "epoch": 0.3031710953187054, + "grad_norm": 1295.46044921875, + "learning_rate": 4.414213308341092e-05, + "loss": 60.9342, + "step": 75040 + }, + { + "epoch": 0.30321149658407304, + "grad_norm": 778.5823364257812, + "learning_rate": 4.4139887651696265e-05, + "loss": 65.9136, + "step": 75050 + }, + { + "epoch": 0.3032518978494406, + "grad_norm": 515.2728271484375, + "learning_rate": 4.413764184683966e-05, + "loss": 73.2117, + "step": 75060 + }, + { + "epoch": 0.30329229911480826, + "grad_norm": 1737.145751953125, + "learning_rate": 4.413539566888487e-05, + "loss": 122.1578, + "step": 75070 + }, + { + "epoch": 0.3033327003801759, + "grad_norm": 606.1090698242188, + "learning_rate": 4.413314911787569e-05, + "loss": 62.4606, + "step": 75080 + }, + { + "epoch": 0.30337310164554354, + "grad_norm": 760.4569702148438, + "learning_rate": 4.413090219385592e-05, + "loss": 42.6394, + "step": 75090 + }, + { + "epoch": 0.3034135029109112, + "grad_norm": 809.2393188476562, + "learning_rate": 4.412865489686936e-05, + "loss": 61.619, + "step": 75100 + }, + { + "epoch": 0.3034539041762788, + "grad_norm": 555.9096069335938, + "learning_rate": 4.412640722695982e-05, + "loss": 79.0981, + "step": 75110 + }, + { + "epoch": 0.3034943054416464, + "grad_norm": 745.3788452148438, + "learning_rate": 4.4124159184171134e-05, + "loss": 80.1419, + "step": 75120 + }, + { + "epoch": 0.30353470670701405, + "grad_norm": 915.6812744140625, + "learning_rate": 4.412191076854711e-05, + "loss": 80.7349, + "step": 75130 + }, + { + "epoch": 0.3035751079723817, + "grad_norm": 857.411865234375, + "learning_rate": 4.41196619801316e-05, + "loss": 43.2552, + "step": 75140 + }, + { + "epoch": 0.3036155092377493, + "grad_norm": 1478.0841064453125, + "learning_rate": 4.4117412818968426e-05, + "loss": 71.5397, + "step": 75150 + }, + { + "epoch": 0.30365591050311697, + "grad_norm": 818.4799194335938, + "learning_rate": 4.411516328510145e-05, + "loss": 42.5693, + "step": 75160 + }, + { + "epoch": 0.3036963117684846, + "grad_norm": 1053.192626953125, + "learning_rate": 4.411291337857453e-05, + "loss": 67.0812, + "step": 75170 + }, + { + "epoch": 0.30373671303385225, + "grad_norm": 399.8765563964844, + "learning_rate": 4.4110663099431514e-05, + "loss": 65.8789, + "step": 75180 + }, + { + "epoch": 0.30377711429921983, + "grad_norm": 869.1029052734375, + "learning_rate": 4.41084124477163e-05, + "loss": 86.4271, + "step": 75190 + }, + { + "epoch": 0.30381751556458747, + "grad_norm": 561.1309814453125, + "learning_rate": 4.410616142347273e-05, + "loss": 65.8079, + "step": 75200 + }, + { + "epoch": 0.3038579168299551, + "grad_norm": 1289.060302734375, + "learning_rate": 4.410391002674471e-05, + "loss": 74.7418, + "step": 75210 + }, + { + "epoch": 0.30389831809532275, + "grad_norm": 566.0087280273438, + "learning_rate": 4.410165825757613e-05, + "loss": 74.0141, + "step": 75220 + }, + { + "epoch": 0.3039387193606904, + "grad_norm": 0.0, + "learning_rate": 4.409940611601089e-05, + "loss": 52.371, + "step": 75230 + }, + { + "epoch": 0.30397912062605803, + "grad_norm": 628.7798461914062, + "learning_rate": 4.409715360209289e-05, + "loss": 70.8489, + "step": 75240 + }, + { + "epoch": 0.3040195218914256, + "grad_norm": 644.141845703125, + "learning_rate": 4.4094900715866064e-05, + "loss": 41.9273, + "step": 75250 + }, + { + "epoch": 0.30405992315679325, + "grad_norm": 727.8847045898438, + "learning_rate": 4.40926474573743e-05, + "loss": 67.9712, + "step": 75260 + }, + { + "epoch": 0.3041003244221609, + "grad_norm": 0.0, + "learning_rate": 4.409039382666155e-05, + "loss": 52.6979, + "step": 75270 + }, + { + "epoch": 0.30414072568752853, + "grad_norm": 521.0494384765625, + "learning_rate": 4.4088139823771744e-05, + "loss": 48.4199, + "step": 75280 + }, + { + "epoch": 0.3041811269528962, + "grad_norm": 494.1226501464844, + "learning_rate": 4.408588544874882e-05, + "loss": 65.0988, + "step": 75290 + }, + { + "epoch": 0.3042215282182638, + "grad_norm": 513.5343627929688, + "learning_rate": 4.408363070163675e-05, + "loss": 42.8742, + "step": 75300 + }, + { + "epoch": 0.30426192948363145, + "grad_norm": 654.8699951171875, + "learning_rate": 4.408137558247946e-05, + "loss": 64.6675, + "step": 75310 + }, + { + "epoch": 0.30430233074899904, + "grad_norm": 1542.4345703125, + "learning_rate": 4.407912009132093e-05, + "loss": 58.0917, + "step": 75320 + }, + { + "epoch": 0.3043427320143667, + "grad_norm": 1139.7677001953125, + "learning_rate": 4.4076864228205136e-05, + "loss": 62.5426, + "step": 75330 + }, + { + "epoch": 0.3043831332797343, + "grad_norm": 480.1134033203125, + "learning_rate": 4.407460799317604e-05, + "loss": 64.8383, + "step": 75340 + }, + { + "epoch": 0.30442353454510196, + "grad_norm": 1814.2696533203125, + "learning_rate": 4.4072351386277654e-05, + "loss": 78.7744, + "step": 75350 + }, + { + "epoch": 0.3044639358104696, + "grad_norm": 1329.923095703125, + "learning_rate": 4.407009440755396e-05, + "loss": 72.4044, + "step": 75360 + }, + { + "epoch": 0.30450433707583724, + "grad_norm": 3605.839599609375, + "learning_rate": 4.4067837057048956e-05, + "loss": 79.2061, + "step": 75370 + }, + { + "epoch": 0.3045447383412048, + "grad_norm": 1073.45166015625, + "learning_rate": 4.406557933480664e-05, + "loss": 81.1469, + "step": 75380 + }, + { + "epoch": 0.30458513960657246, + "grad_norm": 751.8543090820312, + "learning_rate": 4.406332124087105e-05, + "loss": 77.547, + "step": 75390 + }, + { + "epoch": 0.3046255408719401, + "grad_norm": 531.0945434570312, + "learning_rate": 4.40610627752862e-05, + "loss": 66.1161, + "step": 75400 + }, + { + "epoch": 0.30466594213730774, + "grad_norm": 738.7662963867188, + "learning_rate": 4.405880393809612e-05, + "loss": 64.749, + "step": 75410 + }, + { + "epoch": 0.3047063434026754, + "grad_norm": 877.9468383789062, + "learning_rate": 4.405654472934483e-05, + "loss": 51.235, + "step": 75420 + }, + { + "epoch": 0.304746744668043, + "grad_norm": 653.334716796875, + "learning_rate": 4.4054285149076404e-05, + "loss": 55.1535, + "step": 75430 + }, + { + "epoch": 0.3047871459334106, + "grad_norm": 1065.5577392578125, + "learning_rate": 4.4052025197334864e-05, + "loss": 75.8509, + "step": 75440 + }, + { + "epoch": 0.30482754719877825, + "grad_norm": 1329.488525390625, + "learning_rate": 4.40497648741643e-05, + "loss": 78.9796, + "step": 75450 + }, + { + "epoch": 0.3048679484641459, + "grad_norm": 1260.999755859375, + "learning_rate": 4.4047504179608755e-05, + "loss": 91.8325, + "step": 75460 + }, + { + "epoch": 0.30490834972951353, + "grad_norm": 656.1160888671875, + "learning_rate": 4.404524311371231e-05, + "loss": 35.623, + "step": 75470 + }, + { + "epoch": 0.30494875099488117, + "grad_norm": 882.13623046875, + "learning_rate": 4.404298167651905e-05, + "loss": 58.8389, + "step": 75480 + }, + { + "epoch": 0.3049891522602488, + "grad_norm": 1063.432861328125, + "learning_rate": 4.4040719868073055e-05, + "loss": 55.5148, + "step": 75490 + }, + { + "epoch": 0.30502955352561645, + "grad_norm": 1446.3270263671875, + "learning_rate": 4.403845768841842e-05, + "loss": 93.3359, + "step": 75500 + }, + { + "epoch": 0.30506995479098403, + "grad_norm": 652.8603515625, + "learning_rate": 4.403619513759926e-05, + "loss": 88.8308, + "step": 75510 + }, + { + "epoch": 0.3051103560563517, + "grad_norm": 848.1201782226562, + "learning_rate": 4.403393221565966e-05, + "loss": 51.726, + "step": 75520 + }, + { + "epoch": 0.3051507573217193, + "grad_norm": 823.25390625, + "learning_rate": 4.403166892264376e-05, + "loss": 60.7063, + "step": 75530 + }, + { + "epoch": 0.30519115858708695, + "grad_norm": 837.02978515625, + "learning_rate": 4.402940525859568e-05, + "loss": 62.3497, + "step": 75540 + }, + { + "epoch": 0.3052315598524546, + "grad_norm": 1130.3653564453125, + "learning_rate": 4.402714122355955e-05, + "loss": 86.2569, + "step": 75550 + }, + { + "epoch": 0.30527196111782223, + "grad_norm": 313.84063720703125, + "learning_rate": 4.40248768175795e-05, + "loss": 85.372, + "step": 75560 + }, + { + "epoch": 0.3053123623831898, + "grad_norm": 1272.7034912109375, + "learning_rate": 4.4022612040699676e-05, + "loss": 59.0401, + "step": 75570 + }, + { + "epoch": 0.30535276364855746, + "grad_norm": 259.1897888183594, + "learning_rate": 4.4020346892964246e-05, + "loss": 42.5053, + "step": 75580 + }, + { + "epoch": 0.3053931649139251, + "grad_norm": 541.4120483398438, + "learning_rate": 4.401808137441736e-05, + "loss": 59.4345, + "step": 75590 + }, + { + "epoch": 0.30543356617929274, + "grad_norm": 923.3859252929688, + "learning_rate": 4.401581548510318e-05, + "loss": 47.9736, + "step": 75600 + }, + { + "epoch": 0.3054739674446604, + "grad_norm": 1377.25, + "learning_rate": 4.40135492250659e-05, + "loss": 78.1078, + "step": 75610 + }, + { + "epoch": 0.305514368710028, + "grad_norm": 1074.262939453125, + "learning_rate": 4.401128259434968e-05, + "loss": 57.5512, + "step": 75620 + }, + { + "epoch": 0.30555476997539566, + "grad_norm": 616.6300048828125, + "learning_rate": 4.400901559299871e-05, + "loss": 38.326, + "step": 75630 + }, + { + "epoch": 0.30559517124076324, + "grad_norm": 1104.368408203125, + "learning_rate": 4.4006748221057206e-05, + "loss": 94.1468, + "step": 75640 + }, + { + "epoch": 0.3056355725061309, + "grad_norm": 581.0299072265625, + "learning_rate": 4.4004480478569353e-05, + "loss": 87.8687, + "step": 75650 + }, + { + "epoch": 0.3056759737714985, + "grad_norm": 1138.6673583984375, + "learning_rate": 4.400221236557938e-05, + "loss": 66.9354, + "step": 75660 + }, + { + "epoch": 0.30571637503686616, + "grad_norm": 568.2470703125, + "learning_rate": 4.399994388213149e-05, + "loss": 88.6248, + "step": 75670 + }, + { + "epoch": 0.3057567763022338, + "grad_norm": 598.5287475585938, + "learning_rate": 4.3997675028269906e-05, + "loss": 55.7092, + "step": 75680 + }, + { + "epoch": 0.30579717756760144, + "grad_norm": 971.5711669921875, + "learning_rate": 4.399540580403887e-05, + "loss": 67.1653, + "step": 75690 + }, + { + "epoch": 0.305837578832969, + "grad_norm": 1012.9927368164062, + "learning_rate": 4.399313620948262e-05, + "loss": 51.6907, + "step": 75700 + }, + { + "epoch": 0.30587798009833667, + "grad_norm": 1266.41748046875, + "learning_rate": 4.39908662446454e-05, + "loss": 59.8522, + "step": 75710 + }, + { + "epoch": 0.3059183813637043, + "grad_norm": 580.0430297851562, + "learning_rate": 4.3988595909571464e-05, + "loss": 51.2898, + "step": 75720 + }, + { + "epoch": 0.30595878262907195, + "grad_norm": 503.0343322753906, + "learning_rate": 4.3986325204305076e-05, + "loss": 69.8676, + "step": 75730 + }, + { + "epoch": 0.3059991838944396, + "grad_norm": 246.7170867919922, + "learning_rate": 4.398405412889051e-05, + "loss": 60.3378, + "step": 75740 + }, + { + "epoch": 0.3060395851598072, + "grad_norm": 440.064208984375, + "learning_rate": 4.3981782683372016e-05, + "loss": 52.1341, + "step": 75750 + }, + { + "epoch": 0.3060799864251748, + "grad_norm": 684.9658203125, + "learning_rate": 4.3979510867793917e-05, + "loss": 83.72, + "step": 75760 + }, + { + "epoch": 0.30612038769054245, + "grad_norm": 0.0, + "learning_rate": 4.397723868220047e-05, + "loss": 64.8489, + "step": 75770 + }, + { + "epoch": 0.3061607889559101, + "grad_norm": 971.6740112304688, + "learning_rate": 4.397496612663599e-05, + "loss": 59.74, + "step": 75780 + }, + { + "epoch": 0.30620119022127773, + "grad_norm": 1435.0618896484375, + "learning_rate": 4.397269320114478e-05, + "loss": 92.3261, + "step": 75790 + }, + { + "epoch": 0.30624159148664537, + "grad_norm": 1401.4566650390625, + "learning_rate": 4.3970419905771145e-05, + "loss": 86.528, + "step": 75800 + }, + { + "epoch": 0.306281992752013, + "grad_norm": 423.3729553222656, + "learning_rate": 4.39681462405594e-05, + "loss": 106.0067, + "step": 75810 + }, + { + "epoch": 0.30632239401738065, + "grad_norm": 0.0, + "learning_rate": 4.3965872205553885e-05, + "loss": 70.0686, + "step": 75820 + }, + { + "epoch": 0.30636279528274823, + "grad_norm": 1514.8172607421875, + "learning_rate": 4.3963597800798927e-05, + "loss": 85.976, + "step": 75830 + }, + { + "epoch": 0.3064031965481159, + "grad_norm": 286.7713928222656, + "learning_rate": 4.396132302633886e-05, + "loss": 47.0414, + "step": 75840 + }, + { + "epoch": 0.3064435978134835, + "grad_norm": 1182.9954833984375, + "learning_rate": 4.395904788221805e-05, + "loss": 59.4661, + "step": 75850 + }, + { + "epoch": 0.30648399907885115, + "grad_norm": 1122.1630859375, + "learning_rate": 4.3956772368480836e-05, + "loss": 94.7752, + "step": 75860 + }, + { + "epoch": 0.3065244003442188, + "grad_norm": 555.4927368164062, + "learning_rate": 4.395449648517158e-05, + "loss": 52.1087, + "step": 75870 + }, + { + "epoch": 0.30656480160958643, + "grad_norm": 698.0592041015625, + "learning_rate": 4.395222023233466e-05, + "loss": 52.4741, + "step": 75880 + }, + { + "epoch": 0.306605202874954, + "grad_norm": 628.822509765625, + "learning_rate": 4.3949943610014455e-05, + "loss": 86.0263, + "step": 75890 + }, + { + "epoch": 0.30664560414032166, + "grad_norm": 1240.1202392578125, + "learning_rate": 4.394766661825533e-05, + "loss": 46.8542, + "step": 75900 + }, + { + "epoch": 0.3066860054056893, + "grad_norm": 816.1203002929688, + "learning_rate": 4.3945389257101704e-05, + "loss": 38.3735, + "step": 75910 + }, + { + "epoch": 0.30672640667105694, + "grad_norm": 889.33349609375, + "learning_rate": 4.394311152659796e-05, + "loss": 73.3283, + "step": 75920 + }, + { + "epoch": 0.3067668079364246, + "grad_norm": 1453.927978515625, + "learning_rate": 4.3940833426788496e-05, + "loss": 62.5195, + "step": 75930 + }, + { + "epoch": 0.3068072092017922, + "grad_norm": 987.9056396484375, + "learning_rate": 4.393855495771774e-05, + "loss": 77.4361, + "step": 75940 + }, + { + "epoch": 0.3068476104671598, + "grad_norm": 1086.7833251953125, + "learning_rate": 4.3936276119430096e-05, + "loss": 87.7577, + "step": 75950 + }, + { + "epoch": 0.30688801173252744, + "grad_norm": 491.1762390136719, + "learning_rate": 4.393399691197e-05, + "loss": 71.9018, + "step": 75960 + }, + { + "epoch": 0.3069284129978951, + "grad_norm": 834.448974609375, + "learning_rate": 4.3931717335381894e-05, + "loss": 56.691, + "step": 75970 + }, + { + "epoch": 0.3069688142632627, + "grad_norm": 1463.418212890625, + "learning_rate": 4.392943738971021e-05, + "loss": 66.625, + "step": 75980 + }, + { + "epoch": 0.30700921552863036, + "grad_norm": 709.5184936523438, + "learning_rate": 4.39271570749994e-05, + "loss": 72.5582, + "step": 75990 + }, + { + "epoch": 0.307049616793998, + "grad_norm": 802.2734985351562, + "learning_rate": 4.3924876391293915e-05, + "loss": 80.8964, + "step": 76000 + }, + { + "epoch": 0.30709001805936564, + "grad_norm": 1081.5029296875, + "learning_rate": 4.3922595338638214e-05, + "loss": 64.753, + "step": 76010 + }, + { + "epoch": 0.30713041932473323, + "grad_norm": 747.075439453125, + "learning_rate": 4.3920313917076794e-05, + "loss": 48.6337, + "step": 76020 + }, + { + "epoch": 0.30717082059010087, + "grad_norm": 371.2689208984375, + "learning_rate": 4.3918032126654095e-05, + "loss": 56.0596, + "step": 76030 + }, + { + "epoch": 0.3072112218554685, + "grad_norm": 852.907958984375, + "learning_rate": 4.391574996741463e-05, + "loss": 66.6618, + "step": 76040 + }, + { + "epoch": 0.30725162312083615, + "grad_norm": 1084.6485595703125, + "learning_rate": 4.391346743940288e-05, + "loss": 66.4222, + "step": 76050 + }, + { + "epoch": 0.3072920243862038, + "grad_norm": 1998.0355224609375, + "learning_rate": 4.3911184542663344e-05, + "loss": 77.1086, + "step": 76060 + }, + { + "epoch": 0.30733242565157143, + "grad_norm": 789.2811889648438, + "learning_rate": 4.390890127724053e-05, + "loss": 68.8511, + "step": 76070 + }, + { + "epoch": 0.307372826916939, + "grad_norm": 666.281494140625, + "learning_rate": 4.390661764317895e-05, + "loss": 68.8774, + "step": 76080 + }, + { + "epoch": 0.30741322818230665, + "grad_norm": 593.8773803710938, + "learning_rate": 4.390433364052312e-05, + "loss": 45.1297, + "step": 76090 + }, + { + "epoch": 0.3074536294476743, + "grad_norm": 1460.0093994140625, + "learning_rate": 4.390204926931758e-05, + "loss": 79.1225, + "step": 76100 + }, + { + "epoch": 0.30749403071304193, + "grad_norm": 653.6718139648438, + "learning_rate": 4.389976452960686e-05, + "loss": 72.7947, + "step": 76110 + }, + { + "epoch": 0.3075344319784096, + "grad_norm": 906.63134765625, + "learning_rate": 4.38974794214355e-05, + "loss": 72.8858, + "step": 76120 + }, + { + "epoch": 0.3075748332437772, + "grad_norm": 1931.9991455078125, + "learning_rate": 4.3895193944848034e-05, + "loss": 64.3772, + "step": 76130 + }, + { + "epoch": 0.30761523450914485, + "grad_norm": 798.5737915039062, + "learning_rate": 4.3892908099889054e-05, + "loss": 55.5147, + "step": 76140 + }, + { + "epoch": 0.30765563577451244, + "grad_norm": 696.617919921875, + "learning_rate": 4.389062188660309e-05, + "loss": 62.2774, + "step": 76150 + }, + { + "epoch": 0.3076960370398801, + "grad_norm": 840.3499145507812, + "learning_rate": 4.388833530503473e-05, + "loss": 98.8143, + "step": 76160 + }, + { + "epoch": 0.3077364383052477, + "grad_norm": 764.2418212890625, + "learning_rate": 4.388604835522855e-05, + "loss": 69.4767, + "step": 76170 + }, + { + "epoch": 0.30777683957061536, + "grad_norm": 663.6424560546875, + "learning_rate": 4.3883761037229146e-05, + "loss": 69.4592, + "step": 76180 + }, + { + "epoch": 0.307817240835983, + "grad_norm": 624.4500732421875, + "learning_rate": 4.388147335108108e-05, + "loss": 53.6257, + "step": 76190 + }, + { + "epoch": 0.30785764210135064, + "grad_norm": 1174.481201171875, + "learning_rate": 4.387918529682898e-05, + "loss": 70.0419, + "step": 76200 + }, + { + "epoch": 0.3078980433667182, + "grad_norm": 466.0067443847656, + "learning_rate": 4.3876896874517434e-05, + "loss": 112.7296, + "step": 76210 + }, + { + "epoch": 0.30793844463208586, + "grad_norm": 576.7889404296875, + "learning_rate": 4.387460808419108e-05, + "loss": 56.6612, + "step": 76220 + }, + { + "epoch": 0.3079788458974535, + "grad_norm": 809.4537353515625, + "learning_rate": 4.387231892589452e-05, + "loss": 84.8902, + "step": 76230 + }, + { + "epoch": 0.30801924716282114, + "grad_norm": 1000.173095703125, + "learning_rate": 4.387002939967237e-05, + "loss": 69.7403, + "step": 76240 + }, + { + "epoch": 0.3080596484281888, + "grad_norm": 756.7673950195312, + "learning_rate": 4.386773950556931e-05, + "loss": 51.9468, + "step": 76250 + }, + { + "epoch": 0.3081000496935564, + "grad_norm": 922.7078247070312, + "learning_rate": 4.386544924362993e-05, + "loss": 80.4618, + "step": 76260 + }, + { + "epoch": 0.308140450958924, + "grad_norm": 928.8509521484375, + "learning_rate": 4.3863158613898915e-05, + "loss": 90.6427, + "step": 76270 + }, + { + "epoch": 0.30818085222429165, + "grad_norm": 990.8674926757812, + "learning_rate": 4.386086761642091e-05, + "loss": 54.2147, + "step": 76280 + }, + { + "epoch": 0.3082212534896593, + "grad_norm": 973.0326538085938, + "learning_rate": 4.385857625124058e-05, + "loss": 86.8008, + "step": 76290 + }, + { + "epoch": 0.3082616547550269, + "grad_norm": 831.694580078125, + "learning_rate": 4.3856284518402594e-05, + "loss": 77.8919, + "step": 76300 + }, + { + "epoch": 0.30830205602039457, + "grad_norm": 1294.062255859375, + "learning_rate": 4.385399241795164e-05, + "loss": 89.092, + "step": 76310 + }, + { + "epoch": 0.3083424572857622, + "grad_norm": 899.0340576171875, + "learning_rate": 4.3851699949932396e-05, + "loss": 66.3644, + "step": 76320 + }, + { + "epoch": 0.30838285855112985, + "grad_norm": 2219.54296875, + "learning_rate": 4.384940711438955e-05, + "loss": 43.2072, + "step": 76330 + }, + { + "epoch": 0.30842325981649743, + "grad_norm": 1805.721923828125, + "learning_rate": 4.384711391136781e-05, + "loss": 68.598, + "step": 76340 + }, + { + "epoch": 0.30846366108186507, + "grad_norm": 701.5916137695312, + "learning_rate": 4.384482034091189e-05, + "loss": 49.2661, + "step": 76350 + }, + { + "epoch": 0.3085040623472327, + "grad_norm": 435.8236389160156, + "learning_rate": 4.3842526403066486e-05, + "loss": 52.0933, + "step": 76360 + }, + { + "epoch": 0.30854446361260035, + "grad_norm": 2196.045166015625, + "learning_rate": 4.384023209787633e-05, + "loss": 93.9429, + "step": 76370 + }, + { + "epoch": 0.308584864877968, + "grad_norm": 780.8671264648438, + "learning_rate": 4.383793742538616e-05, + "loss": 60.4108, + "step": 76380 + }, + { + "epoch": 0.30862526614333563, + "grad_norm": 1023.3828125, + "learning_rate": 4.383564238564068e-05, + "loss": 65.5329, + "step": 76390 + }, + { + "epoch": 0.3086656674087032, + "grad_norm": 677.0743408203125, + "learning_rate": 4.3833346978684675e-05, + "loss": 89.5438, + "step": 76400 + }, + { + "epoch": 0.30870606867407085, + "grad_norm": 650.9645385742188, + "learning_rate": 4.383105120456287e-05, + "loss": 50.4317, + "step": 76410 + }, + { + "epoch": 0.3087464699394385, + "grad_norm": 543.072021484375, + "learning_rate": 4.3828755063320016e-05, + "loss": 84.4733, + "step": 76420 + }, + { + "epoch": 0.30878687120480613, + "grad_norm": 773.248046875, + "learning_rate": 4.38264585550009e-05, + "loss": 64.6917, + "step": 76430 + }, + { + "epoch": 0.3088272724701738, + "grad_norm": 365.4747009277344, + "learning_rate": 4.382416167965028e-05, + "loss": 62.2631, + "step": 76440 + }, + { + "epoch": 0.3088676737355414, + "grad_norm": 674.434326171875, + "learning_rate": 4.382186443731293e-05, + "loss": 54.153, + "step": 76450 + }, + { + "epoch": 0.30890807500090905, + "grad_norm": 1216.2164306640625, + "learning_rate": 4.381956682803365e-05, + "loss": 77.9781, + "step": 76460 + }, + { + "epoch": 0.30894847626627664, + "grad_norm": 615.4111938476562, + "learning_rate": 4.381726885185722e-05, + "loss": 66.789, + "step": 76470 + }, + { + "epoch": 0.3089888775316443, + "grad_norm": 1128.7919921875, + "learning_rate": 4.381497050882845e-05, + "loss": 61.4229, + "step": 76480 + }, + { + "epoch": 0.3090292787970119, + "grad_norm": 645.344482421875, + "learning_rate": 4.381267179899214e-05, + "loss": 47.0934, + "step": 76490 + }, + { + "epoch": 0.30906968006237956, + "grad_norm": 397.85980224609375, + "learning_rate": 4.381037272239311e-05, + "loss": 63.7962, + "step": 76500 + }, + { + "epoch": 0.3091100813277472, + "grad_norm": 463.2906188964844, + "learning_rate": 4.380807327907618e-05, + "loss": 60.8565, + "step": 76510 + }, + { + "epoch": 0.30915048259311484, + "grad_norm": 470.76104736328125, + "learning_rate": 4.380577346908618e-05, + "loss": 71.5916, + "step": 76520 + }, + { + "epoch": 0.3091908838584824, + "grad_norm": 1406.014404296875, + "learning_rate": 4.380347329246794e-05, + "loss": 76.5488, + "step": 76530 + }, + { + "epoch": 0.30923128512385006, + "grad_norm": 898.1220703125, + "learning_rate": 4.380117274926631e-05, + "loss": 75.5578, + "step": 76540 + }, + { + "epoch": 0.3092716863892177, + "grad_norm": 1027.6002197265625, + "learning_rate": 4.379887183952614e-05, + "loss": 73.7126, + "step": 76550 + }, + { + "epoch": 0.30931208765458534, + "grad_norm": 2206.342529296875, + "learning_rate": 4.379657056329228e-05, + "loss": 121.1146, + "step": 76560 + }, + { + "epoch": 0.309352488919953, + "grad_norm": 986.2785034179688, + "learning_rate": 4.3794268920609605e-05, + "loss": 59.564, + "step": 76570 + }, + { + "epoch": 0.3093928901853206, + "grad_norm": 847.2274169921875, + "learning_rate": 4.379196691152298e-05, + "loss": 69.7664, + "step": 76580 + }, + { + "epoch": 0.3094332914506882, + "grad_norm": 938.61767578125, + "learning_rate": 4.3789664536077286e-05, + "loss": 54.9465, + "step": 76590 + }, + { + "epoch": 0.30947369271605585, + "grad_norm": 879.3148193359375, + "learning_rate": 4.3787361794317405e-05, + "loss": 82.9992, + "step": 76600 + }, + { + "epoch": 0.3095140939814235, + "grad_norm": 394.82000732421875, + "learning_rate": 4.378505868628823e-05, + "loss": 79.9965, + "step": 76610 + }, + { + "epoch": 0.30955449524679113, + "grad_norm": 458.2685241699219, + "learning_rate": 4.3782755212034675e-05, + "loss": 90.1063, + "step": 76620 + }, + { + "epoch": 0.30959489651215877, + "grad_norm": 1523.7799072265625, + "learning_rate": 4.3780451371601626e-05, + "loss": 63.1249, + "step": 76630 + }, + { + "epoch": 0.3096352977775264, + "grad_norm": 656.118896484375, + "learning_rate": 4.3778147165034025e-05, + "loss": 75.1157, + "step": 76640 + }, + { + "epoch": 0.30967569904289405, + "grad_norm": 1332.3353271484375, + "learning_rate": 4.377584259237676e-05, + "loss": 98.9753, + "step": 76650 + }, + { + "epoch": 0.30971610030826163, + "grad_norm": 1163.6890869140625, + "learning_rate": 4.377353765367479e-05, + "loss": 82.146, + "step": 76660 + }, + { + "epoch": 0.3097565015736293, + "grad_norm": 994.2089233398438, + "learning_rate": 4.377123234897303e-05, + "loss": 47.6127, + "step": 76670 + }, + { + "epoch": 0.3097969028389969, + "grad_norm": 427.42059326171875, + "learning_rate": 4.376892667831644e-05, + "loss": 61.8331, + "step": 76680 + }, + { + "epoch": 0.30983730410436455, + "grad_norm": 1147.4427490234375, + "learning_rate": 4.376662064174994e-05, + "loss": 65.2715, + "step": 76690 + }, + { + "epoch": 0.3098777053697322, + "grad_norm": 1077.2244873046875, + "learning_rate": 4.376431423931853e-05, + "loss": 111.3696, + "step": 76700 + }, + { + "epoch": 0.30991810663509983, + "grad_norm": 384.9374084472656, + "learning_rate": 4.3762007471067146e-05, + "loss": 72.2117, + "step": 76710 + }, + { + "epoch": 0.3099585079004674, + "grad_norm": 0.0, + "learning_rate": 4.375970033704077e-05, + "loss": 53.84, + "step": 76720 + }, + { + "epoch": 0.30999890916583506, + "grad_norm": 514.8012084960938, + "learning_rate": 4.375739283728437e-05, + "loss": 43.7522, + "step": 76730 + }, + { + "epoch": 0.3100393104312027, + "grad_norm": 1148.413818359375, + "learning_rate": 4.3755084971842954e-05, + "loss": 73.6758, + "step": 76740 + }, + { + "epoch": 0.31007971169657034, + "grad_norm": 1197.4542236328125, + "learning_rate": 4.375277674076149e-05, + "loss": 51.1232, + "step": 76750 + }, + { + "epoch": 0.310120112961938, + "grad_norm": 1706.7191162109375, + "learning_rate": 4.375046814408499e-05, + "loss": 102.9833, + "step": 76760 + }, + { + "epoch": 0.3101605142273056, + "grad_norm": 696.5614624023438, + "learning_rate": 4.374815918185846e-05, + "loss": 58.2234, + "step": 76770 + }, + { + "epoch": 0.31020091549267326, + "grad_norm": 1848.7935791015625, + "learning_rate": 4.374584985412692e-05, + "loss": 67.9964, + "step": 76780 + }, + { + "epoch": 0.31024131675804084, + "grad_norm": 469.37255859375, + "learning_rate": 4.374354016093538e-05, + "loss": 72.5402, + "step": 76790 + }, + { + "epoch": 0.3102817180234085, + "grad_norm": 1170.890380859375, + "learning_rate": 4.374123010232888e-05, + "loss": 67.4718, + "step": 76800 + }, + { + "epoch": 0.3103221192887761, + "grad_norm": 468.8921813964844, + "learning_rate": 4.373891967835245e-05, + "loss": 50.772, + "step": 76810 + }, + { + "epoch": 0.31036252055414376, + "grad_norm": 1059.63623046875, + "learning_rate": 4.373660888905113e-05, + "loss": 74.093, + "step": 76820 + }, + { + "epoch": 0.3104029218195114, + "grad_norm": 1103.974609375, + "learning_rate": 4.373429773446998e-05, + "loss": 65.5093, + "step": 76830 + }, + { + "epoch": 0.31044332308487904, + "grad_norm": 885.3768920898438, + "learning_rate": 4.373198621465404e-05, + "loss": 77.8017, + "step": 76840 + }, + { + "epoch": 0.3104837243502466, + "grad_norm": 968.3536376953125, + "learning_rate": 4.372967432964838e-05, + "loss": 53.5939, + "step": 76850 + }, + { + "epoch": 0.31052412561561427, + "grad_norm": 365.57904052734375, + "learning_rate": 4.372736207949809e-05, + "loss": 94.1086, + "step": 76860 + }, + { + "epoch": 0.3105645268809819, + "grad_norm": 974.5148315429688, + "learning_rate": 4.3725049464248235e-05, + "loss": 62.9441, + "step": 76870 + }, + { + "epoch": 0.31060492814634955, + "grad_norm": 191.68748474121094, + "learning_rate": 4.372273648394389e-05, + "loss": 61.9872, + "step": 76880 + }, + { + "epoch": 0.3106453294117172, + "grad_norm": 661.6585693359375, + "learning_rate": 4.372042313863017e-05, + "loss": 50.0335, + "step": 76890 + }, + { + "epoch": 0.3106857306770848, + "grad_norm": 774.6177978515625, + "learning_rate": 4.371810942835215e-05, + "loss": 48.9739, + "step": 76900 + }, + { + "epoch": 0.3107261319424524, + "grad_norm": 813.4871826171875, + "learning_rate": 4.371579535315496e-05, + "loss": 69.586, + "step": 76910 + }, + { + "epoch": 0.31076653320782005, + "grad_norm": 1050.9583740234375, + "learning_rate": 4.37134809130837e-05, + "loss": 67.1586, + "step": 76920 + }, + { + "epoch": 0.3108069344731877, + "grad_norm": 747.37939453125, + "learning_rate": 4.37111661081835e-05, + "loss": 56.411, + "step": 76930 + }, + { + "epoch": 0.31084733573855533, + "grad_norm": 499.1912536621094, + "learning_rate": 4.370885093849948e-05, + "loss": 36.8895, + "step": 76940 + }, + { + "epoch": 0.31088773700392297, + "grad_norm": 1127.0404052734375, + "learning_rate": 4.3706535404076784e-05, + "loss": 100.2089, + "step": 76950 + }, + { + "epoch": 0.3109281382692906, + "grad_norm": 696.9407958984375, + "learning_rate": 4.370421950496054e-05, + "loss": 55.8497, + "step": 76960 + }, + { + "epoch": 0.31096853953465825, + "grad_norm": 867.03173828125, + "learning_rate": 4.3701903241195916e-05, + "loss": 67.1302, + "step": 76970 + }, + { + "epoch": 0.31100894080002583, + "grad_norm": 972.6212768554688, + "learning_rate": 4.369958661282805e-05, + "loss": 54.7901, + "step": 76980 + }, + { + "epoch": 0.3110493420653935, + "grad_norm": 979.4427490234375, + "learning_rate": 4.369726961990213e-05, + "loss": 63.9195, + "step": 76990 + }, + { + "epoch": 0.3110897433307611, + "grad_norm": 1254.9720458984375, + "learning_rate": 4.36949522624633e-05, + "loss": 110.9768, + "step": 77000 + }, + { + "epoch": 0.31113014459612875, + "grad_norm": 1459.2310791015625, + "learning_rate": 4.369263454055675e-05, + "loss": 81.1262, + "step": 77010 + }, + { + "epoch": 0.3111705458614964, + "grad_norm": 968.3046875, + "learning_rate": 4.3690316454227674e-05, + "loss": 64.1656, + "step": 77020 + }, + { + "epoch": 0.31121094712686403, + "grad_norm": 818.0725708007812, + "learning_rate": 4.368799800352126e-05, + "loss": 93.2603, + "step": 77030 + }, + { + "epoch": 0.3112513483922316, + "grad_norm": 521.28125, + "learning_rate": 4.368567918848269e-05, + "loss": 45.2758, + "step": 77040 + }, + { + "epoch": 0.31129174965759926, + "grad_norm": 273.4820556640625, + "learning_rate": 4.368336000915719e-05, + "loss": 69.712, + "step": 77050 + }, + { + "epoch": 0.3113321509229669, + "grad_norm": 905.5136108398438, + "learning_rate": 4.3681040465589976e-05, + "loss": 88.944, + "step": 77060 + }, + { + "epoch": 0.31137255218833454, + "grad_norm": 809.4381713867188, + "learning_rate": 4.3678720557826247e-05, + "loss": 72.0091, + "step": 77070 + }, + { + "epoch": 0.3114129534537022, + "grad_norm": 439.21533203125, + "learning_rate": 4.3676400285911256e-05, + "loss": 68.7604, + "step": 77080 + }, + { + "epoch": 0.3114533547190698, + "grad_norm": 4371.494140625, + "learning_rate": 4.367407964989022e-05, + "loss": 73.8835, + "step": 77090 + }, + { + "epoch": 0.31149375598443746, + "grad_norm": 1301.020751953125, + "learning_rate": 4.367175864980839e-05, + "loss": 71.925, + "step": 77100 + }, + { + "epoch": 0.31153415724980504, + "grad_norm": 608.1272583007812, + "learning_rate": 4.366943728571101e-05, + "loss": 61.2593, + "step": 77110 + }, + { + "epoch": 0.3115745585151727, + "grad_norm": 529.8942260742188, + "learning_rate": 4.3667115557643336e-05, + "loss": 68.4521, + "step": 77120 + }, + { + "epoch": 0.3116149597805403, + "grad_norm": 499.2430725097656, + "learning_rate": 4.366479346565064e-05, + "loss": 47.7916, + "step": 77130 + }, + { + "epoch": 0.31165536104590796, + "grad_norm": 1138.603759765625, + "learning_rate": 4.366247100977818e-05, + "loss": 58.4289, + "step": 77140 + }, + { + "epoch": 0.3116957623112756, + "grad_norm": 558.7338256835938, + "learning_rate": 4.366014819007124e-05, + "loss": 40.2173, + "step": 77150 + }, + { + "epoch": 0.31173616357664324, + "grad_norm": 0.0, + "learning_rate": 4.3657825006575106e-05, + "loss": 76.302, + "step": 77160 + }, + { + "epoch": 0.31177656484201083, + "grad_norm": 1142.35986328125, + "learning_rate": 4.365550145933507e-05, + "loss": 59.8843, + "step": 77170 + }, + { + "epoch": 0.31181696610737847, + "grad_norm": 437.07196044921875, + "learning_rate": 4.3653177548396426e-05, + "loss": 84.331, + "step": 77180 + }, + { + "epoch": 0.3118573673727461, + "grad_norm": 1512.7645263671875, + "learning_rate": 4.365085327380448e-05, + "loss": 68.1001, + "step": 77190 + }, + { + "epoch": 0.31189776863811375, + "grad_norm": 595.816650390625, + "learning_rate": 4.3648528635604556e-05, + "loss": 61.2616, + "step": 77200 + }, + { + "epoch": 0.3119381699034814, + "grad_norm": 307.5999450683594, + "learning_rate": 4.364620363384196e-05, + "loss": 66.6272, + "step": 77210 + }, + { + "epoch": 0.31197857116884903, + "grad_norm": 1286.98095703125, + "learning_rate": 4.364387826856202e-05, + "loss": 78.2909, + "step": 77220 + }, + { + "epoch": 0.3120189724342166, + "grad_norm": 1373.5860595703125, + "learning_rate": 4.364155253981008e-05, + "loss": 72.0954, + "step": 77230 + }, + { + "epoch": 0.31205937369958425, + "grad_norm": 392.8745422363281, + "learning_rate": 4.363922644763147e-05, + "loss": 71.9362, + "step": 77240 + }, + { + "epoch": 0.3120997749649519, + "grad_norm": 891.7858276367188, + "learning_rate": 4.363689999207156e-05, + "loss": 62.0088, + "step": 77250 + }, + { + "epoch": 0.31214017623031953, + "grad_norm": 861.945556640625, + "learning_rate": 4.363457317317567e-05, + "loss": 105.5122, + "step": 77260 + }, + { + "epoch": 0.3121805774956872, + "grad_norm": 843.31689453125, + "learning_rate": 4.3632245990989194e-05, + "loss": 52.6803, + "step": 77270 + }, + { + "epoch": 0.3122209787610548, + "grad_norm": 649.4727172851562, + "learning_rate": 4.362991844555749e-05, + "loss": 44.0785, + "step": 77280 + }, + { + "epoch": 0.31226138002642245, + "grad_norm": 961.2431640625, + "learning_rate": 4.362759053692593e-05, + "loss": 61.1215, + "step": 77290 + }, + { + "epoch": 0.31230178129179004, + "grad_norm": 832.6133422851562, + "learning_rate": 4.3625262265139906e-05, + "loss": 71.7721, + "step": 77300 + }, + { + "epoch": 0.3123421825571577, + "grad_norm": 1031.405517578125, + "learning_rate": 4.36229336302448e-05, + "loss": 61.8297, + "step": 77310 + }, + { + "epoch": 0.3123825838225253, + "grad_norm": 821.444580078125, + "learning_rate": 4.3620604632286024e-05, + "loss": 92.3522, + "step": 77320 + }, + { + "epoch": 0.31242298508789296, + "grad_norm": 809.0938720703125, + "learning_rate": 4.361827527130896e-05, + "loss": 75.1257, + "step": 77330 + }, + { + "epoch": 0.3124633863532606, + "grad_norm": 725.1190795898438, + "learning_rate": 4.361594554735905e-05, + "loss": 77.7145, + "step": 77340 + }, + { + "epoch": 0.31250378761862824, + "grad_norm": 888.7479858398438, + "learning_rate": 4.361361546048169e-05, + "loss": 95.2185, + "step": 77350 + }, + { + "epoch": 0.3125441888839958, + "grad_norm": 2857.910400390625, + "learning_rate": 4.361128501072231e-05, + "loss": 68.9192, + "step": 77360 + }, + { + "epoch": 0.31258459014936346, + "grad_norm": 506.87982177734375, + "learning_rate": 4.360895419812635e-05, + "loss": 58.4488, + "step": 77370 + }, + { + "epoch": 0.3126249914147311, + "grad_norm": 749.842041015625, + "learning_rate": 4.360662302273925e-05, + "loss": 71.4121, + "step": 77380 + }, + { + "epoch": 0.31266539268009874, + "grad_norm": 1351.6341552734375, + "learning_rate": 4.360429148460645e-05, + "loss": 75.3694, + "step": 77390 + }, + { + "epoch": 0.3127057939454664, + "grad_norm": 460.1432189941406, + "learning_rate": 4.3601959583773415e-05, + "loss": 89.7496, + "step": 77400 + }, + { + "epoch": 0.312746195210834, + "grad_norm": 1023.0693969726562, + "learning_rate": 4.3599627320285596e-05, + "loss": 110.523, + "step": 77410 + }, + { + "epoch": 0.31278659647620166, + "grad_norm": 385.4729919433594, + "learning_rate": 4.3597294694188475e-05, + "loss": 37.7279, + "step": 77420 + }, + { + "epoch": 0.31282699774156925, + "grad_norm": 1083.031005859375, + "learning_rate": 4.359496170552751e-05, + "loss": 65.8572, + "step": 77430 + }, + { + "epoch": 0.3128673990069369, + "grad_norm": 571.319580078125, + "learning_rate": 4.35926283543482e-05, + "loss": 54.1978, + "step": 77440 + }, + { + "epoch": 0.3129078002723045, + "grad_norm": 535.0450439453125, + "learning_rate": 4.3590294640696025e-05, + "loss": 88.7871, + "step": 77450 + }, + { + "epoch": 0.31294820153767217, + "grad_norm": 625.9801025390625, + "learning_rate": 4.358796056461648e-05, + "loss": 73.7608, + "step": 77460 + }, + { + "epoch": 0.3129886028030398, + "grad_norm": 569.495361328125, + "learning_rate": 4.3585626126155084e-05, + "loss": 78.8239, + "step": 77470 + }, + { + "epoch": 0.31302900406840745, + "grad_norm": 632.259521484375, + "learning_rate": 4.358329132535733e-05, + "loss": 119.5843, + "step": 77480 + }, + { + "epoch": 0.31306940533377503, + "grad_norm": 752.5601806640625, + "learning_rate": 4.3580956162268746e-05, + "loss": 41.869, + "step": 77490 + }, + { + "epoch": 0.31310980659914267, + "grad_norm": 1639.3475341796875, + "learning_rate": 4.357862063693486e-05, + "loss": 104.7077, + "step": 77500 + }, + { + "epoch": 0.3131502078645103, + "grad_norm": 1603.3714599609375, + "learning_rate": 4.35762847494012e-05, + "loss": 133.0148, + "step": 77510 + }, + { + "epoch": 0.31319060912987795, + "grad_norm": 921.978515625, + "learning_rate": 4.35739484997133e-05, + "loss": 54.5854, + "step": 77520 + }, + { + "epoch": 0.3132310103952456, + "grad_norm": 567.6099853515625, + "learning_rate": 4.3571611887916705e-05, + "loss": 45.054, + "step": 77530 + }, + { + "epoch": 0.31327141166061323, + "grad_norm": 1245.0203857421875, + "learning_rate": 4.356927491405699e-05, + "loss": 54.2814, + "step": 77540 + }, + { + "epoch": 0.3133118129259808, + "grad_norm": 798.68359375, + "learning_rate": 4.356693757817969e-05, + "loss": 68.1666, + "step": 77550 + }, + { + "epoch": 0.31335221419134845, + "grad_norm": 838.5280151367188, + "learning_rate": 4.356459988033039e-05, + "loss": 55.8854, + "step": 77560 + }, + { + "epoch": 0.3133926154567161, + "grad_norm": 497.4013366699219, + "learning_rate": 4.356226182055465e-05, + "loss": 43.9287, + "step": 77570 + }, + { + "epoch": 0.31343301672208373, + "grad_norm": 2377.817626953125, + "learning_rate": 4.355992339889806e-05, + "loss": 89.1516, + "step": 77580 + }, + { + "epoch": 0.3134734179874514, + "grad_norm": 1571.2337646484375, + "learning_rate": 4.355758461540622e-05, + "loss": 64.8556, + "step": 77590 + }, + { + "epoch": 0.313513819252819, + "grad_norm": 2129.327880859375, + "learning_rate": 4.355524547012471e-05, + "loss": 88.4741, + "step": 77600 + }, + { + "epoch": 0.31355422051818665, + "grad_norm": 445.593505859375, + "learning_rate": 4.355290596309912e-05, + "loss": 59.0199, + "step": 77610 + }, + { + "epoch": 0.31359462178355424, + "grad_norm": 660.9097290039062, + "learning_rate": 4.3550566094375086e-05, + "loss": 65.1326, + "step": 77620 + }, + { + "epoch": 0.3136350230489219, + "grad_norm": 670.1815185546875, + "learning_rate": 4.3548225863998224e-05, + "loss": 76.792, + "step": 77630 + }, + { + "epoch": 0.3136754243142895, + "grad_norm": 882.048828125, + "learning_rate": 4.354588527201414e-05, + "loss": 70.3437, + "step": 77640 + }, + { + "epoch": 0.31371582557965716, + "grad_norm": 831.5132446289062, + "learning_rate": 4.3543544318468485e-05, + "loss": 60.2682, + "step": 77650 + }, + { + "epoch": 0.3137562268450248, + "grad_norm": 1422.9718017578125, + "learning_rate": 4.354120300340688e-05, + "loss": 69.6125, + "step": 77660 + }, + { + "epoch": 0.31379662811039244, + "grad_norm": 703.7569580078125, + "learning_rate": 4.353886132687497e-05, + "loss": 46.6462, + "step": 77670 + }, + { + "epoch": 0.31383702937576, + "grad_norm": 620.9691772460938, + "learning_rate": 4.353651928891842e-05, + "loss": 86.715, + "step": 77680 + }, + { + "epoch": 0.31387743064112766, + "grad_norm": 586.15625, + "learning_rate": 4.353417688958289e-05, + "loss": 58.7201, + "step": 77690 + }, + { + "epoch": 0.3139178319064953, + "grad_norm": 1025.6397705078125, + "learning_rate": 4.3531834128914025e-05, + "loss": 46.0605, + "step": 77700 + }, + { + "epoch": 0.31395823317186294, + "grad_norm": 1030.073974609375, + "learning_rate": 4.352949100695752e-05, + "loss": 79.4799, + "step": 77710 + }, + { + "epoch": 0.3139986344372306, + "grad_norm": 862.0059204101562, + "learning_rate": 4.352714752375906e-05, + "loss": 94.2031, + "step": 77720 + }, + { + "epoch": 0.3140390357025982, + "grad_norm": 968.3876342773438, + "learning_rate": 4.352480367936431e-05, + "loss": 63.7685, + "step": 77730 + }, + { + "epoch": 0.31407943696796586, + "grad_norm": 819.8076782226562, + "learning_rate": 4.352245947381897e-05, + "loss": 77.5755, + "step": 77740 + }, + { + "epoch": 0.31411983823333345, + "grad_norm": 1108.0177001953125, + "learning_rate": 4.352011490716875e-05, + "loss": 70.4065, + "step": 77750 + }, + { + "epoch": 0.3141602394987011, + "grad_norm": 698.9972534179688, + "learning_rate": 4.351776997945936e-05, + "loss": 72.4767, + "step": 77760 + }, + { + "epoch": 0.31420064076406873, + "grad_norm": 1234.1883544921875, + "learning_rate": 4.351542469073651e-05, + "loss": 68.6958, + "step": 77770 + }, + { + "epoch": 0.31424104202943637, + "grad_norm": 493.3515625, + "learning_rate": 4.351307904104592e-05, + "loss": 51.1366, + "step": 77780 + }, + { + "epoch": 0.314281443294804, + "grad_norm": 1404.8785400390625, + "learning_rate": 4.351073303043332e-05, + "loss": 85.1898, + "step": 77790 + }, + { + "epoch": 0.31432184456017165, + "grad_norm": 2445.363037109375, + "learning_rate": 4.350838665894446e-05, + "loss": 67.7399, + "step": 77800 + }, + { + "epoch": 0.31436224582553923, + "grad_norm": 1352.8099365234375, + "learning_rate": 4.350603992662506e-05, + "loss": 52.9771, + "step": 77810 + }, + { + "epoch": 0.3144026470909069, + "grad_norm": 969.1926879882812, + "learning_rate": 4.3503692833520894e-05, + "loss": 62.1888, + "step": 77820 + }, + { + "epoch": 0.3144430483562745, + "grad_norm": 486.71185302734375, + "learning_rate": 4.350134537967771e-05, + "loss": 54.4507, + "step": 77830 + }, + { + "epoch": 0.31448344962164215, + "grad_norm": 817.6029663085938, + "learning_rate": 4.3498997565141267e-05, + "loss": 42.0353, + "step": 77840 + }, + { + "epoch": 0.3145238508870098, + "grad_norm": 1096.86669921875, + "learning_rate": 4.349664938995734e-05, + "loss": 79.652, + "step": 77850 + }, + { + "epoch": 0.31456425215237743, + "grad_norm": 1449.87744140625, + "learning_rate": 4.3494300854171715e-05, + "loss": 93.2444, + "step": 77860 + }, + { + "epoch": 0.314604653417745, + "grad_norm": 821.5718994140625, + "learning_rate": 4.349195195783017e-05, + "loss": 53.875, + "step": 77870 + }, + { + "epoch": 0.31464505468311266, + "grad_norm": 1154.4615478515625, + "learning_rate": 4.348960270097851e-05, + "loss": 62.7061, + "step": 77880 + }, + { + "epoch": 0.3146854559484803, + "grad_norm": 792.2052001953125, + "learning_rate": 4.348725308366252e-05, + "loss": 83.3605, + "step": 77890 + }, + { + "epoch": 0.31472585721384794, + "grad_norm": 3039.34228515625, + "learning_rate": 4.348490310592801e-05, + "loss": 87.3251, + "step": 77900 + }, + { + "epoch": 0.3147662584792156, + "grad_norm": 886.7009887695312, + "learning_rate": 4.34825527678208e-05, + "loss": 57.0282, + "step": 77910 + }, + { + "epoch": 0.3148066597445832, + "grad_norm": 786.5516967773438, + "learning_rate": 4.348020206938672e-05, + "loss": 87.8153, + "step": 77920 + }, + { + "epoch": 0.31484706100995086, + "grad_norm": 739.66015625, + "learning_rate": 4.347785101067157e-05, + "loss": 58.632, + "step": 77930 + }, + { + "epoch": 0.31488746227531844, + "grad_norm": 578.0338134765625, + "learning_rate": 4.347549959172121e-05, + "loss": 80.8651, + "step": 77940 + }, + { + "epoch": 0.3149278635406861, + "grad_norm": 1119.5306396484375, + "learning_rate": 4.347314781258147e-05, + "loss": 68.4437, + "step": 77950 + }, + { + "epoch": 0.3149682648060537, + "grad_norm": 0.0, + "learning_rate": 4.3470795673298206e-05, + "loss": 57.3587, + "step": 77960 + }, + { + "epoch": 0.31500866607142136, + "grad_norm": 681.5150146484375, + "learning_rate": 4.3468443173917267e-05, + "loss": 52.9524, + "step": 77970 + }, + { + "epoch": 0.315049067336789, + "grad_norm": 412.8719787597656, + "learning_rate": 4.346609031448452e-05, + "loss": 36.4723, + "step": 77980 + }, + { + "epoch": 0.31508946860215664, + "grad_norm": 945.4547119140625, + "learning_rate": 4.346373709504584e-05, + "loss": 67.0522, + "step": 77990 + }, + { + "epoch": 0.3151298698675242, + "grad_norm": 992.1616821289062, + "learning_rate": 4.3461383515647106e-05, + "loss": 78.1435, + "step": 78000 + }, + { + "epoch": 0.31517027113289187, + "grad_norm": 503.8204650878906, + "learning_rate": 4.345902957633418e-05, + "loss": 48.7498, + "step": 78010 + }, + { + "epoch": 0.3152106723982595, + "grad_norm": 780.5662841796875, + "learning_rate": 4.3456675277152973e-05, + "loss": 56.2306, + "step": 78020 + }, + { + "epoch": 0.31525107366362715, + "grad_norm": 454.04132080078125, + "learning_rate": 4.345432061814938e-05, + "loss": 85.0091, + "step": 78030 + }, + { + "epoch": 0.3152914749289948, + "grad_norm": 1036.3134765625, + "learning_rate": 4.345196559936932e-05, + "loss": 53.9506, + "step": 78040 + }, + { + "epoch": 0.3153318761943624, + "grad_norm": 929.426513671875, + "learning_rate": 4.344961022085867e-05, + "loss": 70.4333, + "step": 78050 + }, + { + "epoch": 0.31537227745973007, + "grad_norm": 597.806640625, + "learning_rate": 4.344725448266338e-05, + "loss": 41.9415, + "step": 78060 + }, + { + "epoch": 0.31541267872509765, + "grad_norm": 391.077392578125, + "learning_rate": 4.3444898384829364e-05, + "loss": 54.7683, + "step": 78070 + }, + { + "epoch": 0.3154530799904653, + "grad_norm": 873.1908569335938, + "learning_rate": 4.3442541927402566e-05, + "loss": 60.2897, + "step": 78080 + }, + { + "epoch": 0.31549348125583293, + "grad_norm": 464.908447265625, + "learning_rate": 4.344018511042891e-05, + "loss": 57.7049, + "step": 78090 + }, + { + "epoch": 0.31553388252120057, + "grad_norm": 481.4813537597656, + "learning_rate": 4.343782793395435e-05, + "loss": 67.3648, + "step": 78100 + }, + { + "epoch": 0.3155742837865682, + "grad_norm": 1816.134521484375, + "learning_rate": 4.343547039802485e-05, + "loss": 79.2945, + "step": 78110 + }, + { + "epoch": 0.31561468505193585, + "grad_norm": 561.038330078125, + "learning_rate": 4.3433112502686355e-05, + "loss": 50.8454, + "step": 78120 + }, + { + "epoch": 0.31565508631730343, + "grad_norm": 685.545654296875, + "learning_rate": 4.3430754247984845e-05, + "loss": 54.5997, + "step": 78130 + }, + { + "epoch": 0.3156954875826711, + "grad_norm": 618.0608520507812, + "learning_rate": 4.342839563396629e-05, + "loss": 76.7737, + "step": 78140 + }, + { + "epoch": 0.3157358888480387, + "grad_norm": 1280.976806640625, + "learning_rate": 4.3426036660676686e-05, + "loss": 54.933, + "step": 78150 + }, + { + "epoch": 0.31577629011340635, + "grad_norm": 650.5822143554688, + "learning_rate": 4.3423677328161996e-05, + "loss": 71.7501, + "step": 78160 + }, + { + "epoch": 0.315816691378774, + "grad_norm": 597.2694091796875, + "learning_rate": 4.342131763646824e-05, + "loss": 68.88, + "step": 78170 + }, + { + "epoch": 0.31585709264414163, + "grad_norm": 1208.90576171875, + "learning_rate": 4.341895758564141e-05, + "loss": 67.8843, + "step": 78180 + }, + { + "epoch": 0.3158974939095092, + "grad_norm": 830.9584350585938, + "learning_rate": 4.3416597175727514e-05, + "loss": 89.2304, + "step": 78190 + }, + { + "epoch": 0.31593789517487686, + "grad_norm": 1917.3353271484375, + "learning_rate": 4.3414236406772584e-05, + "loss": 111.667, + "step": 78200 + }, + { + "epoch": 0.3159782964402445, + "grad_norm": 924.7677612304688, + "learning_rate": 4.3411875278822635e-05, + "loss": 49.8145, + "step": 78210 + }, + { + "epoch": 0.31601869770561214, + "grad_norm": 469.1351318359375, + "learning_rate": 4.340951379192369e-05, + "loss": 116.5447, + "step": 78220 + }, + { + "epoch": 0.3160590989709798, + "grad_norm": 495.249755859375, + "learning_rate": 4.34071519461218e-05, + "loss": 63.0928, + "step": 78230 + }, + { + "epoch": 0.3160995002363474, + "grad_norm": 1259.7349853515625, + "learning_rate": 4.3404789741463e-05, + "loss": 54.0104, + "step": 78240 + }, + { + "epoch": 0.31613990150171506, + "grad_norm": 512.0302124023438, + "learning_rate": 4.3402427177993366e-05, + "loss": 48.6828, + "step": 78250 + }, + { + "epoch": 0.31618030276708264, + "grad_norm": 1092.3133544921875, + "learning_rate": 4.340006425575892e-05, + "loss": 51.555, + "step": 78260 + }, + { + "epoch": 0.3162207040324503, + "grad_norm": 668.2493896484375, + "learning_rate": 4.339770097480576e-05, + "loss": 67.2536, + "step": 78270 + }, + { + "epoch": 0.3162611052978179, + "grad_norm": 832.0966796875, + "learning_rate": 4.3395337335179945e-05, + "loss": 78.7102, + "step": 78280 + }, + { + "epoch": 0.31630150656318556, + "grad_norm": 931.7671508789062, + "learning_rate": 4.339297333692756e-05, + "loss": 57.0442, + "step": 78290 + }, + { + "epoch": 0.3163419078285532, + "grad_norm": 1212.2379150390625, + "learning_rate": 4.339060898009469e-05, + "loss": 83.3376, + "step": 78300 + }, + { + "epoch": 0.31638230909392084, + "grad_norm": 393.2463073730469, + "learning_rate": 4.338824426472743e-05, + "loss": 48.1462, + "step": 78310 + }, + { + "epoch": 0.31642271035928843, + "grad_norm": 422.79998779296875, + "learning_rate": 4.338587919087187e-05, + "loss": 57.0146, + "step": 78320 + }, + { + "epoch": 0.31646311162465607, + "grad_norm": 1542.5203857421875, + "learning_rate": 4.3383513758574143e-05, + "loss": 69.819, + "step": 78330 + }, + { + "epoch": 0.3165035128900237, + "grad_norm": 858.1940307617188, + "learning_rate": 4.338114796788035e-05, + "loss": 95.8173, + "step": 78340 + }, + { + "epoch": 0.31654391415539135, + "grad_norm": 393.74505615234375, + "learning_rate": 4.337878181883661e-05, + "loss": 46.5559, + "step": 78350 + }, + { + "epoch": 0.316584315420759, + "grad_norm": 786.9148559570312, + "learning_rate": 4.3376415311489056e-05, + "loss": 68.8837, + "step": 78360 + }, + { + "epoch": 0.31662471668612663, + "grad_norm": 1402.4737548828125, + "learning_rate": 4.337404844588382e-05, + "loss": 89.8514, + "step": 78370 + }, + { + "epoch": 0.31666511795149427, + "grad_norm": 773.1746826171875, + "learning_rate": 4.337168122206706e-05, + "loss": 74.1317, + "step": 78380 + }, + { + "epoch": 0.31670551921686185, + "grad_norm": 653.9239501953125, + "learning_rate": 4.3369313640084916e-05, + "loss": 51.6219, + "step": 78390 + }, + { + "epoch": 0.3167459204822295, + "grad_norm": 664.2848510742188, + "learning_rate": 4.336694569998354e-05, + "loss": 74.1221, + "step": 78400 + }, + { + "epoch": 0.31678632174759713, + "grad_norm": 609.3822021484375, + "learning_rate": 4.3364577401809105e-05, + "loss": 71.547, + "step": 78410 + }, + { + "epoch": 0.3168267230129648, + "grad_norm": 1231.6363525390625, + "learning_rate": 4.336220874560778e-05, + "loss": 77.0097, + "step": 78420 + }, + { + "epoch": 0.3168671242783324, + "grad_norm": 205.0146484375, + "learning_rate": 4.3359839731425735e-05, + "loss": 50.0911, + "step": 78430 + }, + { + "epoch": 0.31690752554370005, + "grad_norm": 389.177978515625, + "learning_rate": 4.335747035930916e-05, + "loss": 59.2829, + "step": 78440 + }, + { + "epoch": 0.31694792680906764, + "grad_norm": 600.1491088867188, + "learning_rate": 4.3355100629304254e-05, + "loss": 79.876, + "step": 78450 + }, + { + "epoch": 0.3169883280744353, + "grad_norm": 544.903564453125, + "learning_rate": 4.335273054145722e-05, + "loss": 82.2558, + "step": 78460 + }, + { + "epoch": 0.3170287293398029, + "grad_norm": 909.1273803710938, + "learning_rate": 4.335036009581425e-05, + "loss": 70.1611, + "step": 78470 + }, + { + "epoch": 0.31706913060517056, + "grad_norm": 639.6483764648438, + "learning_rate": 4.334798929242155e-05, + "loss": 69.5685, + "step": 78480 + }, + { + "epoch": 0.3171095318705382, + "grad_norm": 615.6561889648438, + "learning_rate": 4.3345618131325374e-05, + "loss": 76.2855, + "step": 78490 + }, + { + "epoch": 0.31714993313590584, + "grad_norm": 3859.427490234375, + "learning_rate": 4.334324661257191e-05, + "loss": 76.5629, + "step": 78500 + }, + { + "epoch": 0.3171903344012734, + "grad_norm": 1610.2098388671875, + "learning_rate": 4.334087473620742e-05, + "loss": 103.2507, + "step": 78510 + }, + { + "epoch": 0.31723073566664106, + "grad_norm": 588.2677612304688, + "learning_rate": 4.3338502502278134e-05, + "loss": 51.7838, + "step": 78520 + }, + { + "epoch": 0.3172711369320087, + "grad_norm": 192.1394500732422, + "learning_rate": 4.333612991083029e-05, + "loss": 68.0169, + "step": 78530 + }, + { + "epoch": 0.31731153819737634, + "grad_norm": 960.3283081054688, + "learning_rate": 4.3333756961910166e-05, + "loss": 47.2101, + "step": 78540 + }, + { + "epoch": 0.317351939462744, + "grad_norm": 1771.0201416015625, + "learning_rate": 4.3331383655564006e-05, + "loss": 60.5402, + "step": 78550 + }, + { + "epoch": 0.3173923407281116, + "grad_norm": 1430.1416015625, + "learning_rate": 4.3329009991838084e-05, + "loss": 53.3842, + "step": 78560 + }, + { + "epoch": 0.31743274199347926, + "grad_norm": 429.119873046875, + "learning_rate": 4.3326635970778676e-05, + "loss": 48.8891, + "step": 78570 + }, + { + "epoch": 0.31747314325884685, + "grad_norm": 798.0321655273438, + "learning_rate": 4.3324261592432056e-05, + "loss": 68.9384, + "step": 78580 + }, + { + "epoch": 0.3175135445242145, + "grad_norm": 999.2288208007812, + "learning_rate": 4.3321886856844534e-05, + "loss": 57.1692, + "step": 78590 + }, + { + "epoch": 0.3175539457895821, + "grad_norm": 923.92724609375, + "learning_rate": 4.331951176406239e-05, + "loss": 43.5955, + "step": 78600 + }, + { + "epoch": 0.31759434705494977, + "grad_norm": 967.3873901367188, + "learning_rate": 4.331713631413194e-05, + "loss": 47.511, + "step": 78610 + }, + { + "epoch": 0.3176347483203174, + "grad_norm": 884.6947021484375, + "learning_rate": 4.331476050709948e-05, + "loss": 57.999, + "step": 78620 + }, + { + "epoch": 0.31767514958568505, + "grad_norm": 611.1873168945312, + "learning_rate": 4.331238434301134e-05, + "loss": 77.5249, + "step": 78630 + }, + { + "epoch": 0.31771555085105263, + "grad_norm": 913.7862548828125, + "learning_rate": 4.3310007821913836e-05, + "loss": 54.6169, + "step": 78640 + }, + { + "epoch": 0.31775595211642027, + "grad_norm": 1171.2972412109375, + "learning_rate": 4.330763094385329e-05, + "loss": 80.6045, + "step": 78650 + }, + { + "epoch": 0.3177963533817879, + "grad_norm": 507.2369384765625, + "learning_rate": 4.330525370887607e-05, + "loss": 46.0367, + "step": 78660 + }, + { + "epoch": 0.31783675464715555, + "grad_norm": 564.16259765625, + "learning_rate": 4.33028761170285e-05, + "loss": 73.1167, + "step": 78670 + }, + { + "epoch": 0.3178771559125232, + "grad_norm": 897.42919921875, + "learning_rate": 4.330049816835694e-05, + "loss": 92.9207, + "step": 78680 + }, + { + "epoch": 0.31791755717789083, + "grad_norm": 3515.646240234375, + "learning_rate": 4.3298119862907744e-05, + "loss": 119.0089, + "step": 78690 + }, + { + "epoch": 0.31795795844325847, + "grad_norm": 1024.9483642578125, + "learning_rate": 4.329574120072728e-05, + "loss": 58.4423, + "step": 78700 + }, + { + "epoch": 0.31799835970862605, + "grad_norm": 969.68115234375, + "learning_rate": 4.329336218186192e-05, + "loss": 55.7534, + "step": 78710 + }, + { + "epoch": 0.3180387609739937, + "grad_norm": 602.466552734375, + "learning_rate": 4.3290982806358046e-05, + "loss": 75.1919, + "step": 78720 + }, + { + "epoch": 0.31807916223936133, + "grad_norm": 599.2578735351562, + "learning_rate": 4.3288603074262054e-05, + "loss": 105.2766, + "step": 78730 + }, + { + "epoch": 0.318119563504729, + "grad_norm": 790.10693359375, + "learning_rate": 4.328622298562033e-05, + "loss": 53.7875, + "step": 78740 + }, + { + "epoch": 0.3181599647700966, + "grad_norm": 551.5433349609375, + "learning_rate": 4.3283842540479264e-05, + "loss": 63.0458, + "step": 78750 + }, + { + "epoch": 0.31820036603546425, + "grad_norm": 0.0, + "learning_rate": 4.3281461738885274e-05, + "loss": 61.5794, + "step": 78760 + }, + { + "epoch": 0.31824076730083184, + "grad_norm": 681.5363159179688, + "learning_rate": 4.327908058088479e-05, + "loss": 67.191, + "step": 78770 + }, + { + "epoch": 0.3182811685661995, + "grad_norm": 1063.3834228515625, + "learning_rate": 4.327669906652421e-05, + "loss": 82.7017, + "step": 78780 + }, + { + "epoch": 0.3183215698315671, + "grad_norm": 500.39361572265625, + "learning_rate": 4.327431719584997e-05, + "loss": 85.2169, + "step": 78790 + }, + { + "epoch": 0.31836197109693476, + "grad_norm": 1306.898681640625, + "learning_rate": 4.3271934968908514e-05, + "loss": 72.2316, + "step": 78800 + }, + { + "epoch": 0.3184023723623024, + "grad_norm": 815.1703491210938, + "learning_rate": 4.326955238574627e-05, + "loss": 50.3086, + "step": 78810 + }, + { + "epoch": 0.31844277362767004, + "grad_norm": 460.31005859375, + "learning_rate": 4.32671694464097e-05, + "loss": 64.6161, + "step": 78820 + }, + { + "epoch": 0.3184831748930376, + "grad_norm": 2179.723388671875, + "learning_rate": 4.326478615094526e-05, + "loss": 49.6086, + "step": 78830 + }, + { + "epoch": 0.31852357615840526, + "grad_norm": 1665.5799560546875, + "learning_rate": 4.3262402499399404e-05, + "loss": 97.2382, + "step": 78840 + }, + { + "epoch": 0.3185639774237729, + "grad_norm": 1162.87841796875, + "learning_rate": 4.326001849181862e-05, + "loss": 57.7263, + "step": 78850 + }, + { + "epoch": 0.31860437868914054, + "grad_norm": 333.8135681152344, + "learning_rate": 4.325763412824937e-05, + "loss": 49.9047, + "step": 78860 + }, + { + "epoch": 0.3186447799545082, + "grad_norm": 510.8147888183594, + "learning_rate": 4.325524940873814e-05, + "loss": 82.3427, + "step": 78870 + }, + { + "epoch": 0.3186851812198758, + "grad_norm": 434.4716491699219, + "learning_rate": 4.325286433333142e-05, + "loss": 71.3787, + "step": 78880 + }, + { + "epoch": 0.31872558248524346, + "grad_norm": 725.61279296875, + "learning_rate": 4.325047890207572e-05, + "loss": 63.1463, + "step": 78890 + }, + { + "epoch": 0.31876598375061105, + "grad_norm": 366.7152404785156, + "learning_rate": 4.324809311501754e-05, + "loss": 66.2424, + "step": 78900 + }, + { + "epoch": 0.3188063850159787, + "grad_norm": 1318.6060791015625, + "learning_rate": 4.3245706972203385e-05, + "loss": 67.3569, + "step": 78910 + }, + { + "epoch": 0.31884678628134633, + "grad_norm": 727.7313842773438, + "learning_rate": 4.3243320473679785e-05, + "loss": 63.7311, + "step": 78920 + }, + { + "epoch": 0.31888718754671397, + "grad_norm": 1180.8690185546875, + "learning_rate": 4.324093361949325e-05, + "loss": 54.1359, + "step": 78930 + }, + { + "epoch": 0.3189275888120816, + "grad_norm": 652.950439453125, + "learning_rate": 4.323854640969033e-05, + "loss": 66.1783, + "step": 78940 + }, + { + "epoch": 0.31896799007744925, + "grad_norm": 554.9948120117188, + "learning_rate": 4.323615884431756e-05, + "loss": 77.2896, + "step": 78950 + }, + { + "epoch": 0.31900839134281683, + "grad_norm": 657.7294921875, + "learning_rate": 4.323377092342148e-05, + "loss": 56.7781, + "step": 78960 + }, + { + "epoch": 0.3190487926081845, + "grad_norm": 1056.711181640625, + "learning_rate": 4.323138264704864e-05, + "loss": 51.1476, + "step": 78970 + }, + { + "epoch": 0.3190891938735521, + "grad_norm": 1820.9912109375, + "learning_rate": 4.322899401524563e-05, + "loss": 90.834, + "step": 78980 + }, + { + "epoch": 0.31912959513891975, + "grad_norm": 1768.5189208984375, + "learning_rate": 4.322660502805899e-05, + "loss": 79.8078, + "step": 78990 + }, + { + "epoch": 0.3191699964042874, + "grad_norm": 601.8983154296875, + "learning_rate": 4.3224215685535294e-05, + "loss": 58.5694, + "step": 79000 + }, + { + "epoch": 0.31921039766965503, + "grad_norm": 1303.4908447265625, + "learning_rate": 4.322182598772113e-05, + "loss": 45.729, + "step": 79010 + }, + { + "epoch": 0.3192507989350226, + "grad_norm": 480.8334655761719, + "learning_rate": 4.321943593466309e-05, + "loss": 84.9618, + "step": 79020 + }, + { + "epoch": 0.31929120020039026, + "grad_norm": 932.1970825195312, + "learning_rate": 4.321704552640777e-05, + "loss": 67.6896, + "step": 79030 + }, + { + "epoch": 0.3193316014657579, + "grad_norm": 1057.0179443359375, + "learning_rate": 4.321465476300177e-05, + "loss": 100.6307, + "step": 79040 + }, + { + "epoch": 0.31937200273112554, + "grad_norm": 996.9810791015625, + "learning_rate": 4.321226364449169e-05, + "loss": 71.2474, + "step": 79050 + }, + { + "epoch": 0.3194124039964932, + "grad_norm": 568.8705444335938, + "learning_rate": 4.320987217092416e-05, + "loss": 43.2181, + "step": 79060 + }, + { + "epoch": 0.3194528052618608, + "grad_norm": 420.3672180175781, + "learning_rate": 4.320748034234579e-05, + "loss": 57.1254, + "step": 79070 + }, + { + "epoch": 0.31949320652722846, + "grad_norm": 1757.289306640625, + "learning_rate": 4.3205088158803226e-05, + "loss": 68.5235, + "step": 79080 + }, + { + "epoch": 0.31953360779259604, + "grad_norm": 660.7200927734375, + "learning_rate": 4.3202695620343083e-05, + "loss": 50.2219, + "step": 79090 + }, + { + "epoch": 0.3195740090579637, + "grad_norm": 1262.357421875, + "learning_rate": 4.320030272701203e-05, + "loss": 58.4075, + "step": 79100 + }, + { + "epoch": 0.3196144103233313, + "grad_norm": 898.780029296875, + "learning_rate": 4.31979094788567e-05, + "loss": 88.9766, + "step": 79110 + }, + { + "epoch": 0.31965481158869896, + "grad_norm": 535.6188354492188, + "learning_rate": 4.319551587592376e-05, + "loss": 30.2677, + "step": 79120 + }, + { + "epoch": 0.3196952128540666, + "grad_norm": 559.2113647460938, + "learning_rate": 4.319312191825987e-05, + "loss": 78.3262, + "step": 79130 + }, + { + "epoch": 0.31973561411943424, + "grad_norm": 952.9019775390625, + "learning_rate": 4.31907276059117e-05, + "loss": 67.9382, + "step": 79140 + }, + { + "epoch": 0.3197760153848018, + "grad_norm": 959.8912353515625, + "learning_rate": 4.318833293892592e-05, + "loss": 68.8999, + "step": 79150 + }, + { + "epoch": 0.31981641665016947, + "grad_norm": 985.2177124023438, + "learning_rate": 4.318593791734924e-05, + "loss": 73.2616, + "step": 79160 + }, + { + "epoch": 0.3198568179155371, + "grad_norm": 1765.3321533203125, + "learning_rate": 4.318354254122833e-05, + "loss": 101.728, + "step": 79170 + }, + { + "epoch": 0.31989721918090475, + "grad_norm": 2424.275390625, + "learning_rate": 4.31811468106099e-05, + "loss": 40.6484, + "step": 79180 + }, + { + "epoch": 0.3199376204462724, + "grad_norm": 529.9296264648438, + "learning_rate": 4.317875072554065e-05, + "loss": 56.8446, + "step": 79190 + }, + { + "epoch": 0.31997802171164, + "grad_norm": 988.1986083984375, + "learning_rate": 4.31763542860673e-05, + "loss": 89.1671, + "step": 79200 + }, + { + "epoch": 0.32001842297700767, + "grad_norm": 954.2215576171875, + "learning_rate": 4.317395749223656e-05, + "loss": 62.5503, + "step": 79210 + }, + { + "epoch": 0.32005882424237525, + "grad_norm": 1343.7386474609375, + "learning_rate": 4.3171560344095164e-05, + "loss": 85.5978, + "step": 79220 + }, + { + "epoch": 0.3200992255077429, + "grad_norm": 992.0386962890625, + "learning_rate": 4.3169162841689846e-05, + "loss": 77.3141, + "step": 79230 + }, + { + "epoch": 0.32013962677311053, + "grad_norm": 684.0596313476562, + "learning_rate": 4.3166764985067343e-05, + "loss": 68.0206, + "step": 79240 + }, + { + "epoch": 0.32018002803847817, + "grad_norm": 544.3250732421875, + "learning_rate": 4.31643667742744e-05, + "loss": 64.4232, + "step": 79250 + }, + { + "epoch": 0.3202204293038458, + "grad_norm": 485.05413818359375, + "learning_rate": 4.3161968209357776e-05, + "loss": 72.3071, + "step": 79260 + }, + { + "epoch": 0.32026083056921345, + "grad_norm": 1109.7979736328125, + "learning_rate": 4.315956929036423e-05, + "loss": 87.8763, + "step": 79270 + }, + { + "epoch": 0.32030123183458103, + "grad_norm": 735.7939453125, + "learning_rate": 4.3157170017340545e-05, + "loss": 60.9775, + "step": 79280 + }, + { + "epoch": 0.3203416330999487, + "grad_norm": 515.8373413085938, + "learning_rate": 4.3154770390333463e-05, + "loss": 57.4723, + "step": 79290 + }, + { + "epoch": 0.3203820343653163, + "grad_norm": 11893.99609375, + "learning_rate": 4.3152370409389795e-05, + "loss": 97.6898, + "step": 79300 + }, + { + "epoch": 0.32042243563068395, + "grad_norm": 583.2001953125, + "learning_rate": 4.3149970074556324e-05, + "loss": 104.1178, + "step": 79310 + }, + { + "epoch": 0.3204628368960516, + "grad_norm": 1154.815185546875, + "learning_rate": 4.314756938587984e-05, + "loss": 47.7879, + "step": 79320 + }, + { + "epoch": 0.32050323816141923, + "grad_norm": 1132.3023681640625, + "learning_rate": 4.314516834340715e-05, + "loss": 59.4125, + "step": 79330 + }, + { + "epoch": 0.3205436394267868, + "grad_norm": 530.7276611328125, + "learning_rate": 4.3142766947185056e-05, + "loss": 76.4587, + "step": 79340 + }, + { + "epoch": 0.32058404069215446, + "grad_norm": 943.7814331054688, + "learning_rate": 4.314036519726038e-05, + "loss": 83.0291, + "step": 79350 + }, + { + "epoch": 0.3206244419575221, + "grad_norm": 1113.9444580078125, + "learning_rate": 4.3137963093679945e-05, + "loss": 62.54, + "step": 79360 + }, + { + "epoch": 0.32066484322288974, + "grad_norm": 839.0965576171875, + "learning_rate": 4.313556063649059e-05, + "loss": 65.0847, + "step": 79370 + }, + { + "epoch": 0.3207052444882574, + "grad_norm": 645.9111938476562, + "learning_rate": 4.313315782573913e-05, + "loss": 66.1717, + "step": 79380 + }, + { + "epoch": 0.320745645753625, + "grad_norm": 1160.1456298828125, + "learning_rate": 4.3130754661472435e-05, + "loss": 72.6717, + "step": 79390 + }, + { + "epoch": 0.32078604701899266, + "grad_norm": 861.6610107421875, + "learning_rate": 4.3128351143737335e-05, + "loss": 94.8734, + "step": 79400 + }, + { + "epoch": 0.32082644828436024, + "grad_norm": 1239.6209716796875, + "learning_rate": 4.31259472725807e-05, + "loss": 65.5956, + "step": 79410 + }, + { + "epoch": 0.3208668495497279, + "grad_norm": 1918.2100830078125, + "learning_rate": 4.312354304804939e-05, + "loss": 98.4281, + "step": 79420 + }, + { + "epoch": 0.3209072508150955, + "grad_norm": 496.2858581542969, + "learning_rate": 4.312113847019028e-05, + "loss": 64.9097, + "step": 79430 + }, + { + "epoch": 0.32094765208046316, + "grad_norm": 568.856689453125, + "learning_rate": 4.3118733539050244e-05, + "loss": 80.5664, + "step": 79440 + }, + { + "epoch": 0.3209880533458308, + "grad_norm": 647.578369140625, + "learning_rate": 4.311632825467617e-05, + "loss": 81.0642, + "step": 79450 + }, + { + "epoch": 0.32102845461119844, + "grad_norm": 606.3375854492188, + "learning_rate": 4.311392261711495e-05, + "loss": 87.3034, + "step": 79460 + }, + { + "epoch": 0.32106885587656603, + "grad_norm": 486.86273193359375, + "learning_rate": 4.3111516626413485e-05, + "loss": 36.4118, + "step": 79470 + }, + { + "epoch": 0.32110925714193367, + "grad_norm": 527.7968139648438, + "learning_rate": 4.310911028261867e-05, + "loss": 75.8701, + "step": 79480 + }, + { + "epoch": 0.3211496584073013, + "grad_norm": 899.1015014648438, + "learning_rate": 4.310670358577744e-05, + "loss": 64.9072, + "step": 79490 + }, + { + "epoch": 0.32119005967266895, + "grad_norm": 491.0596923828125, + "learning_rate": 4.3104296535936695e-05, + "loss": 62.814, + "step": 79500 + }, + { + "epoch": 0.3212304609380366, + "grad_norm": 643.3750610351562, + "learning_rate": 4.3101889133143365e-05, + "loss": 62.7745, + "step": 79510 + }, + { + "epoch": 0.32127086220340423, + "grad_norm": 584.9881591796875, + "learning_rate": 4.3099481377444384e-05, + "loss": 70.7787, + "step": 79520 + }, + { + "epoch": 0.32131126346877187, + "grad_norm": 0.0, + "learning_rate": 4.30970732688867e-05, + "loss": 72.2548, + "step": 79530 + }, + { + "epoch": 0.32135166473413945, + "grad_norm": 551.8460693359375, + "learning_rate": 4.309466480751726e-05, + "loss": 57.843, + "step": 79540 + }, + { + "epoch": 0.3213920659995071, + "grad_norm": 1137.9078369140625, + "learning_rate": 4.309225599338301e-05, + "loss": 67.0156, + "step": 79550 + }, + { + "epoch": 0.32143246726487473, + "grad_norm": 1040.0809326171875, + "learning_rate": 4.308984682653092e-05, + "loss": 51.4768, + "step": 79560 + }, + { + "epoch": 0.3214728685302424, + "grad_norm": 496.1695556640625, + "learning_rate": 4.308743730700795e-05, + "loss": 49.8229, + "step": 79570 + }, + { + "epoch": 0.32151326979561, + "grad_norm": 928.9419555664062, + "learning_rate": 4.308502743486107e-05, + "loss": 68.5493, + "step": 79580 + }, + { + "epoch": 0.32155367106097765, + "grad_norm": 554.7418212890625, + "learning_rate": 4.308261721013728e-05, + "loss": 58.4424, + "step": 79590 + }, + { + "epoch": 0.32159407232634524, + "grad_norm": 530.4016723632812, + "learning_rate": 4.3080206632883554e-05, + "loss": 71.472, + "step": 79600 + }, + { + "epoch": 0.3216344735917129, + "grad_norm": 823.0756225585938, + "learning_rate": 4.307779570314689e-05, + "loss": 54.4288, + "step": 79610 + }, + { + "epoch": 0.3216748748570805, + "grad_norm": 523.12744140625, + "learning_rate": 4.307538442097429e-05, + "loss": 52.3814, + "step": 79620 + }, + { + "epoch": 0.32171527612244816, + "grad_norm": 1036.486083984375, + "learning_rate": 4.307297278641277e-05, + "loss": 56.4284, + "step": 79630 + }, + { + "epoch": 0.3217556773878158, + "grad_norm": 886.7937622070312, + "learning_rate": 4.307056079950934e-05, + "loss": 61.4247, + "step": 79640 + }, + { + "epoch": 0.32179607865318344, + "grad_norm": 1947.759033203125, + "learning_rate": 4.306814846031102e-05, + "loss": 69.1207, + "step": 79650 + }, + { + "epoch": 0.321836479918551, + "grad_norm": 1469.478759765625, + "learning_rate": 4.306573576886484e-05, + "loss": 76.3007, + "step": 79660 + }, + { + "epoch": 0.32187688118391866, + "grad_norm": 883.7667846679688, + "learning_rate": 4.306332272521785e-05, + "loss": 45.0061, + "step": 79670 + }, + { + "epoch": 0.3219172824492863, + "grad_norm": 418.3061828613281, + "learning_rate": 4.306090932941708e-05, + "loss": 45.161, + "step": 79680 + }, + { + "epoch": 0.32195768371465394, + "grad_norm": 504.8469543457031, + "learning_rate": 4.3058495581509586e-05, + "loss": 89.2983, + "step": 79690 + }, + { + "epoch": 0.3219980849800216, + "grad_norm": 510.06805419921875, + "learning_rate": 4.305608148154242e-05, + "loss": 57.7559, + "step": 79700 + }, + { + "epoch": 0.3220384862453892, + "grad_norm": 1332.9613037109375, + "learning_rate": 4.305366702956265e-05, + "loss": 54.3312, + "step": 79710 + }, + { + "epoch": 0.32207888751075686, + "grad_norm": 608.1598510742188, + "learning_rate": 4.305125222561736e-05, + "loss": 63.2103, + "step": 79720 + }, + { + "epoch": 0.32211928877612445, + "grad_norm": 504.0339050292969, + "learning_rate": 4.304883706975359e-05, + "loss": 45.3614, + "step": 79730 + }, + { + "epoch": 0.3221596900414921, + "grad_norm": 865.779052734375, + "learning_rate": 4.304642156201847e-05, + "loss": 77.652, + "step": 79740 + }, + { + "epoch": 0.3222000913068597, + "grad_norm": 1158.892333984375, + "learning_rate": 4.304400570245906e-05, + "loss": 46.2974, + "step": 79750 + }, + { + "epoch": 0.32224049257222737, + "grad_norm": 1002.193603515625, + "learning_rate": 4.304158949112247e-05, + "loss": 78.2484, + "step": 79760 + }, + { + "epoch": 0.322280893837595, + "grad_norm": 1321.477294921875, + "learning_rate": 4.3039172928055805e-05, + "loss": 72.8844, + "step": 79770 + }, + { + "epoch": 0.32232129510296265, + "grad_norm": 188.91424560546875, + "learning_rate": 4.303675601330618e-05, + "loss": 47.1185, + "step": 79780 + }, + { + "epoch": 0.32236169636833023, + "grad_norm": 535.0440673828125, + "learning_rate": 4.3034338746920707e-05, + "loss": 63.9909, + "step": 79790 + }, + { + "epoch": 0.32240209763369787, + "grad_norm": 474.7372741699219, + "learning_rate": 4.303192112894652e-05, + "loss": 73.8586, + "step": 79800 + }, + { + "epoch": 0.3224424988990655, + "grad_norm": 813.8701782226562, + "learning_rate": 4.302950315943074e-05, + "loss": 90.6143, + "step": 79810 + }, + { + "epoch": 0.32248290016443315, + "grad_norm": 344.40325927734375, + "learning_rate": 4.3027084838420516e-05, + "loss": 36.8543, + "step": 79820 + }, + { + "epoch": 0.3225233014298008, + "grad_norm": 300.6861267089844, + "learning_rate": 4.302466616596299e-05, + "loss": 48.8195, + "step": 79830 + }, + { + "epoch": 0.32256370269516843, + "grad_norm": 782.69970703125, + "learning_rate": 4.302224714210532e-05, + "loss": 59.1578, + "step": 79840 + }, + { + "epoch": 0.32260410396053607, + "grad_norm": 801.529541015625, + "learning_rate": 4.301982776689467e-05, + "loss": 44.7871, + "step": 79850 + }, + { + "epoch": 0.32264450522590365, + "grad_norm": 926.8488159179688, + "learning_rate": 4.301740804037819e-05, + "loss": 65.3786, + "step": 79860 + }, + { + "epoch": 0.3226849064912713, + "grad_norm": 917.0177001953125, + "learning_rate": 4.301498796260307e-05, + "loss": 55.9066, + "step": 79870 + }, + { + "epoch": 0.32272530775663893, + "grad_norm": 0.0, + "learning_rate": 4.301256753361649e-05, + "loss": 50.5934, + "step": 79880 + }, + { + "epoch": 0.3227657090220066, + "grad_norm": 771.5294799804688, + "learning_rate": 4.301014675346562e-05, + "loss": 109.5298, + "step": 79890 + }, + { + "epoch": 0.3228061102873742, + "grad_norm": 562.3530883789062, + "learning_rate": 4.3007725622197674e-05, + "loss": 43.964, + "step": 79900 + }, + { + "epoch": 0.32284651155274185, + "grad_norm": 468.8666076660156, + "learning_rate": 4.300530413985985e-05, + "loss": 49.0986, + "step": 79910 + }, + { + "epoch": 0.32288691281810944, + "grad_norm": 678.0904541015625, + "learning_rate": 4.3002882306499345e-05, + "loss": 91.667, + "step": 79920 + }, + { + "epoch": 0.3229273140834771, + "grad_norm": 601.5751342773438, + "learning_rate": 4.300046012216338e-05, + "loss": 52.2733, + "step": 79930 + }, + { + "epoch": 0.3229677153488447, + "grad_norm": 1330.5103759765625, + "learning_rate": 4.299803758689919e-05, + "loss": 82.389, + "step": 79940 + }, + { + "epoch": 0.32300811661421236, + "grad_norm": 509.53277587890625, + "learning_rate": 4.299561470075397e-05, + "loss": 78.0836, + "step": 79950 + }, + { + "epoch": 0.32304851787958, + "grad_norm": 589.202392578125, + "learning_rate": 4.2993191463774997e-05, + "loss": 60.7983, + "step": 79960 + }, + { + "epoch": 0.32308891914494764, + "grad_norm": 738.194580078125, + "learning_rate": 4.299076787600948e-05, + "loss": 77.6573, + "step": 79970 + }, + { + "epoch": 0.3231293204103152, + "grad_norm": 1307.095947265625, + "learning_rate": 4.2988343937504686e-05, + "loss": 61.8344, + "step": 79980 + }, + { + "epoch": 0.32316972167568286, + "grad_norm": 656.2578125, + "learning_rate": 4.298591964830787e-05, + "loss": 95.2794, + "step": 79990 + }, + { + "epoch": 0.3232101229410505, + "grad_norm": 1574.1217041015625, + "learning_rate": 4.2983495008466276e-05, + "loss": 46.9904, + "step": 80000 + }, + { + "epoch": 0.32325052420641814, + "grad_norm": 678.7640380859375, + "learning_rate": 4.2981070018027204e-05, + "loss": 83.523, + "step": 80010 + }, + { + "epoch": 0.3232909254717858, + "grad_norm": 802.5941772460938, + "learning_rate": 4.29786446770379e-05, + "loss": 47.3945, + "step": 80020 + }, + { + "epoch": 0.3233313267371534, + "grad_norm": 880.6734008789062, + "learning_rate": 4.297621898554568e-05, + "loss": 69.5324, + "step": 80030 + }, + { + "epoch": 0.32337172800252106, + "grad_norm": 1087.5975341796875, + "learning_rate": 4.297379294359781e-05, + "loss": 68.7424, + "step": 80040 + }, + { + "epoch": 0.32341212926788865, + "grad_norm": 1480.21826171875, + "learning_rate": 4.297136655124159e-05, + "loss": 68.1467, + "step": 80050 + }, + { + "epoch": 0.3234525305332563, + "grad_norm": 375.3006286621094, + "learning_rate": 4.2968939808524323e-05, + "loss": 58.7787, + "step": 80060 + }, + { + "epoch": 0.32349293179862393, + "grad_norm": 811.9970092773438, + "learning_rate": 4.296651271549333e-05, + "loss": 72.5431, + "step": 80070 + }, + { + "epoch": 0.32353333306399157, + "grad_norm": 533.86474609375, + "learning_rate": 4.296408527219592e-05, + "loss": 87.0758, + "step": 80080 + }, + { + "epoch": 0.3235737343293592, + "grad_norm": 850.876220703125, + "learning_rate": 4.296165747867942e-05, + "loss": 41.6743, + "step": 80090 + }, + { + "epoch": 0.32361413559472685, + "grad_norm": 924.040283203125, + "learning_rate": 4.2959229334991156e-05, + "loss": 69.6225, + "step": 80100 + }, + { + "epoch": 0.32365453686009443, + "grad_norm": 351.706298828125, + "learning_rate": 4.295680084117847e-05, + "loss": 60.5032, + "step": 80110 + }, + { + "epoch": 0.3236949381254621, + "grad_norm": 893.626708984375, + "learning_rate": 4.295437199728871e-05, + "loss": 77.7679, + "step": 80120 + }, + { + "epoch": 0.3237353393908297, + "grad_norm": 652.7449951171875, + "learning_rate": 4.2951942803369225e-05, + "loss": 54.3826, + "step": 80130 + }, + { + "epoch": 0.32377574065619735, + "grad_norm": 1125.824462890625, + "learning_rate": 4.294951325946737e-05, + "loss": 93.5955, + "step": 80140 + }, + { + "epoch": 0.323816141921565, + "grad_norm": 1012.917236328125, + "learning_rate": 4.2947083365630514e-05, + "loss": 61.9179, + "step": 80150 + }, + { + "epoch": 0.32385654318693263, + "grad_norm": 807.2591552734375, + "learning_rate": 4.294465312190603e-05, + "loss": 46.5419, + "step": 80160 + }, + { + "epoch": 0.3238969444523003, + "grad_norm": 317.6124572753906, + "learning_rate": 4.294222252834129e-05, + "loss": 68.4578, + "step": 80170 + }, + { + "epoch": 0.32393734571766786, + "grad_norm": 2099.4423828125, + "learning_rate": 4.293979158498369e-05, + "loss": 87.771, + "step": 80180 + }, + { + "epoch": 0.3239777469830355, + "grad_norm": 482.209228515625, + "learning_rate": 4.293736029188061e-05, + "loss": 73.7765, + "step": 80190 + }, + { + "epoch": 0.32401814824840314, + "grad_norm": 1821.595947265625, + "learning_rate": 4.293492864907947e-05, + "loss": 84.6861, + "step": 80200 + }, + { + "epoch": 0.3240585495137708, + "grad_norm": 640.3442993164062, + "learning_rate": 4.293249665662765e-05, + "loss": 55.7724, + "step": 80210 + }, + { + "epoch": 0.3240989507791384, + "grad_norm": 974.443115234375, + "learning_rate": 4.293006431457258e-05, + "loss": 89.1751, + "step": 80220 + }, + { + "epoch": 0.32413935204450606, + "grad_norm": 625.10888671875, + "learning_rate": 4.2927631622961674e-05, + "loss": 77.496, + "step": 80230 + }, + { + "epoch": 0.32417975330987364, + "grad_norm": 267.2085266113281, + "learning_rate": 4.292519858184236e-05, + "loss": 54.031, + "step": 80240 + }, + { + "epoch": 0.3242201545752413, + "grad_norm": 543.6681518554688, + "learning_rate": 4.292276519126207e-05, + "loss": 61.7213, + "step": 80250 + }, + { + "epoch": 0.3242605558406089, + "grad_norm": 1130.407958984375, + "learning_rate": 4.292033145126825e-05, + "loss": 61.5954, + "step": 80260 + }, + { + "epoch": 0.32430095710597656, + "grad_norm": 590.1688842773438, + "learning_rate": 4.2917897361908335e-05, + "loss": 54.5805, + "step": 80270 + }, + { + "epoch": 0.3243413583713442, + "grad_norm": 1286.2869873046875, + "learning_rate": 4.291546292322979e-05, + "loss": 94.4061, + "step": 80280 + }, + { + "epoch": 0.32438175963671184, + "grad_norm": 520.0972290039062, + "learning_rate": 4.2913028135280076e-05, + "loss": 49.1347, + "step": 80290 + }, + { + "epoch": 0.3244221609020794, + "grad_norm": 775.9698486328125, + "learning_rate": 4.291059299810665e-05, + "loss": 58.7011, + "step": 80300 + }, + { + "epoch": 0.32446256216744707, + "grad_norm": 276.6414794921875, + "learning_rate": 4.2908157511757e-05, + "loss": 70.6107, + "step": 80310 + }, + { + "epoch": 0.3245029634328147, + "grad_norm": 446.62158203125, + "learning_rate": 4.290572167627859e-05, + "loss": 59.438, + "step": 80320 + }, + { + "epoch": 0.32454336469818235, + "grad_norm": 692.0556030273438, + "learning_rate": 4.290328549171893e-05, + "loss": 62.021, + "step": 80330 + }, + { + "epoch": 0.32458376596355, + "grad_norm": 446.0535583496094, + "learning_rate": 4.2900848958125485e-05, + "loss": 56.7572, + "step": 80340 + }, + { + "epoch": 0.3246241672289176, + "grad_norm": 477.32568359375, + "learning_rate": 4.289841207554578e-05, + "loss": 110.483, + "step": 80350 + }, + { + "epoch": 0.32466456849428527, + "grad_norm": 700.8925170898438, + "learning_rate": 4.289597484402732e-05, + "loss": 65.4878, + "step": 80360 + }, + { + "epoch": 0.32470496975965285, + "grad_norm": 1176.870849609375, + "learning_rate": 4.289353726361762e-05, + "loss": 61.7405, + "step": 80370 + }, + { + "epoch": 0.3247453710250205, + "grad_norm": 574.3339233398438, + "learning_rate": 4.289109933436419e-05, + "loss": 58.5538, + "step": 80380 + }, + { + "epoch": 0.32478577229038813, + "grad_norm": 928.0294799804688, + "learning_rate": 4.2888661056314574e-05, + "loss": 62.1923, + "step": 80390 + }, + { + "epoch": 0.32482617355575577, + "grad_norm": 1341.2864990234375, + "learning_rate": 4.2886222429516296e-05, + "loss": 67.9961, + "step": 80400 + }, + { + "epoch": 0.3248665748211234, + "grad_norm": 575.213134765625, + "learning_rate": 4.2883783454016915e-05, + "loss": 43.7556, + "step": 80410 + }, + { + "epoch": 0.32490697608649105, + "grad_norm": 637.3678588867188, + "learning_rate": 4.288134412986395e-05, + "loss": 72.743, + "step": 80420 + }, + { + "epoch": 0.32494737735185864, + "grad_norm": 936.275146484375, + "learning_rate": 4.287890445710499e-05, + "loss": 89.083, + "step": 80430 + }, + { + "epoch": 0.3249877786172263, + "grad_norm": 587.6963500976562, + "learning_rate": 4.287646443578758e-05, + "loss": 40.0728, + "step": 80440 + }, + { + "epoch": 0.3250281798825939, + "grad_norm": 465.3719177246094, + "learning_rate": 4.287402406595929e-05, + "loss": 53.9417, + "step": 80450 + }, + { + "epoch": 0.32506858114796156, + "grad_norm": 862.9938354492188, + "learning_rate": 4.28715833476677e-05, + "loss": 59.9648, + "step": 80460 + }, + { + "epoch": 0.3251089824133292, + "grad_norm": 902.8772583007812, + "learning_rate": 4.2869142280960396e-05, + "loss": 53.2347, + "step": 80470 + }, + { + "epoch": 0.32514938367869683, + "grad_norm": 827.6386108398438, + "learning_rate": 4.2866700865884954e-05, + "loss": 64.4303, + "step": 80480 + }, + { + "epoch": 0.3251897849440645, + "grad_norm": 1176.3564453125, + "learning_rate": 4.2864259102488984e-05, + "loss": 70.9066, + "step": 80490 + }, + { + "epoch": 0.32523018620943206, + "grad_norm": 2174.75244140625, + "learning_rate": 4.2861816990820084e-05, + "loss": 82.248, + "step": 80500 + }, + { + "epoch": 0.3252705874747997, + "grad_norm": 1319.863037109375, + "learning_rate": 4.285937453092587e-05, + "loss": 73.5724, + "step": 80510 + }, + { + "epoch": 0.32531098874016734, + "grad_norm": 802.91845703125, + "learning_rate": 4.285693172285396e-05, + "loss": 62.4872, + "step": 80520 + }, + { + "epoch": 0.325351390005535, + "grad_norm": 735.5004272460938, + "learning_rate": 4.2854488566651965e-05, + "loss": 80.5372, + "step": 80530 + }, + { + "epoch": 0.3253917912709026, + "grad_norm": 813.9871215820312, + "learning_rate": 4.2852045062367516e-05, + "loss": 59.1823, + "step": 80540 + }, + { + "epoch": 0.32543219253627026, + "grad_norm": 911.4254150390625, + "learning_rate": 4.2849601210048274e-05, + "loss": 73.9285, + "step": 80550 + }, + { + "epoch": 0.32547259380163784, + "grad_norm": 1268.9906005859375, + "learning_rate": 4.2847157009741856e-05, + "loss": 63.6592, + "step": 80560 + }, + { + "epoch": 0.3255129950670055, + "grad_norm": 514.0250244140625, + "learning_rate": 4.2844712461495926e-05, + "loss": 66.8111, + "step": 80570 + }, + { + "epoch": 0.3255533963323731, + "grad_norm": 1467.4144287109375, + "learning_rate": 4.284226756535814e-05, + "loss": 66.5214, + "step": 80580 + }, + { + "epoch": 0.32559379759774076, + "grad_norm": 416.23828125, + "learning_rate": 4.283982232137617e-05, + "loss": 47.4183, + "step": 80590 + }, + { + "epoch": 0.3256341988631084, + "grad_norm": 1595.3319091796875, + "learning_rate": 4.283737672959766e-05, + "loss": 72.357, + "step": 80600 + }, + { + "epoch": 0.32567460012847604, + "grad_norm": 1119.335693359375, + "learning_rate": 4.283493079007032e-05, + "loss": 51.9857, + "step": 80610 + }, + { + "epoch": 0.32571500139384363, + "grad_norm": 838.667236328125, + "learning_rate": 4.283248450284182e-05, + "loss": 55.8164, + "step": 80620 + }, + { + "epoch": 0.32575540265921127, + "grad_norm": 394.1136779785156, + "learning_rate": 4.283003786795986e-05, + "loss": 49.545, + "step": 80630 + }, + { + "epoch": 0.3257958039245789, + "grad_norm": 874.3743286132812, + "learning_rate": 4.2827590885472125e-05, + "loss": 64.6963, + "step": 80640 + }, + { + "epoch": 0.32583620518994655, + "grad_norm": 1160.2918701171875, + "learning_rate": 4.2825143555426326e-05, + "loss": 61.9271, + "step": 80650 + }, + { + "epoch": 0.3258766064553142, + "grad_norm": 1602.0166015625, + "learning_rate": 4.2822695877870177e-05, + "loss": 51.2635, + "step": 80660 + }, + { + "epoch": 0.32591700772068183, + "grad_norm": 382.5502014160156, + "learning_rate": 4.28202478528514e-05, + "loss": 54.0187, + "step": 80670 + }, + { + "epoch": 0.32595740898604947, + "grad_norm": 1229.9798583984375, + "learning_rate": 4.281779948041772e-05, + "loss": 60.0021, + "step": 80680 + }, + { + "epoch": 0.32599781025141705, + "grad_norm": 565.9646606445312, + "learning_rate": 4.2815350760616864e-05, + "loss": 61.7397, + "step": 80690 + }, + { + "epoch": 0.3260382115167847, + "grad_norm": 769.6617431640625, + "learning_rate": 4.2812901693496564e-05, + "loss": 60.3849, + "step": 80700 + }, + { + "epoch": 0.32607861278215233, + "grad_norm": 648.3060913085938, + "learning_rate": 4.281045227910459e-05, + "loss": 75.0846, + "step": 80710 + }, + { + "epoch": 0.32611901404752, + "grad_norm": 3067.703857421875, + "learning_rate": 4.2808002517488667e-05, + "loss": 90.7922, + "step": 80720 + }, + { + "epoch": 0.3261594153128876, + "grad_norm": 109.4122085571289, + "learning_rate": 4.280555240869657e-05, + "loss": 69.8678, + "step": 80730 + }, + { + "epoch": 0.32619981657825525, + "grad_norm": 733.5319213867188, + "learning_rate": 4.280310195277606e-05, + "loss": 81.5929, + "step": 80740 + }, + { + "epoch": 0.32624021784362284, + "grad_norm": 610.775634765625, + "learning_rate": 4.280065114977492e-05, + "loss": 57.4617, + "step": 80750 + }, + { + "epoch": 0.3262806191089905, + "grad_norm": 551.3561401367188, + "learning_rate": 4.279819999974091e-05, + "loss": 61.8333, + "step": 80760 + }, + { + "epoch": 0.3263210203743581, + "grad_norm": 685.814697265625, + "learning_rate": 4.279574850272183e-05, + "loss": 67.4173, + "step": 80770 + }, + { + "epoch": 0.32636142163972576, + "grad_norm": 1033.493896484375, + "learning_rate": 4.279329665876548e-05, + "loss": 53.4654, + "step": 80780 + }, + { + "epoch": 0.3264018229050934, + "grad_norm": 485.32745361328125, + "learning_rate": 4.2790844467919646e-05, + "loss": 54.8732, + "step": 80790 + }, + { + "epoch": 0.32644222417046104, + "grad_norm": 0.0, + "learning_rate": 4.278839193023214e-05, + "loss": 75.1927, + "step": 80800 + }, + { + "epoch": 0.3264826254358287, + "grad_norm": 743.0775756835938, + "learning_rate": 4.278593904575077e-05, + "loss": 44.6109, + "step": 80810 + }, + { + "epoch": 0.32652302670119626, + "grad_norm": 1801.4681396484375, + "learning_rate": 4.278348581452337e-05, + "loss": 76.2488, + "step": 80820 + }, + { + "epoch": 0.3265634279665639, + "grad_norm": 1143.9945068359375, + "learning_rate": 4.278103223659775e-05, + "loss": 68.8127, + "step": 80830 + }, + { + "epoch": 0.32660382923193154, + "grad_norm": 568.1923217773438, + "learning_rate": 4.2778578312021754e-05, + "loss": 46.9349, + "step": 80840 + }, + { + "epoch": 0.3266442304972992, + "grad_norm": 758.5953979492188, + "learning_rate": 4.277612404084322e-05, + "loss": 67.1757, + "step": 80850 + }, + { + "epoch": 0.3266846317626668, + "grad_norm": 696.4761352539062, + "learning_rate": 4.277366942311001e-05, + "loss": 62.2708, + "step": 80860 + }, + { + "epoch": 0.32672503302803446, + "grad_norm": 664.263671875, + "learning_rate": 4.277121445886995e-05, + "loss": 70.6147, + "step": 80870 + }, + { + "epoch": 0.32676543429340205, + "grad_norm": 184.5323944091797, + "learning_rate": 4.2768759148170915e-05, + "loss": 42.6514, + "step": 80880 + }, + { + "epoch": 0.3268058355587697, + "grad_norm": 586.1389770507812, + "learning_rate": 4.276630349106078e-05, + "loss": 38.8002, + "step": 80890 + }, + { + "epoch": 0.3268462368241373, + "grad_norm": 1424.904541015625, + "learning_rate": 4.276384748758741e-05, + "loss": 77.3322, + "step": 80900 + }, + { + "epoch": 0.32688663808950497, + "grad_norm": 859.6981811523438, + "learning_rate": 4.2761391137798676e-05, + "loss": 86.453, + "step": 80910 + }, + { + "epoch": 0.3269270393548726, + "grad_norm": 1004.2837524414062, + "learning_rate": 4.2758934441742496e-05, + "loss": 56.4649, + "step": 80920 + }, + { + "epoch": 0.32696744062024025, + "grad_norm": 1162.806396484375, + "learning_rate": 4.2756477399466735e-05, + "loss": 97.9132, + "step": 80930 + }, + { + "epoch": 0.32700784188560783, + "grad_norm": 425.3879699707031, + "learning_rate": 4.275402001101931e-05, + "loss": 68.8998, + "step": 80940 + }, + { + "epoch": 0.32704824315097547, + "grad_norm": 645.8928833007812, + "learning_rate": 4.2751562276448124e-05, + "loss": 51.0803, + "step": 80950 + }, + { + "epoch": 0.3270886444163431, + "grad_norm": 2057.58837890625, + "learning_rate": 4.274910419580108e-05, + "loss": 60.8407, + "step": 80960 + }, + { + "epoch": 0.32712904568171075, + "grad_norm": 1685.50048828125, + "learning_rate": 4.274664576912613e-05, + "loss": 66.7414, + "step": 80970 + }, + { + "epoch": 0.3271694469470784, + "grad_norm": 1081.0294189453125, + "learning_rate": 4.2744186996471174e-05, + "loss": 87.3173, + "step": 80980 + }, + { + "epoch": 0.32720984821244603, + "grad_norm": 1577.8040771484375, + "learning_rate": 4.2741727877884155e-05, + "loss": 85.7177, + "step": 80990 + }, + { + "epoch": 0.32725024947781367, + "grad_norm": 547.41259765625, + "learning_rate": 4.273926841341302e-05, + "loss": 47.8796, + "step": 81000 + }, + { + "epoch": 0.32729065074318126, + "grad_norm": 990.9943237304688, + "learning_rate": 4.273680860310572e-05, + "loss": 71.2299, + "step": 81010 + }, + { + "epoch": 0.3273310520085489, + "grad_norm": 379.1767883300781, + "learning_rate": 4.2734348447010206e-05, + "loss": 53.3839, + "step": 81020 + }, + { + "epoch": 0.32737145327391654, + "grad_norm": 280.44287109375, + "learning_rate": 4.2731887945174434e-05, + "loss": 66.233, + "step": 81030 + }, + { + "epoch": 0.3274118545392842, + "grad_norm": 659.9508056640625, + "learning_rate": 4.272942709764638e-05, + "loss": 80.5297, + "step": 81040 + }, + { + "epoch": 0.3274522558046518, + "grad_norm": 688.0112915039062, + "learning_rate": 4.2726965904474e-05, + "loss": 58.7139, + "step": 81050 + }, + { + "epoch": 0.32749265707001946, + "grad_norm": 742.99169921875, + "learning_rate": 4.2724504365705314e-05, + "loss": 46.189, + "step": 81060 + }, + { + "epoch": 0.32753305833538704, + "grad_norm": 1215.4967041015625, + "learning_rate": 4.272204248138828e-05, + "loss": 48.7161, + "step": 81070 + }, + { + "epoch": 0.3275734596007547, + "grad_norm": 575.4824829101562, + "learning_rate": 4.2719580251570915e-05, + "loss": 78.7691, + "step": 81080 + }, + { + "epoch": 0.3276138608661223, + "grad_norm": 547.03759765625, + "learning_rate": 4.2717117676301196e-05, + "loss": 42.8608, + "step": 81090 + }, + { + "epoch": 0.32765426213148996, + "grad_norm": 605.3917846679688, + "learning_rate": 4.271465475562716e-05, + "loss": 52.3641, + "step": 81100 + }, + { + "epoch": 0.3276946633968576, + "grad_norm": 402.33050537109375, + "learning_rate": 4.2712191489596796e-05, + "loss": 81.6275, + "step": 81110 + }, + { + "epoch": 0.32773506466222524, + "grad_norm": 944.0283813476562, + "learning_rate": 4.270972787825815e-05, + "loss": 44.0819, + "step": 81120 + }, + { + "epoch": 0.3277754659275929, + "grad_norm": 2794.85986328125, + "learning_rate": 4.2707263921659236e-05, + "loss": 77.4065, + "step": 81130 + }, + { + "epoch": 0.32781586719296046, + "grad_norm": 706.5042724609375, + "learning_rate": 4.27047996198481e-05, + "loss": 84.9347, + "step": 81140 + }, + { + "epoch": 0.3278562684583281, + "grad_norm": 1184.057861328125, + "learning_rate": 4.2702334972872776e-05, + "loss": 70.5365, + "step": 81150 + }, + { + "epoch": 0.32789666972369574, + "grad_norm": 548.59716796875, + "learning_rate": 4.269986998078132e-05, + "loss": 57.2449, + "step": 81160 + }, + { + "epoch": 0.3279370709890634, + "grad_norm": 712.474609375, + "learning_rate": 4.2697404643621786e-05, + "loss": 68.5629, + "step": 81170 + }, + { + "epoch": 0.327977472254431, + "grad_norm": 727.8631591796875, + "learning_rate": 4.269493896144224e-05, + "loss": 63.5557, + "step": 81180 + }, + { + "epoch": 0.32801787351979866, + "grad_norm": 1237.4130859375, + "learning_rate": 4.2692472934290746e-05, + "loss": 70.6992, + "step": 81190 + }, + { + "epoch": 0.32805827478516625, + "grad_norm": 625.5675048828125, + "learning_rate": 4.2690006562215384e-05, + "loss": 80.8208, + "step": 81200 + }, + { + "epoch": 0.3280986760505339, + "grad_norm": 561.9657592773438, + "learning_rate": 4.2687539845264235e-05, + "loss": 62.1039, + "step": 81210 + }, + { + "epoch": 0.32813907731590153, + "grad_norm": 1042.281494140625, + "learning_rate": 4.268507278348539e-05, + "loss": 39.331, + "step": 81220 + }, + { + "epoch": 0.32817947858126917, + "grad_norm": 401.18792724609375, + "learning_rate": 4.2682605376926955e-05, + "loss": 61.7103, + "step": 81230 + }, + { + "epoch": 0.3282198798466368, + "grad_norm": 1227.1231689453125, + "learning_rate": 4.268013762563702e-05, + "loss": 72.3686, + "step": 81240 + }, + { + "epoch": 0.32826028111200445, + "grad_norm": 2529.718994140625, + "learning_rate": 4.267766952966369e-05, + "loss": 111.4047, + "step": 81250 + }, + { + "epoch": 0.32830068237737203, + "grad_norm": 762.4780883789062, + "learning_rate": 4.2675201089055096e-05, + "loss": 44.7507, + "step": 81260 + }, + { + "epoch": 0.3283410836427397, + "grad_norm": 917.88720703125, + "learning_rate": 4.2672732303859365e-05, + "loss": 74.2715, + "step": 81270 + }, + { + "epoch": 0.3283814849081073, + "grad_norm": 520.3221435546875, + "learning_rate": 4.267026317412461e-05, + "loss": 54.3115, + "step": 81280 + }, + { + "epoch": 0.32842188617347495, + "grad_norm": 185.31436157226562, + "learning_rate": 4.266779369989899e-05, + "loss": 50.5796, + "step": 81290 + }, + { + "epoch": 0.3284622874388426, + "grad_norm": 1354.3519287109375, + "learning_rate": 4.2665323881230624e-05, + "loss": 101.209, + "step": 81300 + }, + { + "epoch": 0.32850268870421023, + "grad_norm": 387.4610900878906, + "learning_rate": 4.266285371816767e-05, + "loss": 80.6324, + "step": 81310 + }, + { + "epoch": 0.3285430899695779, + "grad_norm": 1534.5985107421875, + "learning_rate": 4.266038321075831e-05, + "loss": 55.0623, + "step": 81320 + }, + { + "epoch": 0.32858349123494546, + "grad_norm": 750.3333129882812, + "learning_rate": 4.265791235905067e-05, + "loss": 65.2265, + "step": 81330 + }, + { + "epoch": 0.3286238925003131, + "grad_norm": 438.5273742675781, + "learning_rate": 4.265544116309294e-05, + "loss": 82.4092, + "step": 81340 + }, + { + "epoch": 0.32866429376568074, + "grad_norm": 1133.52099609375, + "learning_rate": 4.265296962293329e-05, + "loss": 58.1583, + "step": 81350 + }, + { + "epoch": 0.3287046950310484, + "grad_norm": 632.3345336914062, + "learning_rate": 4.265049773861991e-05, + "loss": 79.6419, + "step": 81360 + }, + { + "epoch": 0.328745096296416, + "grad_norm": 778.0379028320312, + "learning_rate": 4.2648025510201e-05, + "loss": 49.8477, + "step": 81370 + }, + { + "epoch": 0.32878549756178366, + "grad_norm": 662.5733642578125, + "learning_rate": 4.2645552937724744e-05, + "loss": 58.1828, + "step": 81380 + }, + { + "epoch": 0.32882589882715124, + "grad_norm": 1248.02001953125, + "learning_rate": 4.264308002123935e-05, + "loss": 57.9085, + "step": 81390 + }, + { + "epoch": 0.3288663000925189, + "grad_norm": 645.2049560546875, + "learning_rate": 4.264060676079302e-05, + "loss": 56.5214, + "step": 81400 + }, + { + "epoch": 0.3289067013578865, + "grad_norm": 2653.940673828125, + "learning_rate": 4.2638133156433986e-05, + "loss": 68.7137, + "step": 81410 + }, + { + "epoch": 0.32894710262325416, + "grad_norm": 756.0985717773438, + "learning_rate": 4.263565920821046e-05, + "loss": 53.9748, + "step": 81420 + }, + { + "epoch": 0.3289875038886218, + "grad_norm": 449.9473571777344, + "learning_rate": 4.2633184916170677e-05, + "loss": 69.4224, + "step": 81430 + }, + { + "epoch": 0.32902790515398944, + "grad_norm": 1535.21923828125, + "learning_rate": 4.263071028036288e-05, + "loss": 66.2684, + "step": 81440 + }, + { + "epoch": 0.3290683064193571, + "grad_norm": 899.3699951171875, + "learning_rate": 4.2628235300835314e-05, + "loss": 65.152, + "step": 81450 + }, + { + "epoch": 0.32910870768472467, + "grad_norm": 956.332275390625, + "learning_rate": 4.2625759977636214e-05, + "loss": 68.6298, + "step": 81460 + }, + { + "epoch": 0.3291491089500923, + "grad_norm": 815.9563598632812, + "learning_rate": 4.262328431081386e-05, + "loss": 80.0543, + "step": 81470 + }, + { + "epoch": 0.32918951021545995, + "grad_norm": 536.765625, + "learning_rate": 4.26208083004165e-05, + "loss": 47.5851, + "step": 81480 + }, + { + "epoch": 0.3292299114808276, + "grad_norm": 564.234375, + "learning_rate": 4.261833194649241e-05, + "loss": 73.5911, + "step": 81490 + }, + { + "epoch": 0.3292703127461952, + "grad_norm": 999.4091186523438, + "learning_rate": 4.261585524908987e-05, + "loss": 68.8592, + "step": 81500 + }, + { + "epoch": 0.32931071401156287, + "grad_norm": 1350.5775146484375, + "learning_rate": 4.261337820825716e-05, + "loss": 70.5121, + "step": 81510 + }, + { + "epoch": 0.32935111527693045, + "grad_norm": 735.5888061523438, + "learning_rate": 4.261090082404258e-05, + "loss": 75.4246, + "step": 81520 + }, + { + "epoch": 0.3293915165422981, + "grad_norm": 365.2147521972656, + "learning_rate": 4.2608423096494406e-05, + "loss": 61.5511, + "step": 81530 + }, + { + "epoch": 0.32943191780766573, + "grad_norm": 1004.5228881835938, + "learning_rate": 4.260594502566097e-05, + "loss": 66.0472, + "step": 81540 + }, + { + "epoch": 0.32947231907303337, + "grad_norm": 725.521484375, + "learning_rate": 4.260346661159058e-05, + "loss": 61.7147, + "step": 81550 + }, + { + "epoch": 0.329512720338401, + "grad_norm": 1222.5792236328125, + "learning_rate": 4.260098785433154e-05, + "loss": 60.5714, + "step": 81560 + }, + { + "epoch": 0.32955312160376865, + "grad_norm": 1317.46728515625, + "learning_rate": 4.259850875393217e-05, + "loss": 75.1795, + "step": 81570 + }, + { + "epoch": 0.32959352286913624, + "grad_norm": 1155.3607177734375, + "learning_rate": 4.2596029310440824e-05, + "loss": 65.329, + "step": 81580 + }, + { + "epoch": 0.3296339241345039, + "grad_norm": 549.3975219726562, + "learning_rate": 4.259354952390582e-05, + "loss": 65.329, + "step": 81590 + }, + { + "epoch": 0.3296743253998715, + "grad_norm": 1814.1624755859375, + "learning_rate": 4.259106939437551e-05, + "loss": 66.1722, + "step": 81600 + }, + { + "epoch": 0.32971472666523916, + "grad_norm": 355.4609680175781, + "learning_rate": 4.258858892189825e-05, + "loss": 80.2931, + "step": 81610 + }, + { + "epoch": 0.3297551279306068, + "grad_norm": 570.4844360351562, + "learning_rate": 4.258610810652239e-05, + "loss": 44.5537, + "step": 81620 + }, + { + "epoch": 0.32979552919597444, + "grad_norm": 719.2767333984375, + "learning_rate": 4.258362694829629e-05, + "loss": 63.4508, + "step": 81630 + }, + { + "epoch": 0.3298359304613421, + "grad_norm": 494.90911865234375, + "learning_rate": 4.258114544726835e-05, + "loss": 67.4852, + "step": 81640 + }, + { + "epoch": 0.32987633172670966, + "grad_norm": 730.2485961914062, + "learning_rate": 4.257866360348692e-05, + "loss": 58.3942, + "step": 81650 + }, + { + "epoch": 0.3299167329920773, + "grad_norm": 528.159423828125, + "learning_rate": 4.257618141700039e-05, + "loss": 55.3444, + "step": 81660 + }, + { + "epoch": 0.32995713425744494, + "grad_norm": 911.97998046875, + "learning_rate": 4.257369888785715e-05, + "loss": 69.4855, + "step": 81670 + }, + { + "epoch": 0.3299975355228126, + "grad_norm": 1036.72021484375, + "learning_rate": 4.2571216016105614e-05, + "loss": 90.7197, + "step": 81680 + }, + { + "epoch": 0.3300379367881802, + "grad_norm": 1021.642333984375, + "learning_rate": 4.256873280179416e-05, + "loss": 96.8164, + "step": 81690 + }, + { + "epoch": 0.33007833805354786, + "grad_norm": 1377.108642578125, + "learning_rate": 4.256624924497123e-05, + "loss": 73.541, + "step": 81700 + }, + { + "epoch": 0.33011873931891544, + "grad_norm": 702.5912475585938, + "learning_rate": 4.256376534568522e-05, + "loss": 98.5258, + "step": 81710 + }, + { + "epoch": 0.3301591405842831, + "grad_norm": 3800.063720703125, + "learning_rate": 4.256128110398457e-05, + "loss": 64.3711, + "step": 81720 + }, + { + "epoch": 0.3301995418496507, + "grad_norm": 806.7174072265625, + "learning_rate": 4.25587965199177e-05, + "loss": 77.5143, + "step": 81730 + }, + { + "epoch": 0.33023994311501836, + "grad_norm": 753.8306274414062, + "learning_rate": 4.255631159353305e-05, + "loss": 85.3019, + "step": 81740 + }, + { + "epoch": 0.330280344380386, + "grad_norm": 1084.2166748046875, + "learning_rate": 4.2553826324879064e-05, + "loss": 70.9181, + "step": 81750 + }, + { + "epoch": 0.33032074564575364, + "grad_norm": 1052.1856689453125, + "learning_rate": 4.2551340714004203e-05, + "loss": 71.8444, + "step": 81760 + }, + { + "epoch": 0.3303611469111213, + "grad_norm": 1021.4315185546875, + "learning_rate": 4.254885476095691e-05, + "loss": 72.4072, + "step": 81770 + }, + { + "epoch": 0.33040154817648887, + "grad_norm": 433.3638000488281, + "learning_rate": 4.254636846578566e-05, + "loss": 95.435, + "step": 81780 + }, + { + "epoch": 0.3304419494418565, + "grad_norm": 778.073974609375, + "learning_rate": 4.254388182853894e-05, + "loss": 72.969, + "step": 81790 + }, + { + "epoch": 0.33048235070722415, + "grad_norm": 1745.2950439453125, + "learning_rate": 4.254139484926519e-05, + "loss": 81.5817, + "step": 81800 + }, + { + "epoch": 0.3305227519725918, + "grad_norm": 273.6972961425781, + "learning_rate": 4.253890752801293e-05, + "loss": 73.0842, + "step": 81810 + }, + { + "epoch": 0.33056315323795943, + "grad_norm": 711.11328125, + "learning_rate": 4.253641986483062e-05, + "loss": 66.8901, + "step": 81820 + }, + { + "epoch": 0.33060355450332707, + "grad_norm": 810.6575927734375, + "learning_rate": 4.2533931859766794e-05, + "loss": 80.1247, + "step": 81830 + }, + { + "epoch": 0.33064395576869465, + "grad_norm": 0.0, + "learning_rate": 4.253144351286994e-05, + "loss": 72.4615, + "step": 81840 + }, + { + "epoch": 0.3306843570340623, + "grad_norm": 2913.29443359375, + "learning_rate": 4.252895482418856e-05, + "loss": 76.5862, + "step": 81850 + }, + { + "epoch": 0.33072475829942993, + "grad_norm": 1155.0936279296875, + "learning_rate": 4.252646579377119e-05, + "loss": 84.624, + "step": 81860 + }, + { + "epoch": 0.3307651595647976, + "grad_norm": 587.8722534179688, + "learning_rate": 4.252397642166633e-05, + "loss": 72.8901, + "step": 81870 + }, + { + "epoch": 0.3308055608301652, + "grad_norm": 840.6516723632812, + "learning_rate": 4.252148670792254e-05, + "loss": 92.0446, + "step": 81880 + }, + { + "epoch": 0.33084596209553285, + "grad_norm": 701.7250366210938, + "learning_rate": 4.251899665258835e-05, + "loss": 30.5136, + "step": 81890 + }, + { + "epoch": 0.33088636336090044, + "grad_norm": 0.0, + "learning_rate": 4.2516506255712296e-05, + "loss": 71.1064, + "step": 81900 + }, + { + "epoch": 0.3309267646262681, + "grad_norm": 485.8135986328125, + "learning_rate": 4.251401551734293e-05, + "loss": 77.7925, + "step": 81910 + }, + { + "epoch": 0.3309671658916357, + "grad_norm": 918.968994140625, + "learning_rate": 4.2511524437528825e-05, + "loss": 79.0183, + "step": 81920 + }, + { + "epoch": 0.33100756715700336, + "grad_norm": 808.7003173828125, + "learning_rate": 4.250903301631853e-05, + "loss": 45.6422, + "step": 81930 + }, + { + "epoch": 0.331047968422371, + "grad_norm": 1006.4502563476562, + "learning_rate": 4.250654125376062e-05, + "loss": 81.9783, + "step": 81940 + }, + { + "epoch": 0.33108836968773864, + "grad_norm": 926.7799682617188, + "learning_rate": 4.250404914990367e-05, + "loss": 51.5458, + "step": 81950 + }, + { + "epoch": 0.3311287709531063, + "grad_norm": 422.4704284667969, + "learning_rate": 4.250155670479628e-05, + "loss": 68.1897, + "step": 81960 + }, + { + "epoch": 0.33116917221847386, + "grad_norm": 661.1273193359375, + "learning_rate": 4.2499063918487034e-05, + "loss": 57.1841, + "step": 81970 + }, + { + "epoch": 0.3312095734838415, + "grad_norm": 1525.314208984375, + "learning_rate": 4.2496570791024513e-05, + "loss": 85.9705, + "step": 81980 + }, + { + "epoch": 0.33124997474920914, + "grad_norm": 677.7116088867188, + "learning_rate": 4.2494077322457346e-05, + "loss": 49.9874, + "step": 81990 + }, + { + "epoch": 0.3312903760145768, + "grad_norm": 854.898193359375, + "learning_rate": 4.249158351283414e-05, + "loss": 67.6385, + "step": 82000 + }, + { + "epoch": 0.3313307772799444, + "grad_norm": 836.3336791992188, + "learning_rate": 4.24890893622035e-05, + "loss": 79.6089, + "step": 82010 + }, + { + "epoch": 0.33137117854531206, + "grad_norm": 509.12847900390625, + "learning_rate": 4.248659487061406e-05, + "loss": 30.2002, + "step": 82020 + }, + { + "epoch": 0.33141157981067965, + "grad_norm": 623.1883544921875, + "learning_rate": 4.248410003811445e-05, + "loss": 54.6345, + "step": 82030 + }, + { + "epoch": 0.3314519810760473, + "grad_norm": 1047.8133544921875, + "learning_rate": 4.248160486475331e-05, + "loss": 69.4941, + "step": 82040 + }, + { + "epoch": 0.3314923823414149, + "grad_norm": 2248.522216796875, + "learning_rate": 4.247910935057929e-05, + "loss": 75.4848, + "step": 82050 + }, + { + "epoch": 0.33153278360678257, + "grad_norm": 834.0875854492188, + "learning_rate": 4.2476613495641026e-05, + "loss": 56.126, + "step": 82060 + }, + { + "epoch": 0.3315731848721502, + "grad_norm": 629.9591064453125, + "learning_rate": 4.247411729998718e-05, + "loss": 64.6742, + "step": 82070 + }, + { + "epoch": 0.33161358613751785, + "grad_norm": 738.5386352539062, + "learning_rate": 4.247162076366643e-05, + "loss": 56.9134, + "step": 82080 + }, + { + "epoch": 0.33165398740288543, + "grad_norm": 734.9909057617188, + "learning_rate": 4.246912388672744e-05, + "loss": 58.0093, + "step": 82090 + }, + { + "epoch": 0.33169438866825307, + "grad_norm": 315.8343505859375, + "learning_rate": 4.246662666921888e-05, + "loss": 68.2145, + "step": 82100 + }, + { + "epoch": 0.3317347899336207, + "grad_norm": 471.6864318847656, + "learning_rate": 4.2464129111189444e-05, + "loss": 98.6437, + "step": 82110 + }, + { + "epoch": 0.33177519119898835, + "grad_norm": 913.5321044921875, + "learning_rate": 4.2461631212687816e-05, + "loss": 50.3258, + "step": 82120 + }, + { + "epoch": 0.331815592464356, + "grad_norm": 837.1478271484375, + "learning_rate": 4.24591329737627e-05, + "loss": 79.771, + "step": 82130 + }, + { + "epoch": 0.33185599372972363, + "grad_norm": 681.1831665039062, + "learning_rate": 4.24566343944628e-05, + "loss": 54.0234, + "step": 82140 + }, + { + "epoch": 0.33189639499509127, + "grad_norm": 497.15582275390625, + "learning_rate": 4.245413547483682e-05, + "loss": 79.7911, + "step": 82150 + }, + { + "epoch": 0.33193679626045886, + "grad_norm": 463.25885009765625, + "learning_rate": 4.245163621493349e-05, + "loss": 60.4234, + "step": 82160 + }, + { + "epoch": 0.3319771975258265, + "grad_norm": 566.7998046875, + "learning_rate": 4.244913661480152e-05, + "loss": 47.7919, + "step": 82170 + }, + { + "epoch": 0.33201759879119414, + "grad_norm": 1045.619384765625, + "learning_rate": 4.2446636674489645e-05, + "loss": 67.3056, + "step": 82180 + }, + { + "epoch": 0.3320580000565618, + "grad_norm": 605.4349975585938, + "learning_rate": 4.244413639404662e-05, + "loss": 68.3609, + "step": 82190 + }, + { + "epoch": 0.3320984013219294, + "grad_norm": 562.8687133789062, + "learning_rate": 4.244163577352116e-05, + "loss": 64.8594, + "step": 82200 + }, + { + "epoch": 0.33213880258729706, + "grad_norm": 830.25927734375, + "learning_rate": 4.243913481296205e-05, + "loss": 42.151, + "step": 82210 + }, + { + "epoch": 0.33217920385266464, + "grad_norm": 570.14453125, + "learning_rate": 4.243663351241801e-05, + "loss": 91.821, + "step": 82220 + }, + { + "epoch": 0.3322196051180323, + "grad_norm": 499.4716796875, + "learning_rate": 4.243413187193783e-05, + "loss": 56.719, + "step": 82230 + }, + { + "epoch": 0.3322600063833999, + "grad_norm": 536.1835327148438, + "learning_rate": 4.2431629891570266e-05, + "loss": 71.9568, + "step": 82240 + }, + { + "epoch": 0.33230040764876756, + "grad_norm": 907.7794799804688, + "learning_rate": 4.242912757136412e-05, + "loss": 80.2039, + "step": 82250 + }, + { + "epoch": 0.3323408089141352, + "grad_norm": 896.99267578125, + "learning_rate": 4.2426624911368146e-05, + "loss": 61.2477, + "step": 82260 + }, + { + "epoch": 0.33238121017950284, + "grad_norm": 3104.49169921875, + "learning_rate": 4.242412191163115e-05, + "loss": 64.1988, + "step": 82270 + }, + { + "epoch": 0.3324216114448705, + "grad_norm": 739.8251342773438, + "learning_rate": 4.242161857220193e-05, + "loss": 99.7195, + "step": 82280 + }, + { + "epoch": 0.33246201271023806, + "grad_norm": 717.78662109375, + "learning_rate": 4.241911489312927e-05, + "loss": 69.5272, + "step": 82290 + }, + { + "epoch": 0.3325024139756057, + "grad_norm": 494.52197265625, + "learning_rate": 4.241661087446202e-05, + "loss": 46.656, + "step": 82300 + }, + { + "epoch": 0.33254281524097334, + "grad_norm": 355.43536376953125, + "learning_rate": 4.2414106516248964e-05, + "loss": 56.7551, + "step": 82310 + }, + { + "epoch": 0.332583216506341, + "grad_norm": 1103.3970947265625, + "learning_rate": 4.241160181853894e-05, + "loss": 68.9124, + "step": 82320 + }, + { + "epoch": 0.3326236177717086, + "grad_norm": 742.4725952148438, + "learning_rate": 4.240909678138077e-05, + "loss": 83.4965, + "step": 82330 + }, + { + "epoch": 0.33266401903707626, + "grad_norm": 1117.4141845703125, + "learning_rate": 4.24065914048233e-05, + "loss": 71.4723, + "step": 82340 + }, + { + "epoch": 0.33270442030244385, + "grad_norm": 539.0693969726562, + "learning_rate": 4.2404085688915364e-05, + "loss": 53.2493, + "step": 82350 + }, + { + "epoch": 0.3327448215678115, + "grad_norm": 4054.252197265625, + "learning_rate": 4.240157963370582e-05, + "loss": 99.1258, + "step": 82360 + }, + { + "epoch": 0.33278522283317913, + "grad_norm": 826.72509765625, + "learning_rate": 4.2399073239243526e-05, + "loss": 68.9222, + "step": 82370 + }, + { + "epoch": 0.33282562409854677, + "grad_norm": 831.2509155273438, + "learning_rate": 4.239656650557734e-05, + "loss": 73.0231, + "step": 82380 + }, + { + "epoch": 0.3328660253639144, + "grad_norm": 409.9931335449219, + "learning_rate": 4.239405943275613e-05, + "loss": 45.4566, + "step": 82390 + }, + { + "epoch": 0.33290642662928205, + "grad_norm": 926.7503051757812, + "learning_rate": 4.2391552020828775e-05, + "loss": 77.0996, + "step": 82400 + }, + { + "epoch": 0.33294682789464963, + "grad_norm": 1059.4488525390625, + "learning_rate": 4.2389044269844155e-05, + "loss": 95.9397, + "step": 82410 + }, + { + "epoch": 0.3329872291600173, + "grad_norm": 0.0, + "learning_rate": 4.238653617985118e-05, + "loss": 49.413, + "step": 82420 + }, + { + "epoch": 0.3330276304253849, + "grad_norm": 1252.39453125, + "learning_rate": 4.238402775089871e-05, + "loss": 66.4441, + "step": 82430 + }, + { + "epoch": 0.33306803169075255, + "grad_norm": 1263.342529296875, + "learning_rate": 4.238151898303569e-05, + "loss": 76.9728, + "step": 82440 + }, + { + "epoch": 0.3331084329561202, + "grad_norm": 473.049072265625, + "learning_rate": 4.2379009876311e-05, + "loss": 63.0306, + "step": 82450 + }, + { + "epoch": 0.33314883422148783, + "grad_norm": 1100.3658447265625, + "learning_rate": 4.237650043077357e-05, + "loss": 107.0959, + "step": 82460 + }, + { + "epoch": 0.3331892354868555, + "grad_norm": 731.7568359375, + "learning_rate": 4.237399064647231e-05, + "loss": 105.6519, + "step": 82470 + }, + { + "epoch": 0.33322963675222306, + "grad_norm": 569.7168579101562, + "learning_rate": 4.237148052345616e-05, + "loss": 89.5767, + "step": 82480 + }, + { + "epoch": 0.3332700380175907, + "grad_norm": 650.82958984375, + "learning_rate": 4.236897006177405e-05, + "loss": 84.1082, + "step": 82490 + }, + { + "epoch": 0.33331043928295834, + "grad_norm": 2010.1734619140625, + "learning_rate": 4.2366459261474933e-05, + "loss": 60.609, + "step": 82500 + }, + { + "epoch": 0.333350840548326, + "grad_norm": 1313.4893798828125, + "learning_rate": 4.2363948122607756e-05, + "loss": 72.3615, + "step": 82510 + }, + { + "epoch": 0.3333912418136936, + "grad_norm": 538.3892211914062, + "learning_rate": 4.236143664522146e-05, + "loss": 78.7254, + "step": 82520 + }, + { + "epoch": 0.33343164307906126, + "grad_norm": 678.4175415039062, + "learning_rate": 4.235892482936502e-05, + "loss": 67.0122, + "step": 82530 + }, + { + "epoch": 0.33347204434442884, + "grad_norm": 1527.4241943359375, + "learning_rate": 4.2356412675087406e-05, + "loss": 46.1357, + "step": 82540 + }, + { + "epoch": 0.3335124456097965, + "grad_norm": 889.2577514648438, + "learning_rate": 4.23539001824376e-05, + "loss": 85.7127, + "step": 82550 + }, + { + "epoch": 0.3335528468751641, + "grad_norm": 621.6854858398438, + "learning_rate": 4.2351387351464565e-05, + "loss": 62.1339, + "step": 82560 + }, + { + "epoch": 0.33359324814053176, + "grad_norm": 606.4068603515625, + "learning_rate": 4.2348874182217305e-05, + "loss": 56.7306, + "step": 82570 + }, + { + "epoch": 0.3336336494058994, + "grad_norm": 785.7623901367188, + "learning_rate": 4.2346360674744815e-05, + "loss": 49.6323, + "step": 82580 + }, + { + "epoch": 0.33367405067126704, + "grad_norm": 1377.6890869140625, + "learning_rate": 4.234384682909608e-05, + "loss": 84.6193, + "step": 82590 + }, + { + "epoch": 0.3337144519366347, + "grad_norm": 2395.7802734375, + "learning_rate": 4.234133264532012e-05, + "loss": 70.3222, + "step": 82600 + }, + { + "epoch": 0.33375485320200227, + "grad_norm": 746.2875366210938, + "learning_rate": 4.2338818123465966e-05, + "loss": 41.8322, + "step": 82610 + }, + { + "epoch": 0.3337952544673699, + "grad_norm": 461.2134094238281, + "learning_rate": 4.2336303263582624e-05, + "loss": 40.6642, + "step": 82620 + }, + { + "epoch": 0.33383565573273755, + "grad_norm": 469.310546875, + "learning_rate": 4.233378806571912e-05, + "loss": 67.8555, + "step": 82630 + }, + { + "epoch": 0.3338760569981052, + "grad_norm": 827.2962036132812, + "learning_rate": 4.2331272529924495e-05, + "loss": 74.3388, + "step": 82640 + }, + { + "epoch": 0.3339164582634728, + "grad_norm": 544.0149536132812, + "learning_rate": 4.2328756656247795e-05, + "loss": 79.4444, + "step": 82650 + }, + { + "epoch": 0.33395685952884047, + "grad_norm": 383.1783142089844, + "learning_rate": 4.2326240444738055e-05, + "loss": 51.7502, + "step": 82660 + }, + { + "epoch": 0.33399726079420805, + "grad_norm": 1237.60009765625, + "learning_rate": 4.232372389544434e-05, + "loss": 73.6413, + "step": 82670 + }, + { + "epoch": 0.3340376620595757, + "grad_norm": 309.4773864746094, + "learning_rate": 4.232120700841571e-05, + "loss": 59.2111, + "step": 82680 + }, + { + "epoch": 0.33407806332494333, + "grad_norm": 1590.2628173828125, + "learning_rate": 4.2318689783701224e-05, + "loss": 59.3465, + "step": 82690 + }, + { + "epoch": 0.33411846459031097, + "grad_norm": 464.6656799316406, + "learning_rate": 4.2316172221349973e-05, + "loss": 38.0717, + "step": 82700 + }, + { + "epoch": 0.3341588658556786, + "grad_norm": 2365.112060546875, + "learning_rate": 4.231365432141103e-05, + "loss": 52.5256, + "step": 82710 + }, + { + "epoch": 0.33419926712104625, + "grad_norm": 905.1858520507812, + "learning_rate": 4.231113608393348e-05, + "loss": 78.6967, + "step": 82720 + }, + { + "epoch": 0.33423966838641384, + "grad_norm": 399.5769348144531, + "learning_rate": 4.2308617508966414e-05, + "loss": 79.4581, + "step": 82730 + }, + { + "epoch": 0.3342800696517815, + "grad_norm": 2698.52685546875, + "learning_rate": 4.230609859655895e-05, + "loss": 78.356, + "step": 82740 + }, + { + "epoch": 0.3343204709171491, + "grad_norm": 1030.6182861328125, + "learning_rate": 4.230357934676017e-05, + "loss": 57.3243, + "step": 82750 + }, + { + "epoch": 0.33436087218251676, + "grad_norm": 1275.8216552734375, + "learning_rate": 4.230105975961921e-05, + "loss": 77.0945, + "step": 82760 + }, + { + "epoch": 0.3344012734478844, + "grad_norm": 846.1837768554688, + "learning_rate": 4.229853983518518e-05, + "loss": 58.413, + "step": 82770 + }, + { + "epoch": 0.33444167471325204, + "grad_norm": 502.4288330078125, + "learning_rate": 4.229601957350722e-05, + "loss": 55.0714, + "step": 82780 + }, + { + "epoch": 0.3344820759786197, + "grad_norm": 1429.5887451171875, + "learning_rate": 4.229349897463445e-05, + "loss": 74.8495, + "step": 82790 + }, + { + "epoch": 0.33452247724398726, + "grad_norm": 534.1192016601562, + "learning_rate": 4.2290978038616e-05, + "loss": 69.4769, + "step": 82800 + }, + { + "epoch": 0.3345628785093549, + "grad_norm": 625.2440185546875, + "learning_rate": 4.228845676550105e-05, + "loss": 68.7517, + "step": 82810 + }, + { + "epoch": 0.33460327977472254, + "grad_norm": 529.8661499023438, + "learning_rate": 4.2285935155338724e-05, + "loss": 77.1349, + "step": 82820 + }, + { + "epoch": 0.3346436810400902, + "grad_norm": 1316.5428466796875, + "learning_rate": 4.22834132081782e-05, + "loss": 61.7906, + "step": 82830 + }, + { + "epoch": 0.3346840823054578, + "grad_norm": 483.9060974121094, + "learning_rate": 4.2280890924068625e-05, + "loss": 62.0083, + "step": 82840 + }, + { + "epoch": 0.33472448357082546, + "grad_norm": 882.261962890625, + "learning_rate": 4.22783683030592e-05, + "loss": 62.3824, + "step": 82850 + }, + { + "epoch": 0.33476488483619304, + "grad_norm": 733.6650390625, + "learning_rate": 4.227584534519907e-05, + "loss": 81.4946, + "step": 82860 + }, + { + "epoch": 0.3348052861015607, + "grad_norm": 604.4740600585938, + "learning_rate": 4.227332205053746e-05, + "loss": 72.0598, + "step": 82870 + }, + { + "epoch": 0.3348456873669283, + "grad_norm": 4985.6357421875, + "learning_rate": 4.2270798419123534e-05, + "loss": 66.5251, + "step": 82880 + }, + { + "epoch": 0.33488608863229596, + "grad_norm": 477.78289794921875, + "learning_rate": 4.2268274451006506e-05, + "loss": 116.5259, + "step": 82890 + }, + { + "epoch": 0.3349264898976636, + "grad_norm": 1549.6126708984375, + "learning_rate": 4.226575014623557e-05, + "loss": 49.6759, + "step": 82900 + }, + { + "epoch": 0.33496689116303124, + "grad_norm": 416.32879638671875, + "learning_rate": 4.2263225504859955e-05, + "loss": 69.3102, + "step": 82910 + }, + { + "epoch": 0.3350072924283989, + "grad_norm": 575.8262939453125, + "learning_rate": 4.226070052692886e-05, + "loss": 54.391, + "step": 82920 + }, + { + "epoch": 0.33504769369376647, + "grad_norm": 318.236572265625, + "learning_rate": 4.2258175212491537e-05, + "loss": 58.843, + "step": 82930 + }, + { + "epoch": 0.3350880949591341, + "grad_norm": 439.44488525390625, + "learning_rate": 4.2255649561597186e-05, + "loss": 62.2686, + "step": 82940 + }, + { + "epoch": 0.33512849622450175, + "grad_norm": 857.3777465820312, + "learning_rate": 4.225312357429508e-05, + "loss": 68.3515, + "step": 82950 + }, + { + "epoch": 0.3351688974898694, + "grad_norm": 1923.6248779296875, + "learning_rate": 4.225059725063444e-05, + "loss": 75.4591, + "step": 82960 + }, + { + "epoch": 0.33520929875523703, + "grad_norm": 1054.3583984375, + "learning_rate": 4.2248070590664525e-05, + "loss": 57.7459, + "step": 82970 + }, + { + "epoch": 0.33524970002060467, + "grad_norm": 974.0528564453125, + "learning_rate": 4.224554359443459e-05, + "loss": 52.0075, + "step": 82980 + }, + { + "epoch": 0.33529010128597225, + "grad_norm": 1275.845947265625, + "learning_rate": 4.22430162619939e-05, + "loss": 94.6089, + "step": 82990 + }, + { + "epoch": 0.3353305025513399, + "grad_norm": 1231.898681640625, + "learning_rate": 4.224048859339175e-05, + "loss": 65.2636, + "step": 83000 + }, + { + "epoch": 0.33537090381670753, + "grad_norm": 592.545166015625, + "learning_rate": 4.223796058867738e-05, + "loss": 74.3102, + "step": 83010 + }, + { + "epoch": 0.3354113050820752, + "grad_norm": 570.7125244140625, + "learning_rate": 4.22354322479001e-05, + "loss": 77.6862, + "step": 83020 + }, + { + "epoch": 0.3354517063474428, + "grad_norm": 814.4917602539062, + "learning_rate": 4.22329035711092e-05, + "loss": 78.4505, + "step": 83030 + }, + { + "epoch": 0.33549210761281045, + "grad_norm": 1354.454345703125, + "learning_rate": 4.223037455835397e-05, + "loss": 56.5575, + "step": 83040 + }, + { + "epoch": 0.33553250887817804, + "grad_norm": 1030.1226806640625, + "learning_rate": 4.2227845209683716e-05, + "loss": 53.6785, + "step": 83050 + }, + { + "epoch": 0.3355729101435457, + "grad_norm": 1466.65966796875, + "learning_rate": 4.222531552514775e-05, + "loss": 68.2507, + "step": 83060 + }, + { + "epoch": 0.3356133114089133, + "grad_norm": 705.6240844726562, + "learning_rate": 4.2222785504795394e-05, + "loss": 92.5086, + "step": 83070 + }, + { + "epoch": 0.33565371267428096, + "grad_norm": 703.1721801757812, + "learning_rate": 4.2220255148675956e-05, + "loss": 59.856, + "step": 83080 + }, + { + "epoch": 0.3356941139396486, + "grad_norm": 523.6635131835938, + "learning_rate": 4.221772445683878e-05, + "loss": 93.9165, + "step": 83090 + }, + { + "epoch": 0.33573451520501624, + "grad_norm": 915.0527954101562, + "learning_rate": 4.221519342933321e-05, + "loss": 62.3113, + "step": 83100 + }, + { + "epoch": 0.3357749164703839, + "grad_norm": 5522.94091796875, + "learning_rate": 4.221266206620859e-05, + "loss": 62.9414, + "step": 83110 + }, + { + "epoch": 0.33581531773575146, + "grad_norm": 619.2235717773438, + "learning_rate": 4.221013036751424e-05, + "loss": 51.0544, + "step": 83120 + }, + { + "epoch": 0.3358557190011191, + "grad_norm": 860.1320190429688, + "learning_rate": 4.220759833329955e-05, + "loss": 57.2342, + "step": 83130 + }, + { + "epoch": 0.33589612026648674, + "grad_norm": 1275.2235107421875, + "learning_rate": 4.2205065963613864e-05, + "loss": 72.2249, + "step": 83140 + }, + { + "epoch": 0.3359365215318544, + "grad_norm": 1409.1829833984375, + "learning_rate": 4.220253325850657e-05, + "loss": 75.4809, + "step": 83150 + }, + { + "epoch": 0.335976922797222, + "grad_norm": 155.7539825439453, + "learning_rate": 4.220000021802702e-05, + "loss": 66.316, + "step": 83160 + }, + { + "epoch": 0.33601732406258966, + "grad_norm": 519.6751098632812, + "learning_rate": 4.219746684222462e-05, + "loss": 46.7393, + "step": 83170 + }, + { + "epoch": 0.33605772532795725, + "grad_norm": 3975.830078125, + "learning_rate": 4.219493313114875e-05, + "loss": 113.0992, + "step": 83180 + }, + { + "epoch": 0.3360981265933249, + "grad_norm": 741.3836059570312, + "learning_rate": 4.219239908484881e-05, + "loss": 62.3929, + "step": 83190 + }, + { + "epoch": 0.3361385278586925, + "grad_norm": 1000.0194091796875, + "learning_rate": 4.218986470337419e-05, + "loss": 63.8103, + "step": 83200 + }, + { + "epoch": 0.33617892912406017, + "grad_norm": 915.8265380859375, + "learning_rate": 4.21873299867743e-05, + "loss": 64.819, + "step": 83210 + }, + { + "epoch": 0.3362193303894278, + "grad_norm": 1064.971923828125, + "learning_rate": 4.218479493509858e-05, + "loss": 75.7704, + "step": 83220 + }, + { + "epoch": 0.33625973165479545, + "grad_norm": 381.1853942871094, + "learning_rate": 4.218225954839643e-05, + "loss": 44.5471, + "step": 83230 + }, + { + "epoch": 0.3363001329201631, + "grad_norm": 2388.5380859375, + "learning_rate": 4.217972382671729e-05, + "loss": 56.8206, + "step": 83240 + }, + { + "epoch": 0.33634053418553067, + "grad_norm": 2224.6982421875, + "learning_rate": 4.2177187770110576e-05, + "loss": 75.5993, + "step": 83250 + }, + { + "epoch": 0.3363809354508983, + "grad_norm": 584.9492797851562, + "learning_rate": 4.2174651378625754e-05, + "loss": 76.4168, + "step": 83260 + }, + { + "epoch": 0.33642133671626595, + "grad_norm": 576.220947265625, + "learning_rate": 4.217211465231226e-05, + "loss": 69.955, + "step": 83270 + }, + { + "epoch": 0.3364617379816336, + "grad_norm": 419.0504455566406, + "learning_rate": 4.2169577591219545e-05, + "loss": 61.2648, + "step": 83280 + }, + { + "epoch": 0.33650213924700123, + "grad_norm": 649.897705078125, + "learning_rate": 4.216704019539707e-05, + "loss": 50.4646, + "step": 83290 + }, + { + "epoch": 0.33654254051236887, + "grad_norm": 1460.0035400390625, + "learning_rate": 4.2164502464894316e-05, + "loss": 66.0741, + "step": 83300 + }, + { + "epoch": 0.33658294177773646, + "grad_norm": 618.8856201171875, + "learning_rate": 4.216196439976076e-05, + "loss": 57.7886, + "step": 83310 + }, + { + "epoch": 0.3366233430431041, + "grad_norm": 540.6898193359375, + "learning_rate": 4.2159426000045854e-05, + "loss": 62.811, + "step": 83320 + }, + { + "epoch": 0.33666374430847174, + "grad_norm": 629.9652099609375, + "learning_rate": 4.215688726579911e-05, + "loss": 61.0531, + "step": 83330 + }, + { + "epoch": 0.3367041455738394, + "grad_norm": 3191.65185546875, + "learning_rate": 4.2154348197070017e-05, + "loss": 70.6365, + "step": 83340 + }, + { + "epoch": 0.336744546839207, + "grad_norm": 483.31396484375, + "learning_rate": 4.215180879390808e-05, + "loss": 71.8596, + "step": 83350 + }, + { + "epoch": 0.33678494810457466, + "grad_norm": 633.4132080078125, + "learning_rate": 4.2149269056362794e-05, + "loss": 65.0201, + "step": 83360 + }, + { + "epoch": 0.33682534936994224, + "grad_norm": 1588.8582763671875, + "learning_rate": 4.214672898448367e-05, + "loss": 81.9684, + "step": 83370 + }, + { + "epoch": 0.3368657506353099, + "grad_norm": 1885.4788818359375, + "learning_rate": 4.214418857832025e-05, + "loss": 81.2159, + "step": 83380 + }, + { + "epoch": 0.3369061519006775, + "grad_norm": 2098.720947265625, + "learning_rate": 4.214164783792205e-05, + "loss": 78.4835, + "step": 83390 + }, + { + "epoch": 0.33694655316604516, + "grad_norm": 934.3864135742188, + "learning_rate": 4.213910676333859e-05, + "loss": 51.1806, + "step": 83400 + }, + { + "epoch": 0.3369869544314128, + "grad_norm": 561.3240356445312, + "learning_rate": 4.213656535461942e-05, + "loss": 83.0985, + "step": 83410 + }, + { + "epoch": 0.33702735569678044, + "grad_norm": 517.56982421875, + "learning_rate": 4.213402361181409e-05, + "loss": 52.2328, + "step": 83420 + }, + { + "epoch": 0.3370677569621481, + "grad_norm": 501.65325927734375, + "learning_rate": 4.213148153497215e-05, + "loss": 45.5168, + "step": 83430 + }, + { + "epoch": 0.33710815822751566, + "grad_norm": 619.2743530273438, + "learning_rate": 4.212893912414316e-05, + "loss": 54.3897, + "step": 83440 + }, + { + "epoch": 0.3371485594928833, + "grad_norm": 281.7502746582031, + "learning_rate": 4.212639637937668e-05, + "loss": 48.6456, + "step": 83450 + }, + { + "epoch": 0.33718896075825094, + "grad_norm": 1214.2874755859375, + "learning_rate": 4.212385330072228e-05, + "loss": 80.0758, + "step": 83460 + }, + { + "epoch": 0.3372293620236186, + "grad_norm": 639.830322265625, + "learning_rate": 4.2121309888229544e-05, + "loss": 64.5304, + "step": 83470 + }, + { + "epoch": 0.3372697632889862, + "grad_norm": 823.2178955078125, + "learning_rate": 4.2118766141948066e-05, + "loss": 52.8296, + "step": 83480 + }, + { + "epoch": 0.33731016455435386, + "grad_norm": 429.5492858886719, + "learning_rate": 4.211622206192742e-05, + "loss": 63.3038, + "step": 83490 + }, + { + "epoch": 0.33735056581972145, + "grad_norm": 0.0, + "learning_rate": 4.211367764821722e-05, + "loss": 82.3349, + "step": 83500 + }, + { + "epoch": 0.3373909670850891, + "grad_norm": 974.5023803710938, + "learning_rate": 4.211113290086706e-05, + "loss": 43.6207, + "step": 83510 + }, + { + "epoch": 0.33743136835045673, + "grad_norm": 804.0608520507812, + "learning_rate": 4.2108587819926554e-05, + "loss": 66.1226, + "step": 83520 + }, + { + "epoch": 0.33747176961582437, + "grad_norm": 269.8977355957031, + "learning_rate": 4.2106042405445325e-05, + "loss": 82.0022, + "step": 83530 + }, + { + "epoch": 0.337512170881192, + "grad_norm": 1011.4495849609375, + "learning_rate": 4.210349665747299e-05, + "loss": 62.9397, + "step": 83540 + }, + { + "epoch": 0.33755257214655965, + "grad_norm": 568.0720825195312, + "learning_rate": 4.210095057605917e-05, + "loss": 80.0488, + "step": 83550 + }, + { + "epoch": 0.3375929734119273, + "grad_norm": 3479.609375, + "learning_rate": 4.209840416125353e-05, + "loss": 88.0878, + "step": 83560 + }, + { + "epoch": 0.3376333746772949, + "grad_norm": 0.0, + "learning_rate": 4.20958574131057e-05, + "loss": 100.4558, + "step": 83570 + }, + { + "epoch": 0.3376737759426625, + "grad_norm": 2387.772216796875, + "learning_rate": 4.209331033166531e-05, + "loss": 77.3918, + "step": 83580 + }, + { + "epoch": 0.33771417720803015, + "grad_norm": 710.8726806640625, + "learning_rate": 4.209076291698205e-05, + "loss": 53.3156, + "step": 83590 + }, + { + "epoch": 0.3377545784733978, + "grad_norm": 847.5494384765625, + "learning_rate": 4.208821516910557e-05, + "loss": 64.4423, + "step": 83600 + }, + { + "epoch": 0.33779497973876543, + "grad_norm": 540.9591674804688, + "learning_rate": 4.208566708808554e-05, + "loss": 54.3956, + "step": 83610 + }, + { + "epoch": 0.3378353810041331, + "grad_norm": 426.9322814941406, + "learning_rate": 4.2083118673971613e-05, + "loss": 80.7343, + "step": 83620 + }, + { + "epoch": 0.33787578226950066, + "grad_norm": 530.9072875976562, + "learning_rate": 4.2080569926813503e-05, + "loss": 62.1767, + "step": 83630 + }, + { + "epoch": 0.3379161835348683, + "grad_norm": 3503.007080078125, + "learning_rate": 4.20780208466609e-05, + "loss": 95.5601, + "step": 83640 + }, + { + "epoch": 0.33795658480023594, + "grad_norm": 764.01708984375, + "learning_rate": 4.207547143356347e-05, + "loss": 44.9169, + "step": 83650 + }, + { + "epoch": 0.3379969860656036, + "grad_norm": 898.1088256835938, + "learning_rate": 4.207292168757095e-05, + "loss": 42.9797, + "step": 83660 + }, + { + "epoch": 0.3380373873309712, + "grad_norm": 616.1493530273438, + "learning_rate": 4.2070371608733025e-05, + "loss": 55.619, + "step": 83670 + }, + { + "epoch": 0.33807778859633886, + "grad_norm": 1211.4007568359375, + "learning_rate": 4.206782119709942e-05, + "loss": 92.8507, + "step": 83680 + }, + { + "epoch": 0.33811818986170644, + "grad_norm": 1338.9266357421875, + "learning_rate": 4.206527045271985e-05, + "loss": 60.5956, + "step": 83690 + }, + { + "epoch": 0.3381585911270741, + "grad_norm": 5582.28515625, + "learning_rate": 4.206271937564405e-05, + "loss": 60.9006, + "step": 83700 + }, + { + "epoch": 0.3381989923924417, + "grad_norm": 1023.3121337890625, + "learning_rate": 4.206016796592174e-05, + "loss": 69.0761, + "step": 83710 + }, + { + "epoch": 0.33823939365780936, + "grad_norm": 665.3640747070312, + "learning_rate": 4.2057616223602684e-05, + "loss": 63.9492, + "step": 83720 + }, + { + "epoch": 0.338279794923177, + "grad_norm": 360.83563232421875, + "learning_rate": 4.205506414873661e-05, + "loss": 40.3897, + "step": 83730 + }, + { + "epoch": 0.33832019618854464, + "grad_norm": 1596.3583984375, + "learning_rate": 4.205251174137329e-05, + "loss": 72.7595, + "step": 83740 + }, + { + "epoch": 0.3383605974539123, + "grad_norm": 361.8247985839844, + "learning_rate": 4.2049959001562464e-05, + "loss": 98.5569, + "step": 83750 + }, + { + "epoch": 0.33840099871927987, + "grad_norm": 818.7006225585938, + "learning_rate": 4.204740592935392e-05, + "loss": 88.2147, + "step": 83760 + }, + { + "epoch": 0.3384413999846475, + "grad_norm": 434.5075378417969, + "learning_rate": 4.2044852524797406e-05, + "loss": 104.2029, + "step": 83770 + }, + { + "epoch": 0.33848180125001515, + "grad_norm": 722.4409790039062, + "learning_rate": 4.204229878794273e-05, + "loss": 48.2412, + "step": 83780 + }, + { + "epoch": 0.3385222025153828, + "grad_norm": 328.6600036621094, + "learning_rate": 4.203974471883966e-05, + "loss": 50.057, + "step": 83790 + }, + { + "epoch": 0.3385626037807504, + "grad_norm": 381.5543518066406, + "learning_rate": 4.2037190317538e-05, + "loss": 62.9453, + "step": 83800 + }, + { + "epoch": 0.33860300504611807, + "grad_norm": 1003.5863037109375, + "learning_rate": 4.2034635584087535e-05, + "loss": 97.3556, + "step": 83810 + }, + { + "epoch": 0.33864340631148565, + "grad_norm": 257.1349182128906, + "learning_rate": 4.203208051853808e-05, + "loss": 65.2049, + "step": 83820 + }, + { + "epoch": 0.3386838075768533, + "grad_norm": 542.9520263671875, + "learning_rate": 4.202952512093945e-05, + "loss": 60.1434, + "step": 83830 + }, + { + "epoch": 0.33872420884222093, + "grad_norm": 742.0493774414062, + "learning_rate": 4.202696939134146e-05, + "loss": 75.3315, + "step": 83840 + }, + { + "epoch": 0.33876461010758857, + "grad_norm": 839.355712890625, + "learning_rate": 4.202441332979394e-05, + "loss": 53.2438, + "step": 83850 + }, + { + "epoch": 0.3388050113729562, + "grad_norm": 616.1432495117188, + "learning_rate": 4.20218569363467e-05, + "loss": 64.6932, + "step": 83860 + }, + { + "epoch": 0.33884541263832385, + "grad_norm": 956.6621704101562, + "learning_rate": 4.2019300211049615e-05, + "loss": 66.2041, + "step": 83870 + }, + { + "epoch": 0.3388858139036915, + "grad_norm": 1587.0950927734375, + "learning_rate": 4.2016743153952505e-05, + "loss": 72.81, + "step": 83880 + }, + { + "epoch": 0.3389262151690591, + "grad_norm": 1762.2025146484375, + "learning_rate": 4.201418576510523e-05, + "loss": 55.4421, + "step": 83890 + }, + { + "epoch": 0.3389666164344267, + "grad_norm": 874.0090942382812, + "learning_rate": 4.201162804455763e-05, + "loss": 62.957, + "step": 83900 + }, + { + "epoch": 0.33900701769979436, + "grad_norm": 457.5177307128906, + "learning_rate": 4.2009069992359595e-05, + "loss": 100.793, + "step": 83910 + }, + { + "epoch": 0.339047418965162, + "grad_norm": 469.1245422363281, + "learning_rate": 4.200651160856098e-05, + "loss": 66.5053, + "step": 83920 + }, + { + "epoch": 0.33908782023052964, + "grad_norm": 394.78167724609375, + "learning_rate": 4.200395289321167e-05, + "loss": 35.0095, + "step": 83930 + }, + { + "epoch": 0.3391282214958973, + "grad_norm": 1044.5858154296875, + "learning_rate": 4.2001393846361536e-05, + "loss": 99.2867, + "step": 83940 + }, + { + "epoch": 0.33916862276126486, + "grad_norm": 793.2132568359375, + "learning_rate": 4.199883446806048e-05, + "loss": 31.189, + "step": 83950 + }, + { + "epoch": 0.3392090240266325, + "grad_norm": 441.0299072265625, + "learning_rate": 4.19962747583584e-05, + "loss": 78.0899, + "step": 83960 + }, + { + "epoch": 0.33924942529200014, + "grad_norm": 1687.0687255859375, + "learning_rate": 4.1993714717305185e-05, + "loss": 110.2709, + "step": 83970 + }, + { + "epoch": 0.3392898265573678, + "grad_norm": 645.5243530273438, + "learning_rate": 4.199115434495076e-05, + "loss": 70.5179, + "step": 83980 + }, + { + "epoch": 0.3393302278227354, + "grad_norm": 889.2882690429688, + "learning_rate": 4.1988593641345024e-05, + "loss": 59.327, + "step": 83990 + }, + { + "epoch": 0.33937062908810306, + "grad_norm": 545.3060913085938, + "learning_rate": 4.198603260653792e-05, + "loss": 62.0888, + "step": 84000 + }, + { + "epoch": 0.33941103035347064, + "grad_norm": 437.35064697265625, + "learning_rate": 4.1983471240579356e-05, + "loss": 48.4227, + "step": 84010 + }, + { + "epoch": 0.3394514316188383, + "grad_norm": 662.9463500976562, + "learning_rate": 4.198090954351928e-05, + "loss": 80.6067, + "step": 84020 + }, + { + "epoch": 0.3394918328842059, + "grad_norm": 952.4712524414062, + "learning_rate": 4.197834751540762e-05, + "loss": 84.5905, + "step": 84030 + }, + { + "epoch": 0.33953223414957356, + "grad_norm": 781.6859130859375, + "learning_rate": 4.197578515629435e-05, + "loss": 60.8523, + "step": 84040 + }, + { + "epoch": 0.3395726354149412, + "grad_norm": 975.0667724609375, + "learning_rate": 4.1973222466229404e-05, + "loss": 54.1651, + "step": 84050 + }, + { + "epoch": 0.33961303668030884, + "grad_norm": 2085.77392578125, + "learning_rate": 4.197065944526275e-05, + "loss": 60.0403, + "step": 84060 + }, + { + "epoch": 0.3396534379456765, + "grad_norm": 599.7440185546875, + "learning_rate": 4.196809609344434e-05, + "loss": 56.4642, + "step": 84070 + }, + { + "epoch": 0.33969383921104407, + "grad_norm": 1469.0364990234375, + "learning_rate": 4.196553241082418e-05, + "loss": 57.0351, + "step": 84080 + }, + { + "epoch": 0.3397342404764117, + "grad_norm": 579.5526123046875, + "learning_rate": 4.1962968397452216e-05, + "loss": 58.5323, + "step": 84090 + }, + { + "epoch": 0.33977464174177935, + "grad_norm": 639.9765625, + "learning_rate": 4.1960404053378454e-05, + "loss": 65.8241, + "step": 84100 + }, + { + "epoch": 0.339815043007147, + "grad_norm": 1099.93017578125, + "learning_rate": 4.1957839378652886e-05, + "loss": 81.618, + "step": 84110 + }, + { + "epoch": 0.33985544427251463, + "grad_norm": 490.1031494140625, + "learning_rate": 4.1955274373325506e-05, + "loss": 71.3405, + "step": 84120 + }, + { + "epoch": 0.33989584553788227, + "grad_norm": 2551.330322265625, + "learning_rate": 4.1952709037446324e-05, + "loss": 63.7435, + "step": 84130 + }, + { + "epoch": 0.33993624680324985, + "grad_norm": 930.4701538085938, + "learning_rate": 4.1950143371065355e-05, + "loss": 58.0226, + "step": 84140 + }, + { + "epoch": 0.3399766480686175, + "grad_norm": 856.0897216796875, + "learning_rate": 4.194757737423261e-05, + "loss": 58.936, + "step": 84150 + }, + { + "epoch": 0.34001704933398513, + "grad_norm": 592.7817993164062, + "learning_rate": 4.194501104699812e-05, + "loss": 55.8844, + "step": 84160 + }, + { + "epoch": 0.3400574505993528, + "grad_norm": 1155.2713623046875, + "learning_rate": 4.194244438941192e-05, + "loss": 62.0951, + "step": 84170 + }, + { + "epoch": 0.3400978518647204, + "grad_norm": 353.23583984375, + "learning_rate": 4.193987740152404e-05, + "loss": 59.3539, + "step": 84180 + }, + { + "epoch": 0.34013825313008805, + "grad_norm": 1153.6064453125, + "learning_rate": 4.193731008338453e-05, + "loss": 73.7305, + "step": 84190 + }, + { + "epoch": 0.3401786543954557, + "grad_norm": 524.4332885742188, + "learning_rate": 4.193474243504343e-05, + "loss": 77.0145, + "step": 84200 + }, + { + "epoch": 0.3402190556608233, + "grad_norm": 718.3153686523438, + "learning_rate": 4.193217445655082e-05, + "loss": 61.5601, + "step": 84210 + }, + { + "epoch": 0.3402594569261909, + "grad_norm": 1005.5006713867188, + "learning_rate": 4.192960614795675e-05, + "loss": 53.4194, + "step": 84220 + }, + { + "epoch": 0.34029985819155856, + "grad_norm": 1067.0313720703125, + "learning_rate": 4.192703750931129e-05, + "loss": 62.9559, + "step": 84230 + }, + { + "epoch": 0.3403402594569262, + "grad_norm": 794.4739990234375, + "learning_rate": 4.192446854066452e-05, + "loss": 46.9849, + "step": 84240 + }, + { + "epoch": 0.34038066072229384, + "grad_norm": 821.4888305664062, + "learning_rate": 4.192189924206652e-05, + "loss": 54.5941, + "step": 84250 + }, + { + "epoch": 0.3404210619876615, + "grad_norm": 876.5982666015625, + "learning_rate": 4.191932961356739e-05, + "loss": 91.2899, + "step": 84260 + }, + { + "epoch": 0.34046146325302906, + "grad_norm": 780.8380737304688, + "learning_rate": 4.1916759655217206e-05, + "loss": 53.8854, + "step": 84270 + }, + { + "epoch": 0.3405018645183967, + "grad_norm": 2006.7110595703125, + "learning_rate": 4.1914189367066094e-05, + "loss": 64.6178, + "step": 84280 + }, + { + "epoch": 0.34054226578376434, + "grad_norm": 663.4932861328125, + "learning_rate": 4.191161874916415e-05, + "loss": 71.5971, + "step": 84290 + }, + { + "epoch": 0.340582667049132, + "grad_norm": 1891.99658203125, + "learning_rate": 4.1909047801561484e-05, + "loss": 87.5498, + "step": 84300 + }, + { + "epoch": 0.3406230683144996, + "grad_norm": 483.8343811035156, + "learning_rate": 4.1906476524308235e-05, + "loss": 87.025, + "step": 84310 + }, + { + "epoch": 0.34066346957986726, + "grad_norm": 1536.576171875, + "learning_rate": 4.1903904917454516e-05, + "loss": 88.3288, + "step": 84320 + }, + { + "epoch": 0.34070387084523485, + "grad_norm": 708.4118041992188, + "learning_rate": 4.190133298105047e-05, + "loss": 59.6797, + "step": 84330 + }, + { + "epoch": 0.3407442721106025, + "grad_norm": 837.2510986328125, + "learning_rate": 4.189876071514624e-05, + "loss": 60.1374, + "step": 84340 + }, + { + "epoch": 0.3407846733759701, + "grad_norm": 2082.153564453125, + "learning_rate": 4.189618811979197e-05, + "loss": 65.0576, + "step": 84350 + }, + { + "epoch": 0.34082507464133777, + "grad_norm": 1336.408447265625, + "learning_rate": 4.18936151950378e-05, + "loss": 76.66, + "step": 84360 + }, + { + "epoch": 0.3408654759067054, + "grad_norm": 2176.240234375, + "learning_rate": 4.189104194093392e-05, + "loss": 84.4399, + "step": 84370 + }, + { + "epoch": 0.34090587717207305, + "grad_norm": 602.0006103515625, + "learning_rate": 4.1888468357530476e-05, + "loss": 68.7765, + "step": 84380 + }, + { + "epoch": 0.3409462784374407, + "grad_norm": 614.801513671875, + "learning_rate": 4.188589444487765e-05, + "loss": 63.4444, + "step": 84390 + }, + { + "epoch": 0.34098667970280827, + "grad_norm": 1069.0560302734375, + "learning_rate": 4.188332020302561e-05, + "loss": 65.1551, + "step": 84400 + }, + { + "epoch": 0.3410270809681759, + "grad_norm": 667.7435913085938, + "learning_rate": 4.1880745632024554e-05, + "loss": 62.9196, + "step": 84410 + }, + { + "epoch": 0.34106748223354355, + "grad_norm": 668.0552368164062, + "learning_rate": 4.187817073192468e-05, + "loss": 40.3692, + "step": 84420 + }, + { + "epoch": 0.3411078834989112, + "grad_norm": 943.7819213867188, + "learning_rate": 4.187559550277617e-05, + "loss": 57.6345, + "step": 84430 + }, + { + "epoch": 0.34114828476427883, + "grad_norm": 314.5076599121094, + "learning_rate": 4.187301994462924e-05, + "loss": 59.5329, + "step": 84440 + }, + { + "epoch": 0.34118868602964647, + "grad_norm": 759.6746826171875, + "learning_rate": 4.1870444057534095e-05, + "loss": 58.5424, + "step": 84450 + }, + { + "epoch": 0.34122908729501406, + "grad_norm": 802.6051635742188, + "learning_rate": 4.1867867841540964e-05, + "loss": 58.658, + "step": 84460 + }, + { + "epoch": 0.3412694885603817, + "grad_norm": 1856.4595947265625, + "learning_rate": 4.186529129670006e-05, + "loss": 88.8393, + "step": 84470 + }, + { + "epoch": 0.34130988982574934, + "grad_norm": 921.8723754882812, + "learning_rate": 4.1862714423061624e-05, + "loss": 62.6543, + "step": 84480 + }, + { + "epoch": 0.341350291091117, + "grad_norm": 756.6406860351562, + "learning_rate": 4.186013722067588e-05, + "loss": 70.8543, + "step": 84490 + }, + { + "epoch": 0.3413906923564846, + "grad_norm": 2013.621826171875, + "learning_rate": 4.185755968959308e-05, + "loss": 62.2787, + "step": 84500 + }, + { + "epoch": 0.34143109362185226, + "grad_norm": 831.4722900390625, + "learning_rate": 4.185498182986349e-05, + "loss": 42.0386, + "step": 84510 + }, + { + "epoch": 0.3414714948872199, + "grad_norm": 723.7086181640625, + "learning_rate": 4.185240364153734e-05, + "loss": 67.1348, + "step": 84520 + }, + { + "epoch": 0.3415118961525875, + "grad_norm": 754.1615600585938, + "learning_rate": 4.184982512466491e-05, + "loss": 80.2152, + "step": 84530 + }, + { + "epoch": 0.3415522974179551, + "grad_norm": 326.5327453613281, + "learning_rate": 4.1847246279296464e-05, + "loss": 81.1519, + "step": 84540 + }, + { + "epoch": 0.34159269868332276, + "grad_norm": 528.33935546875, + "learning_rate": 4.184466710548227e-05, + "loss": 64.1323, + "step": 84550 + }, + { + "epoch": 0.3416330999486904, + "grad_norm": 682.2901611328125, + "learning_rate": 4.184208760327263e-05, + "loss": 64.297, + "step": 84560 + }, + { + "epoch": 0.34167350121405804, + "grad_norm": 516.5701904296875, + "learning_rate": 4.183950777271781e-05, + "loss": 54.8298, + "step": 84570 + }, + { + "epoch": 0.3417139024794257, + "grad_norm": 674.0258178710938, + "learning_rate": 4.183692761386813e-05, + "loss": 66.7959, + "step": 84580 + }, + { + "epoch": 0.34175430374479326, + "grad_norm": 317.5458984375, + "learning_rate": 4.183434712677387e-05, + "loss": 43.3866, + "step": 84590 + }, + { + "epoch": 0.3417947050101609, + "grad_norm": 1114.966064453125, + "learning_rate": 4.183176631148534e-05, + "loss": 63.7891, + "step": 84600 + }, + { + "epoch": 0.34183510627552854, + "grad_norm": 887.722412109375, + "learning_rate": 4.1829185168052877e-05, + "loss": 87.129, + "step": 84610 + }, + { + "epoch": 0.3418755075408962, + "grad_norm": 735.17822265625, + "learning_rate": 4.182660369652677e-05, + "loss": 52.123, + "step": 84620 + }, + { + "epoch": 0.3419159088062638, + "grad_norm": 470.5625305175781, + "learning_rate": 4.182402189695736e-05, + "loss": 82.246, + "step": 84630 + }, + { + "epoch": 0.34195631007163146, + "grad_norm": 1080.039794921875, + "learning_rate": 4.1821439769395e-05, + "loss": 85.6906, + "step": 84640 + }, + { + "epoch": 0.34199671133699905, + "grad_norm": 529.3787841796875, + "learning_rate": 4.181885731388999e-05, + "loss": 66.6203, + "step": 84650 + }, + { + "epoch": 0.3420371126023667, + "grad_norm": 710.7547607421875, + "learning_rate": 4.1816274530492713e-05, + "loss": 64.3684, + "step": 84660 + }, + { + "epoch": 0.34207751386773433, + "grad_norm": 1438.0509033203125, + "learning_rate": 4.18136914192535e-05, + "loss": 71.3715, + "step": 84670 + }, + { + "epoch": 0.34211791513310197, + "grad_norm": 779.0742797851562, + "learning_rate": 4.181110798022271e-05, + "loss": 64.4421, + "step": 84680 + }, + { + "epoch": 0.3421583163984696, + "grad_norm": 415.0228576660156, + "learning_rate": 4.180852421345072e-05, + "loss": 58.738, + "step": 84690 + }, + { + "epoch": 0.34219871766383725, + "grad_norm": 530.8126220703125, + "learning_rate": 4.180594011898791e-05, + "loss": 64.9684, + "step": 84700 + }, + { + "epoch": 0.3422391189292049, + "grad_norm": 553.169677734375, + "learning_rate": 4.1803355696884625e-05, + "loss": 58.3446, + "step": 84710 + }, + { + "epoch": 0.3422795201945725, + "grad_norm": 766.584228515625, + "learning_rate": 4.180077094719128e-05, + "loss": 66.1009, + "step": 84720 + }, + { + "epoch": 0.3423199214599401, + "grad_norm": 801.9248657226562, + "learning_rate": 4.179818586995825e-05, + "loss": 49.8386, + "step": 84730 + }, + { + "epoch": 0.34236032272530775, + "grad_norm": 1037.901611328125, + "learning_rate": 4.1795600465235947e-05, + "loss": 79.0902, + "step": 84740 + }, + { + "epoch": 0.3424007239906754, + "grad_norm": 871.1192626953125, + "learning_rate": 4.179301473307476e-05, + "loss": 89.6048, + "step": 84750 + }, + { + "epoch": 0.34244112525604303, + "grad_norm": 257.207763671875, + "learning_rate": 4.179042867352511e-05, + "loss": 53.3181, + "step": 84760 + }, + { + "epoch": 0.3424815265214107, + "grad_norm": 1137.479736328125, + "learning_rate": 4.17878422866374e-05, + "loss": 64.5483, + "step": 84770 + }, + { + "epoch": 0.34252192778677826, + "grad_norm": 1104.8631591796875, + "learning_rate": 4.1785255572462066e-05, + "loss": 50.5376, + "step": 84780 + }, + { + "epoch": 0.3425623290521459, + "grad_norm": 674.0195922851562, + "learning_rate": 4.178266853104954e-05, + "loss": 61.1695, + "step": 84790 + }, + { + "epoch": 0.34260273031751354, + "grad_norm": 647.3651733398438, + "learning_rate": 4.178008116245024e-05, + "loss": 73.4516, + "step": 84800 + }, + { + "epoch": 0.3426431315828812, + "grad_norm": 1168.34228515625, + "learning_rate": 4.1777493466714624e-05, + "loss": 62.1275, + "step": 84810 + }, + { + "epoch": 0.3426835328482488, + "grad_norm": 2343.4892578125, + "learning_rate": 4.177490544389313e-05, + "loss": 64.6482, + "step": 84820 + }, + { + "epoch": 0.34272393411361646, + "grad_norm": 817.9552001953125, + "learning_rate": 4.177231709403622e-05, + "loss": 64.0418, + "step": 84830 + }, + { + "epoch": 0.3427643353789841, + "grad_norm": 753.8648681640625, + "learning_rate": 4.176972841719435e-05, + "loss": 57.1917, + "step": 84840 + }, + { + "epoch": 0.3428047366443517, + "grad_norm": 510.9037780761719, + "learning_rate": 4.1767139413418e-05, + "loss": 54.1073, + "step": 84850 + }, + { + "epoch": 0.3428451379097193, + "grad_norm": 641.147705078125, + "learning_rate": 4.176455008275764e-05, + "loss": 81.8066, + "step": 84860 + }, + { + "epoch": 0.34288553917508696, + "grad_norm": 1563.3798828125, + "learning_rate": 4.1761960425263735e-05, + "loss": 71.83, + "step": 84870 + }, + { + "epoch": 0.3429259404404546, + "grad_norm": 1007.089111328125, + "learning_rate": 4.1759370440986775e-05, + "loss": 65.9258, + "step": 84880 + }, + { + "epoch": 0.34296634170582224, + "grad_norm": 694.405517578125, + "learning_rate": 4.175678012997727e-05, + "loss": 53.0368, + "step": 84890 + }, + { + "epoch": 0.3430067429711899, + "grad_norm": 787.7047119140625, + "learning_rate": 4.1754189492285714e-05, + "loss": 78.8049, + "step": 84900 + }, + { + "epoch": 0.34304714423655747, + "grad_norm": 1134.1265869140625, + "learning_rate": 4.17515985279626e-05, + "loss": 52.2765, + "step": 84910 + }, + { + "epoch": 0.3430875455019251, + "grad_norm": 984.3347778320312, + "learning_rate": 4.174900723705845e-05, + "loss": 79.6157, + "step": 84920 + }, + { + "epoch": 0.34312794676729275, + "grad_norm": 671.3541259765625, + "learning_rate": 4.174641561962378e-05, + "loss": 68.477, + "step": 84930 + }, + { + "epoch": 0.3431683480326604, + "grad_norm": 870.8594970703125, + "learning_rate": 4.174382367570912e-05, + "loss": 60.4037, + "step": 84940 + }, + { + "epoch": 0.343208749298028, + "grad_norm": 11791.9111328125, + "learning_rate": 4.174123140536499e-05, + "loss": 129.9077, + "step": 84950 + }, + { + "epoch": 0.34324915056339567, + "grad_norm": 656.0787963867188, + "learning_rate": 4.1738638808641936e-05, + "loss": 68.1256, + "step": 84960 + }, + { + "epoch": 0.34328955182876325, + "grad_norm": 549.08935546875, + "learning_rate": 4.17360458855905e-05, + "loss": 46.7033, + "step": 84970 + }, + { + "epoch": 0.3433299530941309, + "grad_norm": 748.6014404296875, + "learning_rate": 4.1733452636261244e-05, + "loss": 37.716, + "step": 84980 + }, + { + "epoch": 0.34337035435949853, + "grad_norm": 870.7546997070312, + "learning_rate": 4.173085906070471e-05, + "loss": 46.5251, + "step": 84990 + }, + { + "epoch": 0.34341075562486617, + "grad_norm": 525.0897216796875, + "learning_rate": 4.172826515897146e-05, + "loss": 88.4015, + "step": 85000 + }, + { + "epoch": 0.3434511568902338, + "grad_norm": 1008.6449584960938, + "learning_rate": 4.172567093111207e-05, + "loss": 69.9759, + "step": 85010 + }, + { + "epoch": 0.34349155815560145, + "grad_norm": 694.2141723632812, + "learning_rate": 4.172307637717711e-05, + "loss": 64.9771, + "step": 85020 + }, + { + "epoch": 0.3435319594209691, + "grad_norm": 601.18505859375, + "learning_rate": 4.172048149721717e-05, + "loss": 72.7544, + "step": 85030 + }, + { + "epoch": 0.3435723606863367, + "grad_norm": 1000.3665771484375, + "learning_rate": 4.171788629128284e-05, + "loss": 79.2617, + "step": 85040 + }, + { + "epoch": 0.3436127619517043, + "grad_norm": 380.0011901855469, + "learning_rate": 4.1715290759424705e-05, + "loss": 77.5077, + "step": 85050 + }, + { + "epoch": 0.34365316321707196, + "grad_norm": 674.123779296875, + "learning_rate": 4.1712694901693374e-05, + "loss": 104.6444, + "step": 85060 + }, + { + "epoch": 0.3436935644824396, + "grad_norm": 2173.708251953125, + "learning_rate": 4.171009871813944e-05, + "loss": 66.0904, + "step": 85070 + }, + { + "epoch": 0.34373396574780724, + "grad_norm": 1144.228759765625, + "learning_rate": 4.170750220881354e-05, + "loss": 66.5176, + "step": 85080 + }, + { + "epoch": 0.3437743670131749, + "grad_norm": 648.3857421875, + "learning_rate": 4.1704905373766286e-05, + "loss": 58.1772, + "step": 85090 + }, + { + "epoch": 0.34381476827854246, + "grad_norm": 1063.3370361328125, + "learning_rate": 4.170230821304829e-05, + "loss": 56.7559, + "step": 85100 + }, + { + "epoch": 0.3438551695439101, + "grad_norm": 704.5231323242188, + "learning_rate": 4.169971072671021e-05, + "loss": 68.8396, + "step": 85110 + }, + { + "epoch": 0.34389557080927774, + "grad_norm": 857.602294921875, + "learning_rate": 4.169711291480266e-05, + "loss": 49.2389, + "step": 85120 + }, + { + "epoch": 0.3439359720746454, + "grad_norm": 1297.68701171875, + "learning_rate": 4.16945147773763e-05, + "loss": 61.9332, + "step": 85130 + }, + { + "epoch": 0.343976373340013, + "grad_norm": 603.5067138671875, + "learning_rate": 4.169191631448178e-05, + "loss": 83.4856, + "step": 85140 + }, + { + "epoch": 0.34401677460538066, + "grad_norm": 541.5191040039062, + "learning_rate": 4.1689317526169766e-05, + "loss": 44.5568, + "step": 85150 + }, + { + "epoch": 0.34405717587074824, + "grad_norm": 589.892578125, + "learning_rate": 4.168671841249091e-05, + "loss": 56.9382, + "step": 85160 + }, + { + "epoch": 0.3440975771361159, + "grad_norm": 555.4111328125, + "learning_rate": 4.168411897349588e-05, + "loss": 83.5068, + "step": 85170 + }, + { + "epoch": 0.3441379784014835, + "grad_norm": 547.9937133789062, + "learning_rate": 4.168151920923536e-05, + "loss": 34.5336, + "step": 85180 + }, + { + "epoch": 0.34417837966685116, + "grad_norm": 1771.55078125, + "learning_rate": 4.1678919119760054e-05, + "loss": 90.6765, + "step": 85190 + }, + { + "epoch": 0.3442187809322188, + "grad_norm": 696.42236328125, + "learning_rate": 4.1676318705120616e-05, + "loss": 59.9467, + "step": 85200 + }, + { + "epoch": 0.34425918219758644, + "grad_norm": 877.6763916015625, + "learning_rate": 4.167371796536777e-05, + "loss": 95.2035, + "step": 85210 + }, + { + "epoch": 0.3442995834629541, + "grad_norm": 1338.104248046875, + "learning_rate": 4.1671116900552194e-05, + "loss": 66.9077, + "step": 85220 + }, + { + "epoch": 0.34433998472832167, + "grad_norm": 992.7899169921875, + "learning_rate": 4.166851551072462e-05, + "loss": 60.782, + "step": 85230 + }, + { + "epoch": 0.3443803859936893, + "grad_norm": 1082.4932861328125, + "learning_rate": 4.166591379593575e-05, + "loss": 61.3283, + "step": 85240 + }, + { + "epoch": 0.34442078725905695, + "grad_norm": 1248.29052734375, + "learning_rate": 4.166331175623631e-05, + "loss": 65.6705, + "step": 85250 + }, + { + "epoch": 0.3444611885244246, + "grad_norm": 651.8319091796875, + "learning_rate": 4.166070939167703e-05, + "loss": 107.7256, + "step": 85260 + }, + { + "epoch": 0.34450158978979223, + "grad_norm": 950.817138671875, + "learning_rate": 4.165810670230865e-05, + "loss": 53.1864, + "step": 85270 + }, + { + "epoch": 0.34454199105515987, + "grad_norm": 871.7650756835938, + "learning_rate": 4.16555036881819e-05, + "loss": 97.4988, + "step": 85280 + }, + { + "epoch": 0.34458239232052745, + "grad_norm": 581.1294555664062, + "learning_rate": 4.1652900349347533e-05, + "loss": 52.556, + "step": 85290 + }, + { + "epoch": 0.3446227935858951, + "grad_norm": 919.3056640625, + "learning_rate": 4.165029668585629e-05, + "loss": 55.3191, + "step": 85300 + }, + { + "epoch": 0.34466319485126273, + "grad_norm": 622.2424926757812, + "learning_rate": 4.164769269775896e-05, + "loss": 107.1292, + "step": 85310 + }, + { + "epoch": 0.3447035961166304, + "grad_norm": 0.0, + "learning_rate": 4.1645088385106266e-05, + "loss": 51.1484, + "step": 85320 + }, + { + "epoch": 0.344743997381998, + "grad_norm": 2131.662353515625, + "learning_rate": 4.164248374794902e-05, + "loss": 83.7406, + "step": 85330 + }, + { + "epoch": 0.34478439864736565, + "grad_norm": 527.97509765625, + "learning_rate": 4.163987878633798e-05, + "loss": 57.3865, + "step": 85340 + }, + { + "epoch": 0.3448247999127333, + "grad_norm": 460.5814208984375, + "learning_rate": 4.163727350032394e-05, + "loss": 35.4961, + "step": 85350 + }, + { + "epoch": 0.3448652011781009, + "grad_norm": 1813.7447509765625, + "learning_rate": 4.1634667889957676e-05, + "loss": 93.811, + "step": 85360 + }, + { + "epoch": 0.3449056024434685, + "grad_norm": 516.6041259765625, + "learning_rate": 4.1632061955290017e-05, + "loss": 66.2047, + "step": 85370 + }, + { + "epoch": 0.34494600370883616, + "grad_norm": 1379.9407958984375, + "learning_rate": 4.1629455696371734e-05, + "loss": 62.7511, + "step": 85380 + }, + { + "epoch": 0.3449864049742038, + "grad_norm": 573.064453125, + "learning_rate": 4.162684911325365e-05, + "loss": 68.7898, + "step": 85390 + }, + { + "epoch": 0.34502680623957144, + "grad_norm": 620.5324096679688, + "learning_rate": 4.162424220598658e-05, + "loss": 62.7597, + "step": 85400 + }, + { + "epoch": 0.3450672075049391, + "grad_norm": 522.501708984375, + "learning_rate": 4.162163497462136e-05, + "loss": 63.0242, + "step": 85410 + }, + { + "epoch": 0.34510760877030666, + "grad_norm": 687.868896484375, + "learning_rate": 4.161902741920881e-05, + "loss": 68.5159, + "step": 85420 + }, + { + "epoch": 0.3451480100356743, + "grad_norm": 842.2858276367188, + "learning_rate": 4.1616419539799754e-05, + "loss": 46.8133, + "step": 85430 + }, + { + "epoch": 0.34518841130104194, + "grad_norm": 808.839111328125, + "learning_rate": 4.161381133644505e-05, + "loss": 58.4971, + "step": 85440 + }, + { + "epoch": 0.3452288125664096, + "grad_norm": 2333.049072265625, + "learning_rate": 4.161120280919554e-05, + "loss": 99.8404, + "step": 85450 + }, + { + "epoch": 0.3452692138317772, + "grad_norm": 667.53125, + "learning_rate": 4.160859395810208e-05, + "loss": 74.1404, + "step": 85460 + }, + { + "epoch": 0.34530961509714486, + "grad_norm": 886.0628051757812, + "learning_rate": 4.160598478321553e-05, + "loss": 62.568, + "step": 85470 + }, + { + "epoch": 0.34535001636251245, + "grad_norm": 421.406005859375, + "learning_rate": 4.160337528458676e-05, + "loss": 75.3051, + "step": 85480 + }, + { + "epoch": 0.3453904176278801, + "grad_norm": 852.6929931640625, + "learning_rate": 4.160076546226663e-05, + "loss": 67.8079, + "step": 85490 + }, + { + "epoch": 0.3454308188932477, + "grad_norm": 1610.6650390625, + "learning_rate": 4.1598155316306044e-05, + "loss": 67.7026, + "step": 85500 + }, + { + "epoch": 0.34547122015861537, + "grad_norm": 740.6041259765625, + "learning_rate": 4.1595544846755865e-05, + "loss": 69.3296, + "step": 85510 + }, + { + "epoch": 0.345511621423983, + "grad_norm": 713.525390625, + "learning_rate": 4.1592934053667004e-05, + "loss": 66.8102, + "step": 85520 + }, + { + "epoch": 0.34555202268935065, + "grad_norm": 1433.9876708984375, + "learning_rate": 4.1590322937090345e-05, + "loss": 49.2965, + "step": 85530 + }, + { + "epoch": 0.3455924239547183, + "grad_norm": 974.6792602539062, + "learning_rate": 4.15877114970768e-05, + "loss": 71.5766, + "step": 85540 + }, + { + "epoch": 0.34563282522008587, + "grad_norm": 996.3238525390625, + "learning_rate": 4.158509973367728e-05, + "loss": 82.5995, + "step": 85550 + }, + { + "epoch": 0.3456732264854535, + "grad_norm": 3350.908935546875, + "learning_rate": 4.1582487646942706e-05, + "loss": 86.8919, + "step": 85560 + }, + { + "epoch": 0.34571362775082115, + "grad_norm": 1187.7080078125, + "learning_rate": 4.157987523692399e-05, + "loss": 59.8579, + "step": 85570 + }, + { + "epoch": 0.3457540290161888, + "grad_norm": 713.9551391601562, + "learning_rate": 4.157726250367207e-05, + "loss": 61.2336, + "step": 85580 + }, + { + "epoch": 0.34579443028155643, + "grad_norm": 932.2216186523438, + "learning_rate": 4.157464944723789e-05, + "loss": 72.413, + "step": 85590 + }, + { + "epoch": 0.34583483154692407, + "grad_norm": 813.9993286132812, + "learning_rate": 4.157203606767238e-05, + "loss": 65.8375, + "step": 85600 + }, + { + "epoch": 0.34587523281229166, + "grad_norm": 648.2496948242188, + "learning_rate": 4.15694223650265e-05, + "loss": 56.1922, + "step": 85610 + }, + { + "epoch": 0.3459156340776593, + "grad_norm": 1841.92236328125, + "learning_rate": 4.156680833935119e-05, + "loss": 73.8934, + "step": 85620 + }, + { + "epoch": 0.34595603534302694, + "grad_norm": 1668.5313720703125, + "learning_rate": 4.156419399069744e-05, + "loss": 98.3084, + "step": 85630 + }, + { + "epoch": 0.3459964366083946, + "grad_norm": 451.40338134765625, + "learning_rate": 4.156157931911619e-05, + "loss": 41.3341, + "step": 85640 + }, + { + "epoch": 0.3460368378737622, + "grad_norm": 740.665771484375, + "learning_rate": 4.155896432465842e-05, + "loss": 93.8652, + "step": 85650 + }, + { + "epoch": 0.34607723913912986, + "grad_norm": 716.4408569335938, + "learning_rate": 4.155634900737513e-05, + "loss": 57.4022, + "step": 85660 + }, + { + "epoch": 0.3461176404044975, + "grad_norm": 543.2172241210938, + "learning_rate": 4.155373336731728e-05, + "loss": 49.6871, + "step": 85670 + }, + { + "epoch": 0.3461580416698651, + "grad_norm": 1185.981689453125, + "learning_rate": 4.155111740453588e-05, + "loss": 69.9553, + "step": 85680 + }, + { + "epoch": 0.3461984429352327, + "grad_norm": 817.763671875, + "learning_rate": 4.154850111908192e-05, + "loss": 54.951, + "step": 85690 + }, + { + "epoch": 0.34623884420060036, + "grad_norm": 793.4094848632812, + "learning_rate": 4.154588451100642e-05, + "loss": 98.9682, + "step": 85700 + }, + { + "epoch": 0.346279245465968, + "grad_norm": 643.4264526367188, + "learning_rate": 4.1543267580360374e-05, + "loss": 52.4739, + "step": 85710 + }, + { + "epoch": 0.34631964673133564, + "grad_norm": 1268.8812255859375, + "learning_rate": 4.154065032719481e-05, + "loss": 83.5729, + "step": 85720 + }, + { + "epoch": 0.3463600479967033, + "grad_norm": 532.7398681640625, + "learning_rate": 4.153803275156076e-05, + "loss": 52.5248, + "step": 85730 + }, + { + "epoch": 0.34640044926207086, + "grad_norm": 2376.361083984375, + "learning_rate": 4.153541485350924e-05, + "loss": 97.7747, + "step": 85740 + }, + { + "epoch": 0.3464408505274385, + "grad_norm": 2388.05908203125, + "learning_rate": 4.1532796633091296e-05, + "loss": 77.3507, + "step": 85750 + }, + { + "epoch": 0.34648125179280614, + "grad_norm": 625.3255004882812, + "learning_rate": 4.1530178090357976e-05, + "loss": 56.9691, + "step": 85760 + }, + { + "epoch": 0.3465216530581738, + "grad_norm": 792.126953125, + "learning_rate": 4.152755922536032e-05, + "loss": 51.1804, + "step": 85770 + }, + { + "epoch": 0.3465620543235414, + "grad_norm": 379.2236328125, + "learning_rate": 4.1524940038149384e-05, + "loss": 53.4376, + "step": 85780 + }, + { + "epoch": 0.34660245558890906, + "grad_norm": 1024.7281494140625, + "learning_rate": 4.152232052877624e-05, + "loss": 101.2286, + "step": 85790 + }, + { + "epoch": 0.34664285685427665, + "grad_norm": 1685.01611328125, + "learning_rate": 4.1519700697291944e-05, + "loss": 58.5797, + "step": 85800 + }, + { + "epoch": 0.3466832581196443, + "grad_norm": 642.328857421875, + "learning_rate": 4.1517080543747584e-05, + "loss": 80.0744, + "step": 85810 + }, + { + "epoch": 0.34672365938501193, + "grad_norm": 760.1344604492188, + "learning_rate": 4.151446006819423e-05, + "loss": 65.5687, + "step": 85820 + }, + { + "epoch": 0.34676406065037957, + "grad_norm": 499.570556640625, + "learning_rate": 4.151183927068298e-05, + "loss": 57.8297, + "step": 85830 + }, + { + "epoch": 0.3468044619157472, + "grad_norm": 1389.1029052734375, + "learning_rate": 4.150921815126493e-05, + "loss": 74.6479, + "step": 85840 + }, + { + "epoch": 0.34684486318111485, + "grad_norm": 1510.54150390625, + "learning_rate": 4.150659670999116e-05, + "loss": 82.4179, + "step": 85850 + }, + { + "epoch": 0.3468852644464825, + "grad_norm": 728.5758056640625, + "learning_rate": 4.150397494691279e-05, + "loss": 70.3151, + "step": 85860 + }, + { + "epoch": 0.3469256657118501, + "grad_norm": 1455.7420654296875, + "learning_rate": 4.150135286208093e-05, + "loss": 73.2886, + "step": 85870 + }, + { + "epoch": 0.3469660669772177, + "grad_norm": 328.35015869140625, + "learning_rate": 4.149873045554671e-05, + "loss": 42.4879, + "step": 85880 + }, + { + "epoch": 0.34700646824258535, + "grad_norm": 434.59527587890625, + "learning_rate": 4.1496107727361235e-05, + "loss": 45.5354, + "step": 85890 + }, + { + "epoch": 0.347046869507953, + "grad_norm": 3333.067138671875, + "learning_rate": 4.149348467757566e-05, + "loss": 98.8698, + "step": 85900 + }, + { + "epoch": 0.34708727077332063, + "grad_norm": 605.552978515625, + "learning_rate": 4.1490861306241096e-05, + "loss": 74.4398, + "step": 85910 + }, + { + "epoch": 0.3471276720386883, + "grad_norm": 1759.8612060546875, + "learning_rate": 4.148823761340871e-05, + "loss": 76.9145, + "step": 85920 + }, + { + "epoch": 0.34716807330405586, + "grad_norm": 567.1038818359375, + "learning_rate": 4.1485613599129636e-05, + "loss": 40.8438, + "step": 85930 + }, + { + "epoch": 0.3472084745694235, + "grad_norm": 3881.6103515625, + "learning_rate": 4.148298926345504e-05, + "loss": 66.3889, + "step": 85940 + }, + { + "epoch": 0.34724887583479114, + "grad_norm": 621.2008666992188, + "learning_rate": 4.148036460643608e-05, + "loss": 59.3511, + "step": 85950 + }, + { + "epoch": 0.3472892771001588, + "grad_norm": 468.0349426269531, + "learning_rate": 4.1477739628123934e-05, + "loss": 101.2036, + "step": 85960 + }, + { + "epoch": 0.3473296783655264, + "grad_norm": 1429.98095703125, + "learning_rate": 4.1475114328569776e-05, + "loss": 56.234, + "step": 85970 + }, + { + "epoch": 0.34737007963089406, + "grad_norm": 695.315673828125, + "learning_rate": 4.147248870782477e-05, + "loss": 54.0909, + "step": 85980 + }, + { + "epoch": 0.3474104808962617, + "grad_norm": 1032.339111328125, + "learning_rate": 4.146986276594012e-05, + "loss": 73.7455, + "step": 85990 + }, + { + "epoch": 0.3474508821616293, + "grad_norm": 353.8186950683594, + "learning_rate": 4.146723650296701e-05, + "loss": 55.2589, + "step": 86000 + }, + { + "epoch": 0.3474912834269969, + "grad_norm": 1116.111572265625, + "learning_rate": 4.146460991895666e-05, + "loss": 58.2723, + "step": 86010 + }, + { + "epoch": 0.34753168469236456, + "grad_norm": 416.8829040527344, + "learning_rate": 4.1461983013960245e-05, + "loss": 47.4193, + "step": 86020 + }, + { + "epoch": 0.3475720859577322, + "grad_norm": 794.2473754882812, + "learning_rate": 4.1459355788029013e-05, + "loss": 67.6769, + "step": 86030 + }, + { + "epoch": 0.34761248722309984, + "grad_norm": 727.6878051757812, + "learning_rate": 4.145672824121416e-05, + "loss": 84.4826, + "step": 86040 + }, + { + "epoch": 0.3476528884884675, + "grad_norm": 2156.322265625, + "learning_rate": 4.145410037356692e-05, + "loss": 82.6664, + "step": 86050 + }, + { + "epoch": 0.34769328975383507, + "grad_norm": 511.3225402832031, + "learning_rate": 4.145147218513852e-05, + "loss": 43.9569, + "step": 86060 + }, + { + "epoch": 0.3477336910192027, + "grad_norm": 298.62005615234375, + "learning_rate": 4.14488436759802e-05, + "loss": 45.4608, + "step": 86070 + }, + { + "epoch": 0.34777409228457035, + "grad_norm": 1355.5355224609375, + "learning_rate": 4.144621484614319e-05, + "loss": 124.826, + "step": 86080 + }, + { + "epoch": 0.347814493549938, + "grad_norm": 1087.8583984375, + "learning_rate": 4.1443585695678774e-05, + "loss": 54.7751, + "step": 86090 + }, + { + "epoch": 0.3478548948153056, + "grad_norm": 431.3752746582031, + "learning_rate": 4.1440956224638184e-05, + "loss": 63.882, + "step": 86100 + }, + { + "epoch": 0.34789529608067327, + "grad_norm": 1415.1689453125, + "learning_rate": 4.143832643307269e-05, + "loss": 55.6841, + "step": 86110 + }, + { + "epoch": 0.34793569734604085, + "grad_norm": 500.28759765625, + "learning_rate": 4.1435696321033554e-05, + "loss": 75.8191, + "step": 86120 + }, + { + "epoch": 0.3479760986114085, + "grad_norm": 725.256591796875, + "learning_rate": 4.143306588857206e-05, + "loss": 68.6684, + "step": 86130 + }, + { + "epoch": 0.34801649987677613, + "grad_norm": 1193.91357421875, + "learning_rate": 4.143043513573949e-05, + "loss": 68.4111, + "step": 86140 + }, + { + "epoch": 0.34805690114214377, + "grad_norm": 311.3026428222656, + "learning_rate": 4.1427804062587116e-05, + "loss": 60.3588, + "step": 86150 + }, + { + "epoch": 0.3480973024075114, + "grad_norm": 2180.667236328125, + "learning_rate": 4.142517266916625e-05, + "loss": 71.0806, + "step": 86160 + }, + { + "epoch": 0.34813770367287905, + "grad_norm": 561.2362060546875, + "learning_rate": 4.1422540955528186e-05, + "loss": 65.3, + "step": 86170 + }, + { + "epoch": 0.3481781049382467, + "grad_norm": 674.5610961914062, + "learning_rate": 4.141990892172424e-05, + "loss": 77.8949, + "step": 86180 + }, + { + "epoch": 0.3482185062036143, + "grad_norm": 1787.229248046875, + "learning_rate": 4.14172765678057e-05, + "loss": 88.2457, + "step": 86190 + }, + { + "epoch": 0.3482589074689819, + "grad_norm": 1689.93505859375, + "learning_rate": 4.1414643893823914e-05, + "loss": 63.5292, + "step": 86200 + }, + { + "epoch": 0.34829930873434956, + "grad_norm": 1198.643310546875, + "learning_rate": 4.141201089983019e-05, + "loss": 57.5542, + "step": 86210 + }, + { + "epoch": 0.3483397099997172, + "grad_norm": 301.3227233886719, + "learning_rate": 4.1409377585875865e-05, + "loss": 59.0172, + "step": 86220 + }, + { + "epoch": 0.34838011126508484, + "grad_norm": 4092.246826171875, + "learning_rate": 4.1406743952012275e-05, + "loss": 72.4229, + "step": 86230 + }, + { + "epoch": 0.3484205125304525, + "grad_norm": 2412.6689453125, + "learning_rate": 4.140410999829076e-05, + "loss": 118.2052, + "step": 86240 + }, + { + "epoch": 0.34846091379582006, + "grad_norm": 649.2863159179688, + "learning_rate": 4.140147572476268e-05, + "loss": 78.7629, + "step": 86250 + }, + { + "epoch": 0.3485013150611877, + "grad_norm": 634.6868286132812, + "learning_rate": 4.1398841131479395e-05, + "loss": 59.1645, + "step": 86260 + }, + { + "epoch": 0.34854171632655534, + "grad_norm": 908.8325805664062, + "learning_rate": 4.139620621849225e-05, + "loss": 60.6888, + "step": 86270 + }, + { + "epoch": 0.348582117591923, + "grad_norm": 1681.2099609375, + "learning_rate": 4.139357098585262e-05, + "loss": 82.6059, + "step": 86280 + }, + { + "epoch": 0.3486225188572906, + "grad_norm": 743.29443359375, + "learning_rate": 4.1390935433611886e-05, + "loss": 86.1345, + "step": 86290 + }, + { + "epoch": 0.34866292012265826, + "grad_norm": 906.7951049804688, + "learning_rate": 4.138829956182144e-05, + "loss": 63.9069, + "step": 86300 + }, + { + "epoch": 0.3487033213880259, + "grad_norm": 821.8445434570312, + "learning_rate": 4.138566337053264e-05, + "loss": 64.0225, + "step": 86310 + }, + { + "epoch": 0.3487437226533935, + "grad_norm": 1009.7755737304688, + "learning_rate": 4.1383026859796905e-05, + "loss": 71.5133, + "step": 86320 + }, + { + "epoch": 0.3487841239187611, + "grad_norm": 260.2582702636719, + "learning_rate": 4.138039002966563e-05, + "loss": 52.0272, + "step": 86330 + }, + { + "epoch": 0.34882452518412876, + "grad_norm": 847.3204956054688, + "learning_rate": 4.137775288019021e-05, + "loss": 59.5128, + "step": 86340 + }, + { + "epoch": 0.3488649264494964, + "grad_norm": 414.3377380371094, + "learning_rate": 4.137511541142207e-05, + "loss": 55.5041, + "step": 86350 + }, + { + "epoch": 0.34890532771486404, + "grad_norm": 2023.2427978515625, + "learning_rate": 4.137247762341262e-05, + "loss": 67.8408, + "step": 86360 + }, + { + "epoch": 0.3489457289802317, + "grad_norm": 1327.11767578125, + "learning_rate": 4.136983951621329e-05, + "loss": 50.12, + "step": 86370 + }, + { + "epoch": 0.34898613024559927, + "grad_norm": 608.749267578125, + "learning_rate": 4.136720108987552e-05, + "loss": 61.7877, + "step": 86380 + }, + { + "epoch": 0.3490265315109669, + "grad_norm": 864.80322265625, + "learning_rate": 4.136456234445073e-05, + "loss": 72.582, + "step": 86390 + }, + { + "epoch": 0.34906693277633455, + "grad_norm": 749.7403564453125, + "learning_rate": 4.136192327999037e-05, + "loss": 68.246, + "step": 86400 + }, + { + "epoch": 0.3491073340417022, + "grad_norm": 1620.177734375, + "learning_rate": 4.1359283896545895e-05, + "loss": 94.8168, + "step": 86410 + }, + { + "epoch": 0.34914773530706983, + "grad_norm": 2137.778564453125, + "learning_rate": 4.135664419416877e-05, + "loss": 54.6499, + "step": 86420 + }, + { + "epoch": 0.34918813657243747, + "grad_norm": 355.5881042480469, + "learning_rate": 4.1354004172910434e-05, + "loss": 71.6114, + "step": 86430 + }, + { + "epoch": 0.34922853783780505, + "grad_norm": 579.06005859375, + "learning_rate": 4.135136383282237e-05, + "loss": 83.3694, + "step": 86440 + }, + { + "epoch": 0.3492689391031727, + "grad_norm": 363.8780822753906, + "learning_rate": 4.134872317395604e-05, + "loss": 71.6161, + "step": 86450 + }, + { + "epoch": 0.34930934036854033, + "grad_norm": 734.9833984375, + "learning_rate": 4.134608219636294e-05, + "loss": 63.2569, + "step": 86460 + }, + { + "epoch": 0.349349741633908, + "grad_norm": 659.5712890625, + "learning_rate": 4.134344090009455e-05, + "loss": 59.2688, + "step": 86470 + }, + { + "epoch": 0.3493901428992756, + "grad_norm": 658.69677734375, + "learning_rate": 4.1340799285202376e-05, + "loss": 49.0295, + "step": 86480 + }, + { + "epoch": 0.34943054416464325, + "grad_norm": 687.8778076171875, + "learning_rate": 4.13381573517379e-05, + "loss": 84.1034, + "step": 86490 + }, + { + "epoch": 0.3494709454300109, + "grad_norm": 576.1513061523438, + "learning_rate": 4.133551509975264e-05, + "loss": 46.6568, + "step": 86500 + }, + { + "epoch": 0.3495113466953785, + "grad_norm": 435.50640869140625, + "learning_rate": 4.13328725292981e-05, + "loss": 51.9577, + "step": 86510 + }, + { + "epoch": 0.3495517479607461, + "grad_norm": 538.1187744140625, + "learning_rate": 4.13302296404258e-05, + "loss": 159.9702, + "step": 86520 + }, + { + "epoch": 0.34959214922611376, + "grad_norm": 989.9334716796875, + "learning_rate": 4.132758643318726e-05, + "loss": 90.6167, + "step": 86530 + }, + { + "epoch": 0.3496325504914814, + "grad_norm": 538.1937255859375, + "learning_rate": 4.132494290763403e-05, + "loss": 52.8154, + "step": 86540 + }, + { + "epoch": 0.34967295175684904, + "grad_norm": 2396.126708984375, + "learning_rate": 4.1322299063817624e-05, + "loss": 60.857, + "step": 86550 + }, + { + "epoch": 0.3497133530222167, + "grad_norm": 909.927978515625, + "learning_rate": 4.131965490178959e-05, + "loss": 127.2578, + "step": 86560 + }, + { + "epoch": 0.34975375428758426, + "grad_norm": 535.4166259765625, + "learning_rate": 4.131701042160149e-05, + "loss": 77.5207, + "step": 86570 + }, + { + "epoch": 0.3497941555529519, + "grad_norm": 947.1220703125, + "learning_rate": 4.131436562330487e-05, + "loss": 85.3897, + "step": 86580 + }, + { + "epoch": 0.34983455681831954, + "grad_norm": 1153.6351318359375, + "learning_rate": 4.13117205069513e-05, + "loss": 85.3414, + "step": 86590 + }, + { + "epoch": 0.3498749580836872, + "grad_norm": 1040.203857421875, + "learning_rate": 4.130907507259233e-05, + "loss": 79.24, + "step": 86600 + }, + { + "epoch": 0.3499153593490548, + "grad_norm": 737.9971313476562, + "learning_rate": 4.130642932027955e-05, + "loss": 47.7125, + "step": 86610 + }, + { + "epoch": 0.34995576061442246, + "grad_norm": 1519.4359130859375, + "learning_rate": 4.130378325006453e-05, + "loss": 59.9955, + "step": 86620 + }, + { + "epoch": 0.3499961618797901, + "grad_norm": 734.98828125, + "learning_rate": 4.130113686199887e-05, + "loss": 44.1955, + "step": 86630 + }, + { + "epoch": 0.3500365631451577, + "grad_norm": 1075.6939697265625, + "learning_rate": 4.129849015613415e-05, + "loss": 73.9399, + "step": 86640 + }, + { + "epoch": 0.3500769644105253, + "grad_norm": 3161.4248046875, + "learning_rate": 4.1295843132521973e-05, + "loss": 81.6022, + "step": 86650 + }, + { + "epoch": 0.35011736567589297, + "grad_norm": 1227.13134765625, + "learning_rate": 4.129319579121394e-05, + "loss": 67.1966, + "step": 86660 + }, + { + "epoch": 0.3501577669412606, + "grad_norm": 779.573974609375, + "learning_rate": 4.129054813226167e-05, + "loss": 72.8276, + "step": 86670 + }, + { + "epoch": 0.35019816820662825, + "grad_norm": 378.6111755371094, + "learning_rate": 4.1287900155716784e-05, + "loss": 43.1125, + "step": 86680 + }, + { + "epoch": 0.3502385694719959, + "grad_norm": 865.598876953125, + "learning_rate": 4.128525186163089e-05, + "loss": 75.8998, + "step": 86690 + }, + { + "epoch": 0.35027897073736347, + "grad_norm": 1067.35693359375, + "learning_rate": 4.128260325005564e-05, + "loss": 66.426, + "step": 86700 + }, + { + "epoch": 0.3503193720027311, + "grad_norm": 1469.167236328125, + "learning_rate": 4.127995432104264e-05, + "loss": 95.7886, + "step": 86710 + }, + { + "epoch": 0.35035977326809875, + "grad_norm": 554.118408203125, + "learning_rate": 4.127730507464356e-05, + "loss": 83.7477, + "step": 86720 + }, + { + "epoch": 0.3504001745334664, + "grad_norm": 1134.07470703125, + "learning_rate": 4.127465551091003e-05, + "loss": 123.3934, + "step": 86730 + }, + { + "epoch": 0.35044057579883403, + "grad_norm": 1663.503173828125, + "learning_rate": 4.1272005629893714e-05, + "loss": 51.4834, + "step": 86740 + }, + { + "epoch": 0.35048097706420167, + "grad_norm": 464.7097473144531, + "learning_rate": 4.1269355431646274e-05, + "loss": 47.998, + "step": 86750 + }, + { + "epoch": 0.35052137832956926, + "grad_norm": 533.934814453125, + "learning_rate": 4.126670491621938e-05, + "loss": 89.6148, + "step": 86760 + }, + { + "epoch": 0.3505617795949369, + "grad_norm": 1182.53466796875, + "learning_rate": 4.126405408366468e-05, + "loss": 58.9152, + "step": 86770 + }, + { + "epoch": 0.35060218086030454, + "grad_norm": 1748.3175048828125, + "learning_rate": 4.1261402934033886e-05, + "loss": 76.4899, + "step": 86780 + }, + { + "epoch": 0.3506425821256722, + "grad_norm": 503.2514343261719, + "learning_rate": 4.125875146737868e-05, + "loss": 94.3811, + "step": 86790 + }, + { + "epoch": 0.3506829833910398, + "grad_norm": 602.4920043945312, + "learning_rate": 4.125609968375072e-05, + "loss": 41.265, + "step": 86800 + }, + { + "epoch": 0.35072338465640746, + "grad_norm": 1253.5115966796875, + "learning_rate": 4.125344758320174e-05, + "loss": 56.0734, + "step": 86810 + }, + { + "epoch": 0.3507637859217751, + "grad_norm": 1205.50634765625, + "learning_rate": 4.125079516578344e-05, + "loss": 90.1057, + "step": 86820 + }, + { + "epoch": 0.3508041871871427, + "grad_norm": 422.2942810058594, + "learning_rate": 4.12481424315475e-05, + "loss": 60.9675, + "step": 86830 + }, + { + "epoch": 0.3508445884525103, + "grad_norm": 702.0094604492188, + "learning_rate": 4.124548938054568e-05, + "loss": 53.5536, + "step": 86840 + }, + { + "epoch": 0.35088498971787796, + "grad_norm": 604.8543090820312, + "learning_rate": 4.1242836012829665e-05, + "loss": 64.9087, + "step": 86850 + }, + { + "epoch": 0.3509253909832456, + "grad_norm": 307.5919494628906, + "learning_rate": 4.1240182328451204e-05, + "loss": 72.9601, + "step": 86860 + }, + { + "epoch": 0.35096579224861324, + "grad_norm": 1943.6920166015625, + "learning_rate": 4.123752832746203e-05, + "loss": 87.0177, + "step": 86870 + }, + { + "epoch": 0.3510061935139809, + "grad_norm": 1072.779296875, + "learning_rate": 4.123487400991388e-05, + "loss": 73.2874, + "step": 86880 + }, + { + "epoch": 0.35104659477934846, + "grad_norm": 644.7705078125, + "learning_rate": 4.1232219375858504e-05, + "loss": 82.4604, + "step": 86890 + }, + { + "epoch": 0.3510869960447161, + "grad_norm": 214.5071563720703, + "learning_rate": 4.1229564425347654e-05, + "loss": 50.0135, + "step": 86900 + }, + { + "epoch": 0.35112739731008374, + "grad_norm": 662.0516357421875, + "learning_rate": 4.122690915843309e-05, + "loss": 45.6385, + "step": 86910 + }, + { + "epoch": 0.3511677985754514, + "grad_norm": 1001.0264282226562, + "learning_rate": 4.122425357516658e-05, + "loss": 63.8673, + "step": 86920 + }, + { + "epoch": 0.351208199840819, + "grad_norm": 1112.534912109375, + "learning_rate": 4.1221597675599886e-05, + "loss": 54.2994, + "step": 86930 + }, + { + "epoch": 0.35124860110618666, + "grad_norm": 565.2496948242188, + "learning_rate": 4.1218941459784796e-05, + "loss": 50.4045, + "step": 86940 + }, + { + "epoch": 0.3512890023715543, + "grad_norm": 823.8052978515625, + "learning_rate": 4.121628492777311e-05, + "loss": 66.9651, + "step": 86950 + }, + { + "epoch": 0.3513294036369219, + "grad_norm": 875.3696899414062, + "learning_rate": 4.121362807961658e-05, + "loss": 84.8828, + "step": 86960 + }, + { + "epoch": 0.35136980490228953, + "grad_norm": 412.0173034667969, + "learning_rate": 4.1210970915367026e-05, + "loss": 78.3683, + "step": 86970 + }, + { + "epoch": 0.35141020616765717, + "grad_norm": 1158.7080078125, + "learning_rate": 4.120831343507625e-05, + "loss": 77.7848, + "step": 86980 + }, + { + "epoch": 0.3514506074330248, + "grad_norm": 261.9429016113281, + "learning_rate": 4.1205655638796065e-05, + "loss": 43.2981, + "step": 86990 + }, + { + "epoch": 0.35149100869839245, + "grad_norm": 779.2062377929688, + "learning_rate": 4.1202997526578276e-05, + "loss": 57.5505, + "step": 87000 + }, + { + "epoch": 0.3515314099637601, + "grad_norm": 742.50048828125, + "learning_rate": 4.120033909847471e-05, + "loss": 68.5733, + "step": 87010 + }, + { + "epoch": 0.3515718112291277, + "grad_norm": 610.2131958007812, + "learning_rate": 4.1197680354537186e-05, + "loss": 64.115, + "step": 87020 + }, + { + "epoch": 0.3516122124944953, + "grad_norm": 994.4957275390625, + "learning_rate": 4.119502129481755e-05, + "loss": 42.9862, + "step": 87030 + }, + { + "epoch": 0.35165261375986295, + "grad_norm": 1622.78076171875, + "learning_rate": 4.119236191936764e-05, + "loss": 55.3836, + "step": 87040 + }, + { + "epoch": 0.3516930150252306, + "grad_norm": 1582.70458984375, + "learning_rate": 4.118970222823929e-05, + "loss": 71.9621, + "step": 87050 + }, + { + "epoch": 0.35173341629059823, + "grad_norm": 1007.8423461914062, + "learning_rate": 4.118704222148436e-05, + "loss": 66.6885, + "step": 87060 + }, + { + "epoch": 0.3517738175559659, + "grad_norm": 431.42828369140625, + "learning_rate": 4.118438189915471e-05, + "loss": 83.4902, + "step": 87070 + }, + { + "epoch": 0.35181421882133346, + "grad_norm": 1616.8265380859375, + "learning_rate": 4.118172126130221e-05, + "loss": 49.96, + "step": 87080 + }, + { + "epoch": 0.3518546200867011, + "grad_norm": 358.4612121582031, + "learning_rate": 4.117906030797871e-05, + "loss": 85.6933, + "step": 87090 + }, + { + "epoch": 0.35189502135206874, + "grad_norm": 595.7027587890625, + "learning_rate": 4.1176399039236116e-05, + "loss": 67.9241, + "step": 87100 + }, + { + "epoch": 0.3519354226174364, + "grad_norm": 826.05517578125, + "learning_rate": 4.117373745512628e-05, + "loss": 92.1798, + "step": 87110 + }, + { + "epoch": 0.351975823882804, + "grad_norm": 2575.160888671875, + "learning_rate": 4.117107555570111e-05, + "loss": 80.2855, + "step": 87120 + }, + { + "epoch": 0.35201622514817166, + "grad_norm": 635.9917602539062, + "learning_rate": 4.1168413341012496e-05, + "loss": 51.3892, + "step": 87130 + }, + { + "epoch": 0.3520566264135393, + "grad_norm": 586.3646240234375, + "learning_rate": 4.116575081111235e-05, + "loss": 74.7427, + "step": 87140 + }, + { + "epoch": 0.3520970276789069, + "grad_norm": 379.6501770019531, + "learning_rate": 4.116308796605255e-05, + "loss": 42.7718, + "step": 87150 + }, + { + "epoch": 0.3521374289442745, + "grad_norm": 757.53076171875, + "learning_rate": 4.116042480588505e-05, + "loss": 79.6147, + "step": 87160 + }, + { + "epoch": 0.35217783020964216, + "grad_norm": 521.4224853515625, + "learning_rate": 4.1157761330661734e-05, + "loss": 114.8519, + "step": 87170 + }, + { + "epoch": 0.3522182314750098, + "grad_norm": 543.6570434570312, + "learning_rate": 4.115509754043454e-05, + "loss": 67.7495, + "step": 87180 + }, + { + "epoch": 0.35225863274037744, + "grad_norm": 635.4970703125, + "learning_rate": 4.115243343525541e-05, + "loss": 78.6889, + "step": 87190 + }, + { + "epoch": 0.3522990340057451, + "grad_norm": 1206.6654052734375, + "learning_rate": 4.1149769015176275e-05, + "loss": 56.5697, + "step": 87200 + }, + { + "epoch": 0.35233943527111267, + "grad_norm": 811.91455078125, + "learning_rate": 4.114710428024907e-05, + "loss": 78.0935, + "step": 87210 + }, + { + "epoch": 0.3523798365364803, + "grad_norm": 1116.96240234375, + "learning_rate": 4.114443923052577e-05, + "loss": 59.2988, + "step": 87220 + }, + { + "epoch": 0.35242023780184795, + "grad_norm": 623.7386474609375, + "learning_rate": 4.11417738660583e-05, + "loss": 56.9475, + "step": 87230 + }, + { + "epoch": 0.3524606390672156, + "grad_norm": 1740.817138671875, + "learning_rate": 4.113910818689864e-05, + "loss": 65.6938, + "step": 87240 + }, + { + "epoch": 0.3525010403325832, + "grad_norm": 752.4931640625, + "learning_rate": 4.113644219309877e-05, + "loss": 83.3021, + "step": 87250 + }, + { + "epoch": 0.35254144159795087, + "grad_norm": 1242.556640625, + "learning_rate": 4.1133775884710634e-05, + "loss": 59.4292, + "step": 87260 + }, + { + "epoch": 0.3525818428633185, + "grad_norm": 1105.26318359375, + "learning_rate": 4.113110926178625e-05, + "loss": 52.1607, + "step": 87270 + }, + { + "epoch": 0.3526222441286861, + "grad_norm": 941.704345703125, + "learning_rate": 4.112844232437757e-05, + "loss": 44.7782, + "step": 87280 + }, + { + "epoch": 0.35266264539405373, + "grad_norm": 975.8741455078125, + "learning_rate": 4.112577507253661e-05, + "loss": 63.2205, + "step": 87290 + }, + { + "epoch": 0.35270304665942137, + "grad_norm": 267.49029541015625, + "learning_rate": 4.112310750631536e-05, + "loss": 55.718, + "step": 87300 + }, + { + "epoch": 0.352743447924789, + "grad_norm": 492.7359619140625, + "learning_rate": 4.112043962576583e-05, + "loss": 87.6296, + "step": 87310 + }, + { + "epoch": 0.35278384919015665, + "grad_norm": 723.97412109375, + "learning_rate": 4.1117771430940035e-05, + "loss": 72.1795, + "step": 87320 + }, + { + "epoch": 0.3528242504555243, + "grad_norm": 1166.4947509765625, + "learning_rate": 4.111510292188998e-05, + "loss": 61.6153, + "step": 87330 + }, + { + "epoch": 0.3528646517208919, + "grad_norm": 943.3729248046875, + "learning_rate": 4.111243409866769e-05, + "loss": 78.3224, + "step": 87340 + }, + { + "epoch": 0.3529050529862595, + "grad_norm": 1874.787353515625, + "learning_rate": 4.110976496132522e-05, + "loss": 43.5677, + "step": 87350 + }, + { + "epoch": 0.35294545425162716, + "grad_norm": 567.2123413085938, + "learning_rate": 4.1107095509914584e-05, + "loss": 114.9888, + "step": 87360 + }, + { + "epoch": 0.3529858555169948, + "grad_norm": 1504.210205078125, + "learning_rate": 4.1104425744487826e-05, + "loss": 63.7791, + "step": 87370 + }, + { + "epoch": 0.35302625678236244, + "grad_norm": 252.47015380859375, + "learning_rate": 4.1101755665096996e-05, + "loss": 46.304, + "step": 87380 + }, + { + "epoch": 0.3530666580477301, + "grad_norm": 952.9064331054688, + "learning_rate": 4.109908527179415e-05, + "loss": 68.7049, + "step": 87390 + }, + { + "epoch": 0.35310705931309766, + "grad_norm": 3070.967041015625, + "learning_rate": 4.109641456463135e-05, + "loss": 70.3882, + "step": 87400 + }, + { + "epoch": 0.3531474605784653, + "grad_norm": 1895.307861328125, + "learning_rate": 4.109374354366066e-05, + "loss": 44.8805, + "step": 87410 + }, + { + "epoch": 0.35318786184383294, + "grad_norm": 1331.394287109375, + "learning_rate": 4.109107220893415e-05, + "loss": 59.9866, + "step": 87420 + }, + { + "epoch": 0.3532282631092006, + "grad_norm": 249.96646118164062, + "learning_rate": 4.1088400560503905e-05, + "loss": 66.2172, + "step": 87430 + }, + { + "epoch": 0.3532686643745682, + "grad_norm": 947.6691284179688, + "learning_rate": 4.108572859842201e-05, + "loss": 90.3438, + "step": 87440 + }, + { + "epoch": 0.35330906563993586, + "grad_norm": 573.4298706054688, + "learning_rate": 4.108305632274055e-05, + "loss": 34.3865, + "step": 87450 + }, + { + "epoch": 0.3533494669053035, + "grad_norm": 602.94384765625, + "learning_rate": 4.108038373351163e-05, + "loss": 51.0906, + "step": 87460 + }, + { + "epoch": 0.3533898681706711, + "grad_norm": 3697.798095703125, + "learning_rate": 4.107771083078735e-05, + "loss": 70.8239, + "step": 87470 + }, + { + "epoch": 0.3534302694360387, + "grad_norm": 549.9475708007812, + "learning_rate": 4.107503761461983e-05, + "loss": 48.9701, + "step": 87480 + }, + { + "epoch": 0.35347067070140636, + "grad_norm": 651.7734985351562, + "learning_rate": 4.107236408506116e-05, + "loss": 87.1631, + "step": 87490 + }, + { + "epoch": 0.353511071966774, + "grad_norm": 854.0162963867188, + "learning_rate": 4.1069690242163484e-05, + "loss": 39.3587, + "step": 87500 + }, + { + "epoch": 0.35355147323214164, + "grad_norm": 845.365234375, + "learning_rate": 4.106701608597893e-05, + "loss": 80.0646, + "step": 87510 + }, + { + "epoch": 0.3535918744975093, + "grad_norm": 571.5369262695312, + "learning_rate": 4.106434161655962e-05, + "loss": 54.8925, + "step": 87520 + }, + { + "epoch": 0.35363227576287687, + "grad_norm": 514.8745727539062, + "learning_rate": 4.106166683395769e-05, + "loss": 80.812, + "step": 87530 + }, + { + "epoch": 0.3536726770282445, + "grad_norm": 895.7685546875, + "learning_rate": 4.105899173822531e-05, + "loss": 52.7344, + "step": 87540 + }, + { + "epoch": 0.35371307829361215, + "grad_norm": 408.03485107421875, + "learning_rate": 4.1056316329414616e-05, + "loss": 63.1606, + "step": 87550 + }, + { + "epoch": 0.3537534795589798, + "grad_norm": 794.3991088867188, + "learning_rate": 4.105364060757776e-05, + "loss": 81.9505, + "step": 87560 + }, + { + "epoch": 0.35379388082434743, + "grad_norm": 308.87799072265625, + "learning_rate": 4.1050964572766923e-05, + "loss": 71.2676, + "step": 87570 + }, + { + "epoch": 0.35383428208971507, + "grad_norm": 743.8997192382812, + "learning_rate": 4.104828822503427e-05, + "loss": 54.9144, + "step": 87580 + }, + { + "epoch": 0.3538746833550827, + "grad_norm": 800.4918212890625, + "learning_rate": 4.104561156443197e-05, + "loss": 48.0048, + "step": 87590 + }, + { + "epoch": 0.3539150846204503, + "grad_norm": 217.0595245361328, + "learning_rate": 4.104293459101222e-05, + "loss": 71.4233, + "step": 87600 + }, + { + "epoch": 0.35395548588581793, + "grad_norm": 778.3648071289062, + "learning_rate": 4.104025730482719e-05, + "loss": 66.6853, + "step": 87610 + }, + { + "epoch": 0.3539958871511856, + "grad_norm": 712.687255859375, + "learning_rate": 4.103757970592909e-05, + "loss": 59.8492, + "step": 87620 + }, + { + "epoch": 0.3540362884165532, + "grad_norm": 366.0815734863281, + "learning_rate": 4.1034901794370116e-05, + "loss": 49.8096, + "step": 87630 + }, + { + "epoch": 0.35407668968192085, + "grad_norm": 1185.479736328125, + "learning_rate": 4.1032223570202474e-05, + "loss": 83.2932, + "step": 87640 + }, + { + "epoch": 0.3541170909472885, + "grad_norm": 411.51373291015625, + "learning_rate": 4.102954503347839e-05, + "loss": 45.1826, + "step": 87650 + }, + { + "epoch": 0.3541574922126561, + "grad_norm": 749.7902221679688, + "learning_rate": 4.102686618425006e-05, + "loss": 56.4452, + "step": 87660 + }, + { + "epoch": 0.3541978934780237, + "grad_norm": 573.7445068359375, + "learning_rate": 4.102418702256973e-05, + "loss": 59.8702, + "step": 87670 + }, + { + "epoch": 0.35423829474339136, + "grad_norm": 1168.4649658203125, + "learning_rate": 4.1021507548489625e-05, + "loss": 66.0522, + "step": 87680 + }, + { + "epoch": 0.354278696008759, + "grad_norm": 949.6201171875, + "learning_rate": 4.1018827762061985e-05, + "loss": 82.0665, + "step": 87690 + }, + { + "epoch": 0.35431909727412664, + "grad_norm": 1565.3529052734375, + "learning_rate": 4.101614766333904e-05, + "loss": 102.3075, + "step": 87700 + }, + { + "epoch": 0.3543594985394943, + "grad_norm": 1542.8006591796875, + "learning_rate": 4.101346725237305e-05, + "loss": 66.2509, + "step": 87710 + }, + { + "epoch": 0.35439989980486186, + "grad_norm": 527.8439331054688, + "learning_rate": 4.1010786529216284e-05, + "loss": 71.4895, + "step": 87720 + }, + { + "epoch": 0.3544403010702295, + "grad_norm": 1168.476318359375, + "learning_rate": 4.100810549392099e-05, + "loss": 85.4038, + "step": 87730 + }, + { + "epoch": 0.35448070233559714, + "grad_norm": 714.1607666015625, + "learning_rate": 4.100542414653943e-05, + "loss": 51.9524, + "step": 87740 + }, + { + "epoch": 0.3545211036009648, + "grad_norm": 2820.892578125, + "learning_rate": 4.100274248712389e-05, + "loss": 84.763, + "step": 87750 + }, + { + "epoch": 0.3545615048663324, + "grad_norm": 1056.6544189453125, + "learning_rate": 4.1000060515726647e-05, + "loss": 46.4493, + "step": 87760 + }, + { + "epoch": 0.35460190613170006, + "grad_norm": 637.3196411132812, + "learning_rate": 4.0997378232399984e-05, + "loss": 64.2221, + "step": 87770 + }, + { + "epoch": 0.3546423073970677, + "grad_norm": 1249.30078125, + "learning_rate": 4.09946956371962e-05, + "loss": 62.3628, + "step": 87780 + }, + { + "epoch": 0.3546827086624353, + "grad_norm": 562.5096435546875, + "learning_rate": 4.0992012730167584e-05, + "loss": 56.4057, + "step": 87790 + }, + { + "epoch": 0.3547231099278029, + "grad_norm": 515.3623046875, + "learning_rate": 4.098932951136645e-05, + "loss": 75.1756, + "step": 87800 + }, + { + "epoch": 0.35476351119317057, + "grad_norm": 1004.6653442382812, + "learning_rate": 4.098664598084511e-05, + "loss": 87.7658, + "step": 87810 + }, + { + "epoch": 0.3548039124585382, + "grad_norm": 514.6812133789062, + "learning_rate": 4.0983962138655873e-05, + "loss": 63.0205, + "step": 87820 + }, + { + "epoch": 0.35484431372390585, + "grad_norm": 327.2781677246094, + "learning_rate": 4.0981277984851066e-05, + "loss": 64.0254, + "step": 87830 + }, + { + "epoch": 0.3548847149892735, + "grad_norm": 945.9813842773438, + "learning_rate": 4.097859351948301e-05, + "loss": 71.4724, + "step": 87840 + }, + { + "epoch": 0.35492511625464107, + "grad_norm": 1146.121337890625, + "learning_rate": 4.0975908742604055e-05, + "loss": 74.1327, + "step": 87850 + }, + { + "epoch": 0.3549655175200087, + "grad_norm": 2756.161376953125, + "learning_rate": 4.097322365426653e-05, + "loss": 60.8623, + "step": 87860 + }, + { + "epoch": 0.35500591878537635, + "grad_norm": 616.4971923828125, + "learning_rate": 4.097053825452278e-05, + "loss": 74.3813, + "step": 87870 + }, + { + "epoch": 0.355046320050744, + "grad_norm": 796.4453735351562, + "learning_rate": 4.0967852543425175e-05, + "loss": 65.7487, + "step": 87880 + }, + { + "epoch": 0.35508672131611163, + "grad_norm": 377.42138671875, + "learning_rate": 4.0965166521026065e-05, + "loss": 46.341, + "step": 87890 + }, + { + "epoch": 0.35512712258147927, + "grad_norm": 605.7762451171875, + "learning_rate": 4.096248018737781e-05, + "loss": 81.4626, + "step": 87900 + }, + { + "epoch": 0.3551675238468469, + "grad_norm": 408.6437072753906, + "learning_rate": 4.095979354253279e-05, + "loss": 85.3279, + "step": 87910 + }, + { + "epoch": 0.3552079251122145, + "grad_norm": 673.0602416992188, + "learning_rate": 4.095710658654337e-05, + "loss": 35.6571, + "step": 87920 + }, + { + "epoch": 0.35524832637758214, + "grad_norm": 562.7644653320312, + "learning_rate": 4.0954419319461946e-05, + "loss": 68.498, + "step": 87930 + }, + { + "epoch": 0.3552887276429498, + "grad_norm": 1345.530029296875, + "learning_rate": 4.09517317413409e-05, + "loss": 54.9481, + "step": 87940 + }, + { + "epoch": 0.3553291289083174, + "grad_norm": 670.7591552734375, + "learning_rate": 4.094904385223264e-05, + "loss": 95.4115, + "step": 87950 + }, + { + "epoch": 0.35536953017368506, + "grad_norm": 6252.8359375, + "learning_rate": 4.094635565218955e-05, + "loss": 85.2645, + "step": 87960 + }, + { + "epoch": 0.3554099314390527, + "grad_norm": 699.56982421875, + "learning_rate": 4.094366714126405e-05, + "loss": 61.0998, + "step": 87970 + }, + { + "epoch": 0.3554503327044203, + "grad_norm": 572.2498779296875, + "learning_rate": 4.094097831950855e-05, + "loss": 55.9745, + "step": 87980 + }, + { + "epoch": 0.3554907339697879, + "grad_norm": 477.888671875, + "learning_rate": 4.093828918697547e-05, + "loss": 64.4699, + "step": 87990 + }, + { + "epoch": 0.35553113523515556, + "grad_norm": 1221.31982421875, + "learning_rate": 4.093559974371725e-05, + "loss": 60.6648, + "step": 88000 + }, + { + "epoch": 0.3555715365005232, + "grad_norm": 390.103759765625, + "learning_rate": 4.09329099897863e-05, + "loss": 51.6583, + "step": 88010 + }, + { + "epoch": 0.35561193776589084, + "grad_norm": 1909.0634765625, + "learning_rate": 4.0930219925235056e-05, + "loss": 62.6641, + "step": 88020 + }, + { + "epoch": 0.3556523390312585, + "grad_norm": 0.0, + "learning_rate": 4.0927529550115986e-05, + "loss": 54.3005, + "step": 88030 + }, + { + "epoch": 0.35569274029662606, + "grad_norm": 1166.9232177734375, + "learning_rate": 4.0924838864481516e-05, + "loss": 77.7679, + "step": 88040 + }, + { + "epoch": 0.3557331415619937, + "grad_norm": 736.1539916992188, + "learning_rate": 4.092214786838413e-05, + "loss": 55.0295, + "step": 88050 + }, + { + "epoch": 0.35577354282736134, + "grad_norm": 957.0504150390625, + "learning_rate": 4.0919456561876256e-05, + "loss": 72.6961, + "step": 88060 + }, + { + "epoch": 0.355813944092729, + "grad_norm": 687.1393432617188, + "learning_rate": 4.091676494501039e-05, + "loss": 60.3476, + "step": 88070 + }, + { + "epoch": 0.3558543453580966, + "grad_norm": 1779.8883056640625, + "learning_rate": 4.0914073017838996e-05, + "loss": 85.0388, + "step": 88080 + }, + { + "epoch": 0.35589474662346426, + "grad_norm": 1815.7283935546875, + "learning_rate": 4.091138078041455e-05, + "loss": 67.7276, + "step": 88090 + }, + { + "epoch": 0.3559351478888319, + "grad_norm": 510.8993835449219, + "learning_rate": 4.090868823278956e-05, + "loss": 74.9138, + "step": 88100 + }, + { + "epoch": 0.3559755491541995, + "grad_norm": 1010.1596069335938, + "learning_rate": 4.090599537501649e-05, + "loss": 70.1757, + "step": 88110 + }, + { + "epoch": 0.35601595041956713, + "grad_norm": 806.6387329101562, + "learning_rate": 4.090330220714785e-05, + "loss": 64.2958, + "step": 88120 + }, + { + "epoch": 0.35605635168493477, + "grad_norm": 1274.768310546875, + "learning_rate": 4.090060872923615e-05, + "loss": 98.0021, + "step": 88130 + }, + { + "epoch": 0.3560967529503024, + "grad_norm": 819.7002563476562, + "learning_rate": 4.089791494133389e-05, + "loss": 52.4494, + "step": 88140 + }, + { + "epoch": 0.35613715421567005, + "grad_norm": 1136.931396484375, + "learning_rate": 4.0895220843493606e-05, + "loss": 105.6924, + "step": 88150 + }, + { + "epoch": 0.3561775554810377, + "grad_norm": 2234.421630859375, + "learning_rate": 4.0892526435767795e-05, + "loss": 103.7004, + "step": 88160 + }, + { + "epoch": 0.3562179567464053, + "grad_norm": 841.349853515625, + "learning_rate": 4.088983171820901e-05, + "loss": 52.9514, + "step": 88170 + }, + { + "epoch": 0.3562583580117729, + "grad_norm": 725.3030395507812, + "learning_rate": 4.088713669086977e-05, + "loss": 49.6578, + "step": 88180 + }, + { + "epoch": 0.35629875927714055, + "grad_norm": 1026.1910400390625, + "learning_rate": 4.088444135380262e-05, + "loss": 63.2285, + "step": 88190 + }, + { + "epoch": 0.3563391605425082, + "grad_norm": 1608.8538818359375, + "learning_rate": 4.088174570706011e-05, + "loss": 65.9221, + "step": 88200 + }, + { + "epoch": 0.35637956180787583, + "grad_norm": 608.7267456054688, + "learning_rate": 4.0879049750694795e-05, + "loss": 59.0359, + "step": 88210 + }, + { + "epoch": 0.3564199630732435, + "grad_norm": 3048.426513671875, + "learning_rate": 4.0876353484759224e-05, + "loss": 100.1788, + "step": 88220 + }, + { + "epoch": 0.35646036433861106, + "grad_norm": 540.722412109375, + "learning_rate": 4.087365690930597e-05, + "loss": 66.9113, + "step": 88230 + }, + { + "epoch": 0.3565007656039787, + "grad_norm": 781.4616088867188, + "learning_rate": 4.0870960024387596e-05, + "loss": 103.4365, + "step": 88240 + }, + { + "epoch": 0.35654116686934634, + "grad_norm": 1404.780029296875, + "learning_rate": 4.086826283005669e-05, + "loss": 58.6043, + "step": 88250 + }, + { + "epoch": 0.356581568134714, + "grad_norm": 440.78472900390625, + "learning_rate": 4.0865565326365835e-05, + "loss": 43.4884, + "step": 88260 + }, + { + "epoch": 0.3566219694000816, + "grad_norm": 425.8787841796875, + "learning_rate": 4.086286751336761e-05, + "loss": 44.2717, + "step": 88270 + }, + { + "epoch": 0.35666237066544926, + "grad_norm": 386.0218200683594, + "learning_rate": 4.0860169391114625e-05, + "loss": 52.1067, + "step": 88280 + }, + { + "epoch": 0.3567027719308169, + "grad_norm": 421.0408935546875, + "learning_rate": 4.085747095965946e-05, + "loss": 63.7093, + "step": 88290 + }, + { + "epoch": 0.3567431731961845, + "grad_norm": 688.8589477539062, + "learning_rate": 4.085477221905474e-05, + "loss": 63.6186, + "step": 88300 + }, + { + "epoch": 0.3567835744615521, + "grad_norm": 2102.650634765625, + "learning_rate": 4.085207316935308e-05, + "loss": 120.8342, + "step": 88310 + }, + { + "epoch": 0.35682397572691976, + "grad_norm": 644.7640380859375, + "learning_rate": 4.084937381060708e-05, + "loss": 71.6069, + "step": 88320 + }, + { + "epoch": 0.3568643769922874, + "grad_norm": 718.741455078125, + "learning_rate": 4.084667414286939e-05, + "loss": 49.9685, + "step": 88330 + }, + { + "epoch": 0.35690477825765504, + "grad_norm": 1353.2684326171875, + "learning_rate": 4.0843974166192614e-05, + "loss": 89.7978, + "step": 88340 + }, + { + "epoch": 0.3569451795230227, + "grad_norm": 518.8037109375, + "learning_rate": 4.0841273880629416e-05, + "loss": 68.2225, + "step": 88350 + }, + { + "epoch": 0.35698558078839027, + "grad_norm": 491.72265625, + "learning_rate": 4.083857328623243e-05, + "loss": 62.1392, + "step": 88360 + }, + { + "epoch": 0.3570259820537579, + "grad_norm": 713.7077026367188, + "learning_rate": 4.0835872383054296e-05, + "loss": 59.1912, + "step": 88370 + }, + { + "epoch": 0.35706638331912555, + "grad_norm": 546.5416259765625, + "learning_rate": 4.083317117114768e-05, + "loss": 65.1588, + "step": 88380 + }, + { + "epoch": 0.3571067845844932, + "grad_norm": 749.3577270507812, + "learning_rate": 4.083046965056524e-05, + "loss": 65.049, + "step": 88390 + }, + { + "epoch": 0.3571471858498608, + "grad_norm": 808.5428466796875, + "learning_rate": 4.082776782135964e-05, + "loss": 59.8705, + "step": 88400 + }, + { + "epoch": 0.35718758711522847, + "grad_norm": 372.74261474609375, + "learning_rate": 4.082506568358357e-05, + "loss": 62.2298, + "step": 88410 + }, + { + "epoch": 0.3572279883805961, + "grad_norm": 574.08447265625, + "learning_rate": 4.082236323728968e-05, + "loss": 56.4968, + "step": 88420 + }, + { + "epoch": 0.3572683896459637, + "grad_norm": 901.8555297851562, + "learning_rate": 4.0819660482530684e-05, + "loss": 67.4959, + "step": 88430 + }, + { + "epoch": 0.35730879091133133, + "grad_norm": 2012.2177734375, + "learning_rate": 4.0816957419359264e-05, + "loss": 60.3428, + "step": 88440 + }, + { + "epoch": 0.35734919217669897, + "grad_norm": 434.2742614746094, + "learning_rate": 4.0814254047828116e-05, + "loss": 55.9326, + "step": 88450 + }, + { + "epoch": 0.3573895934420666, + "grad_norm": 1581.9569091796875, + "learning_rate": 4.081155036798994e-05, + "loss": 80.8084, + "step": 88460 + }, + { + "epoch": 0.35742999470743425, + "grad_norm": 980.014404296875, + "learning_rate": 4.080884637989745e-05, + "loss": 53.9677, + "step": 88470 + }, + { + "epoch": 0.3574703959728019, + "grad_norm": 438.5036926269531, + "learning_rate": 4.080614208360336e-05, + "loss": 46.5922, + "step": 88480 + }, + { + "epoch": 0.3575107972381695, + "grad_norm": 1167.0660400390625, + "learning_rate": 4.080343747916039e-05, + "loss": 60.1899, + "step": 88490 + }, + { + "epoch": 0.3575511985035371, + "grad_norm": 1493.02734375, + "learning_rate": 4.080073256662127e-05, + "loss": 55.8727, + "step": 88500 + }, + { + "epoch": 0.35759159976890476, + "grad_norm": 643.5103759765625, + "learning_rate": 4.079802734603874e-05, + "loss": 75.9446, + "step": 88510 + }, + { + "epoch": 0.3576320010342724, + "grad_norm": 619.2804565429688, + "learning_rate": 4.079532181746553e-05, + "loss": 59.9599, + "step": 88520 + }, + { + "epoch": 0.35767240229964004, + "grad_norm": 699.5809936523438, + "learning_rate": 4.079261598095439e-05, + "loss": 75.2107, + "step": 88530 + }, + { + "epoch": 0.3577128035650077, + "grad_norm": 2605.017578125, + "learning_rate": 4.078990983655807e-05, + "loss": 78.6188, + "step": 88540 + }, + { + "epoch": 0.35775320483037526, + "grad_norm": 483.28961181640625, + "learning_rate": 4.078720338432933e-05, + "loss": 52.8732, + "step": 88550 + }, + { + "epoch": 0.3577936060957429, + "grad_norm": 371.9551086425781, + "learning_rate": 4.078449662432093e-05, + "loss": 67.858, + "step": 88560 + }, + { + "epoch": 0.35783400736111054, + "grad_norm": 539.9357299804688, + "learning_rate": 4.078178955658565e-05, + "loss": 62.4049, + "step": 88570 + }, + { + "epoch": 0.3578744086264782, + "grad_norm": 571.8364868164062, + "learning_rate": 4.077908218117625e-05, + "loss": 80.4749, + "step": 88580 + }, + { + "epoch": 0.3579148098918458, + "grad_norm": 1833.5181884765625, + "learning_rate": 4.077637449814552e-05, + "loss": 59.1497, + "step": 88590 + }, + { + "epoch": 0.35795521115721346, + "grad_norm": 1636.6201171875, + "learning_rate": 4.077366650754624e-05, + "loss": 69.4785, + "step": 88600 + }, + { + "epoch": 0.3579956124225811, + "grad_norm": 2520.92822265625, + "learning_rate": 4.077095820943122e-05, + "loss": 92.7431, + "step": 88610 + }, + { + "epoch": 0.3580360136879487, + "grad_norm": 781.56298828125, + "learning_rate": 4.0768249603853245e-05, + "loss": 66.4768, + "step": 88620 + }, + { + "epoch": 0.3580764149533163, + "grad_norm": 462.7001037597656, + "learning_rate": 4.0765540690865134e-05, + "loss": 39.243, + "step": 88630 + }, + { + "epoch": 0.35811681621868396, + "grad_norm": 1026.7464599609375, + "learning_rate": 4.076283147051968e-05, + "loss": 68.4941, + "step": 88640 + }, + { + "epoch": 0.3581572174840516, + "grad_norm": 558.1005859375, + "learning_rate": 4.0760121942869725e-05, + "loss": 75.5133, + "step": 88650 + }, + { + "epoch": 0.35819761874941924, + "grad_norm": 1109.0501708984375, + "learning_rate": 4.075741210796806e-05, + "loss": 89.089, + "step": 88660 + }, + { + "epoch": 0.3582380200147869, + "grad_norm": 1361.9794921875, + "learning_rate": 4.075470196586755e-05, + "loss": 81.0406, + "step": 88670 + }, + { + "epoch": 0.35827842128015447, + "grad_norm": 390.9439697265625, + "learning_rate": 4.075199151662101e-05, + "loss": 67.3965, + "step": 88680 + }, + { + "epoch": 0.3583188225455221, + "grad_norm": 433.46759033203125, + "learning_rate": 4.074928076028128e-05, + "loss": 59.471, + "step": 88690 + }, + { + "epoch": 0.35835922381088975, + "grad_norm": 3093.1083984375, + "learning_rate": 4.074656969690122e-05, + "loss": 67.4171, + "step": 88700 + }, + { + "epoch": 0.3583996250762574, + "grad_norm": 650.4321899414062, + "learning_rate": 4.0743858326533674e-05, + "loss": 57.6206, + "step": 88710 + }, + { + "epoch": 0.35844002634162503, + "grad_norm": 693.6144409179688, + "learning_rate": 4.0741146649231504e-05, + "loss": 69.4233, + "step": 88720 + }, + { + "epoch": 0.35848042760699267, + "grad_norm": 1305.1888427734375, + "learning_rate": 4.0738434665047575e-05, + "loss": 66.7869, + "step": 88730 + }, + { + "epoch": 0.3585208288723603, + "grad_norm": 671.9458618164062, + "learning_rate": 4.0735722374034764e-05, + "loss": 94.4328, + "step": 88740 + }, + { + "epoch": 0.3585612301377279, + "grad_norm": 782.94873046875, + "learning_rate": 4.073300977624594e-05, + "loss": 59.1198, + "step": 88750 + }, + { + "epoch": 0.35860163140309553, + "grad_norm": 808.3102416992188, + "learning_rate": 4.073029687173399e-05, + "loss": 65.3728, + "step": 88760 + }, + { + "epoch": 0.3586420326684632, + "grad_norm": 2808.516357421875, + "learning_rate": 4.0727583660551806e-05, + "loss": 57.111, + "step": 88770 + }, + { + "epoch": 0.3586824339338308, + "grad_norm": 905.2903442382812, + "learning_rate": 4.0724870142752284e-05, + "loss": 80.5446, + "step": 88780 + }, + { + "epoch": 0.35872283519919845, + "grad_norm": 245.458251953125, + "learning_rate": 4.0722156318388315e-05, + "loss": 81.0946, + "step": 88790 + }, + { + "epoch": 0.3587632364645661, + "grad_norm": 1166.0023193359375, + "learning_rate": 4.071944218751282e-05, + "loss": 62.5315, + "step": 88800 + }, + { + "epoch": 0.3588036377299337, + "grad_norm": 1779.5706787109375, + "learning_rate": 4.0716727750178704e-05, + "loss": 73.8024, + "step": 88810 + }, + { + "epoch": 0.3588440389953013, + "grad_norm": 3553.527587890625, + "learning_rate": 4.071401300643889e-05, + "loss": 107.6503, + "step": 88820 + }, + { + "epoch": 0.35888444026066896, + "grad_norm": 990.9931030273438, + "learning_rate": 4.0711297956346306e-05, + "loss": 51.3191, + "step": 88830 + }, + { + "epoch": 0.3589248415260366, + "grad_norm": 567.8832397460938, + "learning_rate": 4.070858259995387e-05, + "loss": 85.3655, + "step": 88840 + }, + { + "epoch": 0.35896524279140424, + "grad_norm": 978.09130859375, + "learning_rate": 4.070586693731454e-05, + "loss": 86.0509, + "step": 88850 + }, + { + "epoch": 0.3590056440567719, + "grad_norm": 654.3468017578125, + "learning_rate": 4.0703150968481246e-05, + "loss": 80.2084, + "step": 88860 + }, + { + "epoch": 0.35904604532213946, + "grad_norm": 219.86631774902344, + "learning_rate": 4.070043469350694e-05, + "loss": 61.1951, + "step": 88870 + }, + { + "epoch": 0.3590864465875071, + "grad_norm": 804.8790283203125, + "learning_rate": 4.069771811244457e-05, + "loss": 75.843, + "step": 88880 + }, + { + "epoch": 0.35912684785287474, + "grad_norm": 546.8237915039062, + "learning_rate": 4.0695001225347104e-05, + "loss": 51.4436, + "step": 88890 + }, + { + "epoch": 0.3591672491182424, + "grad_norm": 584.9393920898438, + "learning_rate": 4.0692284032267516e-05, + "loss": 50.8256, + "step": 88900 + }, + { + "epoch": 0.35920765038361, + "grad_norm": 2287.378173828125, + "learning_rate": 4.0689566533258765e-05, + "loss": 85.8929, + "step": 88910 + }, + { + "epoch": 0.35924805164897766, + "grad_norm": 798.82470703125, + "learning_rate": 4.068684872837384e-05, + "loss": 54.3008, + "step": 88920 + }, + { + "epoch": 0.3592884529143453, + "grad_norm": 631.5094604492188, + "learning_rate": 4.068413061766572e-05, + "loss": 67.0688, + "step": 88930 + }, + { + "epoch": 0.3593288541797129, + "grad_norm": 788.1356811523438, + "learning_rate": 4.068141220118741e-05, + "loss": 63.0645, + "step": 88940 + }, + { + "epoch": 0.3593692554450805, + "grad_norm": 424.5524597167969, + "learning_rate": 4.067869347899188e-05, + "loss": 54.8084, + "step": 88950 + }, + { + "epoch": 0.35940965671044817, + "grad_norm": 653.3056640625, + "learning_rate": 4.067597445113216e-05, + "loss": 50.6989, + "step": 88960 + }, + { + "epoch": 0.3594500579758158, + "grad_norm": 343.56512451171875, + "learning_rate": 4.067325511766124e-05, + "loss": 82.852, + "step": 88970 + }, + { + "epoch": 0.35949045924118345, + "grad_norm": 750.8988647460938, + "learning_rate": 4.067053547863215e-05, + "loss": 85.4746, + "step": 88980 + }, + { + "epoch": 0.3595308605065511, + "grad_norm": 1303.02197265625, + "learning_rate": 4.06678155340979e-05, + "loss": 50.7246, + "step": 88990 + }, + { + "epoch": 0.35957126177191867, + "grad_norm": 777.9240112304688, + "learning_rate": 4.066509528411152e-05, + "loss": 62.3671, + "step": 89000 + }, + { + "epoch": 0.3596116630372863, + "grad_norm": 304.01275634765625, + "learning_rate": 4.066237472872604e-05, + "loss": 68.3459, + "step": 89010 + }, + { + "epoch": 0.35965206430265395, + "grad_norm": 1499.3465576171875, + "learning_rate": 4.0659653867994496e-05, + "loss": 69.0432, + "step": 89020 + }, + { + "epoch": 0.3596924655680216, + "grad_norm": 1114.2425537109375, + "learning_rate": 4.065693270196995e-05, + "loss": 68.1484, + "step": 89030 + }, + { + "epoch": 0.35973286683338923, + "grad_norm": 738.8199462890625, + "learning_rate": 4.065421123070543e-05, + "loss": 59.836, + "step": 89040 + }, + { + "epoch": 0.35977326809875687, + "grad_norm": 616.7838134765625, + "learning_rate": 4.065148945425401e-05, + "loss": 61.9072, + "step": 89050 + }, + { + "epoch": 0.3598136693641245, + "grad_norm": 215.50350952148438, + "learning_rate": 4.064876737266874e-05, + "loss": 81.0354, + "step": 89060 + }, + { + "epoch": 0.3598540706294921, + "grad_norm": 344.2589111328125, + "learning_rate": 4.06460449860027e-05, + "loss": 61.0841, + "step": 89070 + }, + { + "epoch": 0.35989447189485974, + "grad_norm": 740.9758911132812, + "learning_rate": 4.064332229430895e-05, + "loss": 53.366, + "step": 89080 + }, + { + "epoch": 0.3599348731602274, + "grad_norm": 717.7742919921875, + "learning_rate": 4.0640599297640584e-05, + "loss": 42.3575, + "step": 89090 + }, + { + "epoch": 0.359975274425595, + "grad_norm": 1074.0352783203125, + "learning_rate": 4.063787599605068e-05, + "loss": 79.3194, + "step": 89100 + }, + { + "epoch": 0.36001567569096266, + "grad_norm": 1248.140869140625, + "learning_rate": 4.063515238959233e-05, + "loss": 60.0265, + "step": 89110 + }, + { + "epoch": 0.3600560769563303, + "grad_norm": 400.5279235839844, + "learning_rate": 4.063242847831864e-05, + "loss": 56.0902, + "step": 89120 + }, + { + "epoch": 0.3600964782216979, + "grad_norm": 4143.07568359375, + "learning_rate": 4.06297042622827e-05, + "loss": 77.4453, + "step": 89130 + }, + { + "epoch": 0.3601368794870655, + "grad_norm": 520.5917358398438, + "learning_rate": 4.062697974153764e-05, + "loss": 56.0878, + "step": 89140 + }, + { + "epoch": 0.36017728075243316, + "grad_norm": 1186.0679931640625, + "learning_rate": 4.062425491613656e-05, + "loss": 70.4133, + "step": 89150 + }, + { + "epoch": 0.3602176820178008, + "grad_norm": 972.8978271484375, + "learning_rate": 4.062152978613258e-05, + "loss": 47.6216, + "step": 89160 + }, + { + "epoch": 0.36025808328316844, + "grad_norm": 1175.881591796875, + "learning_rate": 4.061880435157884e-05, + "loss": 52.5779, + "step": 89170 + }, + { + "epoch": 0.3602984845485361, + "grad_norm": 1707.835205078125, + "learning_rate": 4.061607861252847e-05, + "loss": 49.4881, + "step": 89180 + }, + { + "epoch": 0.36033888581390366, + "grad_norm": 898.6934814453125, + "learning_rate": 4.0613352569034615e-05, + "loss": 83.9531, + "step": 89190 + }, + { + "epoch": 0.3603792870792713, + "grad_norm": 791.3284912109375, + "learning_rate": 4.0610626221150394e-05, + "loss": 56.8294, + "step": 89200 + }, + { + "epoch": 0.36041968834463894, + "grad_norm": 439.3869323730469, + "learning_rate": 4.060789956892899e-05, + "loss": 78.4056, + "step": 89210 + }, + { + "epoch": 0.3604600896100066, + "grad_norm": 306.3957214355469, + "learning_rate": 4.060517261242355e-05, + "loss": 53.2591, + "step": 89220 + }, + { + "epoch": 0.3605004908753742, + "grad_norm": 862.3554077148438, + "learning_rate": 4.060244535168723e-05, + "loss": 57.5533, + "step": 89230 + }, + { + "epoch": 0.36054089214074186, + "grad_norm": 631.755859375, + "learning_rate": 4.0599717786773204e-05, + "loss": 46.8937, + "step": 89240 + }, + { + "epoch": 0.3605812934061095, + "grad_norm": 1259.4813232421875, + "learning_rate": 4.059698991773466e-05, + "loss": 62.3537, + "step": 89250 + }, + { + "epoch": 0.3606216946714771, + "grad_norm": 467.6385192871094, + "learning_rate": 4.059426174462476e-05, + "loss": 65.602, + "step": 89260 + }, + { + "epoch": 0.36066209593684473, + "grad_norm": 492.99609375, + "learning_rate": 4.0591533267496694e-05, + "loss": 106.2255, + "step": 89270 + }, + { + "epoch": 0.36070249720221237, + "grad_norm": 893.2991943359375, + "learning_rate": 4.058880448640367e-05, + "loss": 96.5574, + "step": 89280 + }, + { + "epoch": 0.36074289846758, + "grad_norm": 645.7114868164062, + "learning_rate": 4.058607540139887e-05, + "loss": 90.8627, + "step": 89290 + }, + { + "epoch": 0.36078329973294765, + "grad_norm": 3922.514404296875, + "learning_rate": 4.0583346012535506e-05, + "loss": 111.6305, + "step": 89300 + }, + { + "epoch": 0.3608237009983153, + "grad_norm": 904.625, + "learning_rate": 4.058061631986679e-05, + "loss": 79.8804, + "step": 89310 + }, + { + "epoch": 0.3608641022636829, + "grad_norm": 384.1736755371094, + "learning_rate": 4.057788632344593e-05, + "loss": 60.1387, + "step": 89320 + }, + { + "epoch": 0.3609045035290505, + "grad_norm": 675.1691284179688, + "learning_rate": 4.0575156023326166e-05, + "loss": 60.848, + "step": 89330 + }, + { + "epoch": 0.36094490479441815, + "grad_norm": 963.3970947265625, + "learning_rate": 4.0572425419560714e-05, + "loss": 57.821, + "step": 89340 + }, + { + "epoch": 0.3609853060597858, + "grad_norm": 0.0, + "learning_rate": 4.056969451220282e-05, + "loss": 72.7515, + "step": 89350 + }, + { + "epoch": 0.36102570732515343, + "grad_norm": 610.1162719726562, + "learning_rate": 4.0566963301305705e-05, + "loss": 50.6369, + "step": 89360 + }, + { + "epoch": 0.3610661085905211, + "grad_norm": 643.9298706054688, + "learning_rate": 4.056423178692262e-05, + "loss": 39.3118, + "step": 89370 + }, + { + "epoch": 0.3611065098558887, + "grad_norm": 531.35693359375, + "learning_rate": 4.056149996910683e-05, + "loss": 75.3905, + "step": 89380 + }, + { + "epoch": 0.3611469111212563, + "grad_norm": 503.90777587890625, + "learning_rate": 4.05587678479116e-05, + "loss": 65.8769, + "step": 89390 + }, + { + "epoch": 0.36118731238662394, + "grad_norm": 1145.5985107421875, + "learning_rate": 4.055603542339016e-05, + "loss": 51.2441, + "step": 89400 + }, + { + "epoch": 0.3612277136519916, + "grad_norm": 411.92626953125, + "learning_rate": 4.055330269559581e-05, + "loss": 35.5195, + "step": 89410 + }, + { + "epoch": 0.3612681149173592, + "grad_norm": 745.9331665039062, + "learning_rate": 4.055056966458182e-05, + "loss": 58.2253, + "step": 89420 + }, + { + "epoch": 0.36130851618272686, + "grad_norm": 5621.34814453125, + "learning_rate": 4.054783633040146e-05, + "loss": 78.9581, + "step": 89430 + }, + { + "epoch": 0.3613489174480945, + "grad_norm": 5587.95263671875, + "learning_rate": 4.054510269310803e-05, + "loss": 83.4821, + "step": 89440 + }, + { + "epoch": 0.3613893187134621, + "grad_norm": 402.0248107910156, + "learning_rate": 4.0542368752754825e-05, + "loss": 57.0781, + "step": 89450 + }, + { + "epoch": 0.3614297199788297, + "grad_norm": 1605.563720703125, + "learning_rate": 4.053963450939513e-05, + "loss": 90.5012, + "step": 89460 + }, + { + "epoch": 0.36147012124419736, + "grad_norm": 471.9956359863281, + "learning_rate": 4.053689996308227e-05, + "loss": 71.2476, + "step": 89470 + }, + { + "epoch": 0.361510522509565, + "grad_norm": 0.0, + "learning_rate": 4.053416511386954e-05, + "loss": 45.8093, + "step": 89480 + }, + { + "epoch": 0.36155092377493264, + "grad_norm": 800.1768188476562, + "learning_rate": 4.0531429961810264e-05, + "loss": 46.3764, + "step": 89490 + }, + { + "epoch": 0.3615913250403003, + "grad_norm": 731.527587890625, + "learning_rate": 4.052869450695776e-05, + "loss": 78.3263, + "step": 89500 + }, + { + "epoch": 0.36163172630566787, + "grad_norm": 1999.2166748046875, + "learning_rate": 4.052595874936537e-05, + "loss": 70.2021, + "step": 89510 + }, + { + "epoch": 0.3616721275710355, + "grad_norm": 0.0, + "learning_rate": 4.0523222689086414e-05, + "loss": 75.0036, + "step": 89520 + }, + { + "epoch": 0.36171252883640315, + "grad_norm": 896.73388671875, + "learning_rate": 4.052048632617424e-05, + "loss": 73.4701, + "step": 89530 + }, + { + "epoch": 0.3617529301017708, + "grad_norm": 1283.2412109375, + "learning_rate": 4.05177496606822e-05, + "loss": 50.1073, + "step": 89540 + }, + { + "epoch": 0.3617933313671384, + "grad_norm": 543.897705078125, + "learning_rate": 4.0515012692663646e-05, + "loss": 52.8131, + "step": 89550 + }, + { + "epoch": 0.36183373263250607, + "grad_norm": 1975.94384765625, + "learning_rate": 4.051227542217192e-05, + "loss": 92.001, + "step": 89560 + }, + { + "epoch": 0.3618741338978737, + "grad_norm": 247.90350341796875, + "learning_rate": 4.0509537849260404e-05, + "loss": 31.5658, + "step": 89570 + }, + { + "epoch": 0.3619145351632413, + "grad_norm": 859.4539184570312, + "learning_rate": 4.0506799973982465e-05, + "loss": 55.6403, + "step": 89580 + }, + { + "epoch": 0.36195493642860893, + "grad_norm": 1263.11474609375, + "learning_rate": 4.0504061796391474e-05, + "loss": 59.762, + "step": 89590 + }, + { + "epoch": 0.36199533769397657, + "grad_norm": 263.7984313964844, + "learning_rate": 4.050132331654082e-05, + "loss": 36.6857, + "step": 89600 + }, + { + "epoch": 0.3620357389593442, + "grad_norm": 599.0194702148438, + "learning_rate": 4.0498584534483877e-05, + "loss": 101.6762, + "step": 89610 + }, + { + "epoch": 0.36207614022471185, + "grad_norm": 1161.4105224609375, + "learning_rate": 4.0495845450274064e-05, + "loss": 73.1191, + "step": 89620 + }, + { + "epoch": 0.3621165414900795, + "grad_norm": 977.6044921875, + "learning_rate": 4.0493106063964754e-05, + "loss": 63.5236, + "step": 89630 + }, + { + "epoch": 0.3621569427554471, + "grad_norm": 653.26708984375, + "learning_rate": 4.0490366375609376e-05, + "loss": 99.8486, + "step": 89640 + }, + { + "epoch": 0.3621973440208147, + "grad_norm": 1127.0865478515625, + "learning_rate": 4.048762638526132e-05, + "loss": 51.8638, + "step": 89650 + }, + { + "epoch": 0.36223774528618236, + "grad_norm": 565.7835693359375, + "learning_rate": 4.048488609297402e-05, + "loss": 61.2772, + "step": 89660 + }, + { + "epoch": 0.36227814655155, + "grad_norm": 774.4669799804688, + "learning_rate": 4.0482145498800884e-05, + "loss": 61.0465, + "step": 89670 + }, + { + "epoch": 0.36231854781691764, + "grad_norm": 2059.27880859375, + "learning_rate": 4.047940460279537e-05, + "loss": 80.3147, + "step": 89680 + }, + { + "epoch": 0.3623589490822853, + "grad_norm": 890.2815551757812, + "learning_rate": 4.0476663405010874e-05, + "loss": 85.4335, + "step": 89690 + }, + { + "epoch": 0.3623993503476529, + "grad_norm": 1310.5477294921875, + "learning_rate": 4.047392190550087e-05, + "loss": 51.1065, + "step": 89700 + }, + { + "epoch": 0.3624397516130205, + "grad_norm": 640.0499877929688, + "learning_rate": 4.047118010431879e-05, + "loss": 79.5663, + "step": 89710 + }, + { + "epoch": 0.36248015287838814, + "grad_norm": 773.0057373046875, + "learning_rate": 4.0468438001518084e-05, + "loss": 56.4114, + "step": 89720 + }, + { + "epoch": 0.3625205541437558, + "grad_norm": 652.1478881835938, + "learning_rate": 4.046569559715221e-05, + "loss": 52.9334, + "step": 89730 + }, + { + "epoch": 0.3625609554091234, + "grad_norm": 1459.3680419921875, + "learning_rate": 4.0462952891274655e-05, + "loss": 71.04, + "step": 89740 + }, + { + "epoch": 0.36260135667449106, + "grad_norm": 1920.9996337890625, + "learning_rate": 4.046020988393885e-05, + "loss": 61.0197, + "step": 89750 + }, + { + "epoch": 0.3626417579398587, + "grad_norm": 1488.5030517578125, + "learning_rate": 4.045746657519831e-05, + "loss": 76.9832, + "step": 89760 + }, + { + "epoch": 0.3626821592052263, + "grad_norm": 852.4432983398438, + "learning_rate": 4.04547229651065e-05, + "loss": 61.3612, + "step": 89770 + }, + { + "epoch": 0.3627225604705939, + "grad_norm": 748.500244140625, + "learning_rate": 4.0451979053716906e-05, + "loss": 61.8753, + "step": 89780 + }, + { + "epoch": 0.36276296173596156, + "grad_norm": 988.9141235351562, + "learning_rate": 4.044923484108303e-05, + "loss": 53.8847, + "step": 89790 + }, + { + "epoch": 0.3628033630013292, + "grad_norm": 1280.90673828125, + "learning_rate": 4.044649032725836e-05, + "loss": 63.5749, + "step": 89800 + }, + { + "epoch": 0.36284376426669684, + "grad_norm": 1100.82177734375, + "learning_rate": 4.044374551229641e-05, + "loss": 87.8433, + "step": 89810 + }, + { + "epoch": 0.3628841655320645, + "grad_norm": 936.5944213867188, + "learning_rate": 4.0441000396250694e-05, + "loss": 62.9258, + "step": 89820 + }, + { + "epoch": 0.36292456679743207, + "grad_norm": 701.0770874023438, + "learning_rate": 4.0438254979174725e-05, + "loss": 50.3677, + "step": 89830 + }, + { + "epoch": 0.3629649680627997, + "grad_norm": 0.0, + "learning_rate": 4.043550926112203e-05, + "loss": 60.1964, + "step": 89840 + }, + { + "epoch": 0.36300536932816735, + "grad_norm": 546.4686889648438, + "learning_rate": 4.043276324214613e-05, + "loss": 53.8509, + "step": 89850 + }, + { + "epoch": 0.363045770593535, + "grad_norm": 1106.510009765625, + "learning_rate": 4.043001692230056e-05, + "loss": 69.3517, + "step": 89860 + }, + { + "epoch": 0.36308617185890263, + "grad_norm": 665.1596069335938, + "learning_rate": 4.042727030163888e-05, + "loss": 53.5625, + "step": 89870 + }, + { + "epoch": 0.36312657312427027, + "grad_norm": 640.2094116210938, + "learning_rate": 4.042452338021461e-05, + "loss": 90.0176, + "step": 89880 + }, + { + "epoch": 0.3631669743896379, + "grad_norm": 552.5271606445312, + "learning_rate": 4.0421776158081326e-05, + "loss": 39.7842, + "step": 89890 + }, + { + "epoch": 0.3632073756550055, + "grad_norm": 609.4852905273438, + "learning_rate": 4.041902863529256e-05, + "loss": 98.3096, + "step": 89900 + }, + { + "epoch": 0.36324777692037313, + "grad_norm": 1036.63671875, + "learning_rate": 4.041628081190191e-05, + "loss": 74.554, + "step": 89910 + }, + { + "epoch": 0.3632881781857408, + "grad_norm": 296.92547607421875, + "learning_rate": 4.041353268796293e-05, + "loss": 75.188, + "step": 89920 + }, + { + "epoch": 0.3633285794511084, + "grad_norm": 404.4271240234375, + "learning_rate": 4.041078426352918e-05, + "loss": 43.2436, + "step": 89930 + }, + { + "epoch": 0.36336898071647605, + "grad_norm": 1445.6839599609375, + "learning_rate": 4.0408035538654264e-05, + "loss": 48.7288, + "step": 89940 + }, + { + "epoch": 0.3634093819818437, + "grad_norm": 745.0489501953125, + "learning_rate": 4.040528651339176e-05, + "loss": 77.1038, + "step": 89950 + }, + { + "epoch": 0.3634497832472113, + "grad_norm": 543.2232055664062, + "learning_rate": 4.0402537187795274e-05, + "loss": 56.8202, + "step": 89960 + }, + { + "epoch": 0.3634901845125789, + "grad_norm": 557.49072265625, + "learning_rate": 4.039978756191839e-05, + "loss": 62.1664, + "step": 89970 + }, + { + "epoch": 0.36353058577794656, + "grad_norm": 612.9962768554688, + "learning_rate": 4.039703763581472e-05, + "loss": 82.6836, + "step": 89980 + }, + { + "epoch": 0.3635709870433142, + "grad_norm": 921.2637939453125, + "learning_rate": 4.039428740953787e-05, + "loss": 50.0055, + "step": 89990 + }, + { + "epoch": 0.36361138830868184, + "grad_norm": 1612.236328125, + "learning_rate": 4.039153688314145e-05, + "loss": 85.7117, + "step": 90000 + }, + { + "epoch": 0.3636517895740495, + "grad_norm": 656.6378173828125, + "learning_rate": 4.038878605667912e-05, + "loss": 84.2719, + "step": 90010 + }, + { + "epoch": 0.3636921908394171, + "grad_norm": 1089.3055419921875, + "learning_rate": 4.038603493020447e-05, + "loss": 76.3893, + "step": 90020 + }, + { + "epoch": 0.3637325921047847, + "grad_norm": 954.1974487304688, + "learning_rate": 4.038328350377115e-05, + "loss": 111.787, + "step": 90030 + }, + { + "epoch": 0.36377299337015234, + "grad_norm": 725.2201538085938, + "learning_rate": 4.0380531777432794e-05, + "loss": 60.9843, + "step": 90040 + }, + { + "epoch": 0.36381339463552, + "grad_norm": 1051.9312744140625, + "learning_rate": 4.037777975124306e-05, + "loss": 75.9995, + "step": 90050 + }, + { + "epoch": 0.3638537959008876, + "grad_norm": 1385.2197265625, + "learning_rate": 4.037502742525559e-05, + "loss": 85.4595, + "step": 90060 + }, + { + "epoch": 0.36389419716625526, + "grad_norm": 505.546875, + "learning_rate": 4.037227479952404e-05, + "loss": 56.1493, + "step": 90070 + }, + { + "epoch": 0.3639345984316229, + "grad_norm": 749.1838989257812, + "learning_rate": 4.036952187410208e-05, + "loss": 63.6978, + "step": 90080 + }, + { + "epoch": 0.3639749996969905, + "grad_norm": 534.1312866210938, + "learning_rate": 4.036676864904338e-05, + "loss": 57.1218, + "step": 90090 + }, + { + "epoch": 0.3640154009623581, + "grad_norm": 921.06005859375, + "learning_rate": 4.036401512440161e-05, + "loss": 58.3411, + "step": 90100 + }, + { + "epoch": 0.36405580222772577, + "grad_norm": 679.614990234375, + "learning_rate": 4.0361261300230465e-05, + "loss": 66.4312, + "step": 90110 + }, + { + "epoch": 0.3640962034930934, + "grad_norm": 511.14501953125, + "learning_rate": 4.035850717658362e-05, + "loss": 106.1777, + "step": 90120 + }, + { + "epoch": 0.36413660475846105, + "grad_norm": 1034.344482421875, + "learning_rate": 4.035575275351476e-05, + "loss": 78.6351, + "step": 90130 + }, + { + "epoch": 0.3641770060238287, + "grad_norm": 880.7720947265625, + "learning_rate": 4.0352998031077604e-05, + "loss": 67.1489, + "step": 90140 + }, + { + "epoch": 0.36421740728919627, + "grad_norm": 1041.2879638671875, + "learning_rate": 4.035024300932584e-05, + "loss": 59.1619, + "step": 90150 + }, + { + "epoch": 0.3642578085545639, + "grad_norm": 414.6109924316406, + "learning_rate": 4.0347487688313194e-05, + "loss": 57.6462, + "step": 90160 + }, + { + "epoch": 0.36429820981993155, + "grad_norm": 803.5387573242188, + "learning_rate": 4.034473206809337e-05, + "loss": 62.8699, + "step": 90170 + }, + { + "epoch": 0.3643386110852992, + "grad_norm": 982.3478393554688, + "learning_rate": 4.0341976148720095e-05, + "loss": 97.5246, + "step": 90180 + }, + { + "epoch": 0.36437901235066683, + "grad_norm": 951.2240600585938, + "learning_rate": 4.03392199302471e-05, + "loss": 51.8766, + "step": 90190 + }, + { + "epoch": 0.36441941361603447, + "grad_norm": 754.9638061523438, + "learning_rate": 4.033646341272811e-05, + "loss": 59.1747, + "step": 90200 + }, + { + "epoch": 0.3644598148814021, + "grad_norm": 1095.536376953125, + "learning_rate": 4.033370659621687e-05, + "loss": 43.2328, + "step": 90210 + }, + { + "epoch": 0.3645002161467697, + "grad_norm": 915.0740966796875, + "learning_rate": 4.033094948076713e-05, + "loss": 77.3148, + "step": 90220 + }, + { + "epoch": 0.36454061741213734, + "grad_norm": 502.77545166015625, + "learning_rate": 4.032819206643263e-05, + "loss": 41.8404, + "step": 90230 + }, + { + "epoch": 0.364581018677505, + "grad_norm": 352.51165771484375, + "learning_rate": 4.032543435326714e-05, + "loss": 55.2335, + "step": 90240 + }, + { + "epoch": 0.3646214199428726, + "grad_norm": 1066.30615234375, + "learning_rate": 4.0322676341324415e-05, + "loss": 87.1459, + "step": 90250 + }, + { + "epoch": 0.36466182120824026, + "grad_norm": 486.5985107421875, + "learning_rate": 4.0319918030658225e-05, + "loss": 67.0294, + "step": 90260 + }, + { + "epoch": 0.3647022224736079, + "grad_norm": 637.4968872070312, + "learning_rate": 4.031715942132235e-05, + "loss": 67.5639, + "step": 90270 + }, + { + "epoch": 0.3647426237389755, + "grad_norm": 919.53759765625, + "learning_rate": 4.031440051337056e-05, + "loss": 88.9865, + "step": 90280 + }, + { + "epoch": 0.3647830250043431, + "grad_norm": 551.5164184570312, + "learning_rate": 4.031164130685665e-05, + "loss": 68.1998, + "step": 90290 + }, + { + "epoch": 0.36482342626971076, + "grad_norm": 1867.7833251953125, + "learning_rate": 4.030888180183441e-05, + "loss": 80.087, + "step": 90300 + }, + { + "epoch": 0.3648638275350784, + "grad_norm": 731.1893920898438, + "learning_rate": 4.030612199835764e-05, + "loss": 49.1685, + "step": 90310 + }, + { + "epoch": 0.36490422880044604, + "grad_norm": 1434.2115478515625, + "learning_rate": 4.030336189648014e-05, + "loss": 79.8181, + "step": 90320 + }, + { + "epoch": 0.3649446300658137, + "grad_norm": 5654.36669921875, + "learning_rate": 4.030060149625573e-05, + "loss": 106.5813, + "step": 90330 + }, + { + "epoch": 0.3649850313311813, + "grad_norm": 1383.1339111328125, + "learning_rate": 4.02978407977382e-05, + "loss": 81.1142, + "step": 90340 + }, + { + "epoch": 0.3650254325965489, + "grad_norm": 1352.9986572265625, + "learning_rate": 4.0295079800981395e-05, + "loss": 75.8624, + "step": 90350 + }, + { + "epoch": 0.36506583386191654, + "grad_norm": 821.6564331054688, + "learning_rate": 4.029231850603914e-05, + "loss": 87.5254, + "step": 90360 + }, + { + "epoch": 0.3651062351272842, + "grad_norm": 375.5804138183594, + "learning_rate": 4.028955691296526e-05, + "loss": 43.4082, + "step": 90370 + }, + { + "epoch": 0.3651466363926518, + "grad_norm": 2169.375244140625, + "learning_rate": 4.0286795021813594e-05, + "loss": 67.1946, + "step": 90380 + }, + { + "epoch": 0.36518703765801946, + "grad_norm": 7678.23828125, + "learning_rate": 4.0284032832637985e-05, + "loss": 96.252, + "step": 90390 + }, + { + "epoch": 0.3652274389233871, + "grad_norm": 1672.2906494140625, + "learning_rate": 4.028127034549229e-05, + "loss": 76.2553, + "step": 90400 + }, + { + "epoch": 0.3652678401887547, + "grad_norm": 959.1575927734375, + "learning_rate": 4.027850756043037e-05, + "loss": 55.6581, + "step": 90410 + }, + { + "epoch": 0.36530824145412233, + "grad_norm": 646.5990600585938, + "learning_rate": 4.0275744477506074e-05, + "loss": 79.772, + "step": 90420 + }, + { + "epoch": 0.36534864271948997, + "grad_norm": 905.00439453125, + "learning_rate": 4.027298109677327e-05, + "loss": 40.6796, + "step": 90430 + }, + { + "epoch": 0.3653890439848576, + "grad_norm": 800.4560546875, + "learning_rate": 4.027021741828584e-05, + "loss": 52.6281, + "step": 90440 + }, + { + "epoch": 0.36542944525022525, + "grad_norm": 507.7906188964844, + "learning_rate": 4.0267453442097664e-05, + "loss": 72.2229, + "step": 90450 + }, + { + "epoch": 0.3654698465155929, + "grad_norm": 371.3627624511719, + "learning_rate": 4.026468916826262e-05, + "loss": 65.535, + "step": 90460 + }, + { + "epoch": 0.3655102477809605, + "grad_norm": 393.8462219238281, + "learning_rate": 4.02619245968346e-05, + "loss": 75.8037, + "step": 90470 + }, + { + "epoch": 0.3655506490463281, + "grad_norm": 460.1830749511719, + "learning_rate": 4.0259159727867504e-05, + "loss": 82.6051, + "step": 90480 + }, + { + "epoch": 0.36559105031169575, + "grad_norm": 521.975341796875, + "learning_rate": 4.025639456141523e-05, + "loss": 61.9708, + "step": 90490 + }, + { + "epoch": 0.3656314515770634, + "grad_norm": 1583.7652587890625, + "learning_rate": 4.02536290975317e-05, + "loss": 64.9353, + "step": 90500 + }, + { + "epoch": 0.36567185284243103, + "grad_norm": 465.25244140625, + "learning_rate": 4.02508633362708e-05, + "loss": 65.8302, + "step": 90510 + }, + { + "epoch": 0.3657122541077987, + "grad_norm": 293.57965087890625, + "learning_rate": 4.024809727768648e-05, + "loss": 45.0099, + "step": 90520 + }, + { + "epoch": 0.3657526553731663, + "grad_norm": 536.3772583007812, + "learning_rate": 4.024533092183266e-05, + "loss": 74.4069, + "step": 90530 + }, + { + "epoch": 0.3657930566385339, + "grad_norm": 479.0234680175781, + "learning_rate": 4.024256426876325e-05, + "loss": 51.6421, + "step": 90540 + }, + { + "epoch": 0.36583345790390154, + "grad_norm": 1907.06494140625, + "learning_rate": 4.02397973185322e-05, + "loss": 77.2531, + "step": 90550 + }, + { + "epoch": 0.3658738591692692, + "grad_norm": 1270.7117919921875, + "learning_rate": 4.023703007119347e-05, + "loss": 59.475, + "step": 90560 + }, + { + "epoch": 0.3659142604346368, + "grad_norm": 380.741943359375, + "learning_rate": 4.023426252680098e-05, + "loss": 59.4132, + "step": 90570 + }, + { + "epoch": 0.36595466170000446, + "grad_norm": 1124.605224609375, + "learning_rate": 4.023149468540871e-05, + "loss": 67.3577, + "step": 90580 + }, + { + "epoch": 0.3659950629653721, + "grad_norm": 1151.210205078125, + "learning_rate": 4.02287265470706e-05, + "loss": 63.5007, + "step": 90590 + }, + { + "epoch": 0.3660354642307397, + "grad_norm": 0.0, + "learning_rate": 4.022595811184064e-05, + "loss": 38.7889, + "step": 90600 + }, + { + "epoch": 0.3660758654961073, + "grad_norm": 531.6884155273438, + "learning_rate": 4.022318937977277e-05, + "loss": 70.17, + "step": 90610 + }, + { + "epoch": 0.36611626676147496, + "grad_norm": 928.3363647460938, + "learning_rate": 4.022042035092101e-05, + "loss": 63.7609, + "step": 90620 + }, + { + "epoch": 0.3661566680268426, + "grad_norm": 1050.515380859375, + "learning_rate": 4.02176510253393e-05, + "loss": 62.498, + "step": 90630 + }, + { + "epoch": 0.36619706929221024, + "grad_norm": 1077.6502685546875, + "learning_rate": 4.021488140308165e-05, + "loss": 90.2313, + "step": 90640 + }, + { + "epoch": 0.3662374705575779, + "grad_norm": 1099.2191162109375, + "learning_rate": 4.021211148420205e-05, + "loss": 65.9301, + "step": 90650 + }, + { + "epoch": 0.3662778718229455, + "grad_norm": 633.58935546875, + "learning_rate": 4.020934126875452e-05, + "loss": 41.3643, + "step": 90660 + }, + { + "epoch": 0.3663182730883131, + "grad_norm": 1048.8428955078125, + "learning_rate": 4.0206570756793046e-05, + "loss": 63.0074, + "step": 90670 + }, + { + "epoch": 0.36635867435368075, + "grad_norm": 369.955810546875, + "learning_rate": 4.020379994837164e-05, + "loss": 76.6917, + "step": 90680 + }, + { + "epoch": 0.3663990756190484, + "grad_norm": 654.5360717773438, + "learning_rate": 4.020102884354433e-05, + "loss": 71.9556, + "step": 90690 + }, + { + "epoch": 0.366439476884416, + "grad_norm": 689.0057983398438, + "learning_rate": 4.019825744236514e-05, + "loss": 68.2348, + "step": 90700 + }, + { + "epoch": 0.36647987814978367, + "grad_norm": 408.1641540527344, + "learning_rate": 4.0195485744888096e-05, + "loss": 99.4495, + "step": 90710 + }, + { + "epoch": 0.3665202794151513, + "grad_norm": 817.236328125, + "learning_rate": 4.019271375116722e-05, + "loss": 72.3637, + "step": 90720 + }, + { + "epoch": 0.3665606806805189, + "grad_norm": 1630.006103515625, + "learning_rate": 4.018994146125659e-05, + "loss": 85.6407, + "step": 90730 + }, + { + "epoch": 0.36660108194588653, + "grad_norm": 1074.2464599609375, + "learning_rate": 4.0187168875210216e-05, + "loss": 59.7042, + "step": 90740 + }, + { + "epoch": 0.36664148321125417, + "grad_norm": 380.92657470703125, + "learning_rate": 4.018439599308217e-05, + "loss": 81.9409, + "step": 90750 + }, + { + "epoch": 0.3666818844766218, + "grad_norm": 0.0, + "learning_rate": 4.0181622814926504e-05, + "loss": 81.8555, + "step": 90760 + }, + { + "epoch": 0.36672228574198945, + "grad_norm": 703.6498413085938, + "learning_rate": 4.0178849340797285e-05, + "loss": 67.2053, + "step": 90770 + }, + { + "epoch": 0.3667626870073571, + "grad_norm": 706.4108276367188, + "learning_rate": 4.0176075570748596e-05, + "loss": 49.0458, + "step": 90780 + }, + { + "epoch": 0.3668030882727247, + "grad_norm": 2707.36572265625, + "learning_rate": 4.017330150483449e-05, + "loss": 53.4966, + "step": 90790 + }, + { + "epoch": 0.3668434895380923, + "grad_norm": 472.0765075683594, + "learning_rate": 4.017052714310906e-05, + "loss": 38.695, + "step": 90800 + }, + { + "epoch": 0.36688389080345996, + "grad_norm": 786.6532592773438, + "learning_rate": 4.0167752485626385e-05, + "loss": 52.5584, + "step": 90810 + }, + { + "epoch": 0.3669242920688276, + "grad_norm": 680.0988159179688, + "learning_rate": 4.0164977532440584e-05, + "loss": 55.2876, + "step": 90820 + }, + { + "epoch": 0.36696469333419524, + "grad_norm": 1092.18359375, + "learning_rate": 4.0162202283605725e-05, + "loss": 55.0247, + "step": 90830 + }, + { + "epoch": 0.3670050945995629, + "grad_norm": 944.074951171875, + "learning_rate": 4.015942673917593e-05, + "loss": 72.8051, + "step": 90840 + }, + { + "epoch": 0.3670454958649305, + "grad_norm": 1217.025634765625, + "learning_rate": 4.015665089920531e-05, + "loss": 74.4885, + "step": 90850 + }, + { + "epoch": 0.3670858971302981, + "grad_norm": 710.3255004882812, + "learning_rate": 4.0153874763747976e-05, + "loss": 72.1298, + "step": 90860 + }, + { + "epoch": 0.36712629839566574, + "grad_norm": 494.13946533203125, + "learning_rate": 4.015109833285805e-05, + "loss": 66.7202, + "step": 90870 + }, + { + "epoch": 0.3671666996610334, + "grad_norm": 693.8568115234375, + "learning_rate": 4.0148321606589656e-05, + "loss": 60.7179, + "step": 90880 + }, + { + "epoch": 0.367207100926401, + "grad_norm": 548.3522338867188, + "learning_rate": 4.014554458499694e-05, + "loss": 47.7584, + "step": 90890 + }, + { + "epoch": 0.36724750219176866, + "grad_norm": 885.053466796875, + "learning_rate": 4.014276726813404e-05, + "loss": 26.558, + "step": 90900 + }, + { + "epoch": 0.3672879034571363, + "grad_norm": 1183.4205322265625, + "learning_rate": 4.013998965605509e-05, + "loss": 71.821, + "step": 90910 + }, + { + "epoch": 0.3673283047225039, + "grad_norm": 1207.58203125, + "learning_rate": 4.013721174881425e-05, + "loss": 49.7049, + "step": 90920 + }, + { + "epoch": 0.3673687059878715, + "grad_norm": 1142.948486328125, + "learning_rate": 4.013443354646567e-05, + "loss": 74.3073, + "step": 90930 + }, + { + "epoch": 0.36740910725323916, + "grad_norm": 0.0, + "learning_rate": 4.0131655049063514e-05, + "loss": 50.2877, + "step": 90940 + }, + { + "epoch": 0.3674495085186068, + "grad_norm": 2414.29541015625, + "learning_rate": 4.012887625666195e-05, + "loss": 76.8267, + "step": 90950 + }, + { + "epoch": 0.36748990978397444, + "grad_norm": 682.6073608398438, + "learning_rate": 4.012609716931517e-05, + "loss": 46.8848, + "step": 90960 + }, + { + "epoch": 0.3675303110493421, + "grad_norm": 648.0629272460938, + "learning_rate": 4.012331778707732e-05, + "loss": 57.106, + "step": 90970 + }, + { + "epoch": 0.3675707123147097, + "grad_norm": 976.9451293945312, + "learning_rate": 4.012053811000262e-05, + "loss": 56.0116, + "step": 90980 + }, + { + "epoch": 0.3676111135800773, + "grad_norm": 712.3170166015625, + "learning_rate": 4.0117758138145235e-05, + "loss": 72.1889, + "step": 90990 + }, + { + "epoch": 0.36765151484544495, + "grad_norm": 579.1261596679688, + "learning_rate": 4.011497787155938e-05, + "loss": 54.2764, + "step": 91000 + }, + { + "epoch": 0.3676919161108126, + "grad_norm": 885.5346069335938, + "learning_rate": 4.0112197310299235e-05, + "loss": 78.5092, + "step": 91010 + }, + { + "epoch": 0.36773231737618023, + "grad_norm": 661.86572265625, + "learning_rate": 4.010941645441904e-05, + "loss": 68.8041, + "step": 91020 + }, + { + "epoch": 0.36777271864154787, + "grad_norm": 474.86328125, + "learning_rate": 4.010663530397298e-05, + "loss": 62.9406, + "step": 91030 + }, + { + "epoch": 0.3678131199069155, + "grad_norm": 476.5716857910156, + "learning_rate": 4.01038538590153e-05, + "loss": 102.7171, + "step": 91040 + }, + { + "epoch": 0.3678535211722831, + "grad_norm": 696.08837890625, + "learning_rate": 4.0101072119600196e-05, + "loss": 97.6508, + "step": 91050 + }, + { + "epoch": 0.36789392243765073, + "grad_norm": 751.9807739257812, + "learning_rate": 4.009829008578192e-05, + "loss": 85.6849, + "step": 91060 + }, + { + "epoch": 0.3679343237030184, + "grad_norm": 505.8662109375, + "learning_rate": 4.0095507757614717e-05, + "loss": 54.5299, + "step": 91070 + }, + { + "epoch": 0.367974724968386, + "grad_norm": 711.6263427734375, + "learning_rate": 4.009272513515281e-05, + "loss": 58.3999, + "step": 91080 + }, + { + "epoch": 0.36801512623375365, + "grad_norm": 623.3084106445312, + "learning_rate": 4.008994221845046e-05, + "loss": 65.1364, + "step": 91090 + }, + { + "epoch": 0.3680555274991213, + "grad_norm": 479.5476379394531, + "learning_rate": 4.0087159007561916e-05, + "loss": 40.831, + "step": 91100 + }, + { + "epoch": 0.3680959287644889, + "grad_norm": 969.3098754882812, + "learning_rate": 4.0084375502541446e-05, + "loss": 64.9479, + "step": 91110 + }, + { + "epoch": 0.3681363300298565, + "grad_norm": 267.7486572265625, + "learning_rate": 4.00815917034433e-05, + "loss": 107.4876, + "step": 91120 + }, + { + "epoch": 0.36817673129522416, + "grad_norm": 1258.0924072265625, + "learning_rate": 4.007880761032177e-05, + "loss": 63.7383, + "step": 91130 + }, + { + "epoch": 0.3682171325605918, + "grad_norm": 664.6405029296875, + "learning_rate": 4.0076023223231105e-05, + "loss": 65.6308, + "step": 91140 + }, + { + "epoch": 0.36825753382595944, + "grad_norm": 869.2908325195312, + "learning_rate": 4.007323854222562e-05, + "loss": 48.834, + "step": 91150 + }, + { + "epoch": 0.3682979350913271, + "grad_norm": 359.8714599609375, + "learning_rate": 4.007045356735959e-05, + "loss": 51.4937, + "step": 91160 + }, + { + "epoch": 0.3683383363566947, + "grad_norm": 454.6327209472656, + "learning_rate": 4.00676682986873e-05, + "loss": 80.0302, + "step": 91170 + }, + { + "epoch": 0.3683787376220623, + "grad_norm": 582.725341796875, + "learning_rate": 4.006488273626307e-05, + "loss": 81.7921, + "step": 91180 + }, + { + "epoch": 0.36841913888742994, + "grad_norm": 998.3577880859375, + "learning_rate": 4.006209688014119e-05, + "loss": 40.4945, + "step": 91190 + }, + { + "epoch": 0.3684595401527976, + "grad_norm": 951.7125854492188, + "learning_rate": 4.005931073037596e-05, + "loss": 54.5525, + "step": 91200 + }, + { + "epoch": 0.3684999414181652, + "grad_norm": 517.168701171875, + "learning_rate": 4.005652428702173e-05, + "loss": 52.3269, + "step": 91210 + }, + { + "epoch": 0.36854034268353286, + "grad_norm": 982.7772216796875, + "learning_rate": 4.0053737550132816e-05, + "loss": 65.8004, + "step": 91220 + }, + { + "epoch": 0.3685807439489005, + "grad_norm": 0.0, + "learning_rate": 4.005095051976353e-05, + "loss": 58.1448, + "step": 91230 + }, + { + "epoch": 0.3686211452142681, + "grad_norm": 1081.756103515625, + "learning_rate": 4.0048163195968214e-05, + "loss": 51.2047, + "step": 91240 + }, + { + "epoch": 0.3686615464796357, + "grad_norm": 837.7747802734375, + "learning_rate": 4.0045375578801214e-05, + "loss": 66.0523, + "step": 91250 + }, + { + "epoch": 0.36870194774500337, + "grad_norm": 316.6372375488281, + "learning_rate": 4.004258766831686e-05, + "loss": 64.1896, + "step": 91260 + }, + { + "epoch": 0.368742349010371, + "grad_norm": 820.6416015625, + "learning_rate": 4.0039799464569524e-05, + "loss": 83.5785, + "step": 91270 + }, + { + "epoch": 0.36878275027573865, + "grad_norm": 1047.304443359375, + "learning_rate": 4.003701096761355e-05, + "loss": 64.335, + "step": 91280 + }, + { + "epoch": 0.3688231515411063, + "grad_norm": 882.9609375, + "learning_rate": 4.0034222177503314e-05, + "loss": 74.856, + "step": 91290 + }, + { + "epoch": 0.36886355280647387, + "grad_norm": 758.5153198242188, + "learning_rate": 4.003143309429317e-05, + "loss": 57.2044, + "step": 91300 + }, + { + "epoch": 0.3689039540718415, + "grad_norm": 637.7432861328125, + "learning_rate": 4.0028643718037496e-05, + "loss": 67.143, + "step": 91310 + }, + { + "epoch": 0.36894435533720915, + "grad_norm": 982.8681640625, + "learning_rate": 4.0025854048790677e-05, + "loss": 56.672, + "step": 91320 + }, + { + "epoch": 0.3689847566025768, + "grad_norm": 975.6199951171875, + "learning_rate": 4.00230640866071e-05, + "loss": 77.8675, + "step": 91330 + }, + { + "epoch": 0.36902515786794443, + "grad_norm": 463.59063720703125, + "learning_rate": 4.0020273831541155e-05, + "loss": 47.0041, + "step": 91340 + }, + { + "epoch": 0.36906555913331207, + "grad_norm": 1927.79541015625, + "learning_rate": 4.001748328364724e-05, + "loss": 100.6176, + "step": 91350 + }, + { + "epoch": 0.3691059603986797, + "grad_norm": 614.8819580078125, + "learning_rate": 4.001469244297975e-05, + "loss": 63.3884, + "step": 91360 + }, + { + "epoch": 0.3691463616640473, + "grad_norm": 578.9991455078125, + "learning_rate": 4.00119013095931e-05, + "loss": 38.7527, + "step": 91370 + }, + { + "epoch": 0.36918676292941494, + "grad_norm": 424.689697265625, + "learning_rate": 4.0009109883541715e-05, + "loss": 71.5254, + "step": 91380 + }, + { + "epoch": 0.3692271641947826, + "grad_norm": 592.7898559570312, + "learning_rate": 4.000631816488001e-05, + "loss": 88.0966, + "step": 91390 + }, + { + "epoch": 0.3692675654601502, + "grad_norm": 699.6017456054688, + "learning_rate": 4.000352615366239e-05, + "loss": 65.126, + "step": 91400 + }, + { + "epoch": 0.36930796672551786, + "grad_norm": 3042.03662109375, + "learning_rate": 4.0000733849943313e-05, + "loss": 60.7142, + "step": 91410 + }, + { + "epoch": 0.3693483679908855, + "grad_norm": 799.2810668945312, + "learning_rate": 3.999794125377721e-05, + "loss": 36.586, + "step": 91420 + }, + { + "epoch": 0.3693887692562531, + "grad_norm": 968.9716796875, + "learning_rate": 3.999514836521851e-05, + "loss": 69.6768, + "step": 91430 + }, + { + "epoch": 0.3694291705216207, + "grad_norm": 1178.7864990234375, + "learning_rate": 3.999235518432168e-05, + "loss": 61.2758, + "step": 91440 + }, + { + "epoch": 0.36946957178698836, + "grad_norm": 490.9415283203125, + "learning_rate": 3.998956171114116e-05, + "loss": 92.8936, + "step": 91450 + }, + { + "epoch": 0.369509973052356, + "grad_norm": 429.26019287109375, + "learning_rate": 3.998676794573142e-05, + "loss": 54.8671, + "step": 91460 + }, + { + "epoch": 0.36955037431772364, + "grad_norm": 556.7578125, + "learning_rate": 3.998397388814693e-05, + "loss": 50.8339, + "step": 91470 + }, + { + "epoch": 0.3695907755830913, + "grad_norm": 745.5537719726562, + "learning_rate": 3.9981179538442146e-05, + "loss": 75.3068, + "step": 91480 + }, + { + "epoch": 0.3696311768484589, + "grad_norm": 1237.29638671875, + "learning_rate": 3.9978384896671564e-05, + "loss": 71.3501, + "step": 91490 + }, + { + "epoch": 0.3696715781138265, + "grad_norm": 846.2611083984375, + "learning_rate": 3.997558996288965e-05, + "loss": 68.3017, + "step": 91500 + }, + { + "epoch": 0.36971197937919414, + "grad_norm": 1087.683837890625, + "learning_rate": 3.9972794737150895e-05, + "loss": 89.0834, + "step": 91510 + }, + { + "epoch": 0.3697523806445618, + "grad_norm": 462.5960998535156, + "learning_rate": 3.996999921950981e-05, + "loss": 72.2417, + "step": 91520 + }, + { + "epoch": 0.3697927819099294, + "grad_norm": 623.9275512695312, + "learning_rate": 3.9967203410020875e-05, + "loss": 79.5964, + "step": 91530 + }, + { + "epoch": 0.36983318317529706, + "grad_norm": 2321.183837890625, + "learning_rate": 3.99644073087386e-05, + "loss": 100.5441, + "step": 91540 + }, + { + "epoch": 0.3698735844406647, + "grad_norm": 931.0929565429688, + "learning_rate": 3.9961610915717515e-05, + "loss": 69.4745, + "step": 91550 + }, + { + "epoch": 0.3699139857060323, + "grad_norm": 459.2430419921875, + "learning_rate": 3.9958814231012115e-05, + "loss": 48.2317, + "step": 91560 + }, + { + "epoch": 0.36995438697139993, + "grad_norm": 2218.796630859375, + "learning_rate": 3.9956017254676923e-05, + "loss": 73.4239, + "step": 91570 + }, + { + "epoch": 0.36999478823676757, + "grad_norm": 579.8865966796875, + "learning_rate": 3.995321998676648e-05, + "loss": 69.1222, + "step": 91580 + }, + { + "epoch": 0.3700351895021352, + "grad_norm": 805.66943359375, + "learning_rate": 3.995042242733532e-05, + "loss": 40.6279, + "step": 91590 + }, + { + "epoch": 0.37007559076750285, + "grad_norm": 1477.2010498046875, + "learning_rate": 3.9947624576437975e-05, + "loss": 54.183, + "step": 91600 + }, + { + "epoch": 0.3701159920328705, + "grad_norm": 1422.080078125, + "learning_rate": 3.994482643412899e-05, + "loss": 44.8291, + "step": 91610 + }, + { + "epoch": 0.3701563932982381, + "grad_norm": 596.1489868164062, + "learning_rate": 3.994202800046292e-05, + "loss": 50.5651, + "step": 91620 + }, + { + "epoch": 0.3701967945636057, + "grad_norm": 697.937744140625, + "learning_rate": 3.993922927549432e-05, + "loss": 51.1245, + "step": 91630 + }, + { + "epoch": 0.37023719582897335, + "grad_norm": 660.627197265625, + "learning_rate": 3.993643025927776e-05, + "loss": 73.9116, + "step": 91640 + }, + { + "epoch": 0.370277597094341, + "grad_norm": 735.249267578125, + "learning_rate": 3.9933630951867805e-05, + "loss": 41.1959, + "step": 91650 + }, + { + "epoch": 0.37031799835970863, + "grad_norm": 1717.015380859375, + "learning_rate": 3.9930831353319023e-05, + "loss": 66.5677, + "step": 91660 + }, + { + "epoch": 0.3703583996250763, + "grad_norm": 628.260009765625, + "learning_rate": 3.992803146368599e-05, + "loss": 50.1199, + "step": 91670 + }, + { + "epoch": 0.3703988008904439, + "grad_norm": 1197.982666015625, + "learning_rate": 3.99252312830233e-05, + "loss": 53.4161, + "step": 91680 + }, + { + "epoch": 0.3704392021558115, + "grad_norm": 759.8359375, + "learning_rate": 3.992243081138555e-05, + "loss": 49.2217, + "step": 91690 + }, + { + "epoch": 0.37047960342117914, + "grad_norm": 881.2600708007812, + "learning_rate": 3.9919630048827314e-05, + "loss": 75.7815, + "step": 91700 + }, + { + "epoch": 0.3705200046865468, + "grad_norm": 356.75958251953125, + "learning_rate": 3.991682899540322e-05, + "loss": 67.6897, + "step": 91710 + }, + { + "epoch": 0.3705604059519144, + "grad_norm": 1126.136474609375, + "learning_rate": 3.9914027651167866e-05, + "loss": 67.9846, + "step": 91720 + }, + { + "epoch": 0.37060080721728206, + "grad_norm": 467.5633544921875, + "learning_rate": 3.9911226016175866e-05, + "loss": 77.1551, + "step": 91730 + }, + { + "epoch": 0.3706412084826497, + "grad_norm": 590.6211547851562, + "learning_rate": 3.990842409048183e-05, + "loss": 52.3267, + "step": 91740 + }, + { + "epoch": 0.3706816097480173, + "grad_norm": 2134.7158203125, + "learning_rate": 3.99056218741404e-05, + "loss": 89.5028, + "step": 91750 + }, + { + "epoch": 0.3707220110133849, + "grad_norm": 545.798583984375, + "learning_rate": 3.990281936720619e-05, + "loss": 59.62, + "step": 91760 + }, + { + "epoch": 0.37076241227875256, + "grad_norm": 2029.9058837890625, + "learning_rate": 3.990001656973385e-05, + "loss": 59.6643, + "step": 91770 + }, + { + "epoch": 0.3708028135441202, + "grad_norm": 413.22491455078125, + "learning_rate": 3.9897213481778006e-05, + "loss": 48.1997, + "step": 91780 + }, + { + "epoch": 0.37084321480948784, + "grad_norm": 1251.12744140625, + "learning_rate": 3.9894410103393323e-05, + "loss": 59.8171, + "step": 91790 + }, + { + "epoch": 0.3708836160748555, + "grad_norm": 520.3812866210938, + "learning_rate": 3.989160643463445e-05, + "loss": 71.2307, + "step": 91800 + }, + { + "epoch": 0.3709240173402231, + "grad_norm": 866.8963012695312, + "learning_rate": 3.988880247555604e-05, + "loss": 102.0608, + "step": 91810 + }, + { + "epoch": 0.3709644186055907, + "grad_norm": 546.447998046875, + "learning_rate": 3.9885998226212764e-05, + "loss": 48.1409, + "step": 91820 + }, + { + "epoch": 0.37100481987095835, + "grad_norm": 1045.4818115234375, + "learning_rate": 3.988319368665928e-05, + "loss": 96.8548, + "step": 91830 + }, + { + "epoch": 0.371045221136326, + "grad_norm": 1263.72314453125, + "learning_rate": 3.988038885695028e-05, + "loss": 51.555, + "step": 91840 + }, + { + "epoch": 0.3710856224016936, + "grad_norm": 929.8193359375, + "learning_rate": 3.987758373714044e-05, + "loss": 48.9912, + "step": 91850 + }, + { + "epoch": 0.37112602366706127, + "grad_norm": 1874.9107666015625, + "learning_rate": 3.987477832728444e-05, + "loss": 48.1193, + "step": 91860 + }, + { + "epoch": 0.3711664249324289, + "grad_norm": 449.92388916015625, + "learning_rate": 3.987197262743697e-05, + "loss": 60.4172, + "step": 91870 + }, + { + "epoch": 0.3712068261977965, + "grad_norm": 3075.2314453125, + "learning_rate": 3.986916663765275e-05, + "loss": 66.7121, + "step": 91880 + }, + { + "epoch": 0.37124722746316413, + "grad_norm": 805.412841796875, + "learning_rate": 3.9866360357986467e-05, + "loss": 60.1465, + "step": 91890 + }, + { + "epoch": 0.37128762872853177, + "grad_norm": 925.1229248046875, + "learning_rate": 3.9863553788492834e-05, + "loss": 40.3686, + "step": 91900 + }, + { + "epoch": 0.3713280299938994, + "grad_norm": 600.6539306640625, + "learning_rate": 3.9860746929226567e-05, + "loss": 66.7164, + "step": 91910 + }, + { + "epoch": 0.37136843125926705, + "grad_norm": 1517.1585693359375, + "learning_rate": 3.985793978024239e-05, + "loss": 42.1975, + "step": 91920 + }, + { + "epoch": 0.3714088325246347, + "grad_norm": 1709.787353515625, + "learning_rate": 3.985513234159502e-05, + "loss": 61.1718, + "step": 91930 + }, + { + "epoch": 0.3714492337900023, + "grad_norm": 610.5380249023438, + "learning_rate": 3.985232461333921e-05, + "loss": 66.061, + "step": 91940 + }, + { + "epoch": 0.3714896350553699, + "grad_norm": 1254.192138671875, + "learning_rate": 3.984951659552968e-05, + "loss": 61.3981, + "step": 91950 + }, + { + "epoch": 0.37153003632073756, + "grad_norm": 725.9743041992188, + "learning_rate": 3.984670828822118e-05, + "loss": 50.754, + "step": 91960 + }, + { + "epoch": 0.3715704375861052, + "grad_norm": 898.7955322265625, + "learning_rate": 3.984389969146846e-05, + "loss": 80.66, + "step": 91970 + }, + { + "epoch": 0.37161083885147284, + "grad_norm": 1205.29345703125, + "learning_rate": 3.9841090805326264e-05, + "loss": 88.857, + "step": 91980 + }, + { + "epoch": 0.3716512401168405, + "grad_norm": 304.8878479003906, + "learning_rate": 3.983828162984937e-05, + "loss": 39.1823, + "step": 91990 + }, + { + "epoch": 0.3716916413822081, + "grad_norm": 474.86700439453125, + "learning_rate": 3.983547216509254e-05, + "loss": 55.8331, + "step": 92000 + }, + { + "epoch": 0.3717320426475757, + "grad_norm": 401.64044189453125, + "learning_rate": 3.9832662411110536e-05, + "loss": 69.07, + "step": 92010 + }, + { + "epoch": 0.37177244391294334, + "grad_norm": 338.5098571777344, + "learning_rate": 3.9829852367958144e-05, + "loss": 49.6506, + "step": 92020 + }, + { + "epoch": 0.371812845178311, + "grad_norm": 616.5515747070312, + "learning_rate": 3.9827042035690145e-05, + "loss": 55.922, + "step": 92030 + }, + { + "epoch": 0.3718532464436786, + "grad_norm": 696.16357421875, + "learning_rate": 3.9824231414361324e-05, + "loss": 56.3399, + "step": 92040 + }, + { + "epoch": 0.37189364770904626, + "grad_norm": 555.5895385742188, + "learning_rate": 3.982142050402649e-05, + "loss": 57.3369, + "step": 92050 + }, + { + "epoch": 0.3719340489744139, + "grad_norm": 777.5509033203125, + "learning_rate": 3.9818609304740414e-05, + "loss": 72.6481, + "step": 92060 + }, + { + "epoch": 0.3719744502397815, + "grad_norm": 752.47705078125, + "learning_rate": 3.981579781655794e-05, + "loss": 71.8169, + "step": 92070 + }, + { + "epoch": 0.3720148515051491, + "grad_norm": 973.05419921875, + "learning_rate": 3.981298603953385e-05, + "loss": 84.1057, + "step": 92080 + }, + { + "epoch": 0.37205525277051676, + "grad_norm": 956.1757202148438, + "learning_rate": 3.9810173973722974e-05, + "loss": 57.5237, + "step": 92090 + }, + { + "epoch": 0.3720956540358844, + "grad_norm": 606.1538696289062, + "learning_rate": 3.980736161918013e-05, + "loss": 51.6493, + "step": 92100 + }, + { + "epoch": 0.37213605530125204, + "grad_norm": 401.66314697265625, + "learning_rate": 3.980454897596014e-05, + "loss": 74.039, + "step": 92110 + }, + { + "epoch": 0.3721764565666197, + "grad_norm": 469.2101745605469, + "learning_rate": 3.980173604411786e-05, + "loss": 62.7403, + "step": 92120 + }, + { + "epoch": 0.3722168578319873, + "grad_norm": 814.7521362304688, + "learning_rate": 3.979892282370811e-05, + "loss": 93.9603, + "step": 92130 + }, + { + "epoch": 0.3722572590973549, + "grad_norm": 1050.54296875, + "learning_rate": 3.979610931478574e-05, + "loss": 79.703, + "step": 92140 + }, + { + "epoch": 0.37229766036272255, + "grad_norm": 426.4966735839844, + "learning_rate": 3.97932955174056e-05, + "loss": 74.3453, + "step": 92150 + }, + { + "epoch": 0.3723380616280902, + "grad_norm": 587.7415161132812, + "learning_rate": 3.979048143162255e-05, + "loss": 58.7905, + "step": 92160 + }, + { + "epoch": 0.37237846289345783, + "grad_norm": 1028.9864501953125, + "learning_rate": 3.978766705749145e-05, + "loss": 49.0045, + "step": 92170 + }, + { + "epoch": 0.37241886415882547, + "grad_norm": 722.3172607421875, + "learning_rate": 3.9784852395067166e-05, + "loss": 50.1978, + "step": 92180 + }, + { + "epoch": 0.3724592654241931, + "grad_norm": 1028.6400146484375, + "learning_rate": 3.978203744440457e-05, + "loss": 65.8073, + "step": 92190 + }, + { + "epoch": 0.3724996666895607, + "grad_norm": 350.545654296875, + "learning_rate": 3.977922220555855e-05, + "loss": 66.0355, + "step": 92200 + }, + { + "epoch": 0.37254006795492833, + "grad_norm": 888.4639282226562, + "learning_rate": 3.977640667858398e-05, + "loss": 61.8959, + "step": 92210 + }, + { + "epoch": 0.372580469220296, + "grad_norm": 938.3932495117188, + "learning_rate": 3.977359086353576e-05, + "loss": 69.4164, + "step": 92220 + }, + { + "epoch": 0.3726208704856636, + "grad_norm": 1005.269775390625, + "learning_rate": 3.977077476046877e-05, + "loss": 65.6192, + "step": 92230 + }, + { + "epoch": 0.37266127175103125, + "grad_norm": 557.5337524414062, + "learning_rate": 3.976795836943793e-05, + "loss": 59.2249, + "step": 92240 + }, + { + "epoch": 0.3727016730163989, + "grad_norm": 2857.20703125, + "learning_rate": 3.976514169049814e-05, + "loss": 68.5465, + "step": 92250 + }, + { + "epoch": 0.3727420742817665, + "grad_norm": 1169.2440185546875, + "learning_rate": 3.97623247237043e-05, + "loss": 46.9372, + "step": 92260 + }, + { + "epoch": 0.3727824755471341, + "grad_norm": 561.6387329101562, + "learning_rate": 3.9759507469111346e-05, + "loss": 59.3901, + "step": 92270 + }, + { + "epoch": 0.37282287681250176, + "grad_norm": 835.4718017578125, + "learning_rate": 3.9756689926774196e-05, + "loss": 85.8709, + "step": 92280 + }, + { + "epoch": 0.3728632780778694, + "grad_norm": 789.12841796875, + "learning_rate": 3.975387209674778e-05, + "loss": 71.775, + "step": 92290 + }, + { + "epoch": 0.37290367934323704, + "grad_norm": 0.0, + "learning_rate": 3.9751053979087035e-05, + "loss": 47.8555, + "step": 92300 + }, + { + "epoch": 0.3729440806086047, + "grad_norm": 776.5151977539062, + "learning_rate": 3.9748235573846894e-05, + "loss": 70.7797, + "step": 92310 + }, + { + "epoch": 0.3729844818739723, + "grad_norm": 987.219482421875, + "learning_rate": 3.97454168810823e-05, + "loss": 74.3305, + "step": 92320 + }, + { + "epoch": 0.3730248831393399, + "grad_norm": 508.5974426269531, + "learning_rate": 3.974259790084822e-05, + "loss": 80.4925, + "step": 92330 + }, + { + "epoch": 0.37306528440470754, + "grad_norm": 784.2496948242188, + "learning_rate": 3.973977863319961e-05, + "loss": 75.617, + "step": 92340 + }, + { + "epoch": 0.3731056856700752, + "grad_norm": 418.7388916015625, + "learning_rate": 3.973695907819142e-05, + "loss": 49.6771, + "step": 92350 + }, + { + "epoch": 0.3731460869354428, + "grad_norm": 1385.4676513671875, + "learning_rate": 3.973413923587862e-05, + "loss": 54.8952, + "step": 92360 + }, + { + "epoch": 0.37318648820081046, + "grad_norm": 1004.26611328125, + "learning_rate": 3.97313191063162e-05, + "loss": 66.2516, + "step": 92370 + }, + { + "epoch": 0.3732268894661781, + "grad_norm": 331.77972412109375, + "learning_rate": 3.9728498689559126e-05, + "loss": 62.8471, + "step": 92380 + }, + { + "epoch": 0.3732672907315457, + "grad_norm": 799.3560180664062, + "learning_rate": 3.972567798566238e-05, + "loss": 76.5567, + "step": 92390 + }, + { + "epoch": 0.3733076919969133, + "grad_norm": 733.6048583984375, + "learning_rate": 3.9722856994680966e-05, + "loss": 60.599, + "step": 92400 + }, + { + "epoch": 0.37334809326228097, + "grad_norm": 610.0665893554688, + "learning_rate": 3.9720035716669876e-05, + "loss": 58.7812, + "step": 92410 + }, + { + "epoch": 0.3733884945276486, + "grad_norm": 757.6383666992188, + "learning_rate": 3.971721415168411e-05, + "loss": 72.6537, + "step": 92420 + }, + { + "epoch": 0.37342889579301625, + "grad_norm": 628.03515625, + "learning_rate": 3.971439229977869e-05, + "loss": 58.028, + "step": 92430 + }, + { + "epoch": 0.3734692970583839, + "grad_norm": 567.295654296875, + "learning_rate": 3.9711570161008596e-05, + "loss": 54.5905, + "step": 92440 + }, + { + "epoch": 0.3735096983237515, + "grad_norm": 453.08258056640625, + "learning_rate": 3.9708747735428886e-05, + "loss": 53.7484, + "step": 92450 + }, + { + "epoch": 0.3735500995891191, + "grad_norm": 426.8596496582031, + "learning_rate": 3.9705925023094554e-05, + "loss": 73.5093, + "step": 92460 + }, + { + "epoch": 0.37359050085448675, + "grad_norm": 372.20684814453125, + "learning_rate": 3.970310202406064e-05, + "loss": 44.4281, + "step": 92470 + }, + { + "epoch": 0.3736309021198544, + "grad_norm": 714.4765014648438, + "learning_rate": 3.970027873838219e-05, + "loss": 78.6218, + "step": 92480 + }, + { + "epoch": 0.37367130338522203, + "grad_norm": 709.8063354492188, + "learning_rate": 3.969745516611424e-05, + "loss": 55.8991, + "step": 92490 + }, + { + "epoch": 0.37371170465058967, + "grad_norm": 840.8602294921875, + "learning_rate": 3.969463130731183e-05, + "loss": 68.27, + "step": 92500 + }, + { + "epoch": 0.3737521059159573, + "grad_norm": 1879.0968017578125, + "learning_rate": 3.969180716203002e-05, + "loss": 98.6429, + "step": 92510 + }, + { + "epoch": 0.3737925071813249, + "grad_norm": 925.2473754882812, + "learning_rate": 3.9688982730323865e-05, + "loss": 74.3842, + "step": 92520 + }, + { + "epoch": 0.37383290844669254, + "grad_norm": 1487.61962890625, + "learning_rate": 3.968615801224843e-05, + "loss": 81.2509, + "step": 92530 + }, + { + "epoch": 0.3738733097120602, + "grad_norm": 585.5298461914062, + "learning_rate": 3.968333300785878e-05, + "loss": 89.9086, + "step": 92540 + }, + { + "epoch": 0.3739137109774278, + "grad_norm": 598.8243408203125, + "learning_rate": 3.968050771720999e-05, + "loss": 52.4405, + "step": 92550 + }, + { + "epoch": 0.37395411224279546, + "grad_norm": 451.022705078125, + "learning_rate": 3.967768214035715e-05, + "loss": 71.7542, + "step": 92560 + }, + { + "epoch": 0.3739945135081631, + "grad_norm": 768.9769287109375, + "learning_rate": 3.967485627735534e-05, + "loss": 92.9239, + "step": 92570 + }, + { + "epoch": 0.3740349147735307, + "grad_norm": 854.9164428710938, + "learning_rate": 3.967203012825965e-05, + "loss": 71.4999, + "step": 92580 + }, + { + "epoch": 0.3740753160388983, + "grad_norm": 471.7916259765625, + "learning_rate": 3.966920369312518e-05, + "loss": 47.2625, + "step": 92590 + }, + { + "epoch": 0.37411571730426596, + "grad_norm": 478.0005187988281, + "learning_rate": 3.966637697200703e-05, + "loss": 56.9574, + "step": 92600 + }, + { + "epoch": 0.3741561185696336, + "grad_norm": 968.0185546875, + "learning_rate": 3.9663549964960314e-05, + "loss": 62.3624, + "step": 92610 + }, + { + "epoch": 0.37419651983500124, + "grad_norm": 867.2205810546875, + "learning_rate": 3.966072267204014e-05, + "loss": 33.5701, + "step": 92620 + }, + { + "epoch": 0.3742369211003689, + "grad_norm": 941.1777954101562, + "learning_rate": 3.965789509330163e-05, + "loss": 78.525, + "step": 92630 + }, + { + "epoch": 0.3742773223657365, + "grad_norm": 612.6570434570312, + "learning_rate": 3.965506722879991e-05, + "loss": 40.913, + "step": 92640 + }, + { + "epoch": 0.3743177236311041, + "grad_norm": 127.48741149902344, + "learning_rate": 3.965223907859011e-05, + "loss": 63.8331, + "step": 92650 + }, + { + "epoch": 0.37435812489647174, + "grad_norm": 443.2701721191406, + "learning_rate": 3.964941064272736e-05, + "loss": 71.8824, + "step": 92660 + }, + { + "epoch": 0.3743985261618394, + "grad_norm": 431.3937072753906, + "learning_rate": 3.9646581921266815e-05, + "loss": 74.9664, + "step": 92670 + }, + { + "epoch": 0.374438927427207, + "grad_norm": 852.2191772460938, + "learning_rate": 3.964375291426361e-05, + "loss": 51.0597, + "step": 92680 + }, + { + "epoch": 0.37447932869257466, + "grad_norm": 493.4492492675781, + "learning_rate": 3.96409236217729e-05, + "loss": 39.8483, + "step": 92690 + }, + { + "epoch": 0.3745197299579423, + "grad_norm": 363.88836669921875, + "learning_rate": 3.963809404384985e-05, + "loss": 44.7268, + "step": 92700 + }, + { + "epoch": 0.3745601312233099, + "grad_norm": 2506.282470703125, + "learning_rate": 3.9635264180549624e-05, + "loss": 79.4526, + "step": 92710 + }, + { + "epoch": 0.37460053248867753, + "grad_norm": 1399.570556640625, + "learning_rate": 3.963243403192739e-05, + "loss": 62.9623, + "step": 92720 + }, + { + "epoch": 0.37464093375404517, + "grad_norm": 502.1692810058594, + "learning_rate": 3.9629603598038314e-05, + "loss": 73.1174, + "step": 92730 + }, + { + "epoch": 0.3746813350194128, + "grad_norm": 478.8690185546875, + "learning_rate": 3.962677287893758e-05, + "loss": 66.815, + "step": 92740 + }, + { + "epoch": 0.37472173628478045, + "grad_norm": 563.9537963867188, + "learning_rate": 3.962394187468039e-05, + "loss": 72.0108, + "step": 92750 + }, + { + "epoch": 0.3747621375501481, + "grad_norm": 761.1668090820312, + "learning_rate": 3.962111058532192e-05, + "loss": 42.6949, + "step": 92760 + }, + { + "epoch": 0.37480253881551573, + "grad_norm": 795.1704711914062, + "learning_rate": 3.961827901091737e-05, + "loss": 58.1634, + "step": 92770 + }, + { + "epoch": 0.3748429400808833, + "grad_norm": 811.5784301757812, + "learning_rate": 3.9615447151521945e-05, + "loss": 75.2723, + "step": 92780 + }, + { + "epoch": 0.37488334134625095, + "grad_norm": 444.902099609375, + "learning_rate": 3.961261500719085e-05, + "loss": 62.2803, + "step": 92790 + }, + { + "epoch": 0.3749237426116186, + "grad_norm": 530.8676147460938, + "learning_rate": 3.960978257797931e-05, + "loss": 62.1482, + "step": 92800 + }, + { + "epoch": 0.37496414387698623, + "grad_norm": 230.55746459960938, + "learning_rate": 3.9606949863942526e-05, + "loss": 60.5643, + "step": 92810 + }, + { + "epoch": 0.3750045451423539, + "grad_norm": 656.6939697265625, + "learning_rate": 3.960411686513574e-05, + "loss": 49.9396, + "step": 92820 + }, + { + "epoch": 0.3750449464077215, + "grad_norm": 1161.34765625, + "learning_rate": 3.960128358161418e-05, + "loss": 50.9719, + "step": 92830 + }, + { + "epoch": 0.3750853476730891, + "grad_norm": 768.6620483398438, + "learning_rate": 3.9598450013433075e-05, + "loss": 68.8406, + "step": 92840 + }, + { + "epoch": 0.37512574893845674, + "grad_norm": 512.8631591796875, + "learning_rate": 3.9595616160647674e-05, + "loss": 61.0759, + "step": 92850 + }, + { + "epoch": 0.3751661502038244, + "grad_norm": 341.27178955078125, + "learning_rate": 3.959278202331322e-05, + "loss": 50.0951, + "step": 92860 + }, + { + "epoch": 0.375206551469192, + "grad_norm": 1139.0494384765625, + "learning_rate": 3.9589947601484974e-05, + "loss": 72.3378, + "step": 92870 + }, + { + "epoch": 0.37524695273455966, + "grad_norm": 703.4913940429688, + "learning_rate": 3.9587112895218184e-05, + "loss": 57.7523, + "step": 92880 + }, + { + "epoch": 0.3752873539999273, + "grad_norm": 1080.4427490234375, + "learning_rate": 3.958427790456811e-05, + "loss": 68.2021, + "step": 92890 + }, + { + "epoch": 0.3753277552652949, + "grad_norm": 669.9296264648438, + "learning_rate": 3.958144262959004e-05, + "loss": 71.9688, + "step": 92900 + }, + { + "epoch": 0.3753681565306625, + "grad_norm": 945.6260986328125, + "learning_rate": 3.9578607070339235e-05, + "loss": 72.3592, + "step": 92910 + }, + { + "epoch": 0.37540855779603016, + "grad_norm": 310.77203369140625, + "learning_rate": 3.957577122687098e-05, + "loss": 53.5487, + "step": 92920 + }, + { + "epoch": 0.3754489590613978, + "grad_norm": 602.9820556640625, + "learning_rate": 3.957293509924056e-05, + "loss": 61.1102, + "step": 92930 + }, + { + "epoch": 0.37548936032676544, + "grad_norm": 257.7418518066406, + "learning_rate": 3.9570098687503274e-05, + "loss": 56.4356, + "step": 92940 + }, + { + "epoch": 0.3755297615921331, + "grad_norm": 896.652587890625, + "learning_rate": 3.9567261991714404e-05, + "loss": 63.6779, + "step": 92950 + }, + { + "epoch": 0.3755701628575007, + "grad_norm": 1012.0924072265625, + "learning_rate": 3.9564425011929265e-05, + "loss": 68.4286, + "step": 92960 + }, + { + "epoch": 0.3756105641228683, + "grad_norm": 525.5440673828125, + "learning_rate": 3.956158774820316e-05, + "loss": 53.852, + "step": 92970 + }, + { + "epoch": 0.37565096538823595, + "grad_norm": 657.0477905273438, + "learning_rate": 3.955875020059141e-05, + "loss": 79.606, + "step": 92980 + }, + { + "epoch": 0.3756913666536036, + "grad_norm": 1086.502197265625, + "learning_rate": 3.955591236914933e-05, + "loss": 46.6891, + "step": 92990 + }, + { + "epoch": 0.3757317679189712, + "grad_norm": 1276.398193359375, + "learning_rate": 3.955307425393224e-05, + "loss": 83.8669, + "step": 93000 + }, + { + "epoch": 0.37577216918433887, + "grad_norm": 1241.0797119140625, + "learning_rate": 3.955023585499547e-05, + "loss": 65.6787, + "step": 93010 + }, + { + "epoch": 0.3758125704497065, + "grad_norm": 516.9462890625, + "learning_rate": 3.954739717239437e-05, + "loss": 80.84, + "step": 93020 + }, + { + "epoch": 0.3758529717150741, + "grad_norm": 475.4250183105469, + "learning_rate": 3.954455820618427e-05, + "loss": 72.3098, + "step": 93030 + }, + { + "epoch": 0.37589337298044173, + "grad_norm": 950.8383178710938, + "learning_rate": 3.954171895642052e-05, + "loss": 76.2437, + "step": 93040 + }, + { + "epoch": 0.37593377424580937, + "grad_norm": 631.9634399414062, + "learning_rate": 3.953887942315847e-05, + "loss": 42.5692, + "step": 93050 + }, + { + "epoch": 0.375974175511177, + "grad_norm": 942.7606201171875, + "learning_rate": 3.953603960645349e-05, + "loss": 79.1041, + "step": 93060 + }, + { + "epoch": 0.37601457677654465, + "grad_norm": 1219.8829345703125, + "learning_rate": 3.953319950636092e-05, + "loss": 107.0019, + "step": 93070 + }, + { + "epoch": 0.3760549780419123, + "grad_norm": 688.3053588867188, + "learning_rate": 3.953035912293616e-05, + "loss": 64.1869, + "step": 93080 + }, + { + "epoch": 0.37609537930727993, + "grad_norm": 632.4610595703125, + "learning_rate": 3.9527518456234544e-05, + "loss": 57.2202, + "step": 93090 + }, + { + "epoch": 0.3761357805726475, + "grad_norm": 323.7148742675781, + "learning_rate": 3.95246775063115e-05, + "loss": 45.6848, + "step": 93100 + }, + { + "epoch": 0.37617618183801516, + "grad_norm": 789.0226440429688, + "learning_rate": 3.952183627322238e-05, + "loss": 76.8052, + "step": 93110 + }, + { + "epoch": 0.3762165831033828, + "grad_norm": 631.6780395507812, + "learning_rate": 3.951899475702259e-05, + "loss": 57.1333, + "step": 93120 + }, + { + "epoch": 0.37625698436875044, + "grad_norm": 1251.869140625, + "learning_rate": 3.951615295776752e-05, + "loss": 120.3193, + "step": 93130 + }, + { + "epoch": 0.3762973856341181, + "grad_norm": 525.0264892578125, + "learning_rate": 3.951331087551257e-05, + "loss": 77.0742, + "step": 93140 + }, + { + "epoch": 0.3763377868994857, + "grad_norm": 927.7319946289062, + "learning_rate": 3.951046851031315e-05, + "loss": 68.461, + "step": 93150 + }, + { + "epoch": 0.3763781881648533, + "grad_norm": 388.8170471191406, + "learning_rate": 3.950762586222468e-05, + "loss": 33.0051, + "step": 93160 + }, + { + "epoch": 0.37641858943022094, + "grad_norm": 462.57025146484375, + "learning_rate": 3.950478293130258e-05, + "loss": 57.6939, + "step": 93170 + }, + { + "epoch": 0.3764589906955886, + "grad_norm": 818.913818359375, + "learning_rate": 3.950193971760226e-05, + "loss": 47.7335, + "step": 93180 + }, + { + "epoch": 0.3764993919609562, + "grad_norm": 2200.9609375, + "learning_rate": 3.949909622117918e-05, + "loss": 64.2052, + "step": 93190 + }, + { + "epoch": 0.37653979322632386, + "grad_norm": 908.902099609375, + "learning_rate": 3.9496252442088733e-05, + "loss": 36.8062, + "step": 93200 + }, + { + "epoch": 0.3765801944916915, + "grad_norm": 740.1572265625, + "learning_rate": 3.949340838038639e-05, + "loss": 68.5629, + "step": 93210 + }, + { + "epoch": 0.3766205957570591, + "grad_norm": 1135.758544921875, + "learning_rate": 3.949056403612758e-05, + "loss": 62.5657, + "step": 93220 + }, + { + "epoch": 0.3766609970224267, + "grad_norm": 2026.0989990234375, + "learning_rate": 3.9487719409367774e-05, + "loss": 74.2239, + "step": 93230 + }, + { + "epoch": 0.37670139828779436, + "grad_norm": 864.1495361328125, + "learning_rate": 3.948487450016242e-05, + "loss": 92.9084, + "step": 93240 + }, + { + "epoch": 0.376741799553162, + "grad_norm": 810.767333984375, + "learning_rate": 3.948202930856697e-05, + "loss": 80.3417, + "step": 93250 + }, + { + "epoch": 0.37678220081852964, + "grad_norm": 170.60748291015625, + "learning_rate": 3.947918383463691e-05, + "loss": 58.1501, + "step": 93260 + }, + { + "epoch": 0.3768226020838973, + "grad_norm": 521.3902587890625, + "learning_rate": 3.947633807842771e-05, + "loss": 24.7328, + "step": 93270 + }, + { + "epoch": 0.3768630033492649, + "grad_norm": 615.25048828125, + "learning_rate": 3.947349203999484e-05, + "loss": 69.8992, + "step": 93280 + }, + { + "epoch": 0.3769034046146325, + "grad_norm": 616.8837280273438, + "learning_rate": 3.9470645719393794e-05, + "loss": 48.8142, + "step": 93290 + }, + { + "epoch": 0.37694380588000015, + "grad_norm": 1704.8671875, + "learning_rate": 3.946779911668006e-05, + "loss": 62.0936, + "step": 93300 + }, + { + "epoch": 0.3769842071453678, + "grad_norm": 816.0055541992188, + "learning_rate": 3.9464952231909135e-05, + "loss": 56.5077, + "step": 93310 + }, + { + "epoch": 0.37702460841073543, + "grad_norm": 501.01641845703125, + "learning_rate": 3.946210506513651e-05, + "loss": 51.1529, + "step": 93320 + }, + { + "epoch": 0.37706500967610307, + "grad_norm": 1191.9462890625, + "learning_rate": 3.945925761641771e-05, + "loss": 56.0396, + "step": 93330 + }, + { + "epoch": 0.3771054109414707, + "grad_norm": 910.802734375, + "learning_rate": 3.945640988580824e-05, + "loss": 83.711, + "step": 93340 + }, + { + "epoch": 0.3771458122068383, + "grad_norm": 1067.3336181640625, + "learning_rate": 3.9453561873363615e-05, + "loss": 57.1342, + "step": 93350 + }, + { + "epoch": 0.37718621347220593, + "grad_norm": 894.1754150390625, + "learning_rate": 3.945071357913935e-05, + "loss": 53.6304, + "step": 93360 + }, + { + "epoch": 0.3772266147375736, + "grad_norm": 760.7733764648438, + "learning_rate": 3.9447865003191e-05, + "loss": 67.4268, + "step": 93370 + }, + { + "epoch": 0.3772670160029412, + "grad_norm": 771.8887939453125, + "learning_rate": 3.9445016145574074e-05, + "loss": 69.5092, + "step": 93380 + }, + { + "epoch": 0.37730741726830885, + "grad_norm": 459.5794372558594, + "learning_rate": 3.9442167006344124e-05, + "loss": 46.507, + "step": 93390 + }, + { + "epoch": 0.3773478185336765, + "grad_norm": 2548.902099609375, + "learning_rate": 3.943931758555669e-05, + "loss": 73.9357, + "step": 93400 + }, + { + "epoch": 0.37738821979904413, + "grad_norm": 1343.5716552734375, + "learning_rate": 3.9436467883267334e-05, + "loss": 52.8962, + "step": 93410 + }, + { + "epoch": 0.3774286210644117, + "grad_norm": 1176.486083984375, + "learning_rate": 3.9433617899531597e-05, + "loss": 71.4715, + "step": 93420 + }, + { + "epoch": 0.37746902232977936, + "grad_norm": 3818.821533203125, + "learning_rate": 3.943076763440505e-05, + "loss": 73.5444, + "step": 93430 + }, + { + "epoch": 0.377509423595147, + "grad_norm": 1363.684814453125, + "learning_rate": 3.942791708794326e-05, + "loss": 45.8224, + "step": 93440 + }, + { + "epoch": 0.37754982486051464, + "grad_norm": 602.17138671875, + "learning_rate": 3.9425066260201796e-05, + "loss": 56.5135, + "step": 93450 + }, + { + "epoch": 0.3775902261258823, + "grad_norm": 533.1290893554688, + "learning_rate": 3.942221515123623e-05, + "loss": 47.6091, + "step": 93460 + }, + { + "epoch": 0.3776306273912499, + "grad_norm": 868.53125, + "learning_rate": 3.941936376110217e-05, + "loss": 60.5942, + "step": 93470 + }, + { + "epoch": 0.3776710286566175, + "grad_norm": 581.7073364257812, + "learning_rate": 3.9416512089855184e-05, + "loss": 52.1139, + "step": 93480 + }, + { + "epoch": 0.37771142992198514, + "grad_norm": 983.8074951171875, + "learning_rate": 3.941366013755087e-05, + "loss": 46.983, + "step": 93490 + }, + { + "epoch": 0.3777518311873528, + "grad_norm": 1439.6392822265625, + "learning_rate": 3.941080790424484e-05, + "loss": 56.4244, + "step": 93500 + }, + { + "epoch": 0.3777922324527204, + "grad_norm": 1822.520751953125, + "learning_rate": 3.940795538999268e-05, + "loss": 77.834, + "step": 93510 + }, + { + "epoch": 0.37783263371808806, + "grad_norm": 437.8319396972656, + "learning_rate": 3.940510259485002e-05, + "loss": 47.9851, + "step": 93520 + }, + { + "epoch": 0.3778730349834557, + "grad_norm": 1477.3594970703125, + "learning_rate": 3.9402249518872456e-05, + "loss": 104.0979, + "step": 93530 + }, + { + "epoch": 0.3779134362488233, + "grad_norm": 517.4141235351562, + "learning_rate": 3.939939616211563e-05, + "loss": 46.596, + "step": 93540 + }, + { + "epoch": 0.3779538375141909, + "grad_norm": 673.5197143554688, + "learning_rate": 3.9396542524635175e-05, + "loss": 61.4171, + "step": 93550 + }, + { + "epoch": 0.37799423877955857, + "grad_norm": 679.6529541015625, + "learning_rate": 3.939368860648669e-05, + "loss": 50.3042, + "step": 93560 + }, + { + "epoch": 0.3780346400449262, + "grad_norm": 1323.72314453125, + "learning_rate": 3.939083440772585e-05, + "loss": 52.2302, + "step": 93570 + }, + { + "epoch": 0.37807504131029385, + "grad_norm": 1001.907958984375, + "learning_rate": 3.938797992840828e-05, + "loss": 66.2964, + "step": 93580 + }, + { + "epoch": 0.3781154425756615, + "grad_norm": 615.739990234375, + "learning_rate": 3.9385125168589635e-05, + "loss": 72.0375, + "step": 93590 + }, + { + "epoch": 0.3781558438410291, + "grad_norm": 190.5300750732422, + "learning_rate": 3.938227012832557e-05, + "loss": 38.2591, + "step": 93600 + }, + { + "epoch": 0.3781962451063967, + "grad_norm": 4424.119140625, + "learning_rate": 3.9379414807671736e-05, + "loss": 86.606, + "step": 93610 + }, + { + "epoch": 0.37823664637176435, + "grad_norm": 1783.222900390625, + "learning_rate": 3.937655920668382e-05, + "loss": 45.5719, + "step": 93620 + }, + { + "epoch": 0.378277047637132, + "grad_norm": 508.1633605957031, + "learning_rate": 3.937370332541747e-05, + "loss": 46.5244, + "step": 93630 + }, + { + "epoch": 0.37831744890249963, + "grad_norm": 1679.250732421875, + "learning_rate": 3.937084716392838e-05, + "loss": 105.5736, + "step": 93640 + }, + { + "epoch": 0.37835785016786727, + "grad_norm": 992.9168090820312, + "learning_rate": 3.936799072227222e-05, + "loss": 66.7025, + "step": 93650 + }, + { + "epoch": 0.3783982514332349, + "grad_norm": 709.3222045898438, + "learning_rate": 3.936513400050469e-05, + "loss": 49.8676, + "step": 93660 + }, + { + "epoch": 0.3784386526986025, + "grad_norm": 1248.9063720703125, + "learning_rate": 3.936227699868147e-05, + "loss": 67.6602, + "step": 93670 + }, + { + "epoch": 0.37847905396397014, + "grad_norm": 494.0985107421875, + "learning_rate": 3.9359419716858274e-05, + "loss": 52.3046, + "step": 93680 + }, + { + "epoch": 0.3785194552293378, + "grad_norm": 317.8280334472656, + "learning_rate": 3.9356562155090795e-05, + "loss": 37.2544, + "step": 93690 + }, + { + "epoch": 0.3785598564947054, + "grad_norm": 2666.53662109375, + "learning_rate": 3.935370431343475e-05, + "loss": 82.7156, + "step": 93700 + }, + { + "epoch": 0.37860025776007306, + "grad_norm": 1855.157958984375, + "learning_rate": 3.935084619194584e-05, + "loss": 79.1015, + "step": 93710 + }, + { + "epoch": 0.3786406590254407, + "grad_norm": 621.322265625, + "learning_rate": 3.93479877906798e-05, + "loss": 57.4279, + "step": 93720 + }, + { + "epoch": 0.37868106029080834, + "grad_norm": 785.6000366210938, + "learning_rate": 3.934512910969235e-05, + "loss": 75.8258, + "step": 93730 + }, + { + "epoch": 0.3787214615561759, + "grad_norm": 507.967041015625, + "learning_rate": 3.934227014903922e-05, + "loss": 63.3973, + "step": 93740 + }, + { + "epoch": 0.37876186282154356, + "grad_norm": 1563.5101318359375, + "learning_rate": 3.933941090877615e-05, + "loss": 52.865, + "step": 93750 + }, + { + "epoch": 0.3788022640869112, + "grad_norm": 571.6914672851562, + "learning_rate": 3.933655138895889e-05, + "loss": 61.8429, + "step": 93760 + }, + { + "epoch": 0.37884266535227884, + "grad_norm": 927.9751586914062, + "learning_rate": 3.9333691589643177e-05, + "loss": 51.3117, + "step": 93770 + }, + { + "epoch": 0.3788830666176465, + "grad_norm": 582.5885009765625, + "learning_rate": 3.9330831510884755e-05, + "loss": 85.4374, + "step": 93780 + }, + { + "epoch": 0.3789234678830141, + "grad_norm": 681.2293701171875, + "learning_rate": 3.932797115273941e-05, + "loss": 75.4987, + "step": 93790 + }, + { + "epoch": 0.3789638691483817, + "grad_norm": 720.52392578125, + "learning_rate": 3.932511051526289e-05, + "loss": 83.3338, + "step": 93800 + }, + { + "epoch": 0.37900427041374934, + "grad_norm": 888.4859008789062, + "learning_rate": 3.9322249598510955e-05, + "loss": 67.421, + "step": 93810 + }, + { + "epoch": 0.379044671679117, + "grad_norm": 1525.38134765625, + "learning_rate": 3.93193884025394e-05, + "loss": 85.0812, + "step": 93820 + }, + { + "epoch": 0.3790850729444846, + "grad_norm": 705.2191772460938, + "learning_rate": 3.931652692740399e-05, + "loss": 67.6637, + "step": 93830 + }, + { + "epoch": 0.37912547420985226, + "grad_norm": 1749.32568359375, + "learning_rate": 3.931366517316052e-05, + "loss": 68.8921, + "step": 93840 + }, + { + "epoch": 0.3791658754752199, + "grad_norm": 751.9481201171875, + "learning_rate": 3.9310803139864775e-05, + "loss": 123.6256, + "step": 93850 + }, + { + "epoch": 0.3792062767405875, + "grad_norm": 476.33636474609375, + "learning_rate": 3.9307940827572555e-05, + "loss": 50.916, + "step": 93860 + }, + { + "epoch": 0.37924667800595513, + "grad_norm": 267.0869140625, + "learning_rate": 3.9305078236339666e-05, + "loss": 57.5847, + "step": 93870 + }, + { + "epoch": 0.37928707927132277, + "grad_norm": 953.2073364257812, + "learning_rate": 3.930221536622191e-05, + "loss": 61.0358, + "step": 93880 + }, + { + "epoch": 0.3793274805366904, + "grad_norm": 630.3345336914062, + "learning_rate": 3.9299352217275105e-05, + "loss": 70.866, + "step": 93890 + }, + { + "epoch": 0.37936788180205805, + "grad_norm": 356.209228515625, + "learning_rate": 3.9296488789555066e-05, + "loss": 45.0516, + "step": 93900 + }, + { + "epoch": 0.3794082830674257, + "grad_norm": 410.72320556640625, + "learning_rate": 3.9293625083117616e-05, + "loss": 73.401, + "step": 93910 + }, + { + "epoch": 0.37944868433279333, + "grad_norm": 855.3485107421875, + "learning_rate": 3.9290761098018585e-05, + "loss": 55.5836, + "step": 93920 + }, + { + "epoch": 0.3794890855981609, + "grad_norm": 787.3507690429688, + "learning_rate": 3.928789683431381e-05, + "loss": 55.8925, + "step": 93930 + }, + { + "epoch": 0.37952948686352855, + "grad_norm": 600.7017822265625, + "learning_rate": 3.928503229205913e-05, + "loss": 60.9399, + "step": 93940 + }, + { + "epoch": 0.3795698881288962, + "grad_norm": 561.0108032226562, + "learning_rate": 3.928216747131039e-05, + "loss": 69.8876, + "step": 93950 + }, + { + "epoch": 0.37961028939426383, + "grad_norm": 1152.1119384765625, + "learning_rate": 3.927930237212345e-05, + "loss": 76.2245, + "step": 93960 + }, + { + "epoch": 0.3796506906596315, + "grad_norm": 743.5120239257812, + "learning_rate": 3.9276436994554144e-05, + "loss": 57.824, + "step": 93970 + }, + { + "epoch": 0.3796910919249991, + "grad_norm": 733.8156127929688, + "learning_rate": 3.927357133865836e-05, + "loss": 42.9731, + "step": 93980 + }, + { + "epoch": 0.3797314931903667, + "grad_norm": 431.87158203125, + "learning_rate": 3.927070540449195e-05, + "loss": 63.5863, + "step": 93990 + }, + { + "epoch": 0.37977189445573434, + "grad_norm": 751.9428100585938, + "learning_rate": 3.92678391921108e-05, + "loss": 78.9025, + "step": 94000 + }, + { + "epoch": 0.379812295721102, + "grad_norm": 767.0701293945312, + "learning_rate": 3.926497270157077e-05, + "loss": 42.9222, + "step": 94010 + }, + { + "epoch": 0.3798526969864696, + "grad_norm": 768.52197265625, + "learning_rate": 3.926210593292775e-05, + "loss": 48.6413, + "step": 94020 + }, + { + "epoch": 0.37989309825183726, + "grad_norm": 618.26806640625, + "learning_rate": 3.925923888623764e-05, + "loss": 54.2861, + "step": 94030 + }, + { + "epoch": 0.3799334995172049, + "grad_norm": 647.1188354492188, + "learning_rate": 3.925637156155633e-05, + "loss": 70.6201, + "step": 94040 + }, + { + "epoch": 0.37997390078257254, + "grad_norm": 886.6661987304688, + "learning_rate": 3.925350395893971e-05, + "loss": 85.2603, + "step": 94050 + }, + { + "epoch": 0.3800143020479401, + "grad_norm": 1664.85693359375, + "learning_rate": 3.925063607844369e-05, + "loss": 55.2438, + "step": 94060 + }, + { + "epoch": 0.38005470331330776, + "grad_norm": 788.0062866210938, + "learning_rate": 3.9247767920124176e-05, + "loss": 73.1715, + "step": 94070 + }, + { + "epoch": 0.3800951045786754, + "grad_norm": 435.3290710449219, + "learning_rate": 3.924489948403711e-05, + "loss": 70.93, + "step": 94080 + }, + { + "epoch": 0.38013550584404304, + "grad_norm": 143.54469299316406, + "learning_rate": 3.924203077023839e-05, + "loss": 31.3308, + "step": 94090 + }, + { + "epoch": 0.3801759071094107, + "grad_norm": 1035.6220703125, + "learning_rate": 3.923916177878394e-05, + "loss": 59.6959, + "step": 94100 + }, + { + "epoch": 0.3802163083747783, + "grad_norm": 1633.117431640625, + "learning_rate": 3.9236292509729697e-05, + "loss": 77.1727, + "step": 94110 + }, + { + "epoch": 0.3802567096401459, + "grad_norm": 421.79974365234375, + "learning_rate": 3.9233422963131616e-05, + "loss": 45.4822, + "step": 94120 + }, + { + "epoch": 0.38029711090551355, + "grad_norm": 953.0307006835938, + "learning_rate": 3.9230553139045617e-05, + "loss": 98.6893, + "step": 94130 + }, + { + "epoch": 0.3803375121708812, + "grad_norm": 1626.166015625, + "learning_rate": 3.922768303752766e-05, + "loss": 73.0103, + "step": 94140 + }, + { + "epoch": 0.3803779134362488, + "grad_norm": 1201.143310546875, + "learning_rate": 3.92248126586337e-05, + "loss": 49.0338, + "step": 94150 + }, + { + "epoch": 0.38041831470161647, + "grad_norm": 1199.5015869140625, + "learning_rate": 3.922194200241969e-05, + "loss": 49.4027, + "step": 94160 + }, + { + "epoch": 0.3804587159669841, + "grad_norm": 948.0160522460938, + "learning_rate": 3.92190710689416e-05, + "loss": 47.4217, + "step": 94170 + }, + { + "epoch": 0.3804991172323517, + "grad_norm": 1361.0826416015625, + "learning_rate": 3.92161998582554e-05, + "loss": 68.6793, + "step": 94180 + }, + { + "epoch": 0.38053951849771933, + "grad_norm": 691.0305786132812, + "learning_rate": 3.9213328370417065e-05, + "loss": 86.5888, + "step": 94190 + }, + { + "epoch": 0.38057991976308697, + "grad_norm": 448.7333984375, + "learning_rate": 3.9210456605482576e-05, + "loss": 45.7264, + "step": 94200 + }, + { + "epoch": 0.3806203210284546, + "grad_norm": 926.4880981445312, + "learning_rate": 3.920758456350792e-05, + "loss": 73.8021, + "step": 94210 + }, + { + "epoch": 0.38066072229382225, + "grad_norm": 542.78662109375, + "learning_rate": 3.9204712244549085e-05, + "loss": 65.7361, + "step": 94220 + }, + { + "epoch": 0.3807011235591899, + "grad_norm": 3470.885986328125, + "learning_rate": 3.9201839648662074e-05, + "loss": 63.5717, + "step": 94230 + }, + { + "epoch": 0.38074152482455753, + "grad_norm": 596.5708618164062, + "learning_rate": 3.919896677590289e-05, + "loss": 69.1542, + "step": 94240 + }, + { + "epoch": 0.3807819260899251, + "grad_norm": 797.0637817382812, + "learning_rate": 3.919609362632753e-05, + "loss": 106.799, + "step": 94250 + }, + { + "epoch": 0.38082232735529276, + "grad_norm": 641.5678100585938, + "learning_rate": 3.9193220199992025e-05, + "loss": 42.721, + "step": 94260 + }, + { + "epoch": 0.3808627286206604, + "grad_norm": 370.0324401855469, + "learning_rate": 3.919034649695238e-05, + "loss": 47.1425, + "step": 94270 + }, + { + "epoch": 0.38090312988602804, + "grad_norm": 1143.82861328125, + "learning_rate": 3.918747251726463e-05, + "loss": 51.9892, + "step": 94280 + }, + { + "epoch": 0.3809435311513957, + "grad_norm": 859.6338500976562, + "learning_rate": 3.9184598260984795e-05, + "loss": 64.1427, + "step": 94290 + }, + { + "epoch": 0.3809839324167633, + "grad_norm": 423.8070068359375, + "learning_rate": 3.9181723728168916e-05, + "loss": 75.3932, + "step": 94300 + }, + { + "epoch": 0.3810243336821309, + "grad_norm": 945.8095703125, + "learning_rate": 3.9178848918873027e-05, + "loss": 47.6868, + "step": 94310 + }, + { + "epoch": 0.38106473494749854, + "grad_norm": 1396.765625, + "learning_rate": 3.9175973833153186e-05, + "loss": 53.5173, + "step": 94320 + }, + { + "epoch": 0.3811051362128662, + "grad_norm": 1253.411376953125, + "learning_rate": 3.9173098471065434e-05, + "loss": 48.6057, + "step": 94330 + }, + { + "epoch": 0.3811455374782338, + "grad_norm": 627.4695434570312, + "learning_rate": 3.9170222832665825e-05, + "loss": 68.0112, + "step": 94340 + }, + { + "epoch": 0.38118593874360146, + "grad_norm": 918.7919311523438, + "learning_rate": 3.9167346918010425e-05, + "loss": 102.3109, + "step": 94350 + }, + { + "epoch": 0.3812263400089691, + "grad_norm": 1036.47265625, + "learning_rate": 3.9164470727155314e-05, + "loss": 77.9767, + "step": 94360 + }, + { + "epoch": 0.3812667412743367, + "grad_norm": 1388.8292236328125, + "learning_rate": 3.916159426015655e-05, + "loss": 74.0313, + "step": 94370 + }, + { + "epoch": 0.3813071425397043, + "grad_norm": 761.83203125, + "learning_rate": 3.9158717517070214e-05, + "loss": 58.2245, + "step": 94380 + }, + { + "epoch": 0.38134754380507196, + "grad_norm": 416.04351806640625, + "learning_rate": 3.915584049795239e-05, + "loss": 50.2299, + "step": 94390 + }, + { + "epoch": 0.3813879450704396, + "grad_norm": 1973.710205078125, + "learning_rate": 3.915296320285917e-05, + "loss": 56.33, + "step": 94400 + }, + { + "epoch": 0.38142834633580724, + "grad_norm": 895.8132934570312, + "learning_rate": 3.915008563184664e-05, + "loss": 54.553, + "step": 94410 + }, + { + "epoch": 0.3814687476011749, + "grad_norm": 521.8209838867188, + "learning_rate": 3.9147207784970914e-05, + "loss": 69.8902, + "step": 94420 + }, + { + "epoch": 0.3815091488665425, + "grad_norm": 1060.228515625, + "learning_rate": 3.914432966228808e-05, + "loss": 72.3957, + "step": 94430 + }, + { + "epoch": 0.3815495501319101, + "grad_norm": 478.51885986328125, + "learning_rate": 3.914145126385426e-05, + "loss": 68.0388, + "step": 94440 + }, + { + "epoch": 0.38158995139727775, + "grad_norm": 855.6203002929688, + "learning_rate": 3.9138572589725576e-05, + "loss": 52.0944, + "step": 94450 + }, + { + "epoch": 0.3816303526626454, + "grad_norm": 2904.56298828125, + "learning_rate": 3.9135693639958125e-05, + "loss": 50.8609, + "step": 94460 + }, + { + "epoch": 0.38167075392801303, + "grad_norm": 1457.2967529296875, + "learning_rate": 3.913281441460806e-05, + "loss": 86.8889, + "step": 94470 + }, + { + "epoch": 0.38171115519338067, + "grad_norm": 745.7070922851562, + "learning_rate": 3.91299349137315e-05, + "loss": 63.1558, + "step": 94480 + }, + { + "epoch": 0.3817515564587483, + "grad_norm": 832.9826049804688, + "learning_rate": 3.912705513738458e-05, + "loss": 59.3702, + "step": 94490 + }, + { + "epoch": 0.3817919577241159, + "grad_norm": 768.7559204101562, + "learning_rate": 3.912417508562345e-05, + "loss": 65.5276, + "step": 94500 + }, + { + "epoch": 0.38183235898948353, + "grad_norm": 657.02734375, + "learning_rate": 3.912129475850426e-05, + "loss": 71.6129, + "step": 94510 + }, + { + "epoch": 0.3818727602548512, + "grad_norm": 1496.1007080078125, + "learning_rate": 3.911841415608315e-05, + "loss": 83.0548, + "step": 94520 + }, + { + "epoch": 0.3819131615202188, + "grad_norm": 1321.64697265625, + "learning_rate": 3.911553327841629e-05, + "loss": 42.8167, + "step": 94530 + }, + { + "epoch": 0.38195356278558645, + "grad_norm": 434.47039794921875, + "learning_rate": 3.9112652125559845e-05, + "loss": 28.5604, + "step": 94540 + }, + { + "epoch": 0.3819939640509541, + "grad_norm": 1304.2777099609375, + "learning_rate": 3.910977069756998e-05, + "loss": 54.2573, + "step": 94550 + }, + { + "epoch": 0.38203436531632173, + "grad_norm": 1068.87939453125, + "learning_rate": 3.9106888994502864e-05, + "loss": 34.1891, + "step": 94560 + }, + { + "epoch": 0.3820747665816893, + "grad_norm": 4277.7587890625, + "learning_rate": 3.9104007016414695e-05, + "loss": 87.1632, + "step": 94570 + }, + { + "epoch": 0.38211516784705696, + "grad_norm": 96.375244140625, + "learning_rate": 3.910112476336164e-05, + "loss": 76.1936, + "step": 94580 + }, + { + "epoch": 0.3821555691124246, + "grad_norm": 775.3291015625, + "learning_rate": 3.90982422353999e-05, + "loss": 74.7171, + "step": 94590 + }, + { + "epoch": 0.38219597037779224, + "grad_norm": 696.175537109375, + "learning_rate": 3.909535943258567e-05, + "loss": 54.9566, + "step": 94600 + }, + { + "epoch": 0.3822363716431599, + "grad_norm": 724.5578002929688, + "learning_rate": 3.909247635497516e-05, + "loss": 77.6065, + "step": 94610 + }, + { + "epoch": 0.3822767729085275, + "grad_norm": 393.4891662597656, + "learning_rate": 3.9089593002624555e-05, + "loss": 65.3597, + "step": 94620 + }, + { + "epoch": 0.3823171741738951, + "grad_norm": 1047.53662109375, + "learning_rate": 3.908670937559008e-05, + "loss": 63.4875, + "step": 94630 + }, + { + "epoch": 0.38235757543926274, + "grad_norm": 1051.636962890625, + "learning_rate": 3.908382547392796e-05, + "loss": 39.8985, + "step": 94640 + }, + { + "epoch": 0.3823979767046304, + "grad_norm": 705.0889892578125, + "learning_rate": 3.908094129769442e-05, + "loss": 52.4874, + "step": 94650 + }, + { + "epoch": 0.382438377969998, + "grad_norm": 1593.9310302734375, + "learning_rate": 3.907805684694566e-05, + "loss": 82.5799, + "step": 94660 + }, + { + "epoch": 0.38247877923536566, + "grad_norm": 462.7720947265625, + "learning_rate": 3.9075172121737945e-05, + "loss": 48.7856, + "step": 94670 + }, + { + "epoch": 0.3825191805007333, + "grad_norm": 1333.7330322265625, + "learning_rate": 3.907228712212751e-05, + "loss": 88.1809, + "step": 94680 + }, + { + "epoch": 0.3825595817661009, + "grad_norm": 606.8469848632812, + "learning_rate": 3.906940184817057e-05, + "loss": 63.5642, + "step": 94690 + }, + { + "epoch": 0.3825999830314685, + "grad_norm": 280.5736389160156, + "learning_rate": 3.906651629992342e-05, + "loss": 54.9645, + "step": 94700 + }, + { + "epoch": 0.38264038429683617, + "grad_norm": 1871.5345458984375, + "learning_rate": 3.906363047744229e-05, + "loss": 162.7685, + "step": 94710 + }, + { + "epoch": 0.3826807855622038, + "grad_norm": 605.5426025390625, + "learning_rate": 3.9060744380783435e-05, + "loss": 43.6138, + "step": 94720 + }, + { + "epoch": 0.38272118682757145, + "grad_norm": 2624.923583984375, + "learning_rate": 3.9057858010003137e-05, + "loss": 108.0628, + "step": 94730 + }, + { + "epoch": 0.3827615880929391, + "grad_norm": 978.3139038085938, + "learning_rate": 3.905497136515766e-05, + "loss": 51.3885, + "step": 94740 + }, + { + "epoch": 0.3828019893583067, + "grad_norm": 2022.71435546875, + "learning_rate": 3.905208444630327e-05, + "loss": 58.5064, + "step": 94750 + }, + { + "epoch": 0.3828423906236743, + "grad_norm": 395.371826171875, + "learning_rate": 3.9049197253496264e-05, + "loss": 60.7483, + "step": 94760 + }, + { + "epoch": 0.38288279188904195, + "grad_norm": 848.9832153320312, + "learning_rate": 3.904630978679292e-05, + "loss": 57.199, + "step": 94770 + }, + { + "epoch": 0.3829231931544096, + "grad_norm": 1061.2220458984375, + "learning_rate": 3.9043422046249544e-05, + "loss": 64.7823, + "step": 94780 + }, + { + "epoch": 0.38296359441977723, + "grad_norm": 824.2564086914062, + "learning_rate": 3.904053403192242e-05, + "loss": 58.263, + "step": 94790 + }, + { + "epoch": 0.38300399568514487, + "grad_norm": 1173.652099609375, + "learning_rate": 3.903764574386786e-05, + "loss": 80.1155, + "step": 94800 + }, + { + "epoch": 0.3830443969505125, + "grad_norm": 596.0165405273438, + "learning_rate": 3.903475718214217e-05, + "loss": 64.6889, + "step": 94810 + }, + { + "epoch": 0.3830847982158801, + "grad_norm": 544.8338012695312, + "learning_rate": 3.9031868346801656e-05, + "loss": 90.0342, + "step": 94820 + }, + { + "epoch": 0.38312519948124774, + "grad_norm": 582.4503173828125, + "learning_rate": 3.902897923790265e-05, + "loss": 67.4698, + "step": 94830 + }, + { + "epoch": 0.3831656007466154, + "grad_norm": 1065.5283203125, + "learning_rate": 3.902608985550147e-05, + "loss": 64.2469, + "step": 94840 + }, + { + "epoch": 0.383206002011983, + "grad_norm": 718.9613037109375, + "learning_rate": 3.902320019965445e-05, + "loss": 50.7413, + "step": 94850 + }, + { + "epoch": 0.38324640327735066, + "grad_norm": 497.2271423339844, + "learning_rate": 3.902031027041793e-05, + "loss": 46.6418, + "step": 94860 + }, + { + "epoch": 0.3832868045427183, + "grad_norm": 767.0538330078125, + "learning_rate": 3.901742006784822e-05, + "loss": 86.6283, + "step": 94870 + }, + { + "epoch": 0.38332720580808594, + "grad_norm": 101.88063049316406, + "learning_rate": 3.9014529592001705e-05, + "loss": 80.5774, + "step": 94880 + }, + { + "epoch": 0.3833676070734535, + "grad_norm": 479.095947265625, + "learning_rate": 3.901163884293472e-05, + "loss": 44.5601, + "step": 94890 + }, + { + "epoch": 0.38340800833882116, + "grad_norm": 560.9264526367188, + "learning_rate": 3.900874782070362e-05, + "loss": 68.0652, + "step": 94900 + }, + { + "epoch": 0.3834484096041888, + "grad_norm": 530.4874877929688, + "learning_rate": 3.900585652536477e-05, + "loss": 87.2366, + "step": 94910 + }, + { + "epoch": 0.38348881086955644, + "grad_norm": 779.4183959960938, + "learning_rate": 3.900296495697453e-05, + "loss": 70.1493, + "step": 94920 + }, + { + "epoch": 0.3835292121349241, + "grad_norm": 0.0, + "learning_rate": 3.9000073115589286e-05, + "loss": 59.7547, + "step": 94930 + }, + { + "epoch": 0.3835696134002917, + "grad_norm": 516.5095825195312, + "learning_rate": 3.899718100126541e-05, + "loss": 50.7623, + "step": 94940 + }, + { + "epoch": 0.3836100146656593, + "grad_norm": 569.387939453125, + "learning_rate": 3.899428861405928e-05, + "loss": 83.759, + "step": 94950 + }, + { + "epoch": 0.38365041593102694, + "grad_norm": 586.040771484375, + "learning_rate": 3.899139595402729e-05, + "loss": 42.7253, + "step": 94960 + }, + { + "epoch": 0.3836908171963946, + "grad_norm": 2201.12646484375, + "learning_rate": 3.898850302122583e-05, + "loss": 56.9618, + "step": 94970 + }, + { + "epoch": 0.3837312184617622, + "grad_norm": 511.3911437988281, + "learning_rate": 3.898560981571131e-05, + "loss": 55.5062, + "step": 94980 + }, + { + "epoch": 0.38377161972712986, + "grad_norm": 383.2709045410156, + "learning_rate": 3.8982716337540115e-05, + "loss": 60.895, + "step": 94990 + }, + { + "epoch": 0.3838120209924975, + "grad_norm": 768.9598388671875, + "learning_rate": 3.897982258676867e-05, + "loss": 59.8323, + "step": 95000 + }, + { + "epoch": 0.3838524222578651, + "grad_norm": 1006.3447875976562, + "learning_rate": 3.897692856345339e-05, + "loss": 43.6415, + "step": 95010 + }, + { + "epoch": 0.38389282352323273, + "grad_norm": 774.2186889648438, + "learning_rate": 3.897403426765069e-05, + "loss": 101.709, + "step": 95020 + }, + { + "epoch": 0.38393322478860037, + "grad_norm": 1448.4361572265625, + "learning_rate": 3.8971139699417e-05, + "loss": 91.2065, + "step": 95030 + }, + { + "epoch": 0.383973626053968, + "grad_norm": 771.1356811523438, + "learning_rate": 3.896824485880874e-05, + "loss": 60.503, + "step": 95040 + }, + { + "epoch": 0.38401402731933565, + "grad_norm": 726.0272827148438, + "learning_rate": 3.8965349745882365e-05, + "loss": 70.719, + "step": 95050 + }, + { + "epoch": 0.3840544285847033, + "grad_norm": 791.23193359375, + "learning_rate": 3.896245436069431e-05, + "loss": 83.7272, + "step": 95060 + }, + { + "epoch": 0.38409482985007093, + "grad_norm": 805.988037109375, + "learning_rate": 3.8959558703301015e-05, + "loss": 62.7216, + "step": 95070 + }, + { + "epoch": 0.3841352311154385, + "grad_norm": 318.8636169433594, + "learning_rate": 3.895666277375892e-05, + "loss": 59.8062, + "step": 95080 + }, + { + "epoch": 0.38417563238080615, + "grad_norm": 0.0, + "learning_rate": 3.8953766572124515e-05, + "loss": 46.7181, + "step": 95090 + }, + { + "epoch": 0.3842160336461738, + "grad_norm": 616.468017578125, + "learning_rate": 3.895087009845425e-05, + "loss": 55.656, + "step": 95100 + }, + { + "epoch": 0.38425643491154143, + "grad_norm": 872.3570556640625, + "learning_rate": 3.8947973352804584e-05, + "loss": 78.4981, + "step": 95110 + }, + { + "epoch": 0.3842968361769091, + "grad_norm": 765.6026000976562, + "learning_rate": 3.894507633523199e-05, + "loss": 54.1042, + "step": 95120 + }, + { + "epoch": 0.3843372374422767, + "grad_norm": 903.8804931640625, + "learning_rate": 3.894217904579296e-05, + "loss": 79.6192, + "step": 95130 + }, + { + "epoch": 0.3843776387076443, + "grad_norm": 725.60498046875, + "learning_rate": 3.8939281484543974e-05, + "loss": 84.4251, + "step": 95140 + }, + { + "epoch": 0.38441803997301194, + "grad_norm": 1229.2586669921875, + "learning_rate": 3.893638365154152e-05, + "loss": 125.7261, + "step": 95150 + }, + { + "epoch": 0.3844584412383796, + "grad_norm": 727.9944458007812, + "learning_rate": 3.8933485546842094e-05, + "loss": 55.8314, + "step": 95160 + }, + { + "epoch": 0.3844988425037472, + "grad_norm": 732.46337890625, + "learning_rate": 3.893058717050218e-05, + "loss": 43.0125, + "step": 95170 + }, + { + "epoch": 0.38453924376911486, + "grad_norm": 948.4493408203125, + "learning_rate": 3.892768852257831e-05, + "loss": 48.8631, + "step": 95180 + }, + { + "epoch": 0.3845796450344825, + "grad_norm": 395.610595703125, + "learning_rate": 3.892478960312698e-05, + "loss": 97.7245, + "step": 95190 + }, + { + "epoch": 0.38462004629985014, + "grad_norm": 480.57672119140625, + "learning_rate": 3.8921890412204705e-05, + "loss": 67.3274, + "step": 95200 + }, + { + "epoch": 0.3846604475652177, + "grad_norm": 601.1005859375, + "learning_rate": 3.891899094986801e-05, + "loss": 76.2722, + "step": 95210 + }, + { + "epoch": 0.38470084883058536, + "grad_norm": 1527.212646484375, + "learning_rate": 3.891609121617342e-05, + "loss": 30.4016, + "step": 95220 + }, + { + "epoch": 0.384741250095953, + "grad_norm": 598.6000366210938, + "learning_rate": 3.8913191211177464e-05, + "loss": 38.4764, + "step": 95230 + }, + { + "epoch": 0.38478165136132064, + "grad_norm": 1061.5889892578125, + "learning_rate": 3.891029093493669e-05, + "loss": 57.333, + "step": 95240 + }, + { + "epoch": 0.3848220526266883, + "grad_norm": 719.52392578125, + "learning_rate": 3.8907390387507625e-05, + "loss": 66.065, + "step": 95250 + }, + { + "epoch": 0.3848624538920559, + "grad_norm": 723.9736328125, + "learning_rate": 3.890448956894682e-05, + "loss": 41.1686, + "step": 95260 + }, + { + "epoch": 0.3849028551574235, + "grad_norm": 789.1625366210938, + "learning_rate": 3.8901588479310846e-05, + "loss": 112.5225, + "step": 95270 + }, + { + "epoch": 0.38494325642279115, + "grad_norm": 321.8541259765625, + "learning_rate": 3.889868711865624e-05, + "loss": 43.6828, + "step": 95280 + }, + { + "epoch": 0.3849836576881588, + "grad_norm": 449.05401611328125, + "learning_rate": 3.8895785487039574e-05, + "loss": 82.2265, + "step": 95290 + }, + { + "epoch": 0.3850240589535264, + "grad_norm": 677.4368286132812, + "learning_rate": 3.8892883584517415e-05, + "loss": 52.7545, + "step": 95300 + }, + { + "epoch": 0.38506446021889407, + "grad_norm": 960.1495361328125, + "learning_rate": 3.888998141114634e-05, + "loss": 51.1408, + "step": 95310 + }, + { + "epoch": 0.3851048614842617, + "grad_norm": 3223.043701171875, + "learning_rate": 3.8887078966982925e-05, + "loss": 75.4026, + "step": 95320 + }, + { + "epoch": 0.3851452627496293, + "grad_norm": 1701.433837890625, + "learning_rate": 3.888417625208376e-05, + "loss": 65.9606, + "step": 95330 + }, + { + "epoch": 0.38518566401499693, + "grad_norm": 1047.423583984375, + "learning_rate": 3.888127326650542e-05, + "loss": 53.6032, + "step": 95340 + }, + { + "epoch": 0.38522606528036457, + "grad_norm": 591.1990356445312, + "learning_rate": 3.887837001030452e-05, + "loss": 67.6553, + "step": 95350 + }, + { + "epoch": 0.3852664665457322, + "grad_norm": 1093.99365234375, + "learning_rate": 3.887546648353765e-05, + "loss": 44.322, + "step": 95360 + }, + { + "epoch": 0.38530686781109985, + "grad_norm": 545.5051879882812, + "learning_rate": 3.887256268626142e-05, + "loss": 76.8624, + "step": 95370 + }, + { + "epoch": 0.3853472690764675, + "grad_norm": 1276.1842041015625, + "learning_rate": 3.886965861853244e-05, + "loss": 48.7556, + "step": 95380 + }, + { + "epoch": 0.38538767034183513, + "grad_norm": 739.0816650390625, + "learning_rate": 3.886675428040732e-05, + "loss": 62.3025, + "step": 95390 + }, + { + "epoch": 0.3854280716072027, + "grad_norm": 655.6229858398438, + "learning_rate": 3.8863849671942685e-05, + "loss": 51.2894, + "step": 95400 + }, + { + "epoch": 0.38546847287257036, + "grad_norm": 835.5565795898438, + "learning_rate": 3.886094479319517e-05, + "loss": 63.1423, + "step": 95410 + }, + { + "epoch": 0.385508874137938, + "grad_norm": 983.1041259765625, + "learning_rate": 3.885803964422139e-05, + "loss": 65.168, + "step": 95420 + }, + { + "epoch": 0.38554927540330564, + "grad_norm": 928.8274536132812, + "learning_rate": 3.885513422507799e-05, + "loss": 54.2019, + "step": 95430 + }, + { + "epoch": 0.3855896766686733, + "grad_norm": 716.9531860351562, + "learning_rate": 3.885222853582163e-05, + "loss": 69.1313, + "step": 95440 + }, + { + "epoch": 0.3856300779340409, + "grad_norm": 1002.2490234375, + "learning_rate": 3.8849322576508934e-05, + "loss": 57.0536, + "step": 95450 + }, + { + "epoch": 0.3856704791994085, + "grad_norm": 721.2386474609375, + "learning_rate": 3.884641634719657e-05, + "loss": 75.6969, + "step": 95460 + }, + { + "epoch": 0.38571088046477614, + "grad_norm": 673.9929809570312, + "learning_rate": 3.884350984794118e-05, + "loss": 68.2813, + "step": 95470 + }, + { + "epoch": 0.3857512817301438, + "grad_norm": 728.1707153320312, + "learning_rate": 3.8840603078799445e-05, + "loss": 53.9483, + "step": 95480 + }, + { + "epoch": 0.3857916829955114, + "grad_norm": 823.33740234375, + "learning_rate": 3.883769603982803e-05, + "loss": 51.6572, + "step": 95490 + }, + { + "epoch": 0.38583208426087906, + "grad_norm": 1103.5313720703125, + "learning_rate": 3.883478873108361e-05, + "loss": 49.7478, + "step": 95500 + }, + { + "epoch": 0.3858724855262467, + "grad_norm": 1030.14404296875, + "learning_rate": 3.883188115262285e-05, + "loss": 52.5702, + "step": 95510 + }, + { + "epoch": 0.38591288679161434, + "grad_norm": 937.8312377929688, + "learning_rate": 3.8828973304502446e-05, + "loss": 66.6772, + "step": 95520 + }, + { + "epoch": 0.3859532880569819, + "grad_norm": 1410.5428466796875, + "learning_rate": 3.88260651867791e-05, + "loss": 55.7531, + "step": 95530 + }, + { + "epoch": 0.38599368932234956, + "grad_norm": 480.3839111328125, + "learning_rate": 3.8823156799509484e-05, + "loss": 61.0244, + "step": 95540 + }, + { + "epoch": 0.3860340905877172, + "grad_norm": 471.7522888183594, + "learning_rate": 3.8820248142750316e-05, + "loss": 86.2028, + "step": 95550 + }, + { + "epoch": 0.38607449185308484, + "grad_norm": 531.3958129882812, + "learning_rate": 3.881733921655829e-05, + "loss": 88.4188, + "step": 95560 + }, + { + "epoch": 0.3861148931184525, + "grad_norm": 878.5484008789062, + "learning_rate": 3.881443002099012e-05, + "loss": 64.7605, + "step": 95570 + }, + { + "epoch": 0.3861552943838201, + "grad_norm": 371.7911682128906, + "learning_rate": 3.8811520556102535e-05, + "loss": 51.486, + "step": 95580 + }, + { + "epoch": 0.3861956956491877, + "grad_norm": 529.8397216796875, + "learning_rate": 3.880861082195224e-05, + "loss": 72.443, + "step": 95590 + }, + { + "epoch": 0.38623609691455535, + "grad_norm": 988.7250366210938, + "learning_rate": 3.880570081859597e-05, + "loss": 83.2871, + "step": 95600 + }, + { + "epoch": 0.386276498179923, + "grad_norm": 433.3272705078125, + "learning_rate": 3.880279054609045e-05, + "loss": 45.9184, + "step": 95610 + }, + { + "epoch": 0.38631689944529063, + "grad_norm": 1444.31787109375, + "learning_rate": 3.8799880004492425e-05, + "loss": 67.2129, + "step": 95620 + }, + { + "epoch": 0.38635730071065827, + "grad_norm": 507.1636047363281, + "learning_rate": 3.879696919385864e-05, + "loss": 44.7271, + "step": 95630 + }, + { + "epoch": 0.3863977019760259, + "grad_norm": 1155.373046875, + "learning_rate": 3.879405811424583e-05, + "loss": 66.5983, + "step": 95640 + }, + { + "epoch": 0.3864381032413935, + "grad_norm": 490.9678955078125, + "learning_rate": 3.879114676571076e-05, + "loss": 60.9269, + "step": 95650 + }, + { + "epoch": 0.38647850450676113, + "grad_norm": 0.0, + "learning_rate": 3.878823514831018e-05, + "loss": 56.5793, + "step": 95660 + }, + { + "epoch": 0.3865189057721288, + "grad_norm": 1740.7720947265625, + "learning_rate": 3.878532326210086e-05, + "loss": 74.9253, + "step": 95670 + }, + { + "epoch": 0.3865593070374964, + "grad_norm": 437.5474853515625, + "learning_rate": 3.8782411107139564e-05, + "loss": 280.9561, + "step": 95680 + }, + { + "epoch": 0.38659970830286405, + "grad_norm": 1296.4512939453125, + "learning_rate": 3.877949868348307e-05, + "loss": 73.3886, + "step": 95690 + }, + { + "epoch": 0.3866401095682317, + "grad_norm": 372.0828552246094, + "learning_rate": 3.877658599118815e-05, + "loss": 70.0043, + "step": 95700 + }, + { + "epoch": 0.38668051083359933, + "grad_norm": 462.0464782714844, + "learning_rate": 3.87736730303116e-05, + "loss": 69.3731, + "step": 95710 + }, + { + "epoch": 0.3867209120989669, + "grad_norm": 674.0639038085938, + "learning_rate": 3.87707598009102e-05, + "loss": 82.2932, + "step": 95720 + }, + { + "epoch": 0.38676131336433456, + "grad_norm": 281.2317810058594, + "learning_rate": 3.8767846303040746e-05, + "loss": 66.2907, + "step": 95730 + }, + { + "epoch": 0.3868017146297022, + "grad_norm": 531.0217895507812, + "learning_rate": 3.876493253676004e-05, + "loss": 68.4629, + "step": 95740 + }, + { + "epoch": 0.38684211589506984, + "grad_norm": 871.7710571289062, + "learning_rate": 3.8762018502124894e-05, + "loss": 61.8916, + "step": 95750 + }, + { + "epoch": 0.3868825171604375, + "grad_norm": 600.0103759765625, + "learning_rate": 3.875910419919211e-05, + "loss": 86.7091, + "step": 95760 + }, + { + "epoch": 0.3869229184258051, + "grad_norm": 631.7959594726562, + "learning_rate": 3.87561896280185e-05, + "loss": 57.5459, + "step": 95770 + }, + { + "epoch": 0.3869633196911727, + "grad_norm": 459.5212097167969, + "learning_rate": 3.8753274788660894e-05, + "loss": 38.7924, + "step": 95780 + }, + { + "epoch": 0.38700372095654034, + "grad_norm": 722.750244140625, + "learning_rate": 3.875035968117612e-05, + "loss": 61.4237, + "step": 95790 + }, + { + "epoch": 0.387044122221908, + "grad_norm": 1434.1351318359375, + "learning_rate": 3.8747444305621e-05, + "loss": 45.202, + "step": 95800 + }, + { + "epoch": 0.3870845234872756, + "grad_norm": 1343.0447998046875, + "learning_rate": 3.874452866205237e-05, + "loss": 74.4229, + "step": 95810 + }, + { + "epoch": 0.38712492475264326, + "grad_norm": 851.3370971679688, + "learning_rate": 3.874161275052709e-05, + "loss": 57.2729, + "step": 95820 + }, + { + "epoch": 0.3871653260180109, + "grad_norm": 519.22216796875, + "learning_rate": 3.873869657110198e-05, + "loss": 52.2334, + "step": 95830 + }, + { + "epoch": 0.38720572728337854, + "grad_norm": 1355.063720703125, + "learning_rate": 3.873578012383393e-05, + "loss": 80.8646, + "step": 95840 + }, + { + "epoch": 0.3872461285487461, + "grad_norm": 615.1976318359375, + "learning_rate": 3.873286340877975e-05, + "loss": 58.7473, + "step": 95850 + }, + { + "epoch": 0.38728652981411377, + "grad_norm": 780.4457397460938, + "learning_rate": 3.8729946425996345e-05, + "loss": 72.4027, + "step": 95860 + }, + { + "epoch": 0.3873269310794814, + "grad_norm": 345.5774841308594, + "learning_rate": 3.8727029175540554e-05, + "loss": 61.2218, + "step": 95870 + }, + { + "epoch": 0.38736733234484905, + "grad_norm": 546.84228515625, + "learning_rate": 3.872411165746927e-05, + "loss": 47.3672, + "step": 95880 + }, + { + "epoch": 0.3874077336102167, + "grad_norm": 1022.2073364257812, + "learning_rate": 3.872119387183936e-05, + "loss": 74.9494, + "step": 95890 + }, + { + "epoch": 0.3874481348755843, + "grad_norm": 789.552978515625, + "learning_rate": 3.8718275818707715e-05, + "loss": 64.7529, + "step": 95900 + }, + { + "epoch": 0.3874885361409519, + "grad_norm": 446.9588623046875, + "learning_rate": 3.8715357498131214e-05, + "loss": 66.2357, + "step": 95910 + }, + { + "epoch": 0.38752893740631955, + "grad_norm": 432.23651123046875, + "learning_rate": 3.871243891016676e-05, + "loss": 65.1847, + "step": 95920 + }, + { + "epoch": 0.3875693386716872, + "grad_norm": 382.0699462890625, + "learning_rate": 3.870952005487125e-05, + "loss": 46.396, + "step": 95930 + }, + { + "epoch": 0.38760973993705483, + "grad_norm": 553.678955078125, + "learning_rate": 3.870660093230159e-05, + "loss": 59.2287, + "step": 95940 + }, + { + "epoch": 0.38765014120242247, + "grad_norm": 397.8702392578125, + "learning_rate": 3.870368154251469e-05, + "loss": 57.8399, + "step": 95950 + }, + { + "epoch": 0.3876905424677901, + "grad_norm": 776.6229248046875, + "learning_rate": 3.870076188556746e-05, + "loss": 49.7638, + "step": 95960 + }, + { + "epoch": 0.3877309437331577, + "grad_norm": 940.7283325195312, + "learning_rate": 3.869784196151682e-05, + "loss": 69.5812, + "step": 95970 + }, + { + "epoch": 0.38777134499852534, + "grad_norm": 1250.880859375, + "learning_rate": 3.869492177041971e-05, + "loss": 86.8493, + "step": 95980 + }, + { + "epoch": 0.387811746263893, + "grad_norm": 718.7807006835938, + "learning_rate": 3.8692001312333036e-05, + "loss": 64.5188, + "step": 95990 + }, + { + "epoch": 0.3878521475292606, + "grad_norm": 1685.44921875, + "learning_rate": 3.868908058731376e-05, + "loss": 81.74, + "step": 96000 + }, + { + "epoch": 0.38789254879462826, + "grad_norm": 1482.5496826171875, + "learning_rate": 3.8686159595418805e-05, + "loss": 85.64, + "step": 96010 + }, + { + "epoch": 0.3879329500599959, + "grad_norm": 494.4794006347656, + "learning_rate": 3.868323833670512e-05, + "loss": 75.5731, + "step": 96020 + }, + { + "epoch": 0.38797335132536354, + "grad_norm": 1048.5169677734375, + "learning_rate": 3.868031681122966e-05, + "loss": 100.7712, + "step": 96030 + }, + { + "epoch": 0.3880137525907311, + "grad_norm": 431.1014099121094, + "learning_rate": 3.867739501904938e-05, + "loss": 41.6055, + "step": 96040 + }, + { + "epoch": 0.38805415385609876, + "grad_norm": 670.1493530273438, + "learning_rate": 3.867447296022124e-05, + "loss": 55.9565, + "step": 96050 + }, + { + "epoch": 0.3880945551214664, + "grad_norm": 826.1647338867188, + "learning_rate": 3.8671550634802216e-05, + "loss": 64.8045, + "step": 96060 + }, + { + "epoch": 0.38813495638683404, + "grad_norm": 408.30645751953125, + "learning_rate": 3.866862804284928e-05, + "loss": 48.8189, + "step": 96070 + }, + { + "epoch": 0.3881753576522017, + "grad_norm": 523.0665283203125, + "learning_rate": 3.8665705184419386e-05, + "loss": 39.3677, + "step": 96080 + }, + { + "epoch": 0.3882157589175693, + "grad_norm": 1171.9281005859375, + "learning_rate": 3.8662782059569546e-05, + "loss": 76.4366, + "step": 96090 + }, + { + "epoch": 0.3882561601829369, + "grad_norm": 688.5985717773438, + "learning_rate": 3.865985866835673e-05, + "loss": 54.2586, + "step": 96100 + }, + { + "epoch": 0.38829656144830454, + "grad_norm": 703.115234375, + "learning_rate": 3.865693501083794e-05, + "loss": 41.2508, + "step": 96110 + }, + { + "epoch": 0.3883369627136722, + "grad_norm": 764.5994262695312, + "learning_rate": 3.865401108707017e-05, + "loss": 70.6168, + "step": 96120 + }, + { + "epoch": 0.3883773639790398, + "grad_norm": 653.46240234375, + "learning_rate": 3.8651086897110424e-05, + "loss": 95.4823, + "step": 96130 + }, + { + "epoch": 0.38841776524440746, + "grad_norm": 1796.30029296875, + "learning_rate": 3.864816244101571e-05, + "loss": 83.386, + "step": 96140 + }, + { + "epoch": 0.3884581665097751, + "grad_norm": 552.94677734375, + "learning_rate": 3.8645237718843044e-05, + "loss": 56.03, + "step": 96150 + }, + { + "epoch": 0.38849856777514274, + "grad_norm": 776.851806640625, + "learning_rate": 3.864231273064944e-05, + "loss": 82.2406, + "step": 96160 + }, + { + "epoch": 0.38853896904051033, + "grad_norm": 465.9934997558594, + "learning_rate": 3.8639387476491926e-05, + "loss": 68.4747, + "step": 96170 + }, + { + "epoch": 0.38857937030587797, + "grad_norm": 2382.02685546875, + "learning_rate": 3.863646195642754e-05, + "loss": 96.2178, + "step": 96180 + }, + { + "epoch": 0.3886197715712456, + "grad_norm": 781.694580078125, + "learning_rate": 3.8633536170513296e-05, + "loss": 69.0198, + "step": 96190 + }, + { + "epoch": 0.38866017283661325, + "grad_norm": 822.1322631835938, + "learning_rate": 3.8630610118806254e-05, + "loss": 56.8671, + "step": 96200 + }, + { + "epoch": 0.3887005741019809, + "grad_norm": 518.6736450195312, + "learning_rate": 3.862768380136345e-05, + "loss": 43.1214, + "step": 96210 + }, + { + "epoch": 0.38874097536734853, + "grad_norm": 1137.239013671875, + "learning_rate": 3.862475721824193e-05, + "loss": 77.5319, + "step": 96220 + }, + { + "epoch": 0.3887813766327161, + "grad_norm": 113.21827697753906, + "learning_rate": 3.862183036949875e-05, + "loss": 64.3933, + "step": 96230 + }, + { + "epoch": 0.38882177789808375, + "grad_norm": 1088.9544677734375, + "learning_rate": 3.861890325519098e-05, + "loss": 69.0374, + "step": 96240 + }, + { + "epoch": 0.3888621791634514, + "grad_norm": 1370.9381103515625, + "learning_rate": 3.861597587537568e-05, + "loss": 59.8645, + "step": 96250 + }, + { + "epoch": 0.38890258042881903, + "grad_norm": 523.1607055664062, + "learning_rate": 3.861304823010991e-05, + "loss": 71.4097, + "step": 96260 + }, + { + "epoch": 0.3889429816941867, + "grad_norm": 959.0990600585938, + "learning_rate": 3.861012031945077e-05, + "loss": 42.5465, + "step": 96270 + }, + { + "epoch": 0.3889833829595543, + "grad_norm": 660.4682006835938, + "learning_rate": 3.8607192143455326e-05, + "loss": 55.936, + "step": 96280 + }, + { + "epoch": 0.3890237842249219, + "grad_norm": 748.14990234375, + "learning_rate": 3.860426370218067e-05, + "loss": 43.6322, + "step": 96290 + }, + { + "epoch": 0.38906418549028954, + "grad_norm": 584.9511108398438, + "learning_rate": 3.860133499568387e-05, + "loss": 70.2999, + "step": 96300 + }, + { + "epoch": 0.3891045867556572, + "grad_norm": 1015.5634765625, + "learning_rate": 3.859840602402206e-05, + "loss": 109.6341, + "step": 96310 + }, + { + "epoch": 0.3891449880210248, + "grad_norm": 1177.64697265625, + "learning_rate": 3.859547678725231e-05, + "loss": 63.7178, + "step": 96320 + }, + { + "epoch": 0.38918538928639246, + "grad_norm": 480.65618896484375, + "learning_rate": 3.859254728543175e-05, + "loss": 62.31, + "step": 96330 + }, + { + "epoch": 0.3892257905517601, + "grad_norm": 1283.1507568359375, + "learning_rate": 3.8589617518617485e-05, + "loss": 61.4974, + "step": 96340 + }, + { + "epoch": 0.38926619181712774, + "grad_norm": 1167.147216796875, + "learning_rate": 3.858668748686662e-05, + "loss": 48.8035, + "step": 96350 + }, + { + "epoch": 0.3893065930824953, + "grad_norm": 793.9154052734375, + "learning_rate": 3.858375719023629e-05, + "loss": 68.4192, + "step": 96360 + }, + { + "epoch": 0.38934699434786296, + "grad_norm": 838.4378051757812, + "learning_rate": 3.8580826628783625e-05, + "loss": 86.5221, + "step": 96370 + }, + { + "epoch": 0.3893873956132306, + "grad_norm": 797.8261108398438, + "learning_rate": 3.857789580256575e-05, + "loss": 63.2042, + "step": 96380 + }, + { + "epoch": 0.38942779687859824, + "grad_norm": 601.6024169921875, + "learning_rate": 3.857496471163981e-05, + "loss": 52.494, + "step": 96390 + }, + { + "epoch": 0.3894681981439659, + "grad_norm": 693.2849731445312, + "learning_rate": 3.8572033356062943e-05, + "loss": 88.3605, + "step": 96400 + }, + { + "epoch": 0.3895085994093335, + "grad_norm": 1151.2178955078125, + "learning_rate": 3.8569101735892296e-05, + "loss": 47.6969, + "step": 96410 + }, + { + "epoch": 0.3895490006747011, + "grad_norm": 534.7440185546875, + "learning_rate": 3.856616985118502e-05, + "loss": 56.3229, + "step": 96420 + }, + { + "epoch": 0.38958940194006875, + "grad_norm": 883.9851684570312, + "learning_rate": 3.8563237701998286e-05, + "loss": 44.7732, + "step": 96430 + }, + { + "epoch": 0.3896298032054364, + "grad_norm": 903.25341796875, + "learning_rate": 3.856030528838925e-05, + "loss": 49.8167, + "step": 96440 + }, + { + "epoch": 0.389670204470804, + "grad_norm": 1717.411376953125, + "learning_rate": 3.8557372610415074e-05, + "loss": 56.202, + "step": 96450 + }, + { + "epoch": 0.38971060573617167, + "grad_norm": 1218.4908447265625, + "learning_rate": 3.8554439668132946e-05, + "loss": 48.3608, + "step": 96460 + }, + { + "epoch": 0.3897510070015393, + "grad_norm": 884.7052001953125, + "learning_rate": 3.855150646160003e-05, + "loss": 46.3305, + "step": 96470 + }, + { + "epoch": 0.38979140826690695, + "grad_norm": 360.12017822265625, + "learning_rate": 3.854857299087353e-05, + "loss": 37.3111, + "step": 96480 + }, + { + "epoch": 0.38983180953227453, + "grad_norm": 1448.187744140625, + "learning_rate": 3.8545639256010625e-05, + "loss": 71.6807, + "step": 96490 + }, + { + "epoch": 0.38987221079764217, + "grad_norm": 935.8035278320312, + "learning_rate": 3.85427052570685e-05, + "loss": 60.7646, + "step": 96500 + }, + { + "epoch": 0.3899126120630098, + "grad_norm": 805.13037109375, + "learning_rate": 3.853977099410436e-05, + "loss": 46.3225, + "step": 96510 + }, + { + "epoch": 0.38995301332837745, + "grad_norm": 562.0592041015625, + "learning_rate": 3.853683646717543e-05, + "loss": 62.2224, + "step": 96520 + }, + { + "epoch": 0.3899934145937451, + "grad_norm": 1327.85498046875, + "learning_rate": 3.853390167633889e-05, + "loss": 66.2915, + "step": 96530 + }, + { + "epoch": 0.39003381585911273, + "grad_norm": 1803.8189697265625, + "learning_rate": 3.8530966621651976e-05, + "loss": 89.2004, + "step": 96540 + }, + { + "epoch": 0.3900742171244803, + "grad_norm": 363.34783935546875, + "learning_rate": 3.8528031303171895e-05, + "loss": 38.5132, + "step": 96550 + }, + { + "epoch": 0.39011461838984796, + "grad_norm": 1082.673095703125, + "learning_rate": 3.852509572095588e-05, + "loss": 58.43, + "step": 96560 + }, + { + "epoch": 0.3901550196552156, + "grad_norm": 1270.4439697265625, + "learning_rate": 3.852215987506117e-05, + "loss": 55.3348, + "step": 96570 + }, + { + "epoch": 0.39019542092058324, + "grad_norm": 478.66815185546875, + "learning_rate": 3.851922376554499e-05, + "loss": 59.5862, + "step": 96580 + }, + { + "epoch": 0.3902358221859509, + "grad_norm": 772.7987670898438, + "learning_rate": 3.851628739246457e-05, + "loss": 53.7853, + "step": 96590 + }, + { + "epoch": 0.3902762234513185, + "grad_norm": 353.50360107421875, + "learning_rate": 3.851335075587718e-05, + "loss": 68.1417, + "step": 96600 + }, + { + "epoch": 0.3903166247166861, + "grad_norm": 3968.27587890625, + "learning_rate": 3.8510413855840056e-05, + "loss": 92.6137, + "step": 96610 + }, + { + "epoch": 0.39035702598205374, + "grad_norm": 624.1312255859375, + "learning_rate": 3.850747669241046e-05, + "loss": 74.6902, + "step": 96620 + }, + { + "epoch": 0.3903974272474214, + "grad_norm": 1058.5302734375, + "learning_rate": 3.850453926564565e-05, + "loss": 67.8454, + "step": 96630 + }, + { + "epoch": 0.390437828512789, + "grad_norm": 1056.72021484375, + "learning_rate": 3.85016015756029e-05, + "loss": 88.1807, + "step": 96640 + }, + { + "epoch": 0.39047822977815666, + "grad_norm": 1824.6705322265625, + "learning_rate": 3.849866362233947e-05, + "loss": 71.9311, + "step": 96650 + }, + { + "epoch": 0.3905186310435243, + "grad_norm": 608.2755126953125, + "learning_rate": 3.849572540591264e-05, + "loss": 45.0875, + "step": 96660 + }, + { + "epoch": 0.39055903230889194, + "grad_norm": 665.3267822265625, + "learning_rate": 3.84927869263797e-05, + "loss": 67.2472, + "step": 96670 + }, + { + "epoch": 0.3905994335742595, + "grad_norm": 752.25927734375, + "learning_rate": 3.848984818379793e-05, + "loss": 108.7115, + "step": 96680 + }, + { + "epoch": 0.39063983483962716, + "grad_norm": 393.086181640625, + "learning_rate": 3.848690917822463e-05, + "loss": 61.483, + "step": 96690 + }, + { + "epoch": 0.3906802361049948, + "grad_norm": 1095.4296875, + "learning_rate": 3.8483969909717087e-05, + "loss": 57.5574, + "step": 96700 + }, + { + "epoch": 0.39072063737036244, + "grad_norm": 841.97705078125, + "learning_rate": 3.8481030378332614e-05, + "loss": 68.264, + "step": 96710 + }, + { + "epoch": 0.3907610386357301, + "grad_norm": 410.4032897949219, + "learning_rate": 3.84780905841285e-05, + "loss": 69.9679, + "step": 96720 + }, + { + "epoch": 0.3908014399010977, + "grad_norm": 513.5309448242188, + "learning_rate": 3.8475150527162085e-05, + "loss": 80.5341, + "step": 96730 + }, + { + "epoch": 0.3908418411664653, + "grad_norm": 451.75445556640625, + "learning_rate": 3.847221020749067e-05, + "loss": 45.8152, + "step": 96740 + }, + { + "epoch": 0.39088224243183295, + "grad_norm": 1032.3883056640625, + "learning_rate": 3.8469269625171576e-05, + "loss": 84.8716, + "step": 96750 + }, + { + "epoch": 0.3909226436972006, + "grad_norm": 472.958251953125, + "learning_rate": 3.846632878026214e-05, + "loss": 49.6663, + "step": 96760 + }, + { + "epoch": 0.39096304496256823, + "grad_norm": 969.470703125, + "learning_rate": 3.8463387672819696e-05, + "loss": 63.3029, + "step": 96770 + }, + { + "epoch": 0.39100344622793587, + "grad_norm": 1277.8431396484375, + "learning_rate": 3.846044630290158e-05, + "loss": 56.5187, + "step": 96780 + }, + { + "epoch": 0.3910438474933035, + "grad_norm": 364.2802734375, + "learning_rate": 3.845750467056511e-05, + "loss": 59.6623, + "step": 96790 + }, + { + "epoch": 0.39108424875867115, + "grad_norm": 982.3886108398438, + "learning_rate": 3.8454562775867684e-05, + "loss": 85.9636, + "step": 96800 + }, + { + "epoch": 0.39112465002403873, + "grad_norm": 416.16619873046875, + "learning_rate": 3.8451620618866616e-05, + "loss": 84.5701, + "step": 96810 + }, + { + "epoch": 0.3911650512894064, + "grad_norm": 627.9135131835938, + "learning_rate": 3.844867819961928e-05, + "loss": 44.0341, + "step": 96820 + }, + { + "epoch": 0.391205452554774, + "grad_norm": 1361.487060546875, + "learning_rate": 3.8445735518183043e-05, + "loss": 48.6817, + "step": 96830 + }, + { + "epoch": 0.39124585382014165, + "grad_norm": 1403.5098876953125, + "learning_rate": 3.8442792574615275e-05, + "loss": 49.1378, + "step": 96840 + }, + { + "epoch": 0.3912862550855093, + "grad_norm": 1468.5533447265625, + "learning_rate": 3.843984936897334e-05, + "loss": 71.1556, + "step": 96850 + }, + { + "epoch": 0.39132665635087693, + "grad_norm": 2194.382080078125, + "learning_rate": 3.843690590131462e-05, + "loss": 89.6682, + "step": 96860 + }, + { + "epoch": 0.3913670576162445, + "grad_norm": 899.1420288085938, + "learning_rate": 3.84339621716965e-05, + "loss": 59.679, + "step": 96870 + }, + { + "epoch": 0.39140745888161216, + "grad_norm": 726.7706298828125, + "learning_rate": 3.843101818017637e-05, + "loss": 81.8773, + "step": 96880 + }, + { + "epoch": 0.3914478601469798, + "grad_norm": 441.8895568847656, + "learning_rate": 3.8428073926811625e-05, + "loss": 87.1567, + "step": 96890 + }, + { + "epoch": 0.39148826141234744, + "grad_norm": 788.19921875, + "learning_rate": 3.842512941165968e-05, + "loss": 81.7942, + "step": 96900 + }, + { + "epoch": 0.3915286626777151, + "grad_norm": 0.0, + "learning_rate": 3.842218463477791e-05, + "loss": 45.6026, + "step": 96910 + }, + { + "epoch": 0.3915690639430827, + "grad_norm": 864.9554443359375, + "learning_rate": 3.841923959622375e-05, + "loss": 49.6541, + "step": 96920 + }, + { + "epoch": 0.3916094652084503, + "grad_norm": 903.1978149414062, + "learning_rate": 3.84162942960546e-05, + "loss": 76.4913, + "step": 96930 + }, + { + "epoch": 0.39164986647381794, + "grad_norm": 848.7103271484375, + "learning_rate": 3.841334873432789e-05, + "loss": 67.0842, + "step": 96940 + }, + { + "epoch": 0.3916902677391856, + "grad_norm": 898.5715942382812, + "learning_rate": 3.841040291110103e-05, + "loss": 40.5335, + "step": 96950 + }, + { + "epoch": 0.3917306690045532, + "grad_norm": 422.8970031738281, + "learning_rate": 3.840745682643147e-05, + "loss": 62.1978, + "step": 96960 + }, + { + "epoch": 0.39177107026992086, + "grad_norm": 404.5113220214844, + "learning_rate": 3.840451048037663e-05, + "loss": 56.2681, + "step": 96970 + }, + { + "epoch": 0.3918114715352885, + "grad_norm": 1311.5926513671875, + "learning_rate": 3.8401563872993966e-05, + "loss": 63.1407, + "step": 96980 + }, + { + "epoch": 0.39185187280065614, + "grad_norm": 477.2160339355469, + "learning_rate": 3.839861700434091e-05, + "loss": 51.8024, + "step": 96990 + }, + { + "epoch": 0.3918922740660237, + "grad_norm": 835.2833862304688, + "learning_rate": 3.8395669874474915e-05, + "loss": 48.2391, + "step": 97000 + }, + { + "epoch": 0.39193267533139137, + "grad_norm": 906.5626831054688, + "learning_rate": 3.839272248345344e-05, + "loss": 61.7122, + "step": 97010 + }, + { + "epoch": 0.391973076596759, + "grad_norm": 321.434326171875, + "learning_rate": 3.838977483133395e-05, + "loss": 66.5921, + "step": 97020 + }, + { + "epoch": 0.39201347786212665, + "grad_norm": 408.02197265625, + "learning_rate": 3.838682691817391e-05, + "loss": 47.1528, + "step": 97030 + }, + { + "epoch": 0.3920538791274943, + "grad_norm": 953.2274780273438, + "learning_rate": 3.8383878744030776e-05, + "loss": 47.6821, + "step": 97040 + }, + { + "epoch": 0.3920942803928619, + "grad_norm": 541.6514282226562, + "learning_rate": 3.8380930308962036e-05, + "loss": 60.5129, + "step": 97050 + }, + { + "epoch": 0.3921346816582295, + "grad_norm": 443.59527587890625, + "learning_rate": 3.837798161302518e-05, + "loss": 67.278, + "step": 97060 + }, + { + "epoch": 0.39217508292359715, + "grad_norm": 1681.9339599609375, + "learning_rate": 3.8375032656277684e-05, + "loss": 59.9378, + "step": 97070 + }, + { + "epoch": 0.3922154841889648, + "grad_norm": 696.7039184570312, + "learning_rate": 3.837208343877703e-05, + "loss": 49.9206, + "step": 97080 + }, + { + "epoch": 0.39225588545433243, + "grad_norm": 885.4489135742188, + "learning_rate": 3.8369133960580724e-05, + "loss": 69.6553, + "step": 97090 + }, + { + "epoch": 0.39229628671970007, + "grad_norm": 688.5819702148438, + "learning_rate": 3.836618422174628e-05, + "loss": 80.361, + "step": 97100 + }, + { + "epoch": 0.3923366879850677, + "grad_norm": 757.7618408203125, + "learning_rate": 3.83632342223312e-05, + "loss": 49.0418, + "step": 97110 + }, + { + "epoch": 0.39237708925043535, + "grad_norm": 435.7521057128906, + "learning_rate": 3.836028396239297e-05, + "loss": 52.3195, + "step": 97120 + }, + { + "epoch": 0.39241749051580294, + "grad_norm": 622.2557983398438, + "learning_rate": 3.8357333441989134e-05, + "loss": 69.8973, + "step": 97130 + }, + { + "epoch": 0.3924578917811706, + "grad_norm": 2009.5267333984375, + "learning_rate": 3.835438266117721e-05, + "loss": 63.6782, + "step": 97140 + }, + { + "epoch": 0.3924982930465382, + "grad_norm": 851.5496826171875, + "learning_rate": 3.835143162001472e-05, + "loss": 79.4912, + "step": 97150 + }, + { + "epoch": 0.39253869431190586, + "grad_norm": 496.26544189453125, + "learning_rate": 3.834848031855919e-05, + "loss": 53.252, + "step": 97160 + }, + { + "epoch": 0.3925790955772735, + "grad_norm": 664.895263671875, + "learning_rate": 3.8345528756868164e-05, + "loss": 51.9402, + "step": 97170 + }, + { + "epoch": 0.39261949684264114, + "grad_norm": 963.7388305664062, + "learning_rate": 3.8342576934999184e-05, + "loss": 57.3356, + "step": 97180 + }, + { + "epoch": 0.3926598981080087, + "grad_norm": 706.6784057617188, + "learning_rate": 3.83396248530098e-05, + "loss": 74.5728, + "step": 97190 + }, + { + "epoch": 0.39270029937337636, + "grad_norm": 486.8099365234375, + "learning_rate": 3.8336672510957574e-05, + "loss": 64.1127, + "step": 97200 + }, + { + "epoch": 0.392740700638744, + "grad_norm": 563.300048828125, + "learning_rate": 3.833371990890003e-05, + "loss": 53.4073, + "step": 97210 + }, + { + "epoch": 0.39278110190411164, + "grad_norm": 382.8632507324219, + "learning_rate": 3.8330767046894765e-05, + "loss": 44.1544, + "step": 97220 + }, + { + "epoch": 0.3928215031694793, + "grad_norm": 944.963134765625, + "learning_rate": 3.8327813924999326e-05, + "loss": 64.7648, + "step": 97230 + }, + { + "epoch": 0.3928619044348469, + "grad_norm": 428.9286193847656, + "learning_rate": 3.83248605432713e-05, + "loss": 57.617, + "step": 97240 + }, + { + "epoch": 0.3929023057002145, + "grad_norm": 742.8692016601562, + "learning_rate": 3.832190690176825e-05, + "loss": 44.4798, + "step": 97250 + }, + { + "epoch": 0.39294270696558214, + "grad_norm": 139.9200439453125, + "learning_rate": 3.831895300054777e-05, + "loss": 73.2557, + "step": 97260 + }, + { + "epoch": 0.3929831082309498, + "grad_norm": 2734.72802734375, + "learning_rate": 3.8315998839667445e-05, + "loss": 73.5291, + "step": 97270 + }, + { + "epoch": 0.3930235094963174, + "grad_norm": 1699.1915283203125, + "learning_rate": 3.8313044419184873e-05, + "loss": 92.5831, + "step": 97280 + }, + { + "epoch": 0.39306391076168506, + "grad_norm": 1669.2913818359375, + "learning_rate": 3.831008973915764e-05, + "loss": 54.4022, + "step": 97290 + }, + { + "epoch": 0.3931043120270527, + "grad_norm": 1141.6053466796875, + "learning_rate": 3.830713479964335e-05, + "loss": 85.7646, + "step": 97300 + }, + { + "epoch": 0.39314471329242034, + "grad_norm": 1941.885986328125, + "learning_rate": 3.8304179600699626e-05, + "loss": 74.6156, + "step": 97310 + }, + { + "epoch": 0.39318511455778793, + "grad_norm": 877.8158569335938, + "learning_rate": 3.830122414238406e-05, + "loss": 56.3947, + "step": 97320 + }, + { + "epoch": 0.39322551582315557, + "grad_norm": 600.1071166992188, + "learning_rate": 3.829826842475429e-05, + "loss": 56.1246, + "step": 97330 + }, + { + "epoch": 0.3932659170885232, + "grad_norm": 1540.666259765625, + "learning_rate": 3.8295312447867924e-05, + "loss": 58.5337, + "step": 97340 + }, + { + "epoch": 0.39330631835389085, + "grad_norm": 620.038330078125, + "learning_rate": 3.82923562117826e-05, + "loss": 40.1906, + "step": 97350 + }, + { + "epoch": 0.3933467196192585, + "grad_norm": 649.1076049804688, + "learning_rate": 3.828939971655595e-05, + "loss": 86.4342, + "step": 97360 + }, + { + "epoch": 0.39338712088462613, + "grad_norm": 1530.02001953125, + "learning_rate": 3.828644296224562e-05, + "loss": 61.6796, + "step": 97370 + }, + { + "epoch": 0.3934275221499937, + "grad_norm": 1663.7215576171875, + "learning_rate": 3.8283485948909224e-05, + "loss": 59.6131, + "step": 97380 + }, + { + "epoch": 0.39346792341536135, + "grad_norm": 509.4885559082031, + "learning_rate": 3.828052867660445e-05, + "loss": 60.7145, + "step": 97390 + }, + { + "epoch": 0.393508324680729, + "grad_norm": 586.28955078125, + "learning_rate": 3.827757114538892e-05, + "loss": 95.243, + "step": 97400 + }, + { + "epoch": 0.39354872594609663, + "grad_norm": 808.3604736328125, + "learning_rate": 3.82746133553203e-05, + "loss": 66.3657, + "step": 97410 + }, + { + "epoch": 0.3935891272114643, + "grad_norm": 307.7069396972656, + "learning_rate": 3.827165530645627e-05, + "loss": 69.5492, + "step": 97420 + }, + { + "epoch": 0.3936295284768319, + "grad_norm": 1287.857666015625, + "learning_rate": 3.8268696998854486e-05, + "loss": 74.0201, + "step": 97430 + }, + { + "epoch": 0.3936699297421995, + "grad_norm": 1741.658935546875, + "learning_rate": 3.826573843257262e-05, + "loss": 95.2432, + "step": 97440 + }, + { + "epoch": 0.39371033100756714, + "grad_norm": 530.0614624023438, + "learning_rate": 3.826277960766835e-05, + "loss": 76.4804, + "step": 97450 + }, + { + "epoch": 0.3937507322729348, + "grad_norm": 449.9317932128906, + "learning_rate": 3.8259820524199374e-05, + "loss": 58.2512, + "step": 97460 + }, + { + "epoch": 0.3937911335383024, + "grad_norm": 273.4659729003906, + "learning_rate": 3.8256861182223366e-05, + "loss": 50.8024, + "step": 97470 + }, + { + "epoch": 0.39383153480367006, + "grad_norm": 1517.554931640625, + "learning_rate": 3.8253901581798016e-05, + "loss": 65.5294, + "step": 97480 + }, + { + "epoch": 0.3938719360690377, + "grad_norm": 286.39971923828125, + "learning_rate": 3.825094172298104e-05, + "loss": 41.5933, + "step": 97490 + }, + { + "epoch": 0.39391233733440534, + "grad_norm": 1205.6583251953125, + "learning_rate": 3.824798160583012e-05, + "loss": 59.1793, + "step": 97500 + }, + { + "epoch": 0.3939527385997729, + "grad_norm": 1654.855224609375, + "learning_rate": 3.824502123040299e-05, + "loss": 71.1765, + "step": 97510 + }, + { + "epoch": 0.39399313986514056, + "grad_norm": 636.5582275390625, + "learning_rate": 3.824206059675736e-05, + "loss": 76.6066, + "step": 97520 + }, + { + "epoch": 0.3940335411305082, + "grad_norm": 878.0634765625, + "learning_rate": 3.823909970495092e-05, + "loss": 66.1705, + "step": 97530 + }, + { + "epoch": 0.39407394239587584, + "grad_norm": 733.1356811523438, + "learning_rate": 3.8236138555041434e-05, + "loss": 86.5831, + "step": 97540 + }, + { + "epoch": 0.3941143436612435, + "grad_norm": 372.1419982910156, + "learning_rate": 3.823317714708661e-05, + "loss": 52.7755, + "step": 97550 + }, + { + "epoch": 0.3941547449266111, + "grad_norm": 978.9861450195312, + "learning_rate": 3.823021548114417e-05, + "loss": 55.6883, + "step": 97560 + }, + { + "epoch": 0.3941951461919787, + "grad_norm": 339.8622741699219, + "learning_rate": 3.822725355727188e-05, + "loss": 51.7447, + "step": 97570 + }, + { + "epoch": 0.39423554745734635, + "grad_norm": 610.7191162109375, + "learning_rate": 3.8224291375527464e-05, + "loss": 49.7207, + "step": 97580 + }, + { + "epoch": 0.394275948722714, + "grad_norm": 1472.842041015625, + "learning_rate": 3.822132893596869e-05, + "loss": 55.3662, + "step": 97590 + }, + { + "epoch": 0.3943163499880816, + "grad_norm": 558.3397827148438, + "learning_rate": 3.821836623865329e-05, + "loss": 91.5254, + "step": 97600 + }, + { + "epoch": 0.39435675125344927, + "grad_norm": 770.9660034179688, + "learning_rate": 3.821540328363905e-05, + "loss": 58.9525, + "step": 97610 + }, + { + "epoch": 0.3943971525188169, + "grad_norm": 900.81982421875, + "learning_rate": 3.821244007098371e-05, + "loss": 49.8541, + "step": 97620 + }, + { + "epoch": 0.39443755378418455, + "grad_norm": 472.5091247558594, + "learning_rate": 3.820947660074504e-05, + "loss": 70.776, + "step": 97630 + }, + { + "epoch": 0.39447795504955213, + "grad_norm": 1419.308349609375, + "learning_rate": 3.820651287298084e-05, + "loss": 62.8083, + "step": 97640 + }, + { + "epoch": 0.39451835631491977, + "grad_norm": 1251.2393798828125, + "learning_rate": 3.8203548887748865e-05, + "loss": 98.8956, + "step": 97650 + }, + { + "epoch": 0.3945587575802874, + "grad_norm": 568.4578857421875, + "learning_rate": 3.8200584645106904e-05, + "loss": 52.2117, + "step": 97660 + }, + { + "epoch": 0.39459915884565505, + "grad_norm": 1025.6453857421875, + "learning_rate": 3.819762014511275e-05, + "loss": 77.8451, + "step": 97670 + }, + { + "epoch": 0.3946395601110227, + "grad_norm": 1376.6019287109375, + "learning_rate": 3.81946553878242e-05, + "loss": 63.783, + "step": 97680 + }, + { + "epoch": 0.39467996137639033, + "grad_norm": 1392.58544921875, + "learning_rate": 3.819169037329905e-05, + "loss": 51.8016, + "step": 97690 + }, + { + "epoch": 0.3947203626417579, + "grad_norm": 737.3711547851562, + "learning_rate": 3.8188725101595094e-05, + "loss": 44.1351, + "step": 97700 + }, + { + "epoch": 0.39476076390712556, + "grad_norm": 675.7130737304688, + "learning_rate": 3.818575957277016e-05, + "loss": 70.7781, + "step": 97710 + }, + { + "epoch": 0.3948011651724932, + "grad_norm": 522.35302734375, + "learning_rate": 3.8182793786882065e-05, + "loss": 54.7861, + "step": 97720 + }, + { + "epoch": 0.39484156643786084, + "grad_norm": 1283.5611572265625, + "learning_rate": 3.817982774398861e-05, + "loss": 86.0404, + "step": 97730 + }, + { + "epoch": 0.3948819677032285, + "grad_norm": 837.8106079101562, + "learning_rate": 3.817686144414762e-05, + "loss": 72.8473, + "step": 97740 + }, + { + "epoch": 0.3949223689685961, + "grad_norm": 2973.33740234375, + "learning_rate": 3.8173894887416945e-05, + "loss": 72.2817, + "step": 97750 + }, + { + "epoch": 0.3949627702339637, + "grad_norm": 433.107421875, + "learning_rate": 3.8170928073854396e-05, + "loss": 71.6905, + "step": 97760 + }, + { + "epoch": 0.39500317149933134, + "grad_norm": 559.8467407226562, + "learning_rate": 3.816796100351783e-05, + "loss": 59.3247, + "step": 97770 + }, + { + "epoch": 0.395043572764699, + "grad_norm": 676.9853515625, + "learning_rate": 3.8164993676465074e-05, + "loss": 59.816, + "step": 97780 + }, + { + "epoch": 0.3950839740300666, + "grad_norm": 1209.71923828125, + "learning_rate": 3.816202609275401e-05, + "loss": 78.3244, + "step": 97790 + }, + { + "epoch": 0.39512437529543426, + "grad_norm": 407.7916259765625, + "learning_rate": 3.8159058252442446e-05, + "loss": 40.6203, + "step": 97800 + }, + { + "epoch": 0.3951647765608019, + "grad_norm": 0.0, + "learning_rate": 3.815609015558829e-05, + "loss": 68.3833, + "step": 97810 + }, + { + "epoch": 0.39520517782616954, + "grad_norm": 235.8328094482422, + "learning_rate": 3.815312180224937e-05, + "loss": 47.2443, + "step": 97820 + }, + { + "epoch": 0.3952455790915371, + "grad_norm": 620.359619140625, + "learning_rate": 3.8150153192483566e-05, + "loss": 53.1402, + "step": 97830 + }, + { + "epoch": 0.39528598035690476, + "grad_norm": 1514.925537109375, + "learning_rate": 3.814718432634876e-05, + "loss": 83.4849, + "step": 97840 + }, + { + "epoch": 0.3953263816222724, + "grad_norm": 195.32774353027344, + "learning_rate": 3.8144215203902834e-05, + "loss": 58.4113, + "step": 97850 + }, + { + "epoch": 0.39536678288764004, + "grad_norm": 610.07568359375, + "learning_rate": 3.814124582520365e-05, + "loss": 47.7628, + "step": 97860 + }, + { + "epoch": 0.3954071841530077, + "grad_norm": 1019.016845703125, + "learning_rate": 3.813827619030913e-05, + "loss": 66.7314, + "step": 97870 + }, + { + "epoch": 0.3954475854183753, + "grad_norm": 340.5806884765625, + "learning_rate": 3.813530629927714e-05, + "loss": 37.2535, + "step": 97880 + }, + { + "epoch": 0.3954879866837429, + "grad_norm": 705.3890991210938, + "learning_rate": 3.81323361521656e-05, + "loss": 63.5229, + "step": 97890 + }, + { + "epoch": 0.39552838794911055, + "grad_norm": 911.9359741210938, + "learning_rate": 3.81293657490324e-05, + "loss": 107.5619, + "step": 97900 + }, + { + "epoch": 0.3955687892144782, + "grad_norm": 448.5742492675781, + "learning_rate": 3.812639508993545e-05, + "loss": 67.1163, + "step": 97910 + }, + { + "epoch": 0.39560919047984583, + "grad_norm": 292.3797607421875, + "learning_rate": 3.8123424174932674e-05, + "loss": 46.8763, + "step": 97920 + }, + { + "epoch": 0.39564959174521347, + "grad_norm": 516.19287109375, + "learning_rate": 3.812045300408199e-05, + "loss": 60.135, + "step": 97930 + }, + { + "epoch": 0.3956899930105811, + "grad_norm": 1726.6441650390625, + "learning_rate": 3.811748157744132e-05, + "loss": 111.1981, + "step": 97940 + }, + { + "epoch": 0.39573039427594875, + "grad_norm": 617.226806640625, + "learning_rate": 3.8114509895068586e-05, + "loss": 40.3687, + "step": 97950 + }, + { + "epoch": 0.39577079554131633, + "grad_norm": 1564.2078857421875, + "learning_rate": 3.811153795702174e-05, + "loss": 57.8685, + "step": 97960 + }, + { + "epoch": 0.395811196806684, + "grad_norm": 543.6961669921875, + "learning_rate": 3.81085657633587e-05, + "loss": 63.6772, + "step": 97970 + }, + { + "epoch": 0.3958515980720516, + "grad_norm": 619.5729370117188, + "learning_rate": 3.810559331413743e-05, + "loss": 53.4549, + "step": 97980 + }, + { + "epoch": 0.39589199933741925, + "grad_norm": 1818.6915283203125, + "learning_rate": 3.810262060941587e-05, + "loss": 46.7142, + "step": 97990 + }, + { + "epoch": 0.3959324006027869, + "grad_norm": 423.151611328125, + "learning_rate": 3.8099647649251986e-05, + "loss": 68.5805, + "step": 98000 + }, + { + "epoch": 0.39597280186815453, + "grad_norm": 797.4796752929688, + "learning_rate": 3.809667443370372e-05, + "loss": 43.375, + "step": 98010 + }, + { + "epoch": 0.3960132031335221, + "grad_norm": 478.58349609375, + "learning_rate": 3.809370096282902e-05, + "loss": 63.259, + "step": 98020 + }, + { + "epoch": 0.39605360439888976, + "grad_norm": 477.88470458984375, + "learning_rate": 3.8090727236685906e-05, + "loss": 85.8146, + "step": 98030 + }, + { + "epoch": 0.3960940056642574, + "grad_norm": 358.82342529296875, + "learning_rate": 3.808775325533232e-05, + "loss": 46.4715, + "step": 98040 + }, + { + "epoch": 0.39613440692962504, + "grad_norm": 1070.005615234375, + "learning_rate": 3.808477901882624e-05, + "loss": 50.1758, + "step": 98050 + }, + { + "epoch": 0.3961748081949927, + "grad_norm": 568.8511352539062, + "learning_rate": 3.808180452722566e-05, + "loss": 53.4005, + "step": 98060 + }, + { + "epoch": 0.3962152094603603, + "grad_norm": 523.6063232421875, + "learning_rate": 3.8078829780588564e-05, + "loss": 65.6613, + "step": 98070 + }, + { + "epoch": 0.3962556107257279, + "grad_norm": 1423.616943359375, + "learning_rate": 3.8075854778972955e-05, + "loss": 88.5826, + "step": 98080 + }, + { + "epoch": 0.39629601199109554, + "grad_norm": 821.159912109375, + "learning_rate": 3.807287952243682e-05, + "loss": 52.51, + "step": 98090 + }, + { + "epoch": 0.3963364132564632, + "grad_norm": 1087.4752197265625, + "learning_rate": 3.8069904011038165e-05, + "loss": 62.1011, + "step": 98100 + }, + { + "epoch": 0.3963768145218308, + "grad_norm": 981.5172119140625, + "learning_rate": 3.806692824483501e-05, + "loss": 43.4204, + "step": 98110 + }, + { + "epoch": 0.39641721578719846, + "grad_norm": 1140.0291748046875, + "learning_rate": 3.806395222388536e-05, + "loss": 67.1618, + "step": 98120 + }, + { + "epoch": 0.3964576170525661, + "grad_norm": 863.1648559570312, + "learning_rate": 3.8060975948247223e-05, + "loss": 71.7122, + "step": 98130 + }, + { + "epoch": 0.39649801831793374, + "grad_norm": 631.3798217773438, + "learning_rate": 3.805799941797865e-05, + "loss": 42.8692, + "step": 98140 + }, + { + "epoch": 0.3965384195833013, + "grad_norm": 1376.2213134765625, + "learning_rate": 3.805502263313765e-05, + "loss": 55.8479, + "step": 98150 + }, + { + "epoch": 0.39657882084866897, + "grad_norm": 476.51446533203125, + "learning_rate": 3.805204559378227e-05, + "loss": 57.2946, + "step": 98160 + }, + { + "epoch": 0.3966192221140366, + "grad_norm": 941.9556884765625, + "learning_rate": 3.804906829997053e-05, + "loss": 71.0839, + "step": 98170 + }, + { + "epoch": 0.39665962337940425, + "grad_norm": 764.8499755859375, + "learning_rate": 3.804609075176049e-05, + "loss": 61.4329, + "step": 98180 + }, + { + "epoch": 0.3967000246447719, + "grad_norm": 804.7611694335938, + "learning_rate": 3.8043112949210194e-05, + "loss": 50.3516, + "step": 98190 + }, + { + "epoch": 0.3967404259101395, + "grad_norm": 757.18798828125, + "learning_rate": 3.80401348923777e-05, + "loss": 65.6766, + "step": 98200 + }, + { + "epoch": 0.3967808271755071, + "grad_norm": 802.1757202148438, + "learning_rate": 3.803715658132105e-05, + "loss": 51.4916, + "step": 98210 + }, + { + "epoch": 0.39682122844087475, + "grad_norm": 677.951904296875, + "learning_rate": 3.803417801609833e-05, + "loss": 40.4412, + "step": 98220 + }, + { + "epoch": 0.3968616297062424, + "grad_norm": 960.4522094726562, + "learning_rate": 3.803119919676761e-05, + "loss": 53.576, + "step": 98230 + }, + { + "epoch": 0.39690203097161003, + "grad_norm": 1985.3116455078125, + "learning_rate": 3.802822012338694e-05, + "loss": 62.2902, + "step": 98240 + }, + { + "epoch": 0.39694243223697767, + "grad_norm": 1060.7294921875, + "learning_rate": 3.802524079601442e-05, + "loss": 59.7074, + "step": 98250 + }, + { + "epoch": 0.3969828335023453, + "grad_norm": 926.2281494140625, + "learning_rate": 3.802226121470811e-05, + "loss": 67.3111, + "step": 98260 + }, + { + "epoch": 0.39702323476771295, + "grad_norm": 558.5460205078125, + "learning_rate": 3.8019281379526114e-05, + "loss": 52.2698, + "step": 98270 + }, + { + "epoch": 0.39706363603308054, + "grad_norm": 1760.0323486328125, + "learning_rate": 3.8016301290526534e-05, + "loss": 63.7224, + "step": 98280 + }, + { + "epoch": 0.3971040372984482, + "grad_norm": 717.9991455078125, + "learning_rate": 3.8013320947767464e-05, + "loss": 61.1942, + "step": 98290 + }, + { + "epoch": 0.3971444385638158, + "grad_norm": 302.7607421875, + "learning_rate": 3.8010340351306997e-05, + "loss": 52.8783, + "step": 98300 + }, + { + "epoch": 0.39718483982918346, + "grad_norm": 603.1386108398438, + "learning_rate": 3.800735950120324e-05, + "loss": 61.837, + "step": 98310 + }, + { + "epoch": 0.3972252410945511, + "grad_norm": 714.36474609375, + "learning_rate": 3.8004378397514315e-05, + "loss": 57.4805, + "step": 98320 + }, + { + "epoch": 0.39726564235991874, + "grad_norm": 578.4430541992188, + "learning_rate": 3.800139704029835e-05, + "loss": 44.0688, + "step": 98330 + }, + { + "epoch": 0.3973060436252863, + "grad_norm": 556.3411254882812, + "learning_rate": 3.7998415429613444e-05, + "loss": 58.5663, + "step": 98340 + }, + { + "epoch": 0.39734644489065396, + "grad_norm": 402.5372314453125, + "learning_rate": 3.7995433565517735e-05, + "loss": 47.3463, + "step": 98350 + }, + { + "epoch": 0.3973868461560216, + "grad_norm": 652.9655151367188, + "learning_rate": 3.799245144806937e-05, + "loss": 77.4669, + "step": 98360 + }, + { + "epoch": 0.39742724742138924, + "grad_norm": 351.4555358886719, + "learning_rate": 3.7989469077326466e-05, + "loss": 55.3536, + "step": 98370 + }, + { + "epoch": 0.3974676486867569, + "grad_norm": 530.9819946289062, + "learning_rate": 3.798648645334718e-05, + "loss": 48.7565, + "step": 98380 + }, + { + "epoch": 0.3975080499521245, + "grad_norm": 997.8034057617188, + "learning_rate": 3.798350357618965e-05, + "loss": 69.4074, + "step": 98390 + }, + { + "epoch": 0.3975484512174921, + "grad_norm": 1051.143310546875, + "learning_rate": 3.798052044591204e-05, + "loss": 68.8657, + "step": 98400 + }, + { + "epoch": 0.39758885248285974, + "grad_norm": 763.1336059570312, + "learning_rate": 3.79775370625725e-05, + "loss": 65.6765, + "step": 98410 + }, + { + "epoch": 0.3976292537482274, + "grad_norm": 727.5631713867188, + "learning_rate": 3.797455342622919e-05, + "loss": 55.0416, + "step": 98420 + }, + { + "epoch": 0.397669655013595, + "grad_norm": 747.4799194335938, + "learning_rate": 3.797156953694028e-05, + "loss": 62.9093, + "step": 98430 + }, + { + "epoch": 0.39771005627896266, + "grad_norm": 568.0397338867188, + "learning_rate": 3.796858539476394e-05, + "loss": 57.0158, + "step": 98440 + }, + { + "epoch": 0.3977504575443303, + "grad_norm": 1187.3994140625, + "learning_rate": 3.7965600999758356e-05, + "loss": 75.5707, + "step": 98450 + }, + { + "epoch": 0.39779085880969794, + "grad_norm": 1377.0982666015625, + "learning_rate": 3.796261635198171e-05, + "loss": 110.1248, + "step": 98460 + }, + { + "epoch": 0.39783126007506553, + "grad_norm": 541.7080078125, + "learning_rate": 3.7959631451492176e-05, + "loss": 113.9989, + "step": 98470 + }, + { + "epoch": 0.39787166134043317, + "grad_norm": 1125.3931884765625, + "learning_rate": 3.7956646298347956e-05, + "loss": 53.4357, + "step": 98480 + }, + { + "epoch": 0.3979120626058008, + "grad_norm": 1457.5439453125, + "learning_rate": 3.795366089260725e-05, + "loss": 47.1348, + "step": 98490 + }, + { + "epoch": 0.39795246387116845, + "grad_norm": 639.0778198242188, + "learning_rate": 3.795067523432826e-05, + "loss": 53.8182, + "step": 98500 + }, + { + "epoch": 0.3979928651365361, + "grad_norm": 473.65032958984375, + "learning_rate": 3.794768932356918e-05, + "loss": 63.5976, + "step": 98510 + }, + { + "epoch": 0.39803326640190373, + "grad_norm": 1148.6119384765625, + "learning_rate": 3.7944703160388234e-05, + "loss": 43.6071, + "step": 98520 + }, + { + "epoch": 0.3980736676672713, + "grad_norm": 2093.45849609375, + "learning_rate": 3.794171674484363e-05, + "loss": 60.3479, + "step": 98530 + }, + { + "epoch": 0.39811406893263895, + "grad_norm": 1044.666748046875, + "learning_rate": 3.793873007699361e-05, + "loss": 88.364, + "step": 98540 + }, + { + "epoch": 0.3981544701980066, + "grad_norm": 597.9827270507812, + "learning_rate": 3.7935743156896375e-05, + "loss": 54.5251, + "step": 98550 + }, + { + "epoch": 0.39819487146337423, + "grad_norm": 353.5838928222656, + "learning_rate": 3.793275598461017e-05, + "loss": 66.4365, + "step": 98560 + }, + { + "epoch": 0.3982352727287419, + "grad_norm": 399.634521484375, + "learning_rate": 3.792976856019323e-05, + "loss": 31.9752, + "step": 98570 + }, + { + "epoch": 0.3982756739941095, + "grad_norm": 915.1165771484375, + "learning_rate": 3.792678088370379e-05, + "loss": 70.9123, + "step": 98580 + }, + { + "epoch": 0.39831607525947715, + "grad_norm": 863.623046875, + "learning_rate": 3.792379295520011e-05, + "loss": 69.3178, + "step": 98590 + }, + { + "epoch": 0.39835647652484474, + "grad_norm": 773.9990844726562, + "learning_rate": 3.792080477474043e-05, + "loss": 75.029, + "step": 98600 + }, + { + "epoch": 0.3983968777902124, + "grad_norm": 889.9473876953125, + "learning_rate": 3.7917816342383005e-05, + "loss": 80.714, + "step": 98610 + }, + { + "epoch": 0.39843727905558, + "grad_norm": 551.6591186523438, + "learning_rate": 3.7914827658186103e-05, + "loss": 59.7947, + "step": 98620 + }, + { + "epoch": 0.39847768032094766, + "grad_norm": 457.0266418457031, + "learning_rate": 3.791183872220798e-05, + "loss": 64.1721, + "step": 98630 + }, + { + "epoch": 0.3985180815863153, + "grad_norm": 778.4760131835938, + "learning_rate": 3.790884953450692e-05, + "loss": 53.5698, + "step": 98640 + }, + { + "epoch": 0.39855848285168294, + "grad_norm": 628.935791015625, + "learning_rate": 3.790586009514119e-05, + "loss": 45.5346, + "step": 98650 + }, + { + "epoch": 0.3985988841170505, + "grad_norm": 390.69134521484375, + "learning_rate": 3.790287040416908e-05, + "loss": 80.0498, + "step": 98660 + }, + { + "epoch": 0.39863928538241816, + "grad_norm": 585.2010498046875, + "learning_rate": 3.7899880461648865e-05, + "loss": 45.9213, + "step": 98670 + }, + { + "epoch": 0.3986796866477858, + "grad_norm": 537.3295288085938, + "learning_rate": 3.789689026763883e-05, + "loss": 52.588, + "step": 98680 + }, + { + "epoch": 0.39872008791315344, + "grad_norm": 1386.0994873046875, + "learning_rate": 3.789389982219729e-05, + "loss": 69.4656, + "step": 98690 + }, + { + "epoch": 0.3987604891785211, + "grad_norm": 471.1151123046875, + "learning_rate": 3.789090912538253e-05, + "loss": 52.704, + "step": 98700 + }, + { + "epoch": 0.3988008904438887, + "grad_norm": 407.30584716796875, + "learning_rate": 3.7887918177252855e-05, + "loss": 86.2156, + "step": 98710 + }, + { + "epoch": 0.3988412917092563, + "grad_norm": 563.2343139648438, + "learning_rate": 3.788492697786658e-05, + "loss": 79.5212, + "step": 98720 + }, + { + "epoch": 0.39888169297462395, + "grad_norm": 3062.3173828125, + "learning_rate": 3.788193552728204e-05, + "loss": 67.4379, + "step": 98730 + }, + { + "epoch": 0.3989220942399916, + "grad_norm": 581.1060791015625, + "learning_rate": 3.7878943825557516e-05, + "loss": 53.4574, + "step": 98740 + }, + { + "epoch": 0.3989624955053592, + "grad_norm": 495.8964538574219, + "learning_rate": 3.787595187275136e-05, + "loss": 58.9977, + "step": 98750 + }, + { + "epoch": 0.39900289677072687, + "grad_norm": 576.1397705078125, + "learning_rate": 3.7872959668921884e-05, + "loss": 87.9499, + "step": 98760 + }, + { + "epoch": 0.3990432980360945, + "grad_norm": 429.25439453125, + "learning_rate": 3.786996721412745e-05, + "loss": 42.8542, + "step": 98770 + }, + { + "epoch": 0.39908369930146215, + "grad_norm": 403.6635437011719, + "learning_rate": 3.7866974508426354e-05, + "loss": 43.5901, + "step": 98780 + }, + { + "epoch": 0.39912410056682973, + "grad_norm": 498.6553039550781, + "learning_rate": 3.786398155187698e-05, + "loss": 81.6215, + "step": 98790 + }, + { + "epoch": 0.39916450183219737, + "grad_norm": 779.7023315429688, + "learning_rate": 3.786098834453766e-05, + "loss": 47.5752, + "step": 98800 + }, + { + "epoch": 0.399204903097565, + "grad_norm": 615.2477416992188, + "learning_rate": 3.7857994886466755e-05, + "loss": 53.3112, + "step": 98810 + }, + { + "epoch": 0.39924530436293265, + "grad_norm": 2094.180908203125, + "learning_rate": 3.7855001177722615e-05, + "loss": 62.0935, + "step": 98820 + }, + { + "epoch": 0.3992857056283003, + "grad_norm": 477.36724853515625, + "learning_rate": 3.785200721836361e-05, + "loss": 75.4263, + "step": 98830 + }, + { + "epoch": 0.39932610689366793, + "grad_norm": 2158.408447265625, + "learning_rate": 3.7849013008448115e-05, + "loss": 99.4542, + "step": 98840 + }, + { + "epoch": 0.3993665081590355, + "grad_norm": 1072.518310546875, + "learning_rate": 3.784601854803449e-05, + "loss": 77.213, + "step": 98850 + }, + { + "epoch": 0.39940690942440316, + "grad_norm": 796.7136840820312, + "learning_rate": 3.784302383718113e-05, + "loss": 62.9405, + "step": 98860 + }, + { + "epoch": 0.3994473106897708, + "grad_norm": 716.7407836914062, + "learning_rate": 3.784002887594639e-05, + "loss": 66.7786, + "step": 98870 + }, + { + "epoch": 0.39948771195513844, + "grad_norm": 526.4265747070312, + "learning_rate": 3.783703366438868e-05, + "loss": 47.3223, + "step": 98880 + }, + { + "epoch": 0.3995281132205061, + "grad_norm": 420.97637939453125, + "learning_rate": 3.783403820256639e-05, + "loss": 43.0085, + "step": 98890 + }, + { + "epoch": 0.3995685144858737, + "grad_norm": 931.0668334960938, + "learning_rate": 3.783104249053793e-05, + "loss": 84.5414, + "step": 98900 + }, + { + "epoch": 0.39960891575124136, + "grad_norm": 729.2132568359375, + "learning_rate": 3.782804652836168e-05, + "loss": 59.9172, + "step": 98910 + }, + { + "epoch": 0.39964931701660894, + "grad_norm": 518.9829711914062, + "learning_rate": 3.782505031609607e-05, + "loss": 64.4889, + "step": 98920 + }, + { + "epoch": 0.3996897182819766, + "grad_norm": 756.1053466796875, + "learning_rate": 3.782205385379948e-05, + "loss": 73.3509, + "step": 98930 + }, + { + "epoch": 0.3997301195473442, + "grad_norm": 1349.8555908203125, + "learning_rate": 3.781905714153037e-05, + "loss": 70.6158, + "step": 98940 + }, + { + "epoch": 0.39977052081271186, + "grad_norm": 573.3903198242188, + "learning_rate": 3.781606017934713e-05, + "loss": 48.6632, + "step": 98950 + }, + { + "epoch": 0.3998109220780795, + "grad_norm": 510.5514221191406, + "learning_rate": 3.78130629673082e-05, + "loss": 65.3422, + "step": 98960 + }, + { + "epoch": 0.39985132334344714, + "grad_norm": 676.6386108398438, + "learning_rate": 3.781006550547202e-05, + "loss": 49.1923, + "step": 98970 + }, + { + "epoch": 0.3998917246088147, + "grad_norm": 1945.87353515625, + "learning_rate": 3.780706779389701e-05, + "loss": 109.869, + "step": 98980 + }, + { + "epoch": 0.39993212587418236, + "grad_norm": 607.2948608398438, + "learning_rate": 3.7804069832641615e-05, + "loss": 59.3958, + "step": 98990 + }, + { + "epoch": 0.39997252713955, + "grad_norm": 3488.2783203125, + "learning_rate": 3.780107162176429e-05, + "loss": 71.8975, + "step": 99000 + }, + { + "epoch": 0.40001292840491764, + "grad_norm": 869.2059326171875, + "learning_rate": 3.779807316132349e-05, + "loss": 71.5209, + "step": 99010 + }, + { + "epoch": 0.4000533296702853, + "grad_norm": 1111.3924560546875, + "learning_rate": 3.779507445137766e-05, + "loss": 72.5968, + "step": 99020 + }, + { + "epoch": 0.4000937309356529, + "grad_norm": 590.8203125, + "learning_rate": 3.779207549198527e-05, + "loss": 68.4518, + "step": 99030 + }, + { + "epoch": 0.4001341322010205, + "grad_norm": 1010.8024291992188, + "learning_rate": 3.778907628320477e-05, + "loss": 55.1932, + "step": 99040 + }, + { + "epoch": 0.40017453346638815, + "grad_norm": 1730.03271484375, + "learning_rate": 3.778607682509465e-05, + "loss": 72.5133, + "step": 99050 + }, + { + "epoch": 0.4002149347317558, + "grad_norm": 706.1375122070312, + "learning_rate": 3.7783077117713386e-05, + "loss": 51.3278, + "step": 99060 + }, + { + "epoch": 0.40025533599712343, + "grad_norm": 491.18145751953125, + "learning_rate": 3.778007716111945e-05, + "loss": 57.2659, + "step": 99070 + }, + { + "epoch": 0.40029573726249107, + "grad_norm": 967.4299926757812, + "learning_rate": 3.777707695537133e-05, + "loss": 54.0169, + "step": 99080 + }, + { + "epoch": 0.4003361385278587, + "grad_norm": 586.3837280273438, + "learning_rate": 3.777407650052751e-05, + "loss": 72.3556, + "step": 99090 + }, + { + "epoch": 0.40037653979322635, + "grad_norm": 618.3201904296875, + "learning_rate": 3.77710757966465e-05, + "loss": 57.029, + "step": 99100 + }, + { + "epoch": 0.40041694105859393, + "grad_norm": 753.272705078125, + "learning_rate": 3.7768074843786796e-05, + "loss": 61.8438, + "step": 99110 + }, + { + "epoch": 0.4004573423239616, + "grad_norm": 249.07408142089844, + "learning_rate": 3.776507364200689e-05, + "loss": 66.9332, + "step": 99120 + }, + { + "epoch": 0.4004977435893292, + "grad_norm": 371.7924499511719, + "learning_rate": 3.77620721913653e-05, + "loss": 57.5363, + "step": 99130 + }, + { + "epoch": 0.40053814485469685, + "grad_norm": 1456.956787109375, + "learning_rate": 3.7759070491920544e-05, + "loss": 66.1014, + "step": 99140 + }, + { + "epoch": 0.4005785461200645, + "grad_norm": 700.562744140625, + "learning_rate": 3.775606854373115e-05, + "loss": 60.8896, + "step": 99150 + }, + { + "epoch": 0.40061894738543213, + "grad_norm": 712.740966796875, + "learning_rate": 3.775306634685562e-05, + "loss": 57.9598, + "step": 99160 + }, + { + "epoch": 0.4006593486507997, + "grad_norm": 2491.177490234375, + "learning_rate": 3.7750063901352494e-05, + "loss": 66.2804, + "step": 99170 + }, + { + "epoch": 0.40069974991616736, + "grad_norm": 930.8803100585938, + "learning_rate": 3.774706120728032e-05, + "loss": 61.9584, + "step": 99180 + }, + { + "epoch": 0.400740151181535, + "grad_norm": 488.9775085449219, + "learning_rate": 3.774405826469762e-05, + "loss": 51.8706, + "step": 99190 + }, + { + "epoch": 0.40078055244690264, + "grad_norm": 1185.0855712890625, + "learning_rate": 3.7741055073662946e-05, + "loss": 59.6434, + "step": 99200 + }, + { + "epoch": 0.4008209537122703, + "grad_norm": 1065.01025390625, + "learning_rate": 3.773805163423484e-05, + "loss": 90.2759, + "step": 99210 + }, + { + "epoch": 0.4008613549776379, + "grad_norm": 1346.4959716796875, + "learning_rate": 3.773504794647187e-05, + "loss": 47.388, + "step": 99220 + }, + { + "epoch": 0.40090175624300556, + "grad_norm": 672.511962890625, + "learning_rate": 3.7732044010432564e-05, + "loss": 43.0481, + "step": 99230 + }, + { + "epoch": 0.40094215750837314, + "grad_norm": 655.6102294921875, + "learning_rate": 3.772903982617552e-05, + "loss": 57.7771, + "step": 99240 + }, + { + "epoch": 0.4009825587737408, + "grad_norm": 1324.095703125, + "learning_rate": 3.7726035393759285e-05, + "loss": 57.1472, + "step": 99250 + }, + { + "epoch": 0.4010229600391084, + "grad_norm": 1157.129150390625, + "learning_rate": 3.772303071324244e-05, + "loss": 60.1481, + "step": 99260 + }, + { + "epoch": 0.40106336130447606, + "grad_norm": 784.1769409179688, + "learning_rate": 3.772002578468356e-05, + "loss": 101.2776, + "step": 99270 + }, + { + "epoch": 0.4011037625698437, + "grad_norm": 682.7789306640625, + "learning_rate": 3.771702060814123e-05, + "loss": 86.0453, + "step": 99280 + }, + { + "epoch": 0.40114416383521134, + "grad_norm": 3423.19921875, + "learning_rate": 3.771401518367403e-05, + "loss": 74.2687, + "step": 99290 + }, + { + "epoch": 0.4011845651005789, + "grad_norm": 672.9019775390625, + "learning_rate": 3.771100951134057e-05, + "loss": 55.2397, + "step": 99300 + }, + { + "epoch": 0.40122496636594657, + "grad_norm": 642.078857421875, + "learning_rate": 3.770800359119943e-05, + "loss": 34.1067, + "step": 99310 + }, + { + "epoch": 0.4012653676313142, + "grad_norm": 986.0284423828125, + "learning_rate": 3.770499742330922e-05, + "loss": 64.7369, + "step": 99320 + }, + { + "epoch": 0.40130576889668185, + "grad_norm": 416.7942199707031, + "learning_rate": 3.770199100772853e-05, + "loss": 63.9787, + "step": 99330 + }, + { + "epoch": 0.4013461701620495, + "grad_norm": 1163.7266845703125, + "learning_rate": 3.7698984344515997e-05, + "loss": 55.3137, + "step": 99340 + }, + { + "epoch": 0.4013865714274171, + "grad_norm": 591.773681640625, + "learning_rate": 3.769597743373023e-05, + "loss": 54.0791, + "step": 99350 + }, + { + "epoch": 0.4014269726927847, + "grad_norm": 1694.82373046875, + "learning_rate": 3.769297027542985e-05, + "loss": 46.7332, + "step": 99360 + }, + { + "epoch": 0.40146737395815235, + "grad_norm": 717.4148559570312, + "learning_rate": 3.768996286967347e-05, + "loss": 55.8938, + "step": 99370 + }, + { + "epoch": 0.40150777522352, + "grad_norm": 591.4462890625, + "learning_rate": 3.768695521651973e-05, + "loss": 46.4734, + "step": 99380 + }, + { + "epoch": 0.40154817648888763, + "grad_norm": 1100.54150390625, + "learning_rate": 3.7683947316027276e-05, + "loss": 63.5928, + "step": 99390 + }, + { + "epoch": 0.40158857775425527, + "grad_norm": 421.6627197265625, + "learning_rate": 3.7680939168254733e-05, + "loss": 59.3417, + "step": 99400 + }, + { + "epoch": 0.4016289790196229, + "grad_norm": 307.5633544921875, + "learning_rate": 3.767793077326075e-05, + "loss": 94.9799, + "step": 99410 + }, + { + "epoch": 0.40166938028499055, + "grad_norm": 743.0738525390625, + "learning_rate": 3.767492213110397e-05, + "loss": 58.2898, + "step": 99420 + }, + { + "epoch": 0.40170978155035814, + "grad_norm": 724.935791015625, + "learning_rate": 3.767191324184308e-05, + "loss": 54.7535, + "step": 99430 + }, + { + "epoch": 0.4017501828157258, + "grad_norm": 1314.9210205078125, + "learning_rate": 3.7668904105536706e-05, + "loss": 76.4102, + "step": 99440 + }, + { + "epoch": 0.4017905840810934, + "grad_norm": 920.00537109375, + "learning_rate": 3.7665894722243525e-05, + "loss": 58.301, + "step": 99450 + }, + { + "epoch": 0.40183098534646106, + "grad_norm": 1673.596435546875, + "learning_rate": 3.76628850920222e-05, + "loss": 57.4322, + "step": 99460 + }, + { + "epoch": 0.4018713866118287, + "grad_norm": 826.5586547851562, + "learning_rate": 3.7659875214931426e-05, + "loss": 69.376, + "step": 99470 + }, + { + "epoch": 0.40191178787719634, + "grad_norm": 581.40283203125, + "learning_rate": 3.765686509102985e-05, + "loss": 36.2128, + "step": 99480 + }, + { + "epoch": 0.4019521891425639, + "grad_norm": 326.0263977050781, + "learning_rate": 3.765385472037618e-05, + "loss": 64.3921, + "step": 99490 + }, + { + "epoch": 0.40199259040793156, + "grad_norm": 729.6510009765625, + "learning_rate": 3.765084410302909e-05, + "loss": 63.8964, + "step": 99500 + }, + { + "epoch": 0.4020329916732992, + "grad_norm": 920.1556396484375, + "learning_rate": 3.76478332390473e-05, + "loss": 41.994, + "step": 99510 + }, + { + "epoch": 0.40207339293866684, + "grad_norm": 888.444091796875, + "learning_rate": 3.764482212848948e-05, + "loss": 70.7583, + "step": 99520 + }, + { + "epoch": 0.4021137942040345, + "grad_norm": 909.1885375976562, + "learning_rate": 3.7641810771414335e-05, + "loss": 67.3313, + "step": 99530 + }, + { + "epoch": 0.4021541954694021, + "grad_norm": 1571.4930419921875, + "learning_rate": 3.763879916788059e-05, + "loss": 66.0156, + "step": 99540 + }, + { + "epoch": 0.40219459673476976, + "grad_norm": 724.2689208984375, + "learning_rate": 3.763578731794695e-05, + "loss": 66.495, + "step": 99550 + }, + { + "epoch": 0.40223499800013734, + "grad_norm": 734.593505859375, + "learning_rate": 3.7632775221672115e-05, + "loss": 41.2429, + "step": 99560 + }, + { + "epoch": 0.402275399265505, + "grad_norm": 1102.271728515625, + "learning_rate": 3.7629762879114835e-05, + "loss": 71.2054, + "step": 99570 + }, + { + "epoch": 0.4023158005308726, + "grad_norm": 1053.3270263671875, + "learning_rate": 3.7626750290333824e-05, + "loss": 49.4548, + "step": 99580 + }, + { + "epoch": 0.40235620179624026, + "grad_norm": 774.8560180664062, + "learning_rate": 3.7623737455387814e-05, + "loss": 53.1317, + "step": 99590 + }, + { + "epoch": 0.4023966030616079, + "grad_norm": 792.8966064453125, + "learning_rate": 3.762072437433555e-05, + "loss": 63.3754, + "step": 99600 + }, + { + "epoch": 0.40243700432697554, + "grad_norm": 648.0517578125, + "learning_rate": 3.761771104723576e-05, + "loss": 41.7011, + "step": 99610 + }, + { + "epoch": 0.40247740559234313, + "grad_norm": 816.7391967773438, + "learning_rate": 3.76146974741472e-05, + "loss": 55.6581, + "step": 99620 + }, + { + "epoch": 0.40251780685771077, + "grad_norm": 2065.635498046875, + "learning_rate": 3.761168365512862e-05, + "loss": 76.0005, + "step": 99630 + }, + { + "epoch": 0.4025582081230784, + "grad_norm": 1137.087890625, + "learning_rate": 3.760866959023877e-05, + "loss": 58.8005, + "step": 99640 + }, + { + "epoch": 0.40259860938844605, + "grad_norm": 1612.660400390625, + "learning_rate": 3.760565527953641e-05, + "loss": 52.3195, + "step": 99650 + }, + { + "epoch": 0.4026390106538137, + "grad_norm": 1130.4898681640625, + "learning_rate": 3.7602640723080315e-05, + "loss": 58.5023, + "step": 99660 + }, + { + "epoch": 0.40267941191918133, + "grad_norm": 1282.1605224609375, + "learning_rate": 3.7599625920929254e-05, + "loss": 60.2157, + "step": 99670 + }, + { + "epoch": 0.4027198131845489, + "grad_norm": 626.7401733398438, + "learning_rate": 3.759661087314199e-05, + "loss": 61.224, + "step": 99680 + }, + { + "epoch": 0.40276021444991655, + "grad_norm": 403.7524108886719, + "learning_rate": 3.759359557977732e-05, + "loss": 43.2286, + "step": 99690 + }, + { + "epoch": 0.4028006157152842, + "grad_norm": 459.4110107421875, + "learning_rate": 3.759058004089402e-05, + "loss": 63.8654, + "step": 99700 + }, + { + "epoch": 0.40284101698065183, + "grad_norm": 689.9656982421875, + "learning_rate": 3.758756425655089e-05, + "loss": 55.9281, + "step": 99710 + }, + { + "epoch": 0.4028814182460195, + "grad_norm": 493.5693664550781, + "learning_rate": 3.7584548226806696e-05, + "loss": 46.4668, + "step": 99720 + }, + { + "epoch": 0.4029218195113871, + "grad_norm": 568.452880859375, + "learning_rate": 3.758153195172026e-05, + "loss": 85.0147, + "step": 99730 + }, + { + "epoch": 0.40296222077675475, + "grad_norm": 775.391845703125, + "learning_rate": 3.7578515431350384e-05, + "loss": 62.4054, + "step": 99740 + }, + { + "epoch": 0.40300262204212234, + "grad_norm": 1037.2498779296875, + "learning_rate": 3.757549866575588e-05, + "loss": 58.9337, + "step": 99750 + }, + { + "epoch": 0.40304302330749, + "grad_norm": 487.4396667480469, + "learning_rate": 3.757248165499555e-05, + "loss": 78.7348, + "step": 99760 + }, + { + "epoch": 0.4030834245728576, + "grad_norm": 1064.0037841796875, + "learning_rate": 3.7569464399128215e-05, + "loss": 43.4251, + "step": 99770 + }, + { + "epoch": 0.40312382583822526, + "grad_norm": 417.7305603027344, + "learning_rate": 3.75664468982127e-05, + "loss": 43.3443, + "step": 99780 + }, + { + "epoch": 0.4031642271035929, + "grad_norm": 1124.982177734375, + "learning_rate": 3.756342915230784e-05, + "loss": 72.6339, + "step": 99790 + }, + { + "epoch": 0.40320462836896054, + "grad_norm": 1895.08935546875, + "learning_rate": 3.7560411161472456e-05, + "loss": 87.87, + "step": 99800 + }, + { + "epoch": 0.4032450296343281, + "grad_norm": 885.612060546875, + "learning_rate": 3.755739292576539e-05, + "loss": 68.6611, + "step": 99810 + }, + { + "epoch": 0.40328543089969576, + "grad_norm": 1059.715576171875, + "learning_rate": 3.7554374445245474e-05, + "loss": 50.6557, + "step": 99820 + }, + { + "epoch": 0.4033258321650634, + "grad_norm": 1289.69677734375, + "learning_rate": 3.755135571997158e-05, + "loss": 55.3921, + "step": 99830 + }, + { + "epoch": 0.40336623343043104, + "grad_norm": 948.4808959960938, + "learning_rate": 3.7548336750002544e-05, + "loss": 52.4533, + "step": 99840 + }, + { + "epoch": 0.4034066346957987, + "grad_norm": 768.9962768554688, + "learning_rate": 3.7545317535397214e-05, + "loss": 52.6331, + "step": 99850 + }, + { + "epoch": 0.4034470359611663, + "grad_norm": 806.0327758789062, + "learning_rate": 3.754229807621446e-05, + "loss": 58.2938, + "step": 99860 + }, + { + "epoch": 0.40348743722653396, + "grad_norm": 631.0634155273438, + "learning_rate": 3.753927837251315e-05, + "loss": 68.7876, + "step": 99870 + }, + { + "epoch": 0.40352783849190155, + "grad_norm": 574.1565551757812, + "learning_rate": 3.753625842435216e-05, + "loss": 45.7764, + "step": 99880 + }, + { + "epoch": 0.4035682397572692, + "grad_norm": 1697.2900390625, + "learning_rate": 3.753323823179035e-05, + "loss": 65.6436, + "step": 99890 + }, + { + "epoch": 0.4036086410226368, + "grad_norm": 260.11083984375, + "learning_rate": 3.7530217794886606e-05, + "loss": 71.5105, + "step": 99900 + }, + { + "epoch": 0.40364904228800447, + "grad_norm": 932.4937133789062, + "learning_rate": 3.752719711369982e-05, + "loss": 61.679, + "step": 99910 + }, + { + "epoch": 0.4036894435533721, + "grad_norm": 676.4361572265625, + "learning_rate": 3.752417618828888e-05, + "loss": 90.9519, + "step": 99920 + }, + { + "epoch": 0.40372984481873975, + "grad_norm": 6873.72802734375, + "learning_rate": 3.752115501871267e-05, + "loss": 99.3489, + "step": 99930 + }, + { + "epoch": 0.40377024608410733, + "grad_norm": 0.0, + "learning_rate": 3.75181336050301e-05, + "loss": 58.3508, + "step": 99940 + }, + { + "epoch": 0.40381064734947497, + "grad_norm": 619.0031127929688, + "learning_rate": 3.751511194730007e-05, + "loss": 58.6171, + "step": 99950 + }, + { + "epoch": 0.4038510486148426, + "grad_norm": 963.3147583007812, + "learning_rate": 3.751209004558149e-05, + "loss": 60.5256, + "step": 99960 + }, + { + "epoch": 0.40389144988021025, + "grad_norm": 720.6400756835938, + "learning_rate": 3.750906789993327e-05, + "loss": 59.5601, + "step": 99970 + }, + { + "epoch": 0.4039318511455779, + "grad_norm": 1478.5609130859375, + "learning_rate": 3.7506045510414335e-05, + "loss": 46.86, + "step": 99980 + }, + { + "epoch": 0.40397225241094553, + "grad_norm": 427.1527099609375, + "learning_rate": 3.7503022877083606e-05, + "loss": 57.4738, + "step": 99990 + }, + { + "epoch": 0.4040126536763131, + "grad_norm": 967.2838134765625, + "learning_rate": 3.7500000000000003e-05, + "loss": 77.0952, + "step": 100000 + }, + { + "epoch": 0.40405305494168076, + "grad_norm": 896.5092163085938, + "learning_rate": 3.749697687922247e-05, + "loss": 49.8017, + "step": 100010 + }, + { + "epoch": 0.4040934562070484, + "grad_norm": 663.4006958007812, + "learning_rate": 3.749395351480993e-05, + "loss": 45.9336, + "step": 100020 + }, + { + "epoch": 0.40413385747241604, + "grad_norm": 471.2986145019531, + "learning_rate": 3.749092990682134e-05, + "loss": 62.4995, + "step": 100030 + }, + { + "epoch": 0.4041742587377837, + "grad_norm": 956.7597045898438, + "learning_rate": 3.748790605531565e-05, + "loss": 62.3297, + "step": 100040 + }, + { + "epoch": 0.4042146600031513, + "grad_norm": 2078.440185546875, + "learning_rate": 3.748488196035179e-05, + "loss": 129.3817, + "step": 100050 + }, + { + "epoch": 0.40425506126851896, + "grad_norm": 4852.7265625, + "learning_rate": 3.748185762198873e-05, + "loss": 86.8261, + "step": 100060 + }, + { + "epoch": 0.40429546253388654, + "grad_norm": 1042.48828125, + "learning_rate": 3.747883304028543e-05, + "loss": 51.4865, + "step": 100070 + }, + { + "epoch": 0.4043358637992542, + "grad_norm": 758.50048828125, + "learning_rate": 3.7475808215300854e-05, + "loss": 47.7026, + "step": 100080 + }, + { + "epoch": 0.4043762650646218, + "grad_norm": 769.8150024414062, + "learning_rate": 3.7472783147093985e-05, + "loss": 85.2325, + "step": 100090 + }, + { + "epoch": 0.40441666632998946, + "grad_norm": 777.2942504882812, + "learning_rate": 3.746975783572377e-05, + "loss": 67.7482, + "step": 100100 + }, + { + "epoch": 0.4044570675953571, + "grad_norm": 605.9794921875, + "learning_rate": 3.746673228124922e-05, + "loss": 53.6268, + "step": 100110 + }, + { + "epoch": 0.40449746886072474, + "grad_norm": 1080.2708740234375, + "learning_rate": 3.7463706483729296e-05, + "loss": 69.3192, + "step": 100120 + }, + { + "epoch": 0.4045378701260923, + "grad_norm": 725.3931884765625, + "learning_rate": 3.7460680443223004e-05, + "loss": 42.6117, + "step": 100130 + }, + { + "epoch": 0.40457827139145996, + "grad_norm": 629.57470703125, + "learning_rate": 3.745765415978933e-05, + "loss": 69.1875, + "step": 100140 + }, + { + "epoch": 0.4046186726568276, + "grad_norm": 788.4440307617188, + "learning_rate": 3.7454627633487274e-05, + "loss": 86.3007, + "step": 100150 + }, + { + "epoch": 0.40465907392219524, + "grad_norm": 879.8585815429688, + "learning_rate": 3.7451600864375844e-05, + "loss": 81.9851, + "step": 100160 + }, + { + "epoch": 0.4046994751875629, + "grad_norm": 578.5433349609375, + "learning_rate": 3.7448573852514035e-05, + "loss": 56.7397, + "step": 100170 + }, + { + "epoch": 0.4047398764529305, + "grad_norm": 731.2473754882812, + "learning_rate": 3.744554659796088e-05, + "loss": 54.396, + "step": 100180 + }, + { + "epoch": 0.40478027771829816, + "grad_norm": 718.974609375, + "learning_rate": 3.744251910077538e-05, + "loss": 47.8276, + "step": 100190 + }, + { + "epoch": 0.40482067898366575, + "grad_norm": 714.3526000976562, + "learning_rate": 3.7439491361016564e-05, + "loss": 68.6014, + "step": 100200 + }, + { + "epoch": 0.4048610802490334, + "grad_norm": 681.6476440429688, + "learning_rate": 3.743646337874346e-05, + "loss": 68.5919, + "step": 100210 + }, + { + "epoch": 0.40490148151440103, + "grad_norm": 709.8939819335938, + "learning_rate": 3.743343515401511e-05, + "loss": 60.53, + "step": 100220 + }, + { + "epoch": 0.40494188277976867, + "grad_norm": 730.64892578125, + "learning_rate": 3.743040668689053e-05, + "loss": 91.2893, + "step": 100230 + }, + { + "epoch": 0.4049822840451363, + "grad_norm": 1242.37451171875, + "learning_rate": 3.742737797742878e-05, + "loss": 50.5919, + "step": 100240 + }, + { + "epoch": 0.40502268531050395, + "grad_norm": 1279.793701171875, + "learning_rate": 3.742434902568889e-05, + "loss": 47.2868, + "step": 100250 + }, + { + "epoch": 0.40506308657587153, + "grad_norm": 446.11175537109375, + "learning_rate": 3.742131983172992e-05, + "loss": 43.8764, + "step": 100260 + }, + { + "epoch": 0.4051034878412392, + "grad_norm": 478.0585021972656, + "learning_rate": 3.741829039561092e-05, + "loss": 67.3198, + "step": 100270 + }, + { + "epoch": 0.4051438891066068, + "grad_norm": 1781.1514892578125, + "learning_rate": 3.741526071739097e-05, + "loss": 55.5198, + "step": 100280 + }, + { + "epoch": 0.40518429037197445, + "grad_norm": 601.347412109375, + "learning_rate": 3.741223079712911e-05, + "loss": 50.489, + "step": 100290 + }, + { + "epoch": 0.4052246916373421, + "grad_norm": 631.7415161132812, + "learning_rate": 3.7409200634884426e-05, + "loss": 52.5421, + "step": 100300 + }, + { + "epoch": 0.40526509290270973, + "grad_norm": 431.8924255371094, + "learning_rate": 3.740617023071598e-05, + "loss": 59.2278, + "step": 100310 + }, + { + "epoch": 0.4053054941680773, + "grad_norm": 1287.239501953125, + "learning_rate": 3.740313958468287e-05, + "loss": 63.1438, + "step": 100320 + }, + { + "epoch": 0.40534589543344496, + "grad_norm": 769.814208984375, + "learning_rate": 3.7400108696844156e-05, + "loss": 56.3332, + "step": 100330 + }, + { + "epoch": 0.4053862966988126, + "grad_norm": 372.637451171875, + "learning_rate": 3.739707756725894e-05, + "loss": 61.8536, + "step": 100340 + }, + { + "epoch": 0.40542669796418024, + "grad_norm": 537.6292114257812, + "learning_rate": 3.739404619598632e-05, + "loss": 75.01, + "step": 100350 + }, + { + "epoch": 0.4054670992295479, + "grad_norm": 1282.1539306640625, + "learning_rate": 3.7391014583085385e-05, + "loss": 42.7495, + "step": 100360 + }, + { + "epoch": 0.4055075004949155, + "grad_norm": 1255.515869140625, + "learning_rate": 3.738798272861525e-05, + "loss": 79.5692, + "step": 100370 + }, + { + "epoch": 0.40554790176028316, + "grad_norm": 1217.02294921875, + "learning_rate": 3.7384950632634995e-05, + "loss": 55.6681, + "step": 100380 + }, + { + "epoch": 0.40558830302565074, + "grad_norm": 480.9776306152344, + "learning_rate": 3.7381918295203774e-05, + "loss": 43.6625, + "step": 100390 + }, + { + "epoch": 0.4056287042910184, + "grad_norm": 861.3986206054688, + "learning_rate": 3.7378885716380664e-05, + "loss": 48.5241, + "step": 100400 + }, + { + "epoch": 0.405669105556386, + "grad_norm": 490.213134765625, + "learning_rate": 3.737585289622482e-05, + "loss": 60.6238, + "step": 100410 + }, + { + "epoch": 0.40570950682175366, + "grad_norm": 0.0, + "learning_rate": 3.7372819834795335e-05, + "loss": 82.8279, + "step": 100420 + }, + { + "epoch": 0.4057499080871213, + "grad_norm": 3635.81689453125, + "learning_rate": 3.736978653215136e-05, + "loss": 79.4986, + "step": 100430 + }, + { + "epoch": 0.40579030935248894, + "grad_norm": 803.3690795898438, + "learning_rate": 3.736675298835203e-05, + "loss": 84.2047, + "step": 100440 + }, + { + "epoch": 0.4058307106178565, + "grad_norm": 1111.7283935546875, + "learning_rate": 3.7363719203456495e-05, + "loss": 51.9508, + "step": 100450 + }, + { + "epoch": 0.40587111188322417, + "grad_norm": 472.92242431640625, + "learning_rate": 3.736068517752388e-05, + "loss": 47.1429, + "step": 100460 + }, + { + "epoch": 0.4059115131485918, + "grad_norm": 638.3177490234375, + "learning_rate": 3.735765091061334e-05, + "loss": 74.3898, + "step": 100470 + }, + { + "epoch": 0.40595191441395945, + "grad_norm": 877.7601318359375, + "learning_rate": 3.7354616402784035e-05, + "loss": 63.0587, + "step": 100480 + }, + { + "epoch": 0.4059923156793271, + "grad_norm": 625.0929565429688, + "learning_rate": 3.735158165409514e-05, + "loss": 38.2794, + "step": 100490 + }, + { + "epoch": 0.4060327169446947, + "grad_norm": 756.3575439453125, + "learning_rate": 3.7348546664605777e-05, + "loss": 64.2025, + "step": 100500 + }, + { + "epoch": 0.4060731182100623, + "grad_norm": 1217.9346923828125, + "learning_rate": 3.7345511434375145e-05, + "loss": 68.1389, + "step": 100510 + }, + { + "epoch": 0.40611351947542995, + "grad_norm": 873.300048828125, + "learning_rate": 3.734247596346242e-05, + "loss": 66.9642, + "step": 100520 + }, + { + "epoch": 0.4061539207407976, + "grad_norm": 959.7666015625, + "learning_rate": 3.733944025192677e-05, + "loss": 66.9598, + "step": 100530 + }, + { + "epoch": 0.40619432200616523, + "grad_norm": 789.1475830078125, + "learning_rate": 3.733640429982738e-05, + "loss": 58.7012, + "step": 100540 + }, + { + "epoch": 0.40623472327153287, + "grad_norm": 480.3835754394531, + "learning_rate": 3.7333368107223424e-05, + "loss": 43.2679, + "step": 100550 + }, + { + "epoch": 0.4062751245369005, + "grad_norm": 759.8263549804688, + "learning_rate": 3.7330331674174125e-05, + "loss": 66.052, + "step": 100560 + }, + { + "epoch": 0.40631552580226815, + "grad_norm": 859.6227416992188, + "learning_rate": 3.732729500073866e-05, + "loss": 56.3151, + "step": 100570 + }, + { + "epoch": 0.40635592706763574, + "grad_norm": 405.04998779296875, + "learning_rate": 3.732425808697622e-05, + "loss": 54.2167, + "step": 100580 + }, + { + "epoch": 0.4063963283330034, + "grad_norm": 821.4298706054688, + "learning_rate": 3.732122093294603e-05, + "loss": 67.599, + "step": 100590 + }, + { + "epoch": 0.406436729598371, + "grad_norm": 777.3472290039062, + "learning_rate": 3.731818353870729e-05, + "loss": 58.1797, + "step": 100600 + }, + { + "epoch": 0.40647713086373866, + "grad_norm": 1343.93359375, + "learning_rate": 3.731514590431922e-05, + "loss": 65.9451, + "step": 100610 + }, + { + "epoch": 0.4065175321291063, + "grad_norm": 800.9368896484375, + "learning_rate": 3.731210802984105e-05, + "loss": 67.0606, + "step": 100620 + }, + { + "epoch": 0.40655793339447394, + "grad_norm": 1185.2408447265625, + "learning_rate": 3.730906991533199e-05, + "loss": 76.7648, + "step": 100630 + }, + { + "epoch": 0.4065983346598415, + "grad_norm": 545.9066772460938, + "learning_rate": 3.7306031560851275e-05, + "loss": 42.591, + "step": 100640 + }, + { + "epoch": 0.40663873592520916, + "grad_norm": 551.5760498046875, + "learning_rate": 3.730299296645814e-05, + "loss": 81.1927, + "step": 100650 + }, + { + "epoch": 0.4066791371905768, + "grad_norm": 3141.09326171875, + "learning_rate": 3.729995413221183e-05, + "loss": 85.903, + "step": 100660 + }, + { + "epoch": 0.40671953845594444, + "grad_norm": 1117.46728515625, + "learning_rate": 3.7296915058171566e-05, + "loss": 59.2733, + "step": 100670 + }, + { + "epoch": 0.4067599397213121, + "grad_norm": 2471.260986328125, + "learning_rate": 3.729387574439662e-05, + "loss": 68.098, + "step": 100680 + }, + { + "epoch": 0.4068003409866797, + "grad_norm": 768.341552734375, + "learning_rate": 3.729083619094624e-05, + "loss": 73.1918, + "step": 100690 + }, + { + "epoch": 0.40684074225204736, + "grad_norm": 0.0, + "learning_rate": 3.7287796397879674e-05, + "loss": 28.0081, + "step": 100700 + }, + { + "epoch": 0.40688114351741494, + "grad_norm": 1264.1614990234375, + "learning_rate": 3.72847563652562e-05, + "loss": 45.6677, + "step": 100710 + }, + { + "epoch": 0.4069215447827826, + "grad_norm": 287.6547546386719, + "learning_rate": 3.7281716093135063e-05, + "loss": 60.4336, + "step": 100720 + }, + { + "epoch": 0.4069619460481502, + "grad_norm": 565.496826171875, + "learning_rate": 3.7278675581575564e-05, + "loss": 49.9453, + "step": 100730 + }, + { + "epoch": 0.40700234731351786, + "grad_norm": 513.9716796875, + "learning_rate": 3.7275634830636957e-05, + "loss": 48.138, + "step": 100740 + }, + { + "epoch": 0.4070427485788855, + "grad_norm": 798.354248046875, + "learning_rate": 3.727259384037852e-05, + "loss": 49.801, + "step": 100750 + }, + { + "epoch": 0.40708314984425314, + "grad_norm": 901.558837890625, + "learning_rate": 3.726955261085956e-05, + "loss": 38.1762, + "step": 100760 + }, + { + "epoch": 0.40712355110962073, + "grad_norm": 808.7131958007812, + "learning_rate": 3.726651114213935e-05, + "loss": 63.8376, + "step": 100770 + }, + { + "epoch": 0.40716395237498837, + "grad_norm": 726.5311279296875, + "learning_rate": 3.726346943427719e-05, + "loss": 51.7929, + "step": 100780 + }, + { + "epoch": 0.407204353640356, + "grad_norm": 367.88018798828125, + "learning_rate": 3.726042748733238e-05, + "loss": 85.8663, + "step": 100790 + }, + { + "epoch": 0.40724475490572365, + "grad_norm": 368.78338623046875, + "learning_rate": 3.725738530136422e-05, + "loss": 33.2297, + "step": 100800 + }, + { + "epoch": 0.4072851561710913, + "grad_norm": 942.588623046875, + "learning_rate": 3.7254342876432026e-05, + "loss": 74.4766, + "step": 100810 + }, + { + "epoch": 0.40732555743645893, + "grad_norm": 704.6798095703125, + "learning_rate": 3.7251300212595106e-05, + "loss": 84.4977, + "step": 100820 + }, + { + "epoch": 0.4073659587018265, + "grad_norm": 411.1459045410156, + "learning_rate": 3.724825730991279e-05, + "loss": 67.2837, + "step": 100830 + }, + { + "epoch": 0.40740635996719415, + "grad_norm": 969.4332885742188, + "learning_rate": 3.7245214168444386e-05, + "loss": 73.3558, + "step": 100840 + }, + { + "epoch": 0.4074467612325618, + "grad_norm": 857.5194091796875, + "learning_rate": 3.724217078824923e-05, + "loss": 87.0812, + "step": 100850 + }, + { + "epoch": 0.40748716249792943, + "grad_norm": 789.8101196289062, + "learning_rate": 3.723912716938665e-05, + "loss": 62.384, + "step": 100860 + }, + { + "epoch": 0.4075275637632971, + "grad_norm": 963.7822265625, + "learning_rate": 3.723608331191598e-05, + "loss": 70.5818, + "step": 100870 + }, + { + "epoch": 0.4075679650286647, + "grad_norm": 1622.1092529296875, + "learning_rate": 3.723303921589657e-05, + "loss": 73.844, + "step": 100880 + }, + { + "epoch": 0.40760836629403235, + "grad_norm": 4206.56787109375, + "learning_rate": 3.722999488138776e-05, + "loss": 86.4621, + "step": 100890 + }, + { + "epoch": 0.40764876755939994, + "grad_norm": 780.2285766601562, + "learning_rate": 3.722695030844891e-05, + "loss": 42.5627, + "step": 100900 + }, + { + "epoch": 0.4076891688247676, + "grad_norm": 1180.5223388671875, + "learning_rate": 3.7223905497139366e-05, + "loss": 59.9004, + "step": 100910 + }, + { + "epoch": 0.4077295700901352, + "grad_norm": 609.8385009765625, + "learning_rate": 3.722086044751849e-05, + "loss": 39.532, + "step": 100920 + }, + { + "epoch": 0.40776997135550286, + "grad_norm": 0.0, + "learning_rate": 3.721781515964565e-05, + "loss": 33.3144, + "step": 100930 + }, + { + "epoch": 0.4078103726208705, + "grad_norm": 1108.013671875, + "learning_rate": 3.721476963358021e-05, + "loss": 67.8884, + "step": 100940 + }, + { + "epoch": 0.40785077388623814, + "grad_norm": 503.3689270019531, + "learning_rate": 3.721172386938155e-05, + "loss": 63.0006, + "step": 100950 + }, + { + "epoch": 0.4078911751516057, + "grad_norm": 2834.478515625, + "learning_rate": 3.720867786710904e-05, + "loss": 74.9134, + "step": 100960 + }, + { + "epoch": 0.40793157641697336, + "grad_norm": 1194.1221923828125, + "learning_rate": 3.7205631626822074e-05, + "loss": 47.9969, + "step": 100970 + }, + { + "epoch": 0.407971977682341, + "grad_norm": 798.5789794921875, + "learning_rate": 3.7202585148580036e-05, + "loss": 86.6126, + "step": 100980 + }, + { + "epoch": 0.40801237894770864, + "grad_norm": 505.0960998535156, + "learning_rate": 3.7199538432442316e-05, + "loss": 59.1577, + "step": 100990 + }, + { + "epoch": 0.4080527802130763, + "grad_norm": 767.0346069335938, + "learning_rate": 3.719649147846832e-05, + "loss": 64.2209, + "step": 101000 + }, + { + "epoch": 0.4080931814784439, + "grad_norm": 577.536865234375, + "learning_rate": 3.7193444286717436e-05, + "loss": 64.3384, + "step": 101010 + }, + { + "epoch": 0.40813358274381156, + "grad_norm": 668.321533203125, + "learning_rate": 3.719039685724909e-05, + "loss": 39.2026, + "step": 101020 + }, + { + "epoch": 0.40817398400917915, + "grad_norm": 211.69288635253906, + "learning_rate": 3.718734919012267e-05, + "loss": 51.6079, + "step": 101030 + }, + { + "epoch": 0.4082143852745468, + "grad_norm": 391.42852783203125, + "learning_rate": 3.71843012853976e-05, + "loss": 42.6838, + "step": 101040 + }, + { + "epoch": 0.4082547865399144, + "grad_norm": 441.98419189453125, + "learning_rate": 3.718125314313331e-05, + "loss": 40.6746, + "step": 101050 + }, + { + "epoch": 0.40829518780528207, + "grad_norm": 772.721923828125, + "learning_rate": 3.7178204763389216e-05, + "loss": 68.107, + "step": 101060 + }, + { + "epoch": 0.4083355890706497, + "grad_norm": 286.0000305175781, + "learning_rate": 3.717515614622476e-05, + "loss": 79.75, + "step": 101070 + }, + { + "epoch": 0.40837599033601735, + "grad_norm": 966.0552368164062, + "learning_rate": 3.717210729169935e-05, + "loss": 72.2583, + "step": 101080 + }, + { + "epoch": 0.40841639160138493, + "grad_norm": 756.3565673828125, + "learning_rate": 3.7169058199872455e-05, + "loss": 69.5634, + "step": 101090 + }, + { + "epoch": 0.40845679286675257, + "grad_norm": 754.9552001953125, + "learning_rate": 3.71660088708035e-05, + "loss": 58.8931, + "step": 101100 + }, + { + "epoch": 0.4084971941321202, + "grad_norm": 1555.589599609375, + "learning_rate": 3.716295930455194e-05, + "loss": 79.8807, + "step": 101110 + }, + { + "epoch": 0.40853759539748785, + "grad_norm": 1325.0550537109375, + "learning_rate": 3.7159909501177226e-05, + "loss": 68.7145, + "step": 101120 + }, + { + "epoch": 0.4085779966628555, + "grad_norm": 620.8428955078125, + "learning_rate": 3.715685946073881e-05, + "loss": 55.6151, + "step": 101130 + }, + { + "epoch": 0.40861839792822313, + "grad_norm": 955.8616943359375, + "learning_rate": 3.7153809183296176e-05, + "loss": 66.1763, + "step": 101140 + }, + { + "epoch": 0.4086587991935907, + "grad_norm": 599.90234375, + "learning_rate": 3.715075866890876e-05, + "loss": 65.0991, + "step": 101150 + }, + { + "epoch": 0.40869920045895836, + "grad_norm": 3034.8486328125, + "learning_rate": 3.7147707917636046e-05, + "loss": 69.5837, + "step": 101160 + }, + { + "epoch": 0.408739601724326, + "grad_norm": 0.0, + "learning_rate": 3.7144656929537524e-05, + "loss": 45.1975, + "step": 101170 + }, + { + "epoch": 0.40878000298969364, + "grad_norm": 502.8624572753906, + "learning_rate": 3.714160570467266e-05, + "loss": 73.8093, + "step": 101180 + }, + { + "epoch": 0.4088204042550613, + "grad_norm": 0.0, + "learning_rate": 3.7138554243100934e-05, + "loss": 48.2301, + "step": 101190 + }, + { + "epoch": 0.4088608055204289, + "grad_norm": 648.0908203125, + "learning_rate": 3.713550254488185e-05, + "loss": 103.6744, + "step": 101200 + }, + { + "epoch": 0.40890120678579656, + "grad_norm": 1352.32421875, + "learning_rate": 3.71324506100749e-05, + "loss": 82.1861, + "step": 101210 + }, + { + "epoch": 0.40894160805116414, + "grad_norm": 1041.673583984375, + "learning_rate": 3.712939843873957e-05, + "loss": 40.6213, + "step": 101220 + }, + { + "epoch": 0.4089820093165318, + "grad_norm": 763.6616821289062, + "learning_rate": 3.7126346030935374e-05, + "loss": 55.786, + "step": 101230 + }, + { + "epoch": 0.4090224105818994, + "grad_norm": 892.0834350585938, + "learning_rate": 3.712329338672182e-05, + "loss": 60.7086, + "step": 101240 + }, + { + "epoch": 0.40906281184726706, + "grad_norm": 788.8603515625, + "learning_rate": 3.712024050615843e-05, + "loss": 53.5069, + "step": 101250 + }, + { + "epoch": 0.4091032131126347, + "grad_norm": 1101.8909912109375, + "learning_rate": 3.71171873893047e-05, + "loss": 47.3406, + "step": 101260 + }, + { + "epoch": 0.40914361437800234, + "grad_norm": 296.9659729003906, + "learning_rate": 3.711413403622017e-05, + "loss": 55.5417, + "step": 101270 + }, + { + "epoch": 0.4091840156433699, + "grad_norm": 653.2747802734375, + "learning_rate": 3.711108044696436e-05, + "loss": 71.033, + "step": 101280 + }, + { + "epoch": 0.40922441690873756, + "grad_norm": 671.96533203125, + "learning_rate": 3.710802662159679e-05, + "loss": 58.3862, + "step": 101290 + }, + { + "epoch": 0.4092648181741052, + "grad_norm": 519.48095703125, + "learning_rate": 3.710497256017702e-05, + "loss": 90.0393, + "step": 101300 + }, + { + "epoch": 0.40930521943947284, + "grad_norm": 837.6402587890625, + "learning_rate": 3.7101918262764576e-05, + "loss": 60.9863, + "step": 101310 + }, + { + "epoch": 0.4093456207048405, + "grad_norm": 907.7230224609375, + "learning_rate": 3.7098863729419e-05, + "loss": 80.4925, + "step": 101320 + }, + { + "epoch": 0.4093860219702081, + "grad_norm": 2056.220947265625, + "learning_rate": 3.709580896019985e-05, + "loss": 49.3788, + "step": 101330 + }, + { + "epoch": 0.40942642323557576, + "grad_norm": 531.6597900390625, + "learning_rate": 3.7092753955166674e-05, + "loss": 81.7104, + "step": 101340 + }, + { + "epoch": 0.40946682450094335, + "grad_norm": 714.7631225585938, + "learning_rate": 3.708969871437904e-05, + "loss": 63.8878, + "step": 101350 + }, + { + "epoch": 0.409507225766311, + "grad_norm": 1157.5352783203125, + "learning_rate": 3.7086643237896504e-05, + "loss": 41.3693, + "step": 101360 + }, + { + "epoch": 0.40954762703167863, + "grad_norm": 531.5262451171875, + "learning_rate": 3.708358752577863e-05, + "loss": 71.7268, + "step": 101370 + }, + { + "epoch": 0.40958802829704627, + "grad_norm": 447.32330322265625, + "learning_rate": 3.7080531578085e-05, + "loss": 66.6252, + "step": 101380 + }, + { + "epoch": 0.4096284295624139, + "grad_norm": 983.9207153320312, + "learning_rate": 3.707747539487519e-05, + "loss": 62.158, + "step": 101390 + }, + { + "epoch": 0.40966883082778155, + "grad_norm": 294.036376953125, + "learning_rate": 3.7074418976208766e-05, + "loss": 54.1539, + "step": 101400 + }, + { + "epoch": 0.40970923209314913, + "grad_norm": 2036.715087890625, + "learning_rate": 3.707136232214534e-05, + "loss": 53.9095, + "step": 101410 + }, + { + "epoch": 0.4097496333585168, + "grad_norm": 1761.55908203125, + "learning_rate": 3.706830543274449e-05, + "loss": 57.585, + "step": 101420 + }, + { + "epoch": 0.4097900346238844, + "grad_norm": 1144.66064453125, + "learning_rate": 3.706524830806581e-05, + "loss": 80.4368, + "step": 101430 + }, + { + "epoch": 0.40983043588925205, + "grad_norm": 457.09368896484375, + "learning_rate": 3.706219094816891e-05, + "loss": 80.4445, + "step": 101440 + }, + { + "epoch": 0.4098708371546197, + "grad_norm": 451.6939697265625, + "learning_rate": 3.705913335311338e-05, + "loss": 46.0204, + "step": 101450 + }, + { + "epoch": 0.40991123841998733, + "grad_norm": 675.3792724609375, + "learning_rate": 3.705607552295883e-05, + "loss": 29.4593, + "step": 101460 + }, + { + "epoch": 0.4099516396853549, + "grad_norm": 473.2438659667969, + "learning_rate": 3.7053017457764895e-05, + "loss": 69.6346, + "step": 101470 + }, + { + "epoch": 0.40999204095072256, + "grad_norm": 1086.820556640625, + "learning_rate": 3.704995915759117e-05, + "loss": 52.2497, + "step": 101480 + }, + { + "epoch": 0.4100324422160902, + "grad_norm": 1838.8997802734375, + "learning_rate": 3.704690062249729e-05, + "loss": 67.0797, + "step": 101490 + }, + { + "epoch": 0.41007284348145784, + "grad_norm": 1140.804931640625, + "learning_rate": 3.704384185254288e-05, + "loss": 59.0365, + "step": 101500 + }, + { + "epoch": 0.4101132447468255, + "grad_norm": 967.5361938476562, + "learning_rate": 3.7040782847787576e-05, + "loss": 62.3368, + "step": 101510 + }, + { + "epoch": 0.4101536460121931, + "grad_norm": 862.10107421875, + "learning_rate": 3.7037723608291015e-05, + "loss": 66.9464, + "step": 101520 + }, + { + "epoch": 0.41019404727756076, + "grad_norm": 333.78411865234375, + "learning_rate": 3.703466413411282e-05, + "loss": 57.9339, + "step": 101530 + }, + { + "epoch": 0.41023444854292834, + "grad_norm": 1024.932861328125, + "learning_rate": 3.703160442531266e-05, + "loss": 50.8443, + "step": 101540 + }, + { + "epoch": 0.410274849808296, + "grad_norm": 1006.3261108398438, + "learning_rate": 3.702854448195019e-05, + "loss": 66.2163, + "step": 101550 + }, + { + "epoch": 0.4103152510736636, + "grad_norm": 720.7424926757812, + "learning_rate": 3.7025484304085034e-05, + "loss": 61.6989, + "step": 101560 + }, + { + "epoch": 0.41035565233903126, + "grad_norm": 972.9140625, + "learning_rate": 3.702242389177687e-05, + "loss": 58.963, + "step": 101570 + }, + { + "epoch": 0.4103960536043989, + "grad_norm": 819.4244384765625, + "learning_rate": 3.701936324508537e-05, + "loss": 58.8996, + "step": 101580 + }, + { + "epoch": 0.41043645486976654, + "grad_norm": 1408.4876708984375, + "learning_rate": 3.7016302364070196e-05, + "loss": 104.3692, + "step": 101590 + }, + { + "epoch": 0.4104768561351341, + "grad_norm": 1001.0791625976562, + "learning_rate": 3.701324124879102e-05, + "loss": 76.1169, + "step": 101600 + }, + { + "epoch": 0.41051725740050177, + "grad_norm": 1362.98193359375, + "learning_rate": 3.701017989930752e-05, + "loss": 79.0587, + "step": 101610 + }, + { + "epoch": 0.4105576586658694, + "grad_norm": 494.7317199707031, + "learning_rate": 3.7007118315679384e-05, + "loss": 41.3256, + "step": 101620 + }, + { + "epoch": 0.41059805993123705, + "grad_norm": 418.2009582519531, + "learning_rate": 3.700405649796629e-05, + "loss": 77.2711, + "step": 101630 + }, + { + "epoch": 0.4106384611966047, + "grad_norm": 934.8997192382812, + "learning_rate": 3.700099444622794e-05, + "loss": 64.7147, + "step": 101640 + }, + { + "epoch": 0.4106788624619723, + "grad_norm": 829.1802978515625, + "learning_rate": 3.699793216052402e-05, + "loss": 57.4641, + "step": 101650 + }, + { + "epoch": 0.41071926372733997, + "grad_norm": 536.0474853515625, + "learning_rate": 3.699486964091423e-05, + "loss": 79.0541, + "step": 101660 + }, + { + "epoch": 0.41075966499270755, + "grad_norm": 808.2246704101562, + "learning_rate": 3.699180688745829e-05, + "loss": 50.0221, + "step": 101670 + }, + { + "epoch": 0.4108000662580752, + "grad_norm": 558.8209838867188, + "learning_rate": 3.6988743900215894e-05, + "loss": 52.1435, + "step": 101680 + }, + { + "epoch": 0.41084046752344283, + "grad_norm": 1482.892578125, + "learning_rate": 3.698568067924677e-05, + "loss": 49.8897, + "step": 101690 + }, + { + "epoch": 0.41088086878881047, + "grad_norm": 494.3254699707031, + "learning_rate": 3.698261722461063e-05, + "loss": 64.2092, + "step": 101700 + }, + { + "epoch": 0.4109212700541781, + "grad_norm": 455.20489501953125, + "learning_rate": 3.6979553536367194e-05, + "loss": 38.1696, + "step": 101710 + }, + { + "epoch": 0.41096167131954575, + "grad_norm": 528.5197143554688, + "learning_rate": 3.69764896145762e-05, + "loss": 42.2116, + "step": 101720 + }, + { + "epoch": 0.41100207258491334, + "grad_norm": 935.036865234375, + "learning_rate": 3.697342545929737e-05, + "loss": 76.0732, + "step": 101730 + }, + { + "epoch": 0.411042473850281, + "grad_norm": 1118.97216796875, + "learning_rate": 3.697036107059044e-05, + "loss": 37.8624, + "step": 101740 + }, + { + "epoch": 0.4110828751156486, + "grad_norm": 1046.9444580078125, + "learning_rate": 3.696729644851518e-05, + "loss": 34.6092, + "step": 101750 + }, + { + "epoch": 0.41112327638101626, + "grad_norm": 427.7893981933594, + "learning_rate": 3.696423159313129e-05, + "loss": 67.689, + "step": 101760 + }, + { + "epoch": 0.4111636776463839, + "grad_norm": 655.9987182617188, + "learning_rate": 3.696116650449856e-05, + "loss": 42.4555, + "step": 101770 + }, + { + "epoch": 0.41120407891175154, + "grad_norm": 911.2579956054688, + "learning_rate": 3.6958101182676726e-05, + "loss": 61.5867, + "step": 101780 + }, + { + "epoch": 0.4112444801771191, + "grad_norm": 2679.955078125, + "learning_rate": 3.6955035627725557e-05, + "loss": 62.913, + "step": 101790 + }, + { + "epoch": 0.41128488144248676, + "grad_norm": 736.22802734375, + "learning_rate": 3.695196983970481e-05, + "loss": 54.0211, + "step": 101800 + }, + { + "epoch": 0.4113252827078544, + "grad_norm": 1243.8758544921875, + "learning_rate": 3.694890381867425e-05, + "loss": 58.7315, + "step": 101810 + }, + { + "epoch": 0.41136568397322204, + "grad_norm": 828.197265625, + "learning_rate": 3.6945837564693666e-05, + "loss": 65.7947, + "step": 101820 + }, + { + "epoch": 0.4114060852385897, + "grad_norm": 682.7947387695312, + "learning_rate": 3.6942771077822835e-05, + "loss": 57.709, + "step": 101830 + }, + { + "epoch": 0.4114464865039573, + "grad_norm": 1457.8909912109375, + "learning_rate": 3.693970435812153e-05, + "loss": 53.162, + "step": 101840 + }, + { + "epoch": 0.41148688776932496, + "grad_norm": 1344.3275146484375, + "learning_rate": 3.693663740564953e-05, + "loss": 54.1553, + "step": 101850 + }, + { + "epoch": 0.41152728903469254, + "grad_norm": 0.0, + "learning_rate": 3.693357022046665e-05, + "loss": 47.7167, + "step": 101860 + }, + { + "epoch": 0.4115676903000602, + "grad_norm": 927.4996337890625, + "learning_rate": 3.693050280263268e-05, + "loss": 74.7277, + "step": 101870 + }, + { + "epoch": 0.4116080915654278, + "grad_norm": 401.41693115234375, + "learning_rate": 3.6927435152207406e-05, + "loss": 51.1262, + "step": 101880 + }, + { + "epoch": 0.41164849283079546, + "grad_norm": 756.7000122070312, + "learning_rate": 3.6924367269250644e-05, + "loss": 43.8885, + "step": 101890 + }, + { + "epoch": 0.4116888940961631, + "grad_norm": 1407.22412109375, + "learning_rate": 3.69212991538222e-05, + "loss": 51.1716, + "step": 101900 + }, + { + "epoch": 0.41172929536153074, + "grad_norm": 1306.109619140625, + "learning_rate": 3.691823080598189e-05, + "loss": 56.4997, + "step": 101910 + }, + { + "epoch": 0.41176969662689833, + "grad_norm": 890.89013671875, + "learning_rate": 3.6915162225789546e-05, + "loss": 88.45, + "step": 101920 + }, + { + "epoch": 0.41181009789226597, + "grad_norm": 195.86825561523438, + "learning_rate": 3.691209341330497e-05, + "loss": 70.7165, + "step": 101930 + }, + { + "epoch": 0.4118504991576336, + "grad_norm": 1073.44677734375, + "learning_rate": 3.690902436858801e-05, + "loss": 88.2749, + "step": 101940 + }, + { + "epoch": 0.41189090042300125, + "grad_norm": 310.966552734375, + "learning_rate": 3.690595509169848e-05, + "loss": 101.5226, + "step": 101950 + }, + { + "epoch": 0.4119313016883689, + "grad_norm": 515.8529663085938, + "learning_rate": 3.690288558269623e-05, + "loss": 55.0783, + "step": 101960 + }, + { + "epoch": 0.41197170295373653, + "grad_norm": 548.2571411132812, + "learning_rate": 3.68998158416411e-05, + "loss": 51.7741, + "step": 101970 + }, + { + "epoch": 0.41201210421910417, + "grad_norm": 712.4876708984375, + "learning_rate": 3.689674586859292e-05, + "loss": 55.8182, + "step": 101980 + }, + { + "epoch": 0.41205250548447175, + "grad_norm": 1289.3074951171875, + "learning_rate": 3.689367566361157e-05, + "loss": 83.5634, + "step": 101990 + }, + { + "epoch": 0.4120929067498394, + "grad_norm": 378.75164794921875, + "learning_rate": 3.689060522675689e-05, + "loss": 62.7789, + "step": 102000 + }, + { + "epoch": 0.41213330801520703, + "grad_norm": 788.7225341796875, + "learning_rate": 3.6887534558088727e-05, + "loss": 51.0795, + "step": 102010 + }, + { + "epoch": 0.4121737092805747, + "grad_norm": 419.6149597167969, + "learning_rate": 3.688446365766696e-05, + "loss": 72.9321, + "step": 102020 + }, + { + "epoch": 0.4122141105459423, + "grad_norm": 424.8109436035156, + "learning_rate": 3.688139252555146e-05, + "loss": 51.3233, + "step": 102030 + }, + { + "epoch": 0.41225451181130995, + "grad_norm": 340.1546630859375, + "learning_rate": 3.6878321161802104e-05, + "loss": 99.2203, + "step": 102040 + }, + { + "epoch": 0.41229491307667754, + "grad_norm": 572.083984375, + "learning_rate": 3.6875249566478745e-05, + "loss": 62.0742, + "step": 102050 + }, + { + "epoch": 0.4123353143420452, + "grad_norm": 639.38818359375, + "learning_rate": 3.687217773964129e-05, + "loss": 72.4136, + "step": 102060 + }, + { + "epoch": 0.4123757156074128, + "grad_norm": 405.4486389160156, + "learning_rate": 3.686910568134962e-05, + "loss": 57.4386, + "step": 102070 + }, + { + "epoch": 0.41241611687278046, + "grad_norm": 938.0454711914062, + "learning_rate": 3.686603339166362e-05, + "loss": 64.274, + "step": 102080 + }, + { + "epoch": 0.4124565181381481, + "grad_norm": 1430.1524658203125, + "learning_rate": 3.686296087064319e-05, + "loss": 62.3886, + "step": 102090 + }, + { + "epoch": 0.41249691940351574, + "grad_norm": 796.8814086914062, + "learning_rate": 3.685988811834823e-05, + "loss": 85.7702, + "step": 102100 + }, + { + "epoch": 0.4125373206688833, + "grad_norm": 1625.54833984375, + "learning_rate": 3.685681513483865e-05, + "loss": 86.8409, + "step": 102110 + }, + { + "epoch": 0.41257772193425096, + "grad_norm": 754.6946411132812, + "learning_rate": 3.685374192017436e-05, + "loss": 42.4442, + "step": 102120 + }, + { + "epoch": 0.4126181231996186, + "grad_norm": 599.8799438476562, + "learning_rate": 3.6850668474415255e-05, + "loss": 42.2114, + "step": 102130 + }, + { + "epoch": 0.41265852446498624, + "grad_norm": 919.1255493164062, + "learning_rate": 3.684759479762127e-05, + "loss": 66.864, + "step": 102140 + }, + { + "epoch": 0.4126989257303539, + "grad_norm": 352.7879943847656, + "learning_rate": 3.684452088985233e-05, + "loss": 46.0815, + "step": 102150 + }, + { + "epoch": 0.4127393269957215, + "grad_norm": 1374.6343994140625, + "learning_rate": 3.6841446751168355e-05, + "loss": 69.1762, + "step": 102160 + }, + { + "epoch": 0.41277972826108916, + "grad_norm": 597.5450439453125, + "learning_rate": 3.683837238162928e-05, + "loss": 85.0557, + "step": 102170 + }, + { + "epoch": 0.41282012952645675, + "grad_norm": 674.8424072265625, + "learning_rate": 3.683529778129503e-05, + "loss": 79.6359, + "step": 102180 + }, + { + "epoch": 0.4128605307918244, + "grad_norm": 624.8746337890625, + "learning_rate": 3.683222295022557e-05, + "loss": 39.7842, + "step": 102190 + }, + { + "epoch": 0.412900932057192, + "grad_norm": 1277.397705078125, + "learning_rate": 3.682914788848083e-05, + "loss": 58.8937, + "step": 102200 + }, + { + "epoch": 0.41294133332255967, + "grad_norm": 722.8589477539062, + "learning_rate": 3.682607259612076e-05, + "loss": 80.0508, + "step": 102210 + }, + { + "epoch": 0.4129817345879273, + "grad_norm": 824.6448364257812, + "learning_rate": 3.682299707320532e-05, + "loss": 53.1664, + "step": 102220 + }, + { + "epoch": 0.41302213585329495, + "grad_norm": 234.46766662597656, + "learning_rate": 3.681992131979446e-05, + "loss": 66.3412, + "step": 102230 + }, + { + "epoch": 0.41306253711866253, + "grad_norm": 1732.6822509765625, + "learning_rate": 3.681684533594815e-05, + "loss": 118.9763, + "step": 102240 + }, + { + "epoch": 0.41310293838403017, + "grad_norm": 985.6689453125, + "learning_rate": 3.681376912172636e-05, + "loss": 48.3016, + "step": 102250 + }, + { + "epoch": 0.4131433396493978, + "grad_norm": 534.2235107421875, + "learning_rate": 3.6810692677189046e-05, + "loss": 41.1004, + "step": 102260 + }, + { + "epoch": 0.41318374091476545, + "grad_norm": 0.0, + "learning_rate": 3.680761600239621e-05, + "loss": 47.6259, + "step": 102270 + }, + { + "epoch": 0.4132241421801331, + "grad_norm": 2476.089111328125, + "learning_rate": 3.680453909740782e-05, + "loss": 60.4867, + "step": 102280 + }, + { + "epoch": 0.41326454344550073, + "grad_norm": 733.741455078125, + "learning_rate": 3.680146196228386e-05, + "loss": 58.1537, + "step": 102290 + }, + { + "epoch": 0.41330494471086837, + "grad_norm": 543.4186401367188, + "learning_rate": 3.6798384597084325e-05, + "loss": 42.9503, + "step": 102300 + }, + { + "epoch": 0.41334534597623596, + "grad_norm": 535.3422241210938, + "learning_rate": 3.67953070018692e-05, + "loss": 76.8054, + "step": 102310 + }, + { + "epoch": 0.4133857472416036, + "grad_norm": 851.113037109375, + "learning_rate": 3.679222917669851e-05, + "loss": 53.6934, + "step": 102320 + }, + { + "epoch": 0.41342614850697124, + "grad_norm": 3898.60693359375, + "learning_rate": 3.6789151121632226e-05, + "loss": 58.586, + "step": 102330 + }, + { + "epoch": 0.4134665497723389, + "grad_norm": 425.6492919921875, + "learning_rate": 3.678607283673037e-05, + "loss": 61.4479, + "step": 102340 + }, + { + "epoch": 0.4135069510377065, + "grad_norm": 877.3761596679688, + "learning_rate": 3.678299432205296e-05, + "loss": 73.7707, + "step": 102350 + }, + { + "epoch": 0.41354735230307416, + "grad_norm": 763.9284057617188, + "learning_rate": 3.6779915577660015e-05, + "loss": 56.7363, + "step": 102360 + }, + { + "epoch": 0.41358775356844174, + "grad_norm": 1059.4683837890625, + "learning_rate": 3.677683660361155e-05, + "loss": 52.8749, + "step": 102370 + }, + { + "epoch": 0.4136281548338094, + "grad_norm": 408.92071533203125, + "learning_rate": 3.677375739996759e-05, + "loss": 53.0879, + "step": 102380 + }, + { + "epoch": 0.413668556099177, + "grad_norm": 529.644775390625, + "learning_rate": 3.677067796678817e-05, + "loss": 53.6878, + "step": 102390 + }, + { + "epoch": 0.41370895736454466, + "grad_norm": 888.7438354492188, + "learning_rate": 3.6767598304133324e-05, + "loss": 44.5858, + "step": 102400 + }, + { + "epoch": 0.4137493586299123, + "grad_norm": 1067.380615234375, + "learning_rate": 3.676451841206308e-05, + "loss": 42.4786, + "step": 102410 + }, + { + "epoch": 0.41378975989527994, + "grad_norm": 2619.344482421875, + "learning_rate": 3.67614382906375e-05, + "loss": 59.8869, + "step": 102420 + }, + { + "epoch": 0.4138301611606475, + "grad_norm": 730.7283325195312, + "learning_rate": 3.675835793991662e-05, + "loss": 53.2614, + "step": 102430 + }, + { + "epoch": 0.41387056242601516, + "grad_norm": 540.2338256835938, + "learning_rate": 3.67552773599605e-05, + "loss": 53.5669, + "step": 102440 + }, + { + "epoch": 0.4139109636913828, + "grad_norm": 424.4781799316406, + "learning_rate": 3.675219655082921e-05, + "loss": 40.3122, + "step": 102450 + }, + { + "epoch": 0.41395136495675044, + "grad_norm": 1885.5537109375, + "learning_rate": 3.6749115512582786e-05, + "loss": 59.4535, + "step": 102460 + }, + { + "epoch": 0.4139917662221181, + "grad_norm": 790.8214111328125, + "learning_rate": 3.674603424528131e-05, + "loss": 68.5578, + "step": 102470 + }, + { + "epoch": 0.4140321674874857, + "grad_norm": 788.549560546875, + "learning_rate": 3.674295274898485e-05, + "loss": 115.0106, + "step": 102480 + }, + { + "epoch": 0.41407256875285336, + "grad_norm": 3169.2861328125, + "learning_rate": 3.673987102375348e-05, + "loss": 78.3599, + "step": 102490 + }, + { + "epoch": 0.41411297001822095, + "grad_norm": 1180.6651611328125, + "learning_rate": 3.673678906964727e-05, + "loss": 69.2081, + "step": 102500 + }, + { + "epoch": 0.4141533712835886, + "grad_norm": 870.7091674804688, + "learning_rate": 3.673370688672632e-05, + "loss": 76.4967, + "step": 102510 + }, + { + "epoch": 0.41419377254895623, + "grad_norm": 535.1434326171875, + "learning_rate": 3.673062447505072e-05, + "loss": 37.986, + "step": 102520 + }, + { + "epoch": 0.41423417381432387, + "grad_norm": 574.4082641601562, + "learning_rate": 3.672754183468055e-05, + "loss": 43.2866, + "step": 102530 + }, + { + "epoch": 0.4142745750796915, + "grad_norm": 1545.2213134765625, + "learning_rate": 3.672445896567592e-05, + "loss": 84.1171, + "step": 102540 + }, + { + "epoch": 0.41431497634505915, + "grad_norm": 625.9170532226562, + "learning_rate": 3.6721375868096924e-05, + "loss": 44.562, + "step": 102550 + }, + { + "epoch": 0.41435537761042673, + "grad_norm": 948.0220336914062, + "learning_rate": 3.6718292542003666e-05, + "loss": 68.8121, + "step": 102560 + }, + { + "epoch": 0.4143957788757944, + "grad_norm": 400.49859619140625, + "learning_rate": 3.671520898745627e-05, + "loss": 59.8667, + "step": 102570 + }, + { + "epoch": 0.414436180141162, + "grad_norm": 329.8058776855469, + "learning_rate": 3.671212520451484e-05, + "loss": 76.8248, + "step": 102580 + }, + { + "epoch": 0.41447658140652965, + "grad_norm": 916.3284301757812, + "learning_rate": 3.670904119323949e-05, + "loss": 74.5609, + "step": 102590 + }, + { + "epoch": 0.4145169826718973, + "grad_norm": 561.3569946289062, + "learning_rate": 3.6705956953690364e-05, + "loss": 43.266, + "step": 102600 + }, + { + "epoch": 0.41455738393726493, + "grad_norm": 627.5663452148438, + "learning_rate": 3.670287248592758e-05, + "loss": 49.7998, + "step": 102610 + }, + { + "epoch": 0.4145977852026326, + "grad_norm": 919.4302368164062, + "learning_rate": 3.669978779001127e-05, + "loss": 72.0918, + "step": 102620 + }, + { + "epoch": 0.41463818646800016, + "grad_norm": 536.8106079101562, + "learning_rate": 3.669670286600157e-05, + "loss": 84.2724, + "step": 102630 + }, + { + "epoch": 0.4146785877333678, + "grad_norm": 926.4111328125, + "learning_rate": 3.6693617713958634e-05, + "loss": 49.7705, + "step": 102640 + }, + { + "epoch": 0.41471898899873544, + "grad_norm": 511.818359375, + "learning_rate": 3.66905323339426e-05, + "loss": 46.5629, + "step": 102650 + }, + { + "epoch": 0.4147593902641031, + "grad_norm": 1487.813720703125, + "learning_rate": 3.668744672601361e-05, + "loss": 55.226, + "step": 102660 + }, + { + "epoch": 0.4147997915294707, + "grad_norm": 1999.3150634765625, + "learning_rate": 3.668436089023184e-05, + "loss": 84.6968, + "step": 102670 + }, + { + "epoch": 0.41484019279483836, + "grad_norm": 443.5814208984375, + "learning_rate": 3.668127482665743e-05, + "loss": 59.7227, + "step": 102680 + }, + { + "epoch": 0.41488059406020594, + "grad_norm": 798.040283203125, + "learning_rate": 3.667818853535056e-05, + "loss": 88.5318, + "step": 102690 + }, + { + "epoch": 0.4149209953255736, + "grad_norm": 1065.25927734375, + "learning_rate": 3.667510201637139e-05, + "loss": 46.2704, + "step": 102700 + }, + { + "epoch": 0.4149613965909412, + "grad_norm": 213.02117919921875, + "learning_rate": 3.667201526978009e-05, + "loss": 72.9519, + "step": 102710 + }, + { + "epoch": 0.41500179785630886, + "grad_norm": 582.4226684570312, + "learning_rate": 3.6668928295636854e-05, + "loss": 43.031, + "step": 102720 + }, + { + "epoch": 0.4150421991216765, + "grad_norm": 550.1882934570312, + "learning_rate": 3.666584109400185e-05, + "loss": 56.0368, + "step": 102730 + }, + { + "epoch": 0.41508260038704414, + "grad_norm": 563.0902709960938, + "learning_rate": 3.666275366493526e-05, + "loss": 58.4682, + "step": 102740 + }, + { + "epoch": 0.4151230016524117, + "grad_norm": 1416.572998046875, + "learning_rate": 3.665966600849728e-05, + "loss": 48.4765, + "step": 102750 + }, + { + "epoch": 0.41516340291777937, + "grad_norm": 977.8565063476562, + "learning_rate": 3.665657812474812e-05, + "loss": 99.4884, + "step": 102760 + }, + { + "epoch": 0.415203804183147, + "grad_norm": 605.6978149414062, + "learning_rate": 3.6653490013747965e-05, + "loss": 62.5299, + "step": 102770 + }, + { + "epoch": 0.41524420544851465, + "grad_norm": 1297.1224365234375, + "learning_rate": 3.665040167555702e-05, + "loss": 53.5037, + "step": 102780 + }, + { + "epoch": 0.4152846067138823, + "grad_norm": 823.6423950195312, + "learning_rate": 3.664731311023549e-05, + "loss": 50.0008, + "step": 102790 + }, + { + "epoch": 0.4153250079792499, + "grad_norm": 611.2938842773438, + "learning_rate": 3.664422431784361e-05, + "loss": 50.3032, + "step": 102800 + }, + { + "epoch": 0.41536540924461757, + "grad_norm": 713.4339599609375, + "learning_rate": 3.6641135298441576e-05, + "loss": 65.9606, + "step": 102810 + }, + { + "epoch": 0.41540581050998515, + "grad_norm": 908.730712890625, + "learning_rate": 3.6638046052089616e-05, + "loss": 72.0269, + "step": 102820 + }, + { + "epoch": 0.4154462117753528, + "grad_norm": 714.7061767578125, + "learning_rate": 3.6634956578847954e-05, + "loss": 85.8837, + "step": 102830 + }, + { + "epoch": 0.41548661304072043, + "grad_norm": 490.82269287109375, + "learning_rate": 3.663186687877682e-05, + "loss": 57.8224, + "step": 102840 + }, + { + "epoch": 0.41552701430608807, + "grad_norm": 954.6878662109375, + "learning_rate": 3.662877695193646e-05, + "loss": 58.0162, + "step": 102850 + }, + { + "epoch": 0.4155674155714557, + "grad_norm": 220.2834930419922, + "learning_rate": 3.6625686798387106e-05, + "loss": 67.6011, + "step": 102860 + }, + { + "epoch": 0.41560781683682335, + "grad_norm": 1146.68994140625, + "learning_rate": 3.6622596418188995e-05, + "loss": 86.136, + "step": 102870 + }, + { + "epoch": 0.41564821810219094, + "grad_norm": 983.2677001953125, + "learning_rate": 3.661950581140239e-05, + "loss": 32.4779, + "step": 102880 + }, + { + "epoch": 0.4156886193675586, + "grad_norm": 694.6705322265625, + "learning_rate": 3.661641497808754e-05, + "loss": 52.4579, + "step": 102890 + }, + { + "epoch": 0.4157290206329262, + "grad_norm": 500.0966491699219, + "learning_rate": 3.66133239183047e-05, + "loss": 54.7675, + "step": 102900 + }, + { + "epoch": 0.41576942189829386, + "grad_norm": 847.8213500976562, + "learning_rate": 3.6610232632114124e-05, + "loss": 55.5015, + "step": 102910 + }, + { + "epoch": 0.4158098231636615, + "grad_norm": 1668.6177978515625, + "learning_rate": 3.6607141119576084e-05, + "loss": 98.7418, + "step": 102920 + }, + { + "epoch": 0.41585022442902914, + "grad_norm": 397.88885498046875, + "learning_rate": 3.6604049380750855e-05, + "loss": 49.9987, + "step": 102930 + }, + { + "epoch": 0.4158906256943968, + "grad_norm": 378.2391662597656, + "learning_rate": 3.660095741569871e-05, + "loss": 62.2791, + "step": 102940 + }, + { + "epoch": 0.41593102695976436, + "grad_norm": 539.7728271484375, + "learning_rate": 3.659786522447993e-05, + "loss": 57.2088, + "step": 102950 + }, + { + "epoch": 0.415971428225132, + "grad_norm": 925.6470336914062, + "learning_rate": 3.659477280715479e-05, + "loss": 54.7426, + "step": 102960 + }, + { + "epoch": 0.41601182949049964, + "grad_norm": 0.0, + "learning_rate": 3.659168016378359e-05, + "loss": 54.7942, + "step": 102970 + }, + { + "epoch": 0.4160522307558673, + "grad_norm": 1118.713623046875, + "learning_rate": 3.658858729442662e-05, + "loss": 45.2592, + "step": 102980 + }, + { + "epoch": 0.4160926320212349, + "grad_norm": 478.5465087890625, + "learning_rate": 3.658549419914417e-05, + "loss": 78.469, + "step": 102990 + }, + { + "epoch": 0.41613303328660256, + "grad_norm": 556.0774536132812, + "learning_rate": 3.6582400877996546e-05, + "loss": 56.5781, + "step": 103000 + }, + { + "epoch": 0.41617343455197015, + "grad_norm": 360.80340576171875, + "learning_rate": 3.6579307331044054e-05, + "loss": 52.6853, + "step": 103010 + }, + { + "epoch": 0.4162138358173378, + "grad_norm": 601.7576293945312, + "learning_rate": 3.657621355834701e-05, + "loss": 61.4476, + "step": 103020 + }, + { + "epoch": 0.4162542370827054, + "grad_norm": 592.1106567382812, + "learning_rate": 3.657311955996571e-05, + "loss": 51.4591, + "step": 103030 + }, + { + "epoch": 0.41629463834807306, + "grad_norm": 637.8342895507812, + "learning_rate": 3.657002533596049e-05, + "loss": 61.3845, + "step": 103040 + }, + { + "epoch": 0.4163350396134407, + "grad_norm": 371.7774963378906, + "learning_rate": 3.6566930886391674e-05, + "loss": 74.0396, + "step": 103050 + }, + { + "epoch": 0.41637544087880834, + "grad_norm": 375.3711242675781, + "learning_rate": 3.656383621131959e-05, + "loss": 56.7201, + "step": 103060 + }, + { + "epoch": 0.41641584214417593, + "grad_norm": 538.1626586914062, + "learning_rate": 3.656074131080457e-05, + "loss": 51.9634, + "step": 103070 + }, + { + "epoch": 0.41645624340954357, + "grad_norm": 1070.9267578125, + "learning_rate": 3.655764618490692e-05, + "loss": 92.4124, + "step": 103080 + }, + { + "epoch": 0.4164966446749112, + "grad_norm": 656.9863891601562, + "learning_rate": 3.655455083368703e-05, + "loss": 65.6765, + "step": 103090 + }, + { + "epoch": 0.41653704594027885, + "grad_norm": 841.900634765625, + "learning_rate": 3.655145525720522e-05, + "loss": 68.1387, + "step": 103100 + }, + { + "epoch": 0.4165774472056465, + "grad_norm": 409.5385437011719, + "learning_rate": 3.6548359455521836e-05, + "loss": 41.3072, + "step": 103110 + }, + { + "epoch": 0.41661784847101413, + "grad_norm": 598.8934936523438, + "learning_rate": 3.654526342869724e-05, + "loss": 64.1787, + "step": 103120 + }, + { + "epoch": 0.41665824973638177, + "grad_norm": 938.2333374023438, + "learning_rate": 3.654216717679179e-05, + "loss": 63.3869, + "step": 103130 + }, + { + "epoch": 0.41669865100174935, + "grad_norm": 0.0, + "learning_rate": 3.6539070699865853e-05, + "loss": 45.5103, + "step": 103140 + }, + { + "epoch": 0.416739052267117, + "grad_norm": 555.0664672851562, + "learning_rate": 3.653597399797979e-05, + "loss": 78.243, + "step": 103150 + }, + { + "epoch": 0.41677945353248463, + "grad_norm": 200.4738006591797, + "learning_rate": 3.6532877071193974e-05, + "loss": 45.6943, + "step": 103160 + }, + { + "epoch": 0.4168198547978523, + "grad_norm": 773.7369384765625, + "learning_rate": 3.652977991956878e-05, + "loss": 76.8474, + "step": 103170 + }, + { + "epoch": 0.4168602560632199, + "grad_norm": 1079.94775390625, + "learning_rate": 3.6526682543164595e-05, + "loss": 42.3459, + "step": 103180 + }, + { + "epoch": 0.41690065732858755, + "grad_norm": 919.0646362304688, + "learning_rate": 3.6523584942041794e-05, + "loss": 57.7047, + "step": 103190 + }, + { + "epoch": 0.41694105859395514, + "grad_norm": 891.546630859375, + "learning_rate": 3.6520487116260776e-05, + "loss": 73.0311, + "step": 103200 + }, + { + "epoch": 0.4169814598593228, + "grad_norm": 673.0984497070312, + "learning_rate": 3.6517389065881925e-05, + "loss": 52.7989, + "step": 103210 + }, + { + "epoch": 0.4170218611246904, + "grad_norm": 622.553955078125, + "learning_rate": 3.651429079096566e-05, + "loss": 61.8157, + "step": 103220 + }, + { + "epoch": 0.41706226239005806, + "grad_norm": 590.778564453125, + "learning_rate": 3.651119229157235e-05, + "loss": 42.1001, + "step": 103230 + }, + { + "epoch": 0.4171026636554257, + "grad_norm": 783.4470825195312, + "learning_rate": 3.650809356776242e-05, + "loss": 57.0175, + "step": 103240 + }, + { + "epoch": 0.41714306492079334, + "grad_norm": 1166.0804443359375, + "learning_rate": 3.6504994619596294e-05, + "loss": 72.4459, + "step": 103250 + }, + { + "epoch": 0.4171834661861609, + "grad_norm": 981.3162231445312, + "learning_rate": 3.650189544713437e-05, + "loss": 64.13, + "step": 103260 + }, + { + "epoch": 0.41722386745152856, + "grad_norm": 556.0223388671875, + "learning_rate": 3.649879605043707e-05, + "loss": 61.0497, + "step": 103270 + }, + { + "epoch": 0.4172642687168962, + "grad_norm": 1262.341064453125, + "learning_rate": 3.6495696429564823e-05, + "loss": 66.8657, + "step": 103280 + }, + { + "epoch": 0.41730466998226384, + "grad_norm": 322.91082763671875, + "learning_rate": 3.649259658457805e-05, + "loss": 62.4766, + "step": 103290 + }, + { + "epoch": 0.4173450712476315, + "grad_norm": 603.0479736328125, + "learning_rate": 3.6489496515537204e-05, + "loss": 65.0343, + "step": 103300 + }, + { + "epoch": 0.4173854725129991, + "grad_norm": 724.8381958007812, + "learning_rate": 3.648639622250269e-05, + "loss": 71.2941, + "step": 103310 + }, + { + "epoch": 0.41742587377836676, + "grad_norm": 709.285888671875, + "learning_rate": 3.648329570553498e-05, + "loss": 69.7722, + "step": 103320 + }, + { + "epoch": 0.41746627504373435, + "grad_norm": 1008.2940673828125, + "learning_rate": 3.648019496469451e-05, + "loss": 61.9354, + "step": 103330 + }, + { + "epoch": 0.417506676309102, + "grad_norm": 716.356689453125, + "learning_rate": 3.647709400004172e-05, + "loss": 102.8124, + "step": 103340 + }, + { + "epoch": 0.4175470775744696, + "grad_norm": 661.27880859375, + "learning_rate": 3.647399281163708e-05, + "loss": 54.8302, + "step": 103350 + }, + { + "epoch": 0.41758747883983727, + "grad_norm": 1043.45263671875, + "learning_rate": 3.647089139954104e-05, + "loss": 60.8454, + "step": 103360 + }, + { + "epoch": 0.4176278801052049, + "grad_norm": 862.45263671875, + "learning_rate": 3.646778976381407e-05, + "loss": 50.1279, + "step": 103370 + }, + { + "epoch": 0.41766828137057255, + "grad_norm": 924.0723266601562, + "learning_rate": 3.646468790451663e-05, + "loss": 41.2529, + "step": 103380 + }, + { + "epoch": 0.41770868263594013, + "grad_norm": 597.7651977539062, + "learning_rate": 3.64615858217092e-05, + "loss": 57.1287, + "step": 103390 + }, + { + "epoch": 0.41774908390130777, + "grad_norm": 379.25152587890625, + "learning_rate": 3.645848351545225e-05, + "loss": 62.9141, + "step": 103400 + }, + { + "epoch": 0.4177894851666754, + "grad_norm": 953.6447143554688, + "learning_rate": 3.645538098580627e-05, + "loss": 53.8604, + "step": 103410 + }, + { + "epoch": 0.41782988643204305, + "grad_norm": 575.6273193359375, + "learning_rate": 3.6452278232831735e-05, + "loss": 76.9551, + "step": 103420 + }, + { + "epoch": 0.4178702876974107, + "grad_norm": 1073.8568115234375, + "learning_rate": 3.644917525658914e-05, + "loss": 35.6009, + "step": 103430 + }, + { + "epoch": 0.41791068896277833, + "grad_norm": 1554.2855224609375, + "learning_rate": 3.644607205713898e-05, + "loss": 59.1008, + "step": 103440 + }, + { + "epoch": 0.41795109022814597, + "grad_norm": 619.3630981445312, + "learning_rate": 3.644296863454175e-05, + "loss": 48.0955, + "step": 103450 + }, + { + "epoch": 0.41799149149351356, + "grad_norm": 810.5908813476562, + "learning_rate": 3.643986498885796e-05, + "loss": 66.1217, + "step": 103460 + }, + { + "epoch": 0.4180318927588812, + "grad_norm": 2242.524658203125, + "learning_rate": 3.643676112014811e-05, + "loss": 79.2221, + "step": 103470 + }, + { + "epoch": 0.41807229402424884, + "grad_norm": 303.63385009765625, + "learning_rate": 3.643365702847272e-05, + "loss": 45.5608, + "step": 103480 + }, + { + "epoch": 0.4181126952896165, + "grad_norm": 1422.8160400390625, + "learning_rate": 3.643055271389229e-05, + "loss": 66.0965, + "step": 103490 + }, + { + "epoch": 0.4181530965549841, + "grad_norm": 741.1747436523438, + "learning_rate": 3.642744817646736e-05, + "loss": 45.8928, + "step": 103500 + }, + { + "epoch": 0.41819349782035176, + "grad_norm": 506.9162902832031, + "learning_rate": 3.642434341625844e-05, + "loss": 67.4611, + "step": 103510 + }, + { + "epoch": 0.41823389908571934, + "grad_norm": 592.9998168945312, + "learning_rate": 3.642123843332606e-05, + "loss": 63.5184, + "step": 103520 + }, + { + "epoch": 0.418274300351087, + "grad_norm": 303.4530334472656, + "learning_rate": 3.641813322773076e-05, + "loss": 50.6571, + "step": 103530 + }, + { + "epoch": 0.4183147016164546, + "grad_norm": 1942.93115234375, + "learning_rate": 3.641502779953307e-05, + "loss": 46.2943, + "step": 103540 + }, + { + "epoch": 0.41835510288182226, + "grad_norm": 748.298095703125, + "learning_rate": 3.6411922148793544e-05, + "loss": 54.8428, + "step": 103550 + }, + { + "epoch": 0.4183955041471899, + "grad_norm": 902.7544555664062, + "learning_rate": 3.640881627557271e-05, + "loss": 54.3055, + "step": 103560 + }, + { + "epoch": 0.41843590541255754, + "grad_norm": 1064.920654296875, + "learning_rate": 3.640571017993113e-05, + "loss": 58.3005, + "step": 103570 + }, + { + "epoch": 0.4184763066779251, + "grad_norm": 1428.6287841796875, + "learning_rate": 3.6402603861929374e-05, + "loss": 64.2658, + "step": 103580 + }, + { + "epoch": 0.41851670794329277, + "grad_norm": 503.11871337890625, + "learning_rate": 3.639949732162797e-05, + "loss": 61.6714, + "step": 103590 + }, + { + "epoch": 0.4185571092086604, + "grad_norm": 1219.691162109375, + "learning_rate": 3.639639055908751e-05, + "loss": 66.9562, + "step": 103600 + }, + { + "epoch": 0.41859751047402805, + "grad_norm": 223.49790954589844, + "learning_rate": 3.639328357436853e-05, + "loss": 55.7522, + "step": 103610 + }, + { + "epoch": 0.4186379117393957, + "grad_norm": 514.73681640625, + "learning_rate": 3.639017636753163e-05, + "loss": 71.0878, + "step": 103620 + }, + { + "epoch": 0.4186783130047633, + "grad_norm": 383.3266906738281, + "learning_rate": 3.638706893863739e-05, + "loss": 71.9139, + "step": 103630 + }, + { + "epoch": 0.41871871427013097, + "grad_norm": 539.1884765625, + "learning_rate": 3.638396128774636e-05, + "loss": 76.8819, + "step": 103640 + }, + { + "epoch": 0.41875911553549855, + "grad_norm": 1771.3558349609375, + "learning_rate": 3.6380853414919144e-05, + "loss": 65.5004, + "step": 103650 + }, + { + "epoch": 0.4187995168008662, + "grad_norm": 257.1325988769531, + "learning_rate": 3.6377745320216346e-05, + "loss": 84.9693, + "step": 103660 + }, + { + "epoch": 0.41883991806623383, + "grad_norm": 1277.81787109375, + "learning_rate": 3.6374637003698536e-05, + "loss": 59.5876, + "step": 103670 + }, + { + "epoch": 0.41888031933160147, + "grad_norm": 985.5901489257812, + "learning_rate": 3.637152846542633e-05, + "loss": 47.3202, + "step": 103680 + }, + { + "epoch": 0.4189207205969691, + "grad_norm": 943.6722412109375, + "learning_rate": 3.636841970546031e-05, + "loss": 52.7835, + "step": 103690 + }, + { + "epoch": 0.41896112186233675, + "grad_norm": 512.9838256835938, + "learning_rate": 3.63653107238611e-05, + "loss": 59.9564, + "step": 103700 + }, + { + "epoch": 0.41900152312770433, + "grad_norm": 886.2669067382812, + "learning_rate": 3.636220152068931e-05, + "loss": 69.5346, + "step": 103710 + }, + { + "epoch": 0.419041924393072, + "grad_norm": 866.3492431640625, + "learning_rate": 3.635909209600555e-05, + "loss": 68.2752, + "step": 103720 + }, + { + "epoch": 0.4190823256584396, + "grad_norm": 536.70947265625, + "learning_rate": 3.635598244987043e-05, + "loss": 56.4688, + "step": 103730 + }, + { + "epoch": 0.41912272692380725, + "grad_norm": 421.8968200683594, + "learning_rate": 3.6352872582344596e-05, + "loss": 63.9303, + "step": 103740 + }, + { + "epoch": 0.4191631281891749, + "grad_norm": 437.35546875, + "learning_rate": 3.634976249348867e-05, + "loss": 29.7742, + "step": 103750 + }, + { + "epoch": 0.41920352945454253, + "grad_norm": 1521.4986572265625, + "learning_rate": 3.634665218336328e-05, + "loss": 52.6372, + "step": 103760 + }, + { + "epoch": 0.4192439307199102, + "grad_norm": 814.3853759765625, + "learning_rate": 3.6343541652029064e-05, + "loss": 68.0177, + "step": 103770 + }, + { + "epoch": 0.41928433198527776, + "grad_norm": 495.0726318359375, + "learning_rate": 3.6340430899546656e-05, + "loss": 49.5452, + "step": 103780 + }, + { + "epoch": 0.4193247332506454, + "grad_norm": 1200.4290771484375, + "learning_rate": 3.633731992597672e-05, + "loss": 81.554, + "step": 103790 + }, + { + "epoch": 0.41936513451601304, + "grad_norm": 1105.92822265625, + "learning_rate": 3.633420873137988e-05, + "loss": 59.0107, + "step": 103800 + }, + { + "epoch": 0.4194055357813807, + "grad_norm": 965.9473266601562, + "learning_rate": 3.633109731581682e-05, + "loss": 52.3541, + "step": 103810 + }, + { + "epoch": 0.4194459370467483, + "grad_norm": 784.390625, + "learning_rate": 3.632798567934817e-05, + "loss": 42.5555, + "step": 103820 + }, + { + "epoch": 0.41948633831211596, + "grad_norm": 1471.4720458984375, + "learning_rate": 3.632487382203462e-05, + "loss": 61.9829, + "step": 103830 + }, + { + "epoch": 0.41952673957748354, + "grad_norm": 826.1594848632812, + "learning_rate": 3.632176174393682e-05, + "loss": 53.5257, + "step": 103840 + }, + { + "epoch": 0.4195671408428512, + "grad_norm": 526.9407958984375, + "learning_rate": 3.631864944511545e-05, + "loss": 58.7132, + "step": 103850 + }, + { + "epoch": 0.4196075421082188, + "grad_norm": 845.7454833984375, + "learning_rate": 3.6315536925631174e-05, + "loss": 44.2861, + "step": 103860 + }, + { + "epoch": 0.41964794337358646, + "grad_norm": 869.853515625, + "learning_rate": 3.631242418554469e-05, + "loss": 78.6071, + "step": 103870 + }, + { + "epoch": 0.4196883446389541, + "grad_norm": 858.0875244140625, + "learning_rate": 3.630931122491666e-05, + "loss": 53.3545, + "step": 103880 + }, + { + "epoch": 0.41972874590432174, + "grad_norm": 472.39337158203125, + "learning_rate": 3.6306198043807795e-05, + "loss": 44.7906, + "step": 103890 + }, + { + "epoch": 0.4197691471696893, + "grad_norm": 1292.262939453125, + "learning_rate": 3.630308464227877e-05, + "loss": 46.8033, + "step": 103900 + }, + { + "epoch": 0.41980954843505697, + "grad_norm": 667.8062133789062, + "learning_rate": 3.62999710203903e-05, + "loss": 75.6769, + "step": 103910 + }, + { + "epoch": 0.4198499497004246, + "grad_norm": 862.8394775390625, + "learning_rate": 3.629685717820307e-05, + "loss": 56.4103, + "step": 103920 + }, + { + "epoch": 0.41989035096579225, + "grad_norm": 348.3255920410156, + "learning_rate": 3.629374311577779e-05, + "loss": 44.0832, + "step": 103930 + }, + { + "epoch": 0.4199307522311599, + "grad_norm": 1450.09375, + "learning_rate": 3.629062883317519e-05, + "loss": 68.2507, + "step": 103940 + }, + { + "epoch": 0.4199711534965275, + "grad_norm": 537.00537109375, + "learning_rate": 3.628751433045596e-05, + "loss": 69.9783, + "step": 103950 + }, + { + "epoch": 0.42001155476189517, + "grad_norm": 581.5643310546875, + "learning_rate": 3.628439960768082e-05, + "loss": 63.0526, + "step": 103960 + }, + { + "epoch": 0.42005195602726275, + "grad_norm": 664.759521484375, + "learning_rate": 3.62812846649105e-05, + "loss": 56.3519, + "step": 103970 + }, + { + "epoch": 0.4200923572926304, + "grad_norm": 635.3744506835938, + "learning_rate": 3.6278169502205736e-05, + "loss": 61.1876, + "step": 103980 + }, + { + "epoch": 0.42013275855799803, + "grad_norm": 530.5980224609375, + "learning_rate": 3.627505411962724e-05, + "loss": 66.6494, + "step": 103990 + }, + { + "epoch": 0.42017315982336567, + "grad_norm": 758.6454467773438, + "learning_rate": 3.627193851723577e-05, + "loss": 58.0647, + "step": 104000 + }, + { + "epoch": 0.4202135610887333, + "grad_norm": 2163.636474609375, + "learning_rate": 3.6268822695092056e-05, + "loss": 46.5954, + "step": 104010 + }, + { + "epoch": 0.42025396235410095, + "grad_norm": 1194.97119140625, + "learning_rate": 3.626570665325684e-05, + "loss": 61.3154, + "step": 104020 + }, + { + "epoch": 0.42029436361946854, + "grad_norm": 1891.653076171875, + "learning_rate": 3.626259039179086e-05, + "loss": 59.1667, + "step": 104030 + }, + { + "epoch": 0.4203347648848362, + "grad_norm": 609.3426513671875, + "learning_rate": 3.6259473910754904e-05, + "loss": 41.8143, + "step": 104040 + }, + { + "epoch": 0.4203751661502038, + "grad_norm": 1182.2841796875, + "learning_rate": 3.625635721020969e-05, + "loss": 64.0979, + "step": 104050 + }, + { + "epoch": 0.42041556741557146, + "grad_norm": 960.9483642578125, + "learning_rate": 3.6253240290216e-05, + "loss": 49.4696, + "step": 104060 + }, + { + "epoch": 0.4204559686809391, + "grad_norm": 292.2507629394531, + "learning_rate": 3.62501231508346e-05, + "loss": 71.0735, + "step": 104070 + }, + { + "epoch": 0.42049636994630674, + "grad_norm": 782.7526245117188, + "learning_rate": 3.624700579212626e-05, + "loss": 55.9743, + "step": 104080 + }, + { + "epoch": 0.4205367712116744, + "grad_norm": 1657.7484130859375, + "learning_rate": 3.624388821415175e-05, + "loss": 35.3792, + "step": 104090 + }, + { + "epoch": 0.42057717247704196, + "grad_norm": 462.6631774902344, + "learning_rate": 3.624077041697185e-05, + "loss": 53.0793, + "step": 104100 + }, + { + "epoch": 0.4206175737424096, + "grad_norm": 606.4874267578125, + "learning_rate": 3.6237652400647345e-05, + "loss": 45.0725, + "step": 104110 + }, + { + "epoch": 0.42065797500777724, + "grad_norm": 842.7166137695312, + "learning_rate": 3.623453416523902e-05, + "loss": 67.8834, + "step": 104120 + }, + { + "epoch": 0.4206983762731449, + "grad_norm": 249.5082550048828, + "learning_rate": 3.623141571080766e-05, + "loss": 37.8796, + "step": 104130 + }, + { + "epoch": 0.4207387775385125, + "grad_norm": 343.2146911621094, + "learning_rate": 3.6228297037414074e-05, + "loss": 57.456, + "step": 104140 + }, + { + "epoch": 0.42077917880388016, + "grad_norm": 3797.084228515625, + "learning_rate": 3.622517814511906e-05, + "loss": 60.7318, + "step": 104150 + }, + { + "epoch": 0.42081958006924775, + "grad_norm": 617.2786254882812, + "learning_rate": 3.622205903398342e-05, + "loss": 59.6277, + "step": 104160 + }, + { + "epoch": 0.4208599813346154, + "grad_norm": 836.6347045898438, + "learning_rate": 3.6218939704067955e-05, + "loss": 69.2782, + "step": 104170 + }, + { + "epoch": 0.420900382599983, + "grad_norm": 988.6712646484375, + "learning_rate": 3.621582015543348e-05, + "loss": 71.7698, + "step": 104180 + }, + { + "epoch": 0.42094078386535067, + "grad_norm": 377.2164001464844, + "learning_rate": 3.621270038814083e-05, + "loss": 58.6837, + "step": 104190 + }, + { + "epoch": 0.4209811851307183, + "grad_norm": 498.7535095214844, + "learning_rate": 3.6209580402250815e-05, + "loss": 38.0926, + "step": 104200 + }, + { + "epoch": 0.42102158639608595, + "grad_norm": 403.9892883300781, + "learning_rate": 3.620646019782425e-05, + "loss": 51.1487, + "step": 104210 + }, + { + "epoch": 0.42106198766145353, + "grad_norm": 1094.6707763671875, + "learning_rate": 3.6203339774921976e-05, + "loss": 49.0772, + "step": 104220 + }, + { + "epoch": 0.42110238892682117, + "grad_norm": 1082.783935546875, + "learning_rate": 3.6200219133604816e-05, + "loss": 71.491, + "step": 104230 + }, + { + "epoch": 0.4211427901921888, + "grad_norm": 443.6391296386719, + "learning_rate": 3.6197098273933634e-05, + "loss": 57.1322, + "step": 104240 + }, + { + "epoch": 0.42118319145755645, + "grad_norm": 431.85430908203125, + "learning_rate": 3.619397719596924e-05, + "loss": 51.8768, + "step": 104250 + }, + { + "epoch": 0.4212235927229241, + "grad_norm": 2122.891845703125, + "learning_rate": 3.619085589977251e-05, + "loss": 62.3512, + "step": 104260 + }, + { + "epoch": 0.42126399398829173, + "grad_norm": 1347.9971923828125, + "learning_rate": 3.618773438540428e-05, + "loss": 68.3303, + "step": 104270 + }, + { + "epoch": 0.42130439525365937, + "grad_norm": 583.928955078125, + "learning_rate": 3.618461265292541e-05, + "loss": 52.5091, + "step": 104280 + }, + { + "epoch": 0.42134479651902695, + "grad_norm": 523.10546875, + "learning_rate": 3.618149070239676e-05, + "loss": 55.4588, + "step": 104290 + }, + { + "epoch": 0.4213851977843946, + "grad_norm": 891.4425659179688, + "learning_rate": 3.617836853387918e-05, + "loss": 56.3773, + "step": 104300 + }, + { + "epoch": 0.42142559904976223, + "grad_norm": 534.82470703125, + "learning_rate": 3.6175246147433563e-05, + "loss": 64.5376, + "step": 104310 + }, + { + "epoch": 0.4214660003151299, + "grad_norm": 529.0205688476562, + "learning_rate": 3.617212354312076e-05, + "loss": 50.4187, + "step": 104320 + }, + { + "epoch": 0.4215064015804975, + "grad_norm": 303.3257141113281, + "learning_rate": 3.616900072100166e-05, + "loss": 42.9221, + "step": 104330 + }, + { + "epoch": 0.42154680284586515, + "grad_norm": 1264.615478515625, + "learning_rate": 3.6165877681137136e-05, + "loss": 72.0312, + "step": 104340 + }, + { + "epoch": 0.42158720411123274, + "grad_norm": 1191.180908203125, + "learning_rate": 3.6162754423588085e-05, + "loss": 57.8124, + "step": 104350 + }, + { + "epoch": 0.4216276053766004, + "grad_norm": 366.5780944824219, + "learning_rate": 3.61596309484154e-05, + "loss": 56.5291, + "step": 104360 + }, + { + "epoch": 0.421668006641968, + "grad_norm": 512.8042602539062, + "learning_rate": 3.615650725567995e-05, + "loss": 57.7641, + "step": 104370 + }, + { + "epoch": 0.42170840790733566, + "grad_norm": 782.4150390625, + "learning_rate": 3.615338334544265e-05, + "loss": 41.4072, + "step": 104380 + }, + { + "epoch": 0.4217488091727033, + "grad_norm": 1751.27587890625, + "learning_rate": 3.615025921776439e-05, + "loss": 49.6701, + "step": 104390 + }, + { + "epoch": 0.42178921043807094, + "grad_norm": 1079.94189453125, + "learning_rate": 3.614713487270611e-05, + "loss": 46.7276, + "step": 104400 + }, + { + "epoch": 0.4218296117034386, + "grad_norm": 1060.4190673828125, + "learning_rate": 3.614401031032867e-05, + "loss": 106.2965, + "step": 104410 + }, + { + "epoch": 0.42187001296880616, + "grad_norm": 749.7593994140625, + "learning_rate": 3.614088553069303e-05, + "loss": 44.3926, + "step": 104420 + }, + { + "epoch": 0.4219104142341738, + "grad_norm": 628.2740478515625, + "learning_rate": 3.6137760533860074e-05, + "loss": 88.6748, + "step": 104430 + }, + { + "epoch": 0.42195081549954144, + "grad_norm": 461.367431640625, + "learning_rate": 3.613463531989076e-05, + "loss": 49.6176, + "step": 104440 + }, + { + "epoch": 0.4219912167649091, + "grad_norm": 873.3343505859375, + "learning_rate": 3.613150988884599e-05, + "loss": 61.126, + "step": 104450 + }, + { + "epoch": 0.4220316180302767, + "grad_norm": 393.72869873046875, + "learning_rate": 3.612838424078671e-05, + "loss": 38.0402, + "step": 104460 + }, + { + "epoch": 0.42207201929564436, + "grad_norm": 850.967041015625, + "learning_rate": 3.612525837577384e-05, + "loss": 66.1431, + "step": 104470 + }, + { + "epoch": 0.42211242056101195, + "grad_norm": 1090.09814453125, + "learning_rate": 3.6122132293868335e-05, + "loss": 82.8322, + "step": 104480 + }, + { + "epoch": 0.4221528218263796, + "grad_norm": 956.2141723632812, + "learning_rate": 3.611900599513114e-05, + "loss": 57.5093, + "step": 104490 + }, + { + "epoch": 0.4221932230917472, + "grad_norm": 407.1945495605469, + "learning_rate": 3.611587947962319e-05, + "loss": 50.0537, + "step": 104500 + }, + { + "epoch": 0.42223362435711487, + "grad_norm": 486.4323425292969, + "learning_rate": 3.6112752747405447e-05, + "loss": 52.2107, + "step": 104510 + }, + { + "epoch": 0.4222740256224825, + "grad_norm": 864.6810913085938, + "learning_rate": 3.6109625798538873e-05, + "loss": 50.7976, + "step": 104520 + }, + { + "epoch": 0.42231442688785015, + "grad_norm": 534.1397705078125, + "learning_rate": 3.6106498633084424e-05, + "loss": 48.8516, + "step": 104530 + }, + { + "epoch": 0.42235482815321773, + "grad_norm": 636.9712524414062, + "learning_rate": 3.610337125110307e-05, + "loss": 52.8733, + "step": 104540 + }, + { + "epoch": 0.42239522941858537, + "grad_norm": 1049.5821533203125, + "learning_rate": 3.610024365265577e-05, + "loss": 56.6238, + "step": 104550 + }, + { + "epoch": 0.422435630683953, + "grad_norm": 374.7794189453125, + "learning_rate": 3.6097115837803505e-05, + "loss": 73.0223, + "step": 104560 + }, + { + "epoch": 0.42247603194932065, + "grad_norm": 321.7105712890625, + "learning_rate": 3.609398780660726e-05, + "loss": 51.7502, + "step": 104570 + }, + { + "epoch": 0.4225164332146883, + "grad_norm": 833.6190185546875, + "learning_rate": 3.6090859559128e-05, + "loss": 44.2709, + "step": 104580 + }, + { + "epoch": 0.42255683448005593, + "grad_norm": 465.5598449707031, + "learning_rate": 3.6087731095426733e-05, + "loss": 54.5164, + "step": 104590 + }, + { + "epoch": 0.42259723574542357, + "grad_norm": 460.9774169921875, + "learning_rate": 3.608460241556443e-05, + "loss": 59.3148, + "step": 104600 + }, + { + "epoch": 0.42263763701079116, + "grad_norm": 390.5638427734375, + "learning_rate": 3.6081473519602105e-05, + "loss": 70.8456, + "step": 104610 + }, + { + "epoch": 0.4226780382761588, + "grad_norm": 1563.58642578125, + "learning_rate": 3.607834440760074e-05, + "loss": 57.9671, + "step": 104620 + }, + { + "epoch": 0.42271843954152644, + "grad_norm": 1309.08935546875, + "learning_rate": 3.607521507962136e-05, + "loss": 63.8607, + "step": 104630 + }, + { + "epoch": 0.4227588408068941, + "grad_norm": 2113.58447265625, + "learning_rate": 3.6072085535724956e-05, + "loss": 57.3463, + "step": 104640 + }, + { + "epoch": 0.4227992420722617, + "grad_norm": 1004.9712524414062, + "learning_rate": 3.606895577597255e-05, + "loss": 67.326, + "step": 104650 + }, + { + "epoch": 0.42283964333762936, + "grad_norm": 0.0, + "learning_rate": 3.606582580042513e-05, + "loss": 33.3827, + "step": 104660 + }, + { + "epoch": 0.42288004460299694, + "grad_norm": 594.0809936523438, + "learning_rate": 3.606269560914376e-05, + "loss": 48.728, + "step": 104670 + }, + { + "epoch": 0.4229204458683646, + "grad_norm": 891.5621948242188, + "learning_rate": 3.6059565202189435e-05, + "loss": 63.9774, + "step": 104680 + }, + { + "epoch": 0.4229608471337322, + "grad_norm": 622.961669921875, + "learning_rate": 3.605643457962319e-05, + "loss": 53.7502, + "step": 104690 + }, + { + "epoch": 0.42300124839909986, + "grad_norm": 899.34716796875, + "learning_rate": 3.605330374150607e-05, + "loss": 62.4448, + "step": 104700 + }, + { + "epoch": 0.4230416496644675, + "grad_norm": 698.8828735351562, + "learning_rate": 3.60501726878991e-05, + "loss": 72.48, + "step": 104710 + }, + { + "epoch": 0.42308205092983514, + "grad_norm": 591.2288208007812, + "learning_rate": 3.604704141886332e-05, + "loss": 50.3654, + "step": 104720 + }, + { + "epoch": 0.4231224521952028, + "grad_norm": 1411.56640625, + "learning_rate": 3.6043909934459785e-05, + "loss": 68.7279, + "step": 104730 + }, + { + "epoch": 0.42316285346057037, + "grad_norm": 3371.13134765625, + "learning_rate": 3.604077823474954e-05, + "loss": 72.1287, + "step": 104740 + }, + { + "epoch": 0.423203254725938, + "grad_norm": 0.0, + "learning_rate": 3.603764631979363e-05, + "loss": 61.998, + "step": 104750 + }, + { + "epoch": 0.42324365599130565, + "grad_norm": 704.408935546875, + "learning_rate": 3.603451418965313e-05, + "loss": 27.3794, + "step": 104760 + }, + { + "epoch": 0.4232840572566733, + "grad_norm": 973.9273681640625, + "learning_rate": 3.60313818443891e-05, + "loss": 87.7188, + "step": 104770 + }, + { + "epoch": 0.4233244585220409, + "grad_norm": 908.014404296875, + "learning_rate": 3.602824928406259e-05, + "loss": 51.2298, + "step": 104780 + }, + { + "epoch": 0.42336485978740857, + "grad_norm": 693.9036254882812, + "learning_rate": 3.602511650873469e-05, + "loss": 97.5675, + "step": 104790 + }, + { + "epoch": 0.42340526105277615, + "grad_norm": 1070.6943359375, + "learning_rate": 3.602198351846647e-05, + "loss": 70.0561, + "step": 104800 + }, + { + "epoch": 0.4234456623181438, + "grad_norm": 970.8385620117188, + "learning_rate": 3.6018850313319e-05, + "loss": 71.5289, + "step": 104810 + }, + { + "epoch": 0.42348606358351143, + "grad_norm": 1016.22021484375, + "learning_rate": 3.6015716893353376e-05, + "loss": 53.878, + "step": 104820 + }, + { + "epoch": 0.42352646484887907, + "grad_norm": 532.7132568359375, + "learning_rate": 3.601258325863067e-05, + "loss": 62.0343, + "step": 104830 + }, + { + "epoch": 0.4235668661142467, + "grad_norm": 94.07963562011719, + "learning_rate": 3.600944940921199e-05, + "loss": 67.1461, + "step": 104840 + }, + { + "epoch": 0.42360726737961435, + "grad_norm": 921.68017578125, + "learning_rate": 3.6006315345158434e-05, + "loss": 65.9748, + "step": 104850 + }, + { + "epoch": 0.42364766864498193, + "grad_norm": 1337.1324462890625, + "learning_rate": 3.600318106653108e-05, + "loss": 60.1088, + "step": 104860 + }, + { + "epoch": 0.4236880699103496, + "grad_norm": 522.1008911132812, + "learning_rate": 3.600004657339105e-05, + "loss": 55.0299, + "step": 104870 + }, + { + "epoch": 0.4237284711757172, + "grad_norm": 452.1679992675781, + "learning_rate": 3.5996911865799454e-05, + "loss": 42.4841, + "step": 104880 + }, + { + "epoch": 0.42376887244108485, + "grad_norm": 712.0603637695312, + "learning_rate": 3.59937769438174e-05, + "loss": 57.7705, + "step": 104890 + }, + { + "epoch": 0.4238092737064525, + "grad_norm": 486.09613037109375, + "learning_rate": 3.5990641807506e-05, + "loss": 39.141, + "step": 104900 + }, + { + "epoch": 0.42384967497182013, + "grad_norm": 1278.7674560546875, + "learning_rate": 3.598750645692638e-05, + "loss": 59.3348, + "step": 104910 + }, + { + "epoch": 0.4238900762371878, + "grad_norm": 1631.982666015625, + "learning_rate": 3.5984370892139666e-05, + "loss": 68.6455, + "step": 104920 + }, + { + "epoch": 0.42393047750255536, + "grad_norm": 466.52874755859375, + "learning_rate": 3.598123511320699e-05, + "loss": 37.1398, + "step": 104930 + }, + { + "epoch": 0.423970878767923, + "grad_norm": 1008.3381958007812, + "learning_rate": 3.597809912018947e-05, + "loss": 65.8413, + "step": 104940 + }, + { + "epoch": 0.42401128003329064, + "grad_norm": 738.3510131835938, + "learning_rate": 3.597496291314827e-05, + "loss": 67.1094, + "step": 104950 + }, + { + "epoch": 0.4240516812986583, + "grad_norm": 337.72796630859375, + "learning_rate": 3.5971826492144504e-05, + "loss": 63.2226, + "step": 104960 + }, + { + "epoch": 0.4240920825640259, + "grad_norm": 295.6142272949219, + "learning_rate": 3.5968689857239345e-05, + "loss": 65.1638, + "step": 104970 + }, + { + "epoch": 0.42413248382939356, + "grad_norm": 797.707275390625, + "learning_rate": 3.596555300849392e-05, + "loss": 77.6298, + "step": 104980 + }, + { + "epoch": 0.42417288509476114, + "grad_norm": 476.190185546875, + "learning_rate": 3.5962415945969405e-05, + "loss": 69.4243, + "step": 104990 + }, + { + "epoch": 0.4242132863601288, + "grad_norm": 412.16644287109375, + "learning_rate": 3.5959278669726935e-05, + "loss": 40.4091, + "step": 105000 + }, + { + "epoch": 0.4242536876254964, + "grad_norm": 565.7656860351562, + "learning_rate": 3.595614117982769e-05, + "loss": 68.7301, + "step": 105010 + }, + { + "epoch": 0.42429408889086406, + "grad_norm": 733.7787475585938, + "learning_rate": 3.5953003476332835e-05, + "loss": 54.2594, + "step": 105020 + }, + { + "epoch": 0.4243344901562317, + "grad_norm": 842.4686279296875, + "learning_rate": 3.5949865559303536e-05, + "loss": 65.4532, + "step": 105030 + }, + { + "epoch": 0.42437489142159934, + "grad_norm": 1525.9725341796875, + "learning_rate": 3.594672742880097e-05, + "loss": 51.1515, + "step": 105040 + }, + { + "epoch": 0.424415292686967, + "grad_norm": 1515.8177490234375, + "learning_rate": 3.594358908488632e-05, + "loss": 42.8702, + "step": 105050 + }, + { + "epoch": 0.42445569395233457, + "grad_norm": 922.44091796875, + "learning_rate": 3.594045052762076e-05, + "loss": 57.4078, + "step": 105060 + }, + { + "epoch": 0.4244960952177022, + "grad_norm": 1049.3338623046875, + "learning_rate": 3.5937311757065494e-05, + "loss": 55.3803, + "step": 105070 + }, + { + "epoch": 0.42453649648306985, + "grad_norm": 563.2704467773438, + "learning_rate": 3.5934172773281696e-05, + "loss": 28.5142, + "step": 105080 + }, + { + "epoch": 0.4245768977484375, + "grad_norm": 655.0813598632812, + "learning_rate": 3.593103357633058e-05, + "loss": 41.6843, + "step": 105090 + }, + { + "epoch": 0.4246172990138051, + "grad_norm": 684.439453125, + "learning_rate": 3.592789416627332e-05, + "loss": 50.5486, + "step": 105100 + }, + { + "epoch": 0.42465770027917277, + "grad_norm": 765.9920654296875, + "learning_rate": 3.592475454317115e-05, + "loss": 68.0954, + "step": 105110 + }, + { + "epoch": 0.42469810154454035, + "grad_norm": 1379.385009765625, + "learning_rate": 3.592161470708526e-05, + "loss": 65.1052, + "step": 105120 + }, + { + "epoch": 0.424738502809908, + "grad_norm": 296.92437744140625, + "learning_rate": 3.591847465807687e-05, + "loss": 46.8323, + "step": 105130 + }, + { + "epoch": 0.42477890407527563, + "grad_norm": 554.3091430664062, + "learning_rate": 3.59153343962072e-05, + "loss": 54.8523, + "step": 105140 + }, + { + "epoch": 0.42481930534064327, + "grad_norm": 529.84521484375, + "learning_rate": 3.5912193921537476e-05, + "loss": 44.9619, + "step": 105150 + }, + { + "epoch": 0.4248597066060109, + "grad_norm": 571.56591796875, + "learning_rate": 3.5909053234128895e-05, + "loss": 44.5782, + "step": 105160 + }, + { + "epoch": 0.42490010787137855, + "grad_norm": 598.4688110351562, + "learning_rate": 3.590591233404271e-05, + "loss": 75.1092, + "step": 105170 + }, + { + "epoch": 0.42494050913674614, + "grad_norm": 1925.43798828125, + "learning_rate": 3.590277122134015e-05, + "loss": 52.7716, + "step": 105180 + }, + { + "epoch": 0.4249809104021138, + "grad_norm": 435.9007263183594, + "learning_rate": 3.5899629896082454e-05, + "loss": 35.1308, + "step": 105190 + }, + { + "epoch": 0.4250213116674814, + "grad_norm": 492.9730224609375, + "learning_rate": 3.5896488358330856e-05, + "loss": 47.9135, + "step": 105200 + }, + { + "epoch": 0.42506171293284906, + "grad_norm": 1603.56591796875, + "learning_rate": 3.5893346608146607e-05, + "loss": 62.7425, + "step": 105210 + }, + { + "epoch": 0.4251021141982167, + "grad_norm": 715.2278442382812, + "learning_rate": 3.5890204645590964e-05, + "loss": 127.7718, + "step": 105220 + }, + { + "epoch": 0.42514251546358434, + "grad_norm": 572.968017578125, + "learning_rate": 3.588706247072518e-05, + "loss": 65.1566, + "step": 105230 + }, + { + "epoch": 0.425182916728952, + "grad_norm": 859.6871948242188, + "learning_rate": 3.588392008361049e-05, + "loss": 49.4484, + "step": 105240 + }, + { + "epoch": 0.42522331799431956, + "grad_norm": 1073.6341552734375, + "learning_rate": 3.588077748430819e-05, + "loss": 65.2147, + "step": 105250 + }, + { + "epoch": 0.4252637192596872, + "grad_norm": 802.9885864257812, + "learning_rate": 3.587763467287953e-05, + "loss": 44.3761, + "step": 105260 + }, + { + "epoch": 0.42530412052505484, + "grad_norm": 1480.44921875, + "learning_rate": 3.587449164938578e-05, + "loss": 53.1393, + "step": 105270 + }, + { + "epoch": 0.4253445217904225, + "grad_norm": 544.9711303710938, + "learning_rate": 3.5871348413888204e-05, + "loss": 64.2403, + "step": 105280 + }, + { + "epoch": 0.4253849230557901, + "grad_norm": 1001.657958984375, + "learning_rate": 3.586820496644811e-05, + "loss": 61.8277, + "step": 105290 + }, + { + "epoch": 0.42542532432115776, + "grad_norm": 1063.820068359375, + "learning_rate": 3.586506130712676e-05, + "loss": 47.9627, + "step": 105300 + }, + { + "epoch": 0.42546572558652535, + "grad_norm": 1031.9720458984375, + "learning_rate": 3.5861917435985445e-05, + "loss": 58.5947, + "step": 105310 + }, + { + "epoch": 0.425506126851893, + "grad_norm": 376.46630859375, + "learning_rate": 3.585877335308546e-05, + "loss": 62.4376, + "step": 105320 + }, + { + "epoch": 0.4255465281172606, + "grad_norm": 1131.775390625, + "learning_rate": 3.5855629058488095e-05, + "loss": 71.483, + "step": 105330 + }, + { + "epoch": 0.42558692938262827, + "grad_norm": 687.2857666015625, + "learning_rate": 3.585248455225466e-05, + "loss": 69.5148, + "step": 105340 + }, + { + "epoch": 0.4256273306479959, + "grad_norm": 473.1686706542969, + "learning_rate": 3.584933983444644e-05, + "loss": 89.6963, + "step": 105350 + }, + { + "epoch": 0.42566773191336355, + "grad_norm": 742.19873046875, + "learning_rate": 3.5846194905124757e-05, + "loss": 67.2487, + "step": 105360 + }, + { + "epoch": 0.4257081331787312, + "grad_norm": 369.0582275390625, + "learning_rate": 3.584304976435092e-05, + "loss": 84.1498, + "step": 105370 + }, + { + "epoch": 0.42574853444409877, + "grad_norm": 409.1021728515625, + "learning_rate": 3.5839904412186256e-05, + "loss": 63.912, + "step": 105380 + }, + { + "epoch": 0.4257889357094664, + "grad_norm": 777.0023193359375, + "learning_rate": 3.583675884869206e-05, + "loss": 52.3509, + "step": 105390 + }, + { + "epoch": 0.42582933697483405, + "grad_norm": 345.2575988769531, + "learning_rate": 3.5833613073929684e-05, + "loss": 37.5077, + "step": 105400 + }, + { + "epoch": 0.4258697382402017, + "grad_norm": 1329.019287109375, + "learning_rate": 3.583046708796043e-05, + "loss": 46.7334, + "step": 105410 + }, + { + "epoch": 0.42591013950556933, + "grad_norm": 574.7626953125, + "learning_rate": 3.582732089084566e-05, + "loss": 49.3083, + "step": 105420 + }, + { + "epoch": 0.42595054077093697, + "grad_norm": 0.0, + "learning_rate": 3.582417448264669e-05, + "loss": 53.3416, + "step": 105430 + }, + { + "epoch": 0.42599094203630455, + "grad_norm": 1714.5047607421875, + "learning_rate": 3.582102786342485e-05, + "loss": 67.8292, + "step": 105440 + }, + { + "epoch": 0.4260313433016722, + "grad_norm": 1370.195068359375, + "learning_rate": 3.581788103324152e-05, + "loss": 58.7996, + "step": 105450 + }, + { + "epoch": 0.42607174456703983, + "grad_norm": 1272.4110107421875, + "learning_rate": 3.581473399215802e-05, + "loss": 70.6126, + "step": 105460 + }, + { + "epoch": 0.4261121458324075, + "grad_norm": 662.2067260742188, + "learning_rate": 3.581158674023572e-05, + "loss": 51.143, + "step": 105470 + }, + { + "epoch": 0.4261525470977751, + "grad_norm": 803.9454956054688, + "learning_rate": 3.5808439277535964e-05, + "loss": 38.5995, + "step": 105480 + }, + { + "epoch": 0.42619294836314275, + "grad_norm": 507.4480285644531, + "learning_rate": 3.580529160412013e-05, + "loss": 56.6335, + "step": 105490 + }, + { + "epoch": 0.42623334962851034, + "grad_norm": 673.6394653320312, + "learning_rate": 3.580214372004956e-05, + "loss": 71.4622, + "step": 105500 + }, + { + "epoch": 0.426273750893878, + "grad_norm": 506.5180969238281, + "learning_rate": 3.579899562538564e-05, + "loss": 34.8514, + "step": 105510 + }, + { + "epoch": 0.4263141521592456, + "grad_norm": 912.122802734375, + "learning_rate": 3.5795847320189746e-05, + "loss": 57.6717, + "step": 105520 + }, + { + "epoch": 0.42635455342461326, + "grad_norm": 888.0779418945312, + "learning_rate": 3.5792698804523245e-05, + "loss": 55.5145, + "step": 105530 + }, + { + "epoch": 0.4263949546899809, + "grad_norm": 670.391845703125, + "learning_rate": 3.5789550078447526e-05, + "loss": 78.2021, + "step": 105540 + }, + { + "epoch": 0.42643535595534854, + "grad_norm": 418.4927673339844, + "learning_rate": 3.5786401142023975e-05, + "loss": 73.2309, + "step": 105550 + }, + { + "epoch": 0.4264757572207162, + "grad_norm": 579.839599609375, + "learning_rate": 3.5783251995313985e-05, + "loss": 49.0248, + "step": 105560 + }, + { + "epoch": 0.42651615848608376, + "grad_norm": 1221.287841796875, + "learning_rate": 3.5780102638378936e-05, + "loss": 48.0422, + "step": 105570 + }, + { + "epoch": 0.4265565597514514, + "grad_norm": 576.9136962890625, + "learning_rate": 3.577695307128024e-05, + "loss": 48.1065, + "step": 105580 + }, + { + "epoch": 0.42659696101681904, + "grad_norm": 682.1599731445312, + "learning_rate": 3.57738032940793e-05, + "loss": 80.5689, + "step": 105590 + }, + { + "epoch": 0.4266373622821867, + "grad_norm": 952.7507934570312, + "learning_rate": 3.577065330683751e-05, + "loss": 59.2839, + "step": 105600 + }, + { + "epoch": 0.4266777635475543, + "grad_norm": 755.3192749023438, + "learning_rate": 3.5767503109616296e-05, + "loss": 63.4179, + "step": 105610 + }, + { + "epoch": 0.42671816481292196, + "grad_norm": 528.3074340820312, + "learning_rate": 3.576435270247706e-05, + "loss": 49.0063, + "step": 105620 + }, + { + "epoch": 0.42675856607828955, + "grad_norm": 1181.072509765625, + "learning_rate": 3.5761202085481235e-05, + "loss": 53.1849, + "step": 105630 + }, + { + "epoch": 0.4267989673436572, + "grad_norm": 825.5591430664062, + "learning_rate": 3.575805125869022e-05, + "loss": 54.1307, + "step": 105640 + }, + { + "epoch": 0.4268393686090248, + "grad_norm": 1657.726806640625, + "learning_rate": 3.5754900222165465e-05, + "loss": 84.8316, + "step": 105650 + }, + { + "epoch": 0.42687976987439247, + "grad_norm": 707.2351684570312, + "learning_rate": 3.5751748975968394e-05, + "loss": 71.4824, + "step": 105660 + }, + { + "epoch": 0.4269201711397601, + "grad_norm": 514.521484375, + "learning_rate": 3.574859752016045e-05, + "loss": 60.5219, + "step": 105670 + }, + { + "epoch": 0.42696057240512775, + "grad_norm": 661.9301147460938, + "learning_rate": 3.574544585480305e-05, + "loss": 45.8945, + "step": 105680 + }, + { + "epoch": 0.4270009736704954, + "grad_norm": 416.9234924316406, + "learning_rate": 3.574229397995765e-05, + "loss": 63.908, + "step": 105690 + }, + { + "epoch": 0.42704137493586297, + "grad_norm": 835.8658447265625, + "learning_rate": 3.573914189568571e-05, + "loss": 77.679, + "step": 105700 + }, + { + "epoch": 0.4270817762012306, + "grad_norm": 1938.5263671875, + "learning_rate": 3.5735989602048665e-05, + "loss": 62.1259, + "step": 105710 + }, + { + "epoch": 0.42712217746659825, + "grad_norm": 825.2261962890625, + "learning_rate": 3.573283709910798e-05, + "loss": 50.9862, + "step": 105720 + }, + { + "epoch": 0.4271625787319659, + "grad_norm": 1108.54345703125, + "learning_rate": 3.572968438692509e-05, + "loss": 75.7806, + "step": 105730 + }, + { + "epoch": 0.42720297999733353, + "grad_norm": 338.0368347167969, + "learning_rate": 3.5726531465561504e-05, + "loss": 40.806, + "step": 105740 + }, + { + "epoch": 0.42724338126270117, + "grad_norm": 876.4752197265625, + "learning_rate": 3.572337833507865e-05, + "loss": 63.9452, + "step": 105750 + }, + { + "epoch": 0.42728378252806876, + "grad_norm": 429.4320983886719, + "learning_rate": 3.572022499553802e-05, + "loss": 79.8431, + "step": 105760 + }, + { + "epoch": 0.4273241837934364, + "grad_norm": 749.8831787109375, + "learning_rate": 3.5717071447001083e-05, + "loss": 54.6913, + "step": 105770 + }, + { + "epoch": 0.42736458505880404, + "grad_norm": 902.8178100585938, + "learning_rate": 3.571391768952932e-05, + "loss": 45.6217, + "step": 105780 + }, + { + "epoch": 0.4274049863241717, + "grad_norm": 1094.285400390625, + "learning_rate": 3.571076372318422e-05, + "loss": 76.6032, + "step": 105790 + }, + { + "epoch": 0.4274453875895393, + "grad_norm": 1502.871337890625, + "learning_rate": 3.570760954802726e-05, + "loss": 80.5316, + "step": 105800 + }, + { + "epoch": 0.42748578885490696, + "grad_norm": 572.2935180664062, + "learning_rate": 3.5704455164119945e-05, + "loss": 64.48, + "step": 105810 + }, + { + "epoch": 0.42752619012027454, + "grad_norm": 742.1455688476562, + "learning_rate": 3.5701300571523755e-05, + "loss": 82.0077, + "step": 105820 + }, + { + "epoch": 0.4275665913856422, + "grad_norm": 520.0400390625, + "learning_rate": 3.569814577030022e-05, + "loss": 74.5518, + "step": 105830 + }, + { + "epoch": 0.4276069926510098, + "grad_norm": 967.4630126953125, + "learning_rate": 3.569499076051081e-05, + "loss": 51.6548, + "step": 105840 + }, + { + "epoch": 0.42764739391637746, + "grad_norm": 1097.524658203125, + "learning_rate": 3.5691835542217054e-05, + "loss": 66.9995, + "step": 105850 + }, + { + "epoch": 0.4276877951817451, + "grad_norm": 1097.96533203125, + "learning_rate": 3.5688680115480455e-05, + "loss": 55.4627, + "step": 105860 + }, + { + "epoch": 0.42772819644711274, + "grad_norm": 936.7929077148438, + "learning_rate": 3.5685524480362543e-05, + "loss": 79.1348, + "step": 105870 + }, + { + "epoch": 0.4277685977124804, + "grad_norm": 427.792724609375, + "learning_rate": 3.568236863692482e-05, + "loss": 57.1244, + "step": 105880 + }, + { + "epoch": 0.42780899897784797, + "grad_norm": 430.7917785644531, + "learning_rate": 3.567921258522883e-05, + "loss": 45.7751, + "step": 105890 + }, + { + "epoch": 0.4278494002432156, + "grad_norm": 1268.4412841796875, + "learning_rate": 3.567605632533608e-05, + "loss": 51.2699, + "step": 105900 + }, + { + "epoch": 0.42788980150858325, + "grad_norm": 156.56045532226562, + "learning_rate": 3.5672899857308134e-05, + "loss": 58.0844, + "step": 105910 + }, + { + "epoch": 0.4279302027739509, + "grad_norm": 829.1176147460938, + "learning_rate": 3.56697431812065e-05, + "loss": 55.2921, + "step": 105920 + }, + { + "epoch": 0.4279706040393185, + "grad_norm": 610.2461547851562, + "learning_rate": 3.566658629709273e-05, + "loss": 50.3534, + "step": 105930 + }, + { + "epoch": 0.42801100530468617, + "grad_norm": 1424.289306640625, + "learning_rate": 3.566342920502837e-05, + "loss": 63.0704, + "step": 105940 + }, + { + "epoch": 0.42805140657005375, + "grad_norm": 3496.262939453125, + "learning_rate": 3.5660271905074974e-05, + "loss": 59.1756, + "step": 105950 + }, + { + "epoch": 0.4280918078354214, + "grad_norm": 413.6312561035156, + "learning_rate": 3.565711439729408e-05, + "loss": 55.2588, + "step": 105960 + }, + { + "epoch": 0.42813220910078903, + "grad_norm": 632.3339233398438, + "learning_rate": 3.565395668174725e-05, + "loss": 69.6535, + "step": 105970 + }, + { + "epoch": 0.42817261036615667, + "grad_norm": 1112.0599365234375, + "learning_rate": 3.565079875849605e-05, + "loss": 51.4274, + "step": 105980 + }, + { + "epoch": 0.4282130116315243, + "grad_norm": 127.4210433959961, + "learning_rate": 3.564764062760205e-05, + "loss": 64.7097, + "step": 105990 + }, + { + "epoch": 0.42825341289689195, + "grad_norm": 1343.2515869140625, + "learning_rate": 3.564448228912682e-05, + "loss": 50.9704, + "step": 106000 + }, + { + "epoch": 0.4282938141622596, + "grad_norm": 794.98779296875, + "learning_rate": 3.564132374313192e-05, + "loss": 38.4331, + "step": 106010 + }, + { + "epoch": 0.4283342154276272, + "grad_norm": 195.28814697265625, + "learning_rate": 3.5638164989678935e-05, + "loss": 61.8604, + "step": 106020 + }, + { + "epoch": 0.4283746166929948, + "grad_norm": 1086.4222412109375, + "learning_rate": 3.563500602882945e-05, + "loss": 65.9661, + "step": 106030 + }, + { + "epoch": 0.42841501795836245, + "grad_norm": 737.5653076171875, + "learning_rate": 3.5631846860645044e-05, + "loss": 55.4186, + "step": 106040 + }, + { + "epoch": 0.4284554192237301, + "grad_norm": 864.1533203125, + "learning_rate": 3.562868748518732e-05, + "loss": 82.7763, + "step": 106050 + }, + { + "epoch": 0.42849582048909773, + "grad_norm": 1645.748046875, + "learning_rate": 3.562552790251785e-05, + "loss": 70.2854, + "step": 106060 + }, + { + "epoch": 0.4285362217544654, + "grad_norm": 754.142578125, + "learning_rate": 3.562236811269824e-05, + "loss": 55.0108, + "step": 106070 + }, + { + "epoch": 0.42857662301983296, + "grad_norm": 408.4161376953125, + "learning_rate": 3.56192081157901e-05, + "loss": 69.7249, + "step": 106080 + }, + { + "epoch": 0.4286170242852006, + "grad_norm": 308.8442077636719, + "learning_rate": 3.561604791185503e-05, + "loss": 54.3556, + "step": 106090 + }, + { + "epoch": 0.42865742555056824, + "grad_norm": 1148.19677734375, + "learning_rate": 3.561288750095465e-05, + "loss": 57.7566, + "step": 106100 + }, + { + "epoch": 0.4286978268159359, + "grad_norm": 589.8562622070312, + "learning_rate": 3.560972688315055e-05, + "loss": 65.197, + "step": 106110 + }, + { + "epoch": 0.4287382280813035, + "grad_norm": 1979.934326171875, + "learning_rate": 3.5606566058504375e-05, + "loss": 67.4663, + "step": 106120 + }, + { + "epoch": 0.42877862934667116, + "grad_norm": 638.1552124023438, + "learning_rate": 3.560340502707773e-05, + "loss": 40.0643, + "step": 106130 + }, + { + "epoch": 0.42881903061203874, + "grad_norm": 1344.94580078125, + "learning_rate": 3.560024378893224e-05, + "loss": 71.7906, + "step": 106140 + }, + { + "epoch": 0.4288594318774064, + "grad_norm": 541.87353515625, + "learning_rate": 3.559708234412954e-05, + "loss": 89.126, + "step": 106150 + }, + { + "epoch": 0.428899833142774, + "grad_norm": 1129.01708984375, + "learning_rate": 3.559392069273127e-05, + "loss": 74.1695, + "step": 106160 + }, + { + "epoch": 0.42894023440814166, + "grad_norm": 741.240234375, + "learning_rate": 3.559075883479906e-05, + "loss": 77.3889, + "step": 106170 + }, + { + "epoch": 0.4289806356735093, + "grad_norm": 373.0411682128906, + "learning_rate": 3.558759677039455e-05, + "loss": 85.0204, + "step": 106180 + }, + { + "epoch": 0.42902103693887694, + "grad_norm": 705.1157836914062, + "learning_rate": 3.558443449957939e-05, + "loss": 77.1509, + "step": 106190 + }, + { + "epoch": 0.4290614382042446, + "grad_norm": 565.6890258789062, + "learning_rate": 3.5581272022415244e-05, + "loss": 62.9252, + "step": 106200 + }, + { + "epoch": 0.42910183946961217, + "grad_norm": 2649.929443359375, + "learning_rate": 3.5578109338963736e-05, + "loss": 79.2853, + "step": 106210 + }, + { + "epoch": 0.4291422407349798, + "grad_norm": 645.3712768554688, + "learning_rate": 3.557494644928654e-05, + "loss": 52.6527, + "step": 106220 + }, + { + "epoch": 0.42918264200034745, + "grad_norm": 446.864013671875, + "learning_rate": 3.5571783353445325e-05, + "loss": 54.5988, + "step": 106230 + }, + { + "epoch": 0.4292230432657151, + "grad_norm": 1004.2807006835938, + "learning_rate": 3.5568620051501756e-05, + "loss": 67.0789, + "step": 106240 + }, + { + "epoch": 0.4292634445310827, + "grad_norm": 611.4927368164062, + "learning_rate": 3.556545654351749e-05, + "loss": 38.8004, + "step": 106250 + }, + { + "epoch": 0.42930384579645037, + "grad_norm": 868.6104125976562, + "learning_rate": 3.556229282955421e-05, + "loss": 44.7674, + "step": 106260 + }, + { + "epoch": 0.42934424706181795, + "grad_norm": 1725.8865966796875, + "learning_rate": 3.5559128909673595e-05, + "loss": 71.4952, + "step": 106270 + }, + { + "epoch": 0.4293846483271856, + "grad_norm": 654.6998291015625, + "learning_rate": 3.555596478393733e-05, + "loss": 48.7473, + "step": 106280 + }, + { + "epoch": 0.42942504959255323, + "grad_norm": 1723.461669921875, + "learning_rate": 3.555280045240709e-05, + "loss": 62.5021, + "step": 106290 + }, + { + "epoch": 0.42946545085792087, + "grad_norm": 989.3924560546875, + "learning_rate": 3.554963591514457e-05, + "loss": 64.8296, + "step": 106300 + }, + { + "epoch": 0.4295058521232885, + "grad_norm": 767.1257934570312, + "learning_rate": 3.554647117221147e-05, + "loss": 58.58, + "step": 106310 + }, + { + "epoch": 0.42954625338865615, + "grad_norm": 958.0810546875, + "learning_rate": 3.554330622366949e-05, + "loss": 82.886, + "step": 106320 + }, + { + "epoch": 0.42958665465402374, + "grad_norm": 1372.3026123046875, + "learning_rate": 3.554014106958032e-05, + "loss": 91.1392, + "step": 106330 + }, + { + "epoch": 0.4296270559193914, + "grad_norm": 765.9713745117188, + "learning_rate": 3.5536975710005677e-05, + "loss": 43.6938, + "step": 106340 + }, + { + "epoch": 0.429667457184759, + "grad_norm": 2553.733642578125, + "learning_rate": 3.553381014500727e-05, + "loss": 54.1921, + "step": 106350 + }, + { + "epoch": 0.42970785845012666, + "grad_norm": 1080.475341796875, + "learning_rate": 3.5530644374646815e-05, + "loss": 62.0937, + "step": 106360 + }, + { + "epoch": 0.4297482597154943, + "grad_norm": 2564.04638671875, + "learning_rate": 3.5527478398986015e-05, + "loss": 63.5857, + "step": 106370 + }, + { + "epoch": 0.42978866098086194, + "grad_norm": 299.32989501953125, + "learning_rate": 3.552431221808661e-05, + "loss": 44.5124, + "step": 106380 + }, + { + "epoch": 0.4298290622462296, + "grad_norm": 631.6810302734375, + "learning_rate": 3.5521145832010314e-05, + "loss": 49.5127, + "step": 106390 + }, + { + "epoch": 0.42986946351159716, + "grad_norm": 1121.8922119140625, + "learning_rate": 3.551797924081887e-05, + "loss": 58.7079, + "step": 106400 + }, + { + "epoch": 0.4299098647769648, + "grad_norm": 657.3575439453125, + "learning_rate": 3.5514812444574004e-05, + "loss": 50.7894, + "step": 106410 + }, + { + "epoch": 0.42995026604233244, + "grad_norm": 662.6183471679688, + "learning_rate": 3.551164544333745e-05, + "loss": 79.3807, + "step": 106420 + }, + { + "epoch": 0.4299906673077001, + "grad_norm": 593.0071411132812, + "learning_rate": 3.550847823717096e-05, + "loss": 65.9797, + "step": 106430 + }, + { + "epoch": 0.4300310685730677, + "grad_norm": 755.3797607421875, + "learning_rate": 3.5505310826136286e-05, + "loss": 51.0179, + "step": 106440 + }, + { + "epoch": 0.43007146983843536, + "grad_norm": 1839.5137939453125, + "learning_rate": 3.5502143210295165e-05, + "loss": 62.1264, + "step": 106450 + }, + { + "epoch": 0.43011187110380295, + "grad_norm": 722.4067993164062, + "learning_rate": 3.549897538970934e-05, + "loss": 63.9707, + "step": 106460 + }, + { + "epoch": 0.4301522723691706, + "grad_norm": 465.209716796875, + "learning_rate": 3.54958073644406e-05, + "loss": 51.9835, + "step": 106470 + }, + { + "epoch": 0.4301926736345382, + "grad_norm": 348.4991760253906, + "learning_rate": 3.5492639134550695e-05, + "loss": 84.2273, + "step": 106480 + }, + { + "epoch": 0.43023307489990587, + "grad_norm": 1531.664794921875, + "learning_rate": 3.548947070010138e-05, + "loss": 81.6072, + "step": 106490 + }, + { + "epoch": 0.4302734761652735, + "grad_norm": 797.4585571289062, + "learning_rate": 3.548630206115443e-05, + "loss": 56.6595, + "step": 106500 + }, + { + "epoch": 0.43031387743064115, + "grad_norm": 412.25360107421875, + "learning_rate": 3.5483133217771625e-05, + "loss": 57.9216, + "step": 106510 + }, + { + "epoch": 0.4303542786960088, + "grad_norm": 647.94921875, + "learning_rate": 3.5479964170014746e-05, + "loss": 99.7013, + "step": 106520 + }, + { + "epoch": 0.43039467996137637, + "grad_norm": 649.0404052734375, + "learning_rate": 3.547679491794557e-05, + "loss": 46.1641, + "step": 106530 + }, + { + "epoch": 0.430435081226744, + "grad_norm": 434.8841552734375, + "learning_rate": 3.547362546162588e-05, + "loss": 61.1318, + "step": 106540 + }, + { + "epoch": 0.43047548249211165, + "grad_norm": 733.9017333984375, + "learning_rate": 3.547045580111746e-05, + "loss": 49.06, + "step": 106550 + }, + { + "epoch": 0.4305158837574793, + "grad_norm": 721.9252319335938, + "learning_rate": 3.546728593648213e-05, + "loss": 61.6053, + "step": 106560 + }, + { + "epoch": 0.43055628502284693, + "grad_norm": 553.4278564453125, + "learning_rate": 3.546411586778167e-05, + "loss": 59.98, + "step": 106570 + }, + { + "epoch": 0.43059668628821457, + "grad_norm": 410.0889892578125, + "learning_rate": 3.546094559507787e-05, + "loss": 44.391, + "step": 106580 + }, + { + "epoch": 0.43063708755358215, + "grad_norm": 429.70361328125, + "learning_rate": 3.5457775118432556e-05, + "loss": 90.1694, + "step": 106590 + }, + { + "epoch": 0.4306774888189498, + "grad_norm": 899.5925903320312, + "learning_rate": 3.545460443790753e-05, + "loss": 79.3271, + "step": 106600 + }, + { + "epoch": 0.43071789008431743, + "grad_norm": 1410.063232421875, + "learning_rate": 3.545143355356462e-05, + "loss": 75.9909, + "step": 106610 + }, + { + "epoch": 0.4307582913496851, + "grad_norm": 1006.2098999023438, + "learning_rate": 3.544826246546563e-05, + "loss": 61.4801, + "step": 106620 + }, + { + "epoch": 0.4307986926150527, + "grad_norm": 618.147216796875, + "learning_rate": 3.544509117367238e-05, + "loss": 56.4724, + "step": 106630 + }, + { + "epoch": 0.43083909388042035, + "grad_norm": 866.3955078125, + "learning_rate": 3.544191967824669e-05, + "loss": 47.1831, + "step": 106640 + }, + { + "epoch": 0.43087949514578794, + "grad_norm": 802.1903686523438, + "learning_rate": 3.543874797925042e-05, + "loss": 62.7335, + "step": 106650 + }, + { + "epoch": 0.4309198964111556, + "grad_norm": 722.4039306640625, + "learning_rate": 3.543557607674537e-05, + "loss": 57.5358, + "step": 106660 + }, + { + "epoch": 0.4309602976765232, + "grad_norm": 467.41937255859375, + "learning_rate": 3.543240397079339e-05, + "loss": 42.7253, + "step": 106670 + }, + { + "epoch": 0.43100069894189086, + "grad_norm": 421.191162109375, + "learning_rate": 3.542923166145633e-05, + "loss": 60.0862, + "step": 106680 + }, + { + "epoch": 0.4310411002072585, + "grad_norm": 797.8417358398438, + "learning_rate": 3.542605914879603e-05, + "loss": 69.1785, + "step": 106690 + }, + { + "epoch": 0.43108150147262614, + "grad_norm": 1393.1776123046875, + "learning_rate": 3.542288643287434e-05, + "loss": 42.2223, + "step": 106700 + }, + { + "epoch": 0.4311219027379938, + "grad_norm": 699.2664184570312, + "learning_rate": 3.5419713513753114e-05, + "loss": 71.7247, + "step": 106710 + }, + { + "epoch": 0.43116230400336136, + "grad_norm": 1001.225830078125, + "learning_rate": 3.54165403914942e-05, + "loss": 54.1351, + "step": 106720 + }, + { + "epoch": 0.431202705268729, + "grad_norm": 587.420166015625, + "learning_rate": 3.541336706615947e-05, + "loss": 62.8683, + "step": 106730 + }, + { + "epoch": 0.43124310653409664, + "grad_norm": 481.1246643066406, + "learning_rate": 3.541019353781079e-05, + "loss": 48.2253, + "step": 106740 + }, + { + "epoch": 0.4312835077994643, + "grad_norm": 694.5806274414062, + "learning_rate": 3.540701980651003e-05, + "loss": 50.7718, + "step": 106750 + }, + { + "epoch": 0.4313239090648319, + "grad_norm": 1048.5947265625, + "learning_rate": 3.540384587231906e-05, + "loss": 78.4464, + "step": 106760 + }, + { + "epoch": 0.43136431033019956, + "grad_norm": 372.85406494140625, + "learning_rate": 3.540067173529976e-05, + "loss": 48.4107, + "step": 106770 + }, + { + "epoch": 0.43140471159556715, + "grad_norm": 2308.911865234375, + "learning_rate": 3.5397497395514004e-05, + "loss": 73.4827, + "step": 106780 + }, + { + "epoch": 0.4314451128609348, + "grad_norm": 183.2915802001953, + "learning_rate": 3.5394322853023694e-05, + "loss": 35.4445, + "step": 106790 + }, + { + "epoch": 0.4314855141263024, + "grad_norm": 4619.44140625, + "learning_rate": 3.53911481078907e-05, + "loss": 50.2226, + "step": 106800 + }, + { + "epoch": 0.43152591539167007, + "grad_norm": 873.1148071289062, + "learning_rate": 3.5387973160176926e-05, + "loss": 51.0404, + "step": 106810 + }, + { + "epoch": 0.4315663166570377, + "grad_norm": 480.3205871582031, + "learning_rate": 3.538479800994426e-05, + "loss": 41.806, + "step": 106820 + }, + { + "epoch": 0.43160671792240535, + "grad_norm": 560.2127685546875, + "learning_rate": 3.538162265725462e-05, + "loss": 34.7096, + "step": 106830 + }, + { + "epoch": 0.431647119187773, + "grad_norm": 800.5224609375, + "learning_rate": 3.5378447102169895e-05, + "loss": 83.1514, + "step": 106840 + }, + { + "epoch": 0.43168752045314057, + "grad_norm": 403.2615661621094, + "learning_rate": 3.537527134475201e-05, + "loss": 81.7792, + "step": 106850 + }, + { + "epoch": 0.4317279217185082, + "grad_norm": 613.8062744140625, + "learning_rate": 3.537209538506286e-05, + "loss": 51.7972, + "step": 106860 + }, + { + "epoch": 0.43176832298387585, + "grad_norm": 794.0146484375, + "learning_rate": 3.5368919223164374e-05, + "loss": 78.0091, + "step": 106870 + }, + { + "epoch": 0.4318087242492435, + "grad_norm": 877.7191162109375, + "learning_rate": 3.536574285911847e-05, + "loss": 47.9676, + "step": 106880 + }, + { + "epoch": 0.43184912551461113, + "grad_norm": 1015.1839599609375, + "learning_rate": 3.5362566292987076e-05, + "loss": 56.4783, + "step": 106890 + }, + { + "epoch": 0.43188952677997877, + "grad_norm": 519.8533935546875, + "learning_rate": 3.535938952483211e-05, + "loss": 53.7836, + "step": 106900 + }, + { + "epoch": 0.43192992804534636, + "grad_norm": 356.93109130859375, + "learning_rate": 3.5356212554715506e-05, + "loss": 63.2428, + "step": 106910 + }, + { + "epoch": 0.431970329310714, + "grad_norm": 694.5162963867188, + "learning_rate": 3.535303538269922e-05, + "loss": 33.9129, + "step": 106920 + }, + { + "epoch": 0.43201073057608164, + "grad_norm": 796.4173583984375, + "learning_rate": 3.534985800884517e-05, + "loss": 58.3042, + "step": 106930 + }, + { + "epoch": 0.4320511318414493, + "grad_norm": 0.0, + "learning_rate": 3.5346680433215316e-05, + "loss": 38.0315, + "step": 106940 + }, + { + "epoch": 0.4320915331068169, + "grad_norm": 570.5447387695312, + "learning_rate": 3.5343502655871594e-05, + "loss": 58.1119, + "step": 106950 + }, + { + "epoch": 0.43213193437218456, + "grad_norm": 2234.41162109375, + "learning_rate": 3.534032467687597e-05, + "loss": 50.3924, + "step": 106960 + }, + { + "epoch": 0.43217233563755214, + "grad_norm": 470.1455993652344, + "learning_rate": 3.533714649629039e-05, + "loss": 51.558, + "step": 106970 + }, + { + "epoch": 0.4322127369029198, + "grad_norm": 641.6006469726562, + "learning_rate": 3.533396811417682e-05, + "loss": 46.6653, + "step": 106980 + }, + { + "epoch": 0.4322531381682874, + "grad_norm": 1771.727294921875, + "learning_rate": 3.533078953059721e-05, + "loss": 42.6348, + "step": 106990 + }, + { + "epoch": 0.43229353943365506, + "grad_norm": 885.59765625, + "learning_rate": 3.532761074561355e-05, + "loss": 58.1762, + "step": 107000 + }, + { + "epoch": 0.4323339406990227, + "grad_norm": 618.5126953125, + "learning_rate": 3.5324431759287796e-05, + "loss": 43.6578, + "step": 107010 + }, + { + "epoch": 0.43237434196439034, + "grad_norm": 740.6704711914062, + "learning_rate": 3.532125257168193e-05, + "loss": 49.2011, + "step": 107020 + }, + { + "epoch": 0.432414743229758, + "grad_norm": 713.0264892578125, + "learning_rate": 3.531807318285793e-05, + "loss": 67.6269, + "step": 107030 + }, + { + "epoch": 0.43245514449512557, + "grad_norm": 793.6551513671875, + "learning_rate": 3.531489359287779e-05, + "loss": 78.4188, + "step": 107040 + }, + { + "epoch": 0.4324955457604932, + "grad_norm": 1205.0350341796875, + "learning_rate": 3.531171380180348e-05, + "loss": 77.0156, + "step": 107050 + }, + { + "epoch": 0.43253594702586085, + "grad_norm": 1664.7984619140625, + "learning_rate": 3.530853380969701e-05, + "loss": 51.596, + "step": 107060 + }, + { + "epoch": 0.4325763482912285, + "grad_norm": 353.55523681640625, + "learning_rate": 3.5305353616620355e-05, + "loss": 36.4841, + "step": 107070 + }, + { + "epoch": 0.4326167495565961, + "grad_norm": 607.7645263671875, + "learning_rate": 3.5302173222635524e-05, + "loss": 56.0603, + "step": 107080 + }, + { + "epoch": 0.43265715082196377, + "grad_norm": 688.8688354492188, + "learning_rate": 3.529899262780453e-05, + "loss": 60.969, + "step": 107090 + }, + { + "epoch": 0.43269755208733135, + "grad_norm": 431.1716003417969, + "learning_rate": 3.529581183218937e-05, + "loss": 43.1526, + "step": 107100 + }, + { + "epoch": 0.432737953352699, + "grad_norm": 1323.6634521484375, + "learning_rate": 3.529263083585206e-05, + "loss": 64.5691, + "step": 107110 + }, + { + "epoch": 0.43277835461806663, + "grad_norm": 650.7156372070312, + "learning_rate": 3.528944963885461e-05, + "loss": 24.5781, + "step": 107120 + }, + { + "epoch": 0.43281875588343427, + "grad_norm": 1326.6907958984375, + "learning_rate": 3.528626824125905e-05, + "loss": 55.9819, + "step": 107130 + }, + { + "epoch": 0.4328591571488019, + "grad_norm": 727.043701171875, + "learning_rate": 3.528308664312739e-05, + "loss": 44.3257, + "step": 107140 + }, + { + "epoch": 0.43289955841416955, + "grad_norm": 719.6072998046875, + "learning_rate": 3.527990484452166e-05, + "loss": 46.9241, + "step": 107150 + }, + { + "epoch": 0.4329399596795372, + "grad_norm": 1634.4859619140625, + "learning_rate": 3.527672284550389e-05, + "loss": 87.1335, + "step": 107160 + }, + { + "epoch": 0.4329803609449048, + "grad_norm": 281.7586975097656, + "learning_rate": 3.527354064613612e-05, + "loss": 70.6858, + "step": 107170 + }, + { + "epoch": 0.4330207622102724, + "grad_norm": 1065.198974609375, + "learning_rate": 3.5270358246480386e-05, + "loss": 79.6831, + "step": 107180 + }, + { + "epoch": 0.43306116347564005, + "grad_norm": 263.0101623535156, + "learning_rate": 3.526717564659873e-05, + "loss": 61.8488, + "step": 107190 + }, + { + "epoch": 0.4331015647410077, + "grad_norm": 918.0792236328125, + "learning_rate": 3.52639928465532e-05, + "loss": 56.7985, + "step": 107200 + }, + { + "epoch": 0.43314196600637533, + "grad_norm": 454.3590393066406, + "learning_rate": 3.526080984640585e-05, + "loss": 59.4921, + "step": 107210 + }, + { + "epoch": 0.433182367271743, + "grad_norm": 332.873291015625, + "learning_rate": 3.525762664621872e-05, + "loss": 34.6081, + "step": 107220 + }, + { + "epoch": 0.43322276853711056, + "grad_norm": 1424.8919677734375, + "learning_rate": 3.5254443246053886e-05, + "loss": 54.6382, + "step": 107230 + }, + { + "epoch": 0.4332631698024782, + "grad_norm": 1493.1998291015625, + "learning_rate": 3.5251259645973394e-05, + "loss": 77.3529, + "step": 107240 + }, + { + "epoch": 0.43330357106784584, + "grad_norm": 310.91943359375, + "learning_rate": 3.524807584603932e-05, + "loss": 58.498, + "step": 107250 + }, + { + "epoch": 0.4333439723332135, + "grad_norm": 713.1577758789062, + "learning_rate": 3.5244891846313736e-05, + "loss": 48.7548, + "step": 107260 + }, + { + "epoch": 0.4333843735985811, + "grad_norm": 746.4721069335938, + "learning_rate": 3.5241707646858703e-05, + "loss": 67.363, + "step": 107270 + }, + { + "epoch": 0.43342477486394876, + "grad_norm": 1737.3177490234375, + "learning_rate": 3.523852324773631e-05, + "loss": 58.3859, + "step": 107280 + }, + { + "epoch": 0.43346517612931634, + "grad_norm": 240.0654754638672, + "learning_rate": 3.523533864900863e-05, + "loss": 89.2563, + "step": 107290 + }, + { + "epoch": 0.433505577394684, + "grad_norm": 697.5162353515625, + "learning_rate": 3.523215385073777e-05, + "loss": 66.2658, + "step": 107300 + }, + { + "epoch": 0.4335459786600516, + "grad_norm": 527.6311645507812, + "learning_rate": 3.52289688529858e-05, + "loss": 39.8144, + "step": 107310 + }, + { + "epoch": 0.43358637992541926, + "grad_norm": 1482.4761962890625, + "learning_rate": 3.5225783655814796e-05, + "loss": 56.7785, + "step": 107320 + }, + { + "epoch": 0.4336267811907869, + "grad_norm": 1850.587646484375, + "learning_rate": 3.522259825928689e-05, + "loss": 69.2113, + "step": 107330 + }, + { + "epoch": 0.43366718245615454, + "grad_norm": 821.7341918945312, + "learning_rate": 3.5219412663464167e-05, + "loss": 85.9736, + "step": 107340 + }, + { + "epoch": 0.4337075837215222, + "grad_norm": 1092.7030029296875, + "learning_rate": 3.521622686840873e-05, + "loss": 74.6831, + "step": 107350 + }, + { + "epoch": 0.43374798498688977, + "grad_norm": 633.818115234375, + "learning_rate": 3.521304087418269e-05, + "loss": 64.548, + "step": 107360 + }, + { + "epoch": 0.4337883862522574, + "grad_norm": 994.1685791015625, + "learning_rate": 3.520985468084816e-05, + "loss": 31.1137, + "step": 107370 + }, + { + "epoch": 0.43382878751762505, + "grad_norm": 887.774169921875, + "learning_rate": 3.520666828846726e-05, + "loss": 52.128, + "step": 107380 + }, + { + "epoch": 0.4338691887829927, + "grad_norm": 478.0501708984375, + "learning_rate": 3.52034816971021e-05, + "loss": 60.6667, + "step": 107390 + }, + { + "epoch": 0.4339095900483603, + "grad_norm": 1626.81884765625, + "learning_rate": 3.5200294906814824e-05, + "loss": 108.8241, + "step": 107400 + }, + { + "epoch": 0.43394999131372797, + "grad_norm": 482.78741455078125, + "learning_rate": 3.519710791766754e-05, + "loss": 57.1852, + "step": 107410 + }, + { + "epoch": 0.43399039257909555, + "grad_norm": 536.291259765625, + "learning_rate": 3.5193920729722384e-05, + "loss": 55.354, + "step": 107420 + }, + { + "epoch": 0.4340307938444632, + "grad_norm": 395.05181884765625, + "learning_rate": 3.51907333430415e-05, + "loss": 51.6174, + "step": 107430 + }, + { + "epoch": 0.43407119510983083, + "grad_norm": 504.0698547363281, + "learning_rate": 3.5187545757687015e-05, + "loss": 48.5033, + "step": 107440 + }, + { + "epoch": 0.43411159637519847, + "grad_norm": 938.1822509765625, + "learning_rate": 3.518435797372109e-05, + "loss": 77.9516, + "step": 107450 + }, + { + "epoch": 0.4341519976405661, + "grad_norm": 542.053955078125, + "learning_rate": 3.5181169991205866e-05, + "loss": 50.1242, + "step": 107460 + }, + { + "epoch": 0.43419239890593375, + "grad_norm": 945.6251831054688, + "learning_rate": 3.517798181020348e-05, + "loss": 82.4342, + "step": 107470 + }, + { + "epoch": 0.4342328001713014, + "grad_norm": 841.2509765625, + "learning_rate": 3.517479343077611e-05, + "loss": 48.6936, + "step": 107480 + }, + { + "epoch": 0.434273201436669, + "grad_norm": 1135.741943359375, + "learning_rate": 3.517160485298589e-05, + "loss": 57.5404, + "step": 107490 + }, + { + "epoch": 0.4343136027020366, + "grad_norm": 634.0276489257812, + "learning_rate": 3.516841607689501e-05, + "loss": 57.2011, + "step": 107500 + }, + { + "epoch": 0.43435400396740426, + "grad_norm": 684.9427490234375, + "learning_rate": 3.516522710256562e-05, + "loss": 63.644, + "step": 107510 + }, + { + "epoch": 0.4343944052327719, + "grad_norm": 707.3285522460938, + "learning_rate": 3.516203793005989e-05, + "loss": 44.6688, + "step": 107520 + }, + { + "epoch": 0.43443480649813954, + "grad_norm": 765.8327026367188, + "learning_rate": 3.515884855944e-05, + "loss": 46.0717, + "step": 107530 + }, + { + "epoch": 0.4344752077635072, + "grad_norm": 708.8120727539062, + "learning_rate": 3.515565899076813e-05, + "loss": 47.5356, + "step": 107540 + }, + { + "epoch": 0.43451560902887476, + "grad_norm": 779.5784912109375, + "learning_rate": 3.5152469224106454e-05, + "loss": 50.02, + "step": 107550 + }, + { + "epoch": 0.4345560102942424, + "grad_norm": 638.3084106445312, + "learning_rate": 3.514927925951717e-05, + "loss": 69.4422, + "step": 107560 + }, + { + "epoch": 0.43459641155961004, + "grad_norm": 0.0, + "learning_rate": 3.5146089097062456e-05, + "loss": 48.1131, + "step": 107570 + }, + { + "epoch": 0.4346368128249777, + "grad_norm": 613.4315185546875, + "learning_rate": 3.514289873680451e-05, + "loss": 39.7531, + "step": 107580 + }, + { + "epoch": 0.4346772140903453, + "grad_norm": 596.2418212890625, + "learning_rate": 3.513970817880554e-05, + "loss": 46.1025, + "step": 107590 + }, + { + "epoch": 0.43471761535571296, + "grad_norm": 742.696044921875, + "learning_rate": 3.513651742312774e-05, + "loss": 50.7197, + "step": 107600 + }, + { + "epoch": 0.43475801662108055, + "grad_norm": 517.7498168945312, + "learning_rate": 3.51333264698333e-05, + "loss": 72.2126, + "step": 107610 + }, + { + "epoch": 0.4347984178864482, + "grad_norm": 336.8522644042969, + "learning_rate": 3.5130135318984456e-05, + "loss": 97.5587, + "step": 107620 + }, + { + "epoch": 0.4348388191518158, + "grad_norm": 1882.3516845703125, + "learning_rate": 3.512694397064341e-05, + "loss": 46.2149, + "step": 107630 + }, + { + "epoch": 0.43487922041718347, + "grad_norm": 852.6275024414062, + "learning_rate": 3.512375242487236e-05, + "loss": 59.0864, + "step": 107640 + }, + { + "epoch": 0.4349196216825511, + "grad_norm": 1158.6456298828125, + "learning_rate": 3.512056068173356e-05, + "loss": 86.1055, + "step": 107650 + }, + { + "epoch": 0.43496002294791875, + "grad_norm": 1050.2406005859375, + "learning_rate": 3.511736874128922e-05, + "loss": 47.8806, + "step": 107660 + }, + { + "epoch": 0.4350004242132864, + "grad_norm": 576.0054931640625, + "learning_rate": 3.5114176603601564e-05, + "loss": 44.2818, + "step": 107670 + }, + { + "epoch": 0.43504082547865397, + "grad_norm": 3012.95849609375, + "learning_rate": 3.511098426873283e-05, + "loss": 75.0666, + "step": 107680 + }, + { + "epoch": 0.4350812267440216, + "grad_norm": 1107.3419189453125, + "learning_rate": 3.5107791736745244e-05, + "loss": 47.5885, + "step": 107690 + }, + { + "epoch": 0.43512162800938925, + "grad_norm": 486.3786926269531, + "learning_rate": 3.5104599007701054e-05, + "loss": 52.94, + "step": 107700 + }, + { + "epoch": 0.4351620292747569, + "grad_norm": 1440.493896484375, + "learning_rate": 3.510140608166251e-05, + "loss": 44.7074, + "step": 107710 + }, + { + "epoch": 0.43520243054012453, + "grad_norm": 665.9118041992188, + "learning_rate": 3.5098212958691854e-05, + "loss": 84.7358, + "step": 107720 + }, + { + "epoch": 0.43524283180549217, + "grad_norm": 670.6646118164062, + "learning_rate": 3.509501963885134e-05, + "loss": 47.459, + "step": 107730 + }, + { + "epoch": 0.43528323307085975, + "grad_norm": 1127.4244384765625, + "learning_rate": 3.509182612220322e-05, + "loss": 66.771, + "step": 107740 + }, + { + "epoch": 0.4353236343362274, + "grad_norm": 1750.5572509765625, + "learning_rate": 3.5088632408809755e-05, + "loss": 100.8368, + "step": 107750 + }, + { + "epoch": 0.43536403560159503, + "grad_norm": 694.012451171875, + "learning_rate": 3.50854384987332e-05, + "loss": 67.874, + "step": 107760 + }, + { + "epoch": 0.4354044368669627, + "grad_norm": 543.4619750976562, + "learning_rate": 3.508224439203583e-05, + "loss": 48.8384, + "step": 107770 + }, + { + "epoch": 0.4354448381323303, + "grad_norm": 1029.557373046875, + "learning_rate": 3.5079050088779926e-05, + "loss": 41.3493, + "step": 107780 + }, + { + "epoch": 0.43548523939769795, + "grad_norm": 404.4317321777344, + "learning_rate": 3.5075855589027746e-05, + "loss": 59.9128, + "step": 107790 + }, + { + "epoch": 0.4355256406630656, + "grad_norm": 459.72357177734375, + "learning_rate": 3.507266089284157e-05, + "loss": 49.0815, + "step": 107800 + }, + { + "epoch": 0.4355660419284332, + "grad_norm": 1144.3963623046875, + "learning_rate": 3.506946600028368e-05, + "loss": 93.688, + "step": 107810 + }, + { + "epoch": 0.4356064431938008, + "grad_norm": 1337.915771484375, + "learning_rate": 3.5066270911416373e-05, + "loss": 41.8152, + "step": 107820 + }, + { + "epoch": 0.43564684445916846, + "grad_norm": 537.2900390625, + "learning_rate": 3.506307562630194e-05, + "loss": 33.384, + "step": 107830 + }, + { + "epoch": 0.4356872457245361, + "grad_norm": 676.332275390625, + "learning_rate": 3.5059880145002654e-05, + "loss": 72.9059, + "step": 107840 + }, + { + "epoch": 0.43572764698990374, + "grad_norm": 1162.9178466796875, + "learning_rate": 3.505668446758083e-05, + "loss": 46.9766, + "step": 107850 + }, + { + "epoch": 0.4357680482552714, + "grad_norm": 1080.3890380859375, + "learning_rate": 3.505348859409876e-05, + "loss": 53.5876, + "step": 107860 + }, + { + "epoch": 0.43580844952063896, + "grad_norm": 1019.6876220703125, + "learning_rate": 3.5050292524618764e-05, + "loss": 39.1764, + "step": 107870 + }, + { + "epoch": 0.4358488507860066, + "grad_norm": 521.6009521484375, + "learning_rate": 3.5047096259203135e-05, + "loss": 69.4503, + "step": 107880 + }, + { + "epoch": 0.43588925205137424, + "grad_norm": 1152.7193603515625, + "learning_rate": 3.5043899797914187e-05, + "loss": 74.8687, + "step": 107890 + }, + { + "epoch": 0.4359296533167419, + "grad_norm": 993.6572265625, + "learning_rate": 3.504070314081425e-05, + "loss": 64.1491, + "step": 107900 + }, + { + "epoch": 0.4359700545821095, + "grad_norm": 3389.17578125, + "learning_rate": 3.503750628796563e-05, + "loss": 62.572, + "step": 107910 + }, + { + "epoch": 0.43601045584747716, + "grad_norm": 1042.504150390625, + "learning_rate": 3.503430923943066e-05, + "loss": 51.6757, + "step": 107920 + }, + { + "epoch": 0.43605085711284475, + "grad_norm": 1820.093505859375, + "learning_rate": 3.503111199527167e-05, + "loss": 46.9761, + "step": 107930 + }, + { + "epoch": 0.4360912583782124, + "grad_norm": 530.4893798828125, + "learning_rate": 3.5027914555550976e-05, + "loss": 80.2376, + "step": 107940 + }, + { + "epoch": 0.43613165964358, + "grad_norm": 673.11767578125, + "learning_rate": 3.502471692033094e-05, + "loss": 36.759, + "step": 107950 + }, + { + "epoch": 0.43617206090894767, + "grad_norm": 979.2339477539062, + "learning_rate": 3.5021519089673876e-05, + "loss": 53.1278, + "step": 107960 + }, + { + "epoch": 0.4362124621743153, + "grad_norm": 533.0077514648438, + "learning_rate": 3.501832106364213e-05, + "loss": 44.0872, + "step": 107970 + }, + { + "epoch": 0.43625286343968295, + "grad_norm": 2183.581298828125, + "learning_rate": 3.501512284229807e-05, + "loss": 74.3517, + "step": 107980 + }, + { + "epoch": 0.4362932647050506, + "grad_norm": 1532.984375, + "learning_rate": 3.5011924425704036e-05, + "loss": 55.7305, + "step": 107990 + }, + { + "epoch": 0.43633366597041817, + "grad_norm": 794.1177978515625, + "learning_rate": 3.5008725813922386e-05, + "loss": 39.9372, + "step": 108000 + }, + { + "epoch": 0.4363740672357858, + "grad_norm": 489.5924072265625, + "learning_rate": 3.5005527007015455e-05, + "loss": 51.468, + "step": 108010 + }, + { + "epoch": 0.43641446850115345, + "grad_norm": 1078.9996337890625, + "learning_rate": 3.500232800504563e-05, + "loss": 69.4806, + "step": 108020 + }, + { + "epoch": 0.4364548697665211, + "grad_norm": 880.2926025390625, + "learning_rate": 3.499912880807528e-05, + "loss": 53.5359, + "step": 108030 + }, + { + "epoch": 0.43649527103188873, + "grad_norm": 827.0285034179688, + "learning_rate": 3.4995929416166756e-05, + "loss": 50.6137, + "step": 108040 + }, + { + "epoch": 0.43653567229725637, + "grad_norm": 720.4219970703125, + "learning_rate": 3.499272982938244e-05, + "loss": 35.6677, + "step": 108050 + }, + { + "epoch": 0.43657607356262396, + "grad_norm": 739.00927734375, + "learning_rate": 3.4989530047784716e-05, + "loss": 70.8884, + "step": 108060 + }, + { + "epoch": 0.4366164748279916, + "grad_norm": 387.0387878417969, + "learning_rate": 3.498633007143596e-05, + "loss": 47.103, + "step": 108070 + }, + { + "epoch": 0.43665687609335924, + "grad_norm": 912.795654296875, + "learning_rate": 3.498312990039856e-05, + "loss": 56.658, + "step": 108080 + }, + { + "epoch": 0.4366972773587269, + "grad_norm": 1193.864990234375, + "learning_rate": 3.497992953473491e-05, + "loss": 77.9862, + "step": 108090 + }, + { + "epoch": 0.4367376786240945, + "grad_norm": 711.3486938476562, + "learning_rate": 3.4976728974507384e-05, + "loss": 31.2605, + "step": 108100 + }, + { + "epoch": 0.43677807988946216, + "grad_norm": 1228.09033203125, + "learning_rate": 3.497352821977839e-05, + "loss": 42.339, + "step": 108110 + }, + { + "epoch": 0.4368184811548298, + "grad_norm": 524.43798828125, + "learning_rate": 3.497032727061034e-05, + "loss": 59.911, + "step": 108120 + }, + { + "epoch": 0.4368588824201974, + "grad_norm": 255.4713134765625, + "learning_rate": 3.496712612706561e-05, + "loss": 63.1904, + "step": 108130 + }, + { + "epoch": 0.436899283685565, + "grad_norm": 261.4523620605469, + "learning_rate": 3.4963924789206636e-05, + "loss": 88.5668, + "step": 108140 + }, + { + "epoch": 0.43693968495093266, + "grad_norm": 680.0599365234375, + "learning_rate": 3.496072325709582e-05, + "loss": 45.9959, + "step": 108150 + }, + { + "epoch": 0.4369800862163003, + "grad_norm": 402.27239990234375, + "learning_rate": 3.495752153079557e-05, + "loss": 50.0955, + "step": 108160 + }, + { + "epoch": 0.43702048748166794, + "grad_norm": 0.0, + "learning_rate": 3.495431961036832e-05, + "loss": 44.4594, + "step": 108170 + }, + { + "epoch": 0.4370608887470356, + "grad_norm": 455.03326416015625, + "learning_rate": 3.495111749587647e-05, + "loss": 51.1414, + "step": 108180 + }, + { + "epoch": 0.43710129001240317, + "grad_norm": 438.03314208984375, + "learning_rate": 3.494791518738247e-05, + "loss": 112.6708, + "step": 108190 + }, + { + "epoch": 0.4371416912777708, + "grad_norm": 547.6517333984375, + "learning_rate": 3.494471268494875e-05, + "loss": 31.8104, + "step": 108200 + }, + { + "epoch": 0.43718209254313845, + "grad_norm": 541.253173828125, + "learning_rate": 3.494150998863772e-05, + "loss": 63.8281, + "step": 108210 + }, + { + "epoch": 0.4372224938085061, + "grad_norm": 710.3536376953125, + "learning_rate": 3.4938307098511846e-05, + "loss": 43.2936, + "step": 108220 + }, + { + "epoch": 0.4372628950738737, + "grad_norm": 795.8914184570312, + "learning_rate": 3.493510401463355e-05, + "loss": 48.5519, + "step": 108230 + }, + { + "epoch": 0.43730329633924137, + "grad_norm": 1529.8331298828125, + "learning_rate": 3.493190073706529e-05, + "loss": 42.2938, + "step": 108240 + }, + { + "epoch": 0.43734369760460895, + "grad_norm": 1139.19189453125, + "learning_rate": 3.4928697265869515e-05, + "loss": 43.6591, + "step": 108250 + }, + { + "epoch": 0.4373840988699766, + "grad_norm": 427.7257080078125, + "learning_rate": 3.492549360110868e-05, + "loss": 126.2329, + "step": 108260 + }, + { + "epoch": 0.43742450013534423, + "grad_norm": 489.84521484375, + "learning_rate": 3.4922289742845224e-05, + "loss": 51.3288, + "step": 108270 + }, + { + "epoch": 0.43746490140071187, + "grad_norm": 0.0, + "learning_rate": 3.491908569114164e-05, + "loss": 57.9718, + "step": 108280 + }, + { + "epoch": 0.4375053026660795, + "grad_norm": 901.902099609375, + "learning_rate": 3.491588144606035e-05, + "loss": 58.3395, + "step": 108290 + }, + { + "epoch": 0.43754570393144715, + "grad_norm": 549.3409423828125, + "learning_rate": 3.491267700766386e-05, + "loss": 82.0675, + "step": 108300 + }, + { + "epoch": 0.4375861051968148, + "grad_norm": 490.4176025390625, + "learning_rate": 3.490947237601462e-05, + "loss": 52.1806, + "step": 108310 + }, + { + "epoch": 0.4376265064621824, + "grad_norm": 650.8810424804688, + "learning_rate": 3.4906267551175124e-05, + "loss": 76.2948, + "step": 108320 + }, + { + "epoch": 0.43766690772755, + "grad_norm": 959.3955688476562, + "learning_rate": 3.4903062533207834e-05, + "loss": 76.9859, + "step": 108330 + }, + { + "epoch": 0.43770730899291765, + "grad_norm": 609.113037109375, + "learning_rate": 3.489985732217525e-05, + "loss": 34.4562, + "step": 108340 + }, + { + "epoch": 0.4377477102582853, + "grad_norm": 2289.5068359375, + "learning_rate": 3.4896651918139845e-05, + "loss": 71.8435, + "step": 108350 + }, + { + "epoch": 0.43778811152365293, + "grad_norm": 0.0, + "learning_rate": 3.489344632116412e-05, + "loss": 54.8811, + "step": 108360 + }, + { + "epoch": 0.4378285127890206, + "grad_norm": 416.5489196777344, + "learning_rate": 3.489024053131056e-05, + "loss": 54.7843, + "step": 108370 + }, + { + "epoch": 0.43786891405438816, + "grad_norm": 850.0728149414062, + "learning_rate": 3.488703454864167e-05, + "loss": 83.0363, + "step": 108380 + }, + { + "epoch": 0.4379093153197558, + "grad_norm": 580.010498046875, + "learning_rate": 3.488382837321995e-05, + "loss": 45.165, + "step": 108390 + }, + { + "epoch": 0.43794971658512344, + "grad_norm": 684.89111328125, + "learning_rate": 3.488062200510791e-05, + "loss": 78.0684, + "step": 108400 + }, + { + "epoch": 0.4379901178504911, + "grad_norm": 1013.9144287109375, + "learning_rate": 3.487741544436806e-05, + "loss": 63.4015, + "step": 108410 + }, + { + "epoch": 0.4380305191158587, + "grad_norm": 758.853759765625, + "learning_rate": 3.48742086910629e-05, + "loss": 71.6265, + "step": 108420 + }, + { + "epoch": 0.43807092038122636, + "grad_norm": 398.92095947265625, + "learning_rate": 3.487100174525498e-05, + "loss": 43.4276, + "step": 108430 + }, + { + "epoch": 0.438111321646594, + "grad_norm": 313.7893371582031, + "learning_rate": 3.4867794607006784e-05, + "loss": 48.0121, + "step": 108440 + }, + { + "epoch": 0.4381517229119616, + "grad_norm": 1349.3988037109375, + "learning_rate": 3.486458727638085e-05, + "loss": 70.9725, + "step": 108450 + }, + { + "epoch": 0.4381921241773292, + "grad_norm": 433.0930480957031, + "learning_rate": 3.486137975343971e-05, + "loss": 61.4865, + "step": 108460 + }, + { + "epoch": 0.43823252544269686, + "grad_norm": 1083.5059814453125, + "learning_rate": 3.48581720382459e-05, + "loss": 54.0927, + "step": 108470 + }, + { + "epoch": 0.4382729267080645, + "grad_norm": 2715.4462890625, + "learning_rate": 3.485496413086195e-05, + "loss": 87.5101, + "step": 108480 + }, + { + "epoch": 0.43831332797343214, + "grad_norm": 566.393798828125, + "learning_rate": 3.4851756031350394e-05, + "loss": 56.3198, + "step": 108490 + }, + { + "epoch": 0.4383537292387998, + "grad_norm": 815.7106323242188, + "learning_rate": 3.484854773977378e-05, + "loss": 61.3737, + "step": 108500 + }, + { + "epoch": 0.43839413050416737, + "grad_norm": 807.414306640625, + "learning_rate": 3.4845339256194666e-05, + "loss": 44.4692, + "step": 108510 + }, + { + "epoch": 0.438434531769535, + "grad_norm": 984.805419921875, + "learning_rate": 3.484213058067559e-05, + "loss": 75.2083, + "step": 108520 + }, + { + "epoch": 0.43847493303490265, + "grad_norm": 1864.611083984375, + "learning_rate": 3.483892171327911e-05, + "loss": 86.4765, + "step": 108530 + }, + { + "epoch": 0.4385153343002703, + "grad_norm": 1487.40869140625, + "learning_rate": 3.4835712654067785e-05, + "loss": 76.399, + "step": 108540 + }, + { + "epoch": 0.4385557355656379, + "grad_norm": 735.3146362304688, + "learning_rate": 3.483250340310418e-05, + "loss": 70.6352, + "step": 108550 + }, + { + "epoch": 0.43859613683100557, + "grad_norm": 807.2081298828125, + "learning_rate": 3.482929396045087e-05, + "loss": 48.8148, + "step": 108560 + }, + { + "epoch": 0.43863653809637315, + "grad_norm": 632.7271118164062, + "learning_rate": 3.48260843261704e-05, + "loss": 77.7649, + "step": 108570 + }, + { + "epoch": 0.4386769393617408, + "grad_norm": 897.5272216796875, + "learning_rate": 3.482287450032536e-05, + "loss": 51.7576, + "step": 108580 + }, + { + "epoch": 0.43871734062710843, + "grad_norm": 753.651611328125, + "learning_rate": 3.4819664482978325e-05, + "loss": 71.9576, + "step": 108590 + }, + { + "epoch": 0.43875774189247607, + "grad_norm": 524.542724609375, + "learning_rate": 3.481645427419188e-05, + "loss": 51.1556, + "step": 108600 + }, + { + "epoch": 0.4387981431578437, + "grad_norm": 578.78173828125, + "learning_rate": 3.48132438740286e-05, + "loss": 42.8052, + "step": 108610 + }, + { + "epoch": 0.43883854442321135, + "grad_norm": 634.4146118164062, + "learning_rate": 3.481003328255108e-05, + "loss": 56.4694, + "step": 108620 + }, + { + "epoch": 0.438878945688579, + "grad_norm": 823.702392578125, + "learning_rate": 3.480682249982191e-05, + "loss": 51.0967, + "step": 108630 + }, + { + "epoch": 0.4389193469539466, + "grad_norm": 625.2899169921875, + "learning_rate": 3.4803611525903685e-05, + "loss": 55.0041, + "step": 108640 + }, + { + "epoch": 0.4389597482193142, + "grad_norm": 251.00901794433594, + "learning_rate": 3.480040036085901e-05, + "loss": 52.5765, + "step": 108650 + }, + { + "epoch": 0.43900014948468186, + "grad_norm": 1115.88134765625, + "learning_rate": 3.479718900475049e-05, + "loss": 83.0873, + "step": 108660 + }, + { + "epoch": 0.4390405507500495, + "grad_norm": 9209.01953125, + "learning_rate": 3.479397745764071e-05, + "loss": 142.5869, + "step": 108670 + }, + { + "epoch": 0.43908095201541714, + "grad_norm": 704.7980346679688, + "learning_rate": 3.479076571959231e-05, + "loss": 47.6946, + "step": 108680 + }, + { + "epoch": 0.4391213532807848, + "grad_norm": 1096.587890625, + "learning_rate": 3.4787553790667896e-05, + "loss": 57.3606, + "step": 108690 + }, + { + "epoch": 0.43916175454615236, + "grad_norm": 685.4246215820312, + "learning_rate": 3.4784341670930065e-05, + "loss": 52.4217, + "step": 108700 + }, + { + "epoch": 0.43920215581152, + "grad_norm": 385.501220703125, + "learning_rate": 3.478112936044146e-05, + "loss": 43.921, + "step": 108710 + }, + { + "epoch": 0.43924255707688764, + "grad_norm": 648.174560546875, + "learning_rate": 3.477791685926471e-05, + "loss": 48.4695, + "step": 108720 + }, + { + "epoch": 0.4392829583422553, + "grad_norm": 201.7768096923828, + "learning_rate": 3.4774704167462434e-05, + "loss": 63.3514, + "step": 108730 + }, + { + "epoch": 0.4393233596076229, + "grad_norm": 579.0665283203125, + "learning_rate": 3.477149128509727e-05, + "loss": 59.8261, + "step": 108740 + }, + { + "epoch": 0.43936376087299056, + "grad_norm": 727.160888671875, + "learning_rate": 3.476827821223184e-05, + "loss": 48.0612, + "step": 108750 + }, + { + "epoch": 0.4394041621383582, + "grad_norm": 842.6546020507812, + "learning_rate": 3.4765064948928814e-05, + "loss": 52.5195, + "step": 108760 + }, + { + "epoch": 0.4394445634037258, + "grad_norm": 680.0587158203125, + "learning_rate": 3.4761851495250816e-05, + "loss": 83.2483, + "step": 108770 + }, + { + "epoch": 0.4394849646690934, + "grad_norm": 946.002197265625, + "learning_rate": 3.475863785126049e-05, + "loss": 61.0262, + "step": 108780 + }, + { + "epoch": 0.43952536593446107, + "grad_norm": 1292.424072265625, + "learning_rate": 3.47554240170205e-05, + "loss": 117.6964, + "step": 108790 + }, + { + "epoch": 0.4395657671998287, + "grad_norm": 593.932861328125, + "learning_rate": 3.475220999259349e-05, + "loss": 60.331, + "step": 108800 + }, + { + "epoch": 0.43960616846519635, + "grad_norm": 740.8704833984375, + "learning_rate": 3.4748995778042136e-05, + "loss": 88.3325, + "step": 108810 + }, + { + "epoch": 0.439646569730564, + "grad_norm": 495.8048400878906, + "learning_rate": 3.474578137342909e-05, + "loss": 46.6653, + "step": 108820 + }, + { + "epoch": 0.43968697099593157, + "grad_norm": 907.8876342773438, + "learning_rate": 3.474256677881701e-05, + "loss": 65.5185, + "step": 108830 + }, + { + "epoch": 0.4397273722612992, + "grad_norm": 1308.182861328125, + "learning_rate": 3.473935199426858e-05, + "loss": 109.5993, + "step": 108840 + }, + { + "epoch": 0.43976777352666685, + "grad_norm": 538.4534912109375, + "learning_rate": 3.4736137019846465e-05, + "loss": 47.8979, + "step": 108850 + }, + { + "epoch": 0.4398081747920345, + "grad_norm": 302.4723815917969, + "learning_rate": 3.4732921855613355e-05, + "loss": 60.7093, + "step": 108860 + }, + { + "epoch": 0.43984857605740213, + "grad_norm": 1104.8883056640625, + "learning_rate": 3.472970650163191e-05, + "loss": 61.7136, + "step": 108870 + }, + { + "epoch": 0.43988897732276977, + "grad_norm": 1319.9013671875, + "learning_rate": 3.4726490957964834e-05, + "loss": 71.9735, + "step": 108880 + }, + { + "epoch": 0.43992937858813735, + "grad_norm": 3414.708251953125, + "learning_rate": 3.472327522467481e-05, + "loss": 74.4029, + "step": 108890 + }, + { + "epoch": 0.439969779853505, + "grad_norm": 442.96405029296875, + "learning_rate": 3.4720059301824525e-05, + "loss": 74.8008, + "step": 108900 + }, + { + "epoch": 0.44001018111887263, + "grad_norm": 849.4617919921875, + "learning_rate": 3.4716843189476687e-05, + "loss": 67.8839, + "step": 108910 + }, + { + "epoch": 0.4400505823842403, + "grad_norm": 464.7969970703125, + "learning_rate": 3.471362688769398e-05, + "loss": 55.6521, + "step": 108920 + }, + { + "epoch": 0.4400909836496079, + "grad_norm": 426.9241943359375, + "learning_rate": 3.471041039653913e-05, + "loss": 53.6479, + "step": 108930 + }, + { + "epoch": 0.44013138491497555, + "grad_norm": 926.24853515625, + "learning_rate": 3.4707193716074816e-05, + "loss": 58.2524, + "step": 108940 + }, + { + "epoch": 0.4401717861803432, + "grad_norm": 1493.707763671875, + "learning_rate": 3.470397684636377e-05, + "loss": 60.5845, + "step": 108950 + }, + { + "epoch": 0.4402121874457108, + "grad_norm": 715.8504638671875, + "learning_rate": 3.4700759787468695e-05, + "loss": 61.8466, + "step": 108960 + }, + { + "epoch": 0.4402525887110784, + "grad_norm": 539.120849609375, + "learning_rate": 3.469754253945232e-05, + "loss": 50.327, + "step": 108970 + }, + { + "epoch": 0.44029298997644606, + "grad_norm": 644.7427978515625, + "learning_rate": 3.4694325102377355e-05, + "loss": 74.642, + "step": 108980 + }, + { + "epoch": 0.4403333912418137, + "grad_norm": 702.2979736328125, + "learning_rate": 3.469110747630653e-05, + "loss": 56.1246, + "step": 108990 + }, + { + "epoch": 0.44037379250718134, + "grad_norm": 1032.9031982421875, + "learning_rate": 3.4687889661302576e-05, + "loss": 56.0517, + "step": 109000 + }, + { + "epoch": 0.440414193772549, + "grad_norm": 972.0687866210938, + "learning_rate": 3.468467165742823e-05, + "loss": 56.0222, + "step": 109010 + }, + { + "epoch": 0.44045459503791656, + "grad_norm": 598.9658203125, + "learning_rate": 3.468145346474622e-05, + "loss": 57.363, + "step": 109020 + }, + { + "epoch": 0.4404949963032842, + "grad_norm": 616.7764282226562, + "learning_rate": 3.4678235083319296e-05, + "loss": 82.0413, + "step": 109030 + }, + { + "epoch": 0.44053539756865184, + "grad_norm": 318.1781005859375, + "learning_rate": 3.467501651321019e-05, + "loss": 62.9576, + "step": 109040 + }, + { + "epoch": 0.4405757988340195, + "grad_norm": 718.73681640625, + "learning_rate": 3.467179775448166e-05, + "loss": 66.6093, + "step": 109050 + }, + { + "epoch": 0.4406162000993871, + "grad_norm": 973.098388671875, + "learning_rate": 3.466857880719645e-05, + "loss": 65.0112, + "step": 109060 + }, + { + "epoch": 0.44065660136475476, + "grad_norm": 1736.9281005859375, + "learning_rate": 3.466535967141732e-05, + "loss": 67.5672, + "step": 109070 + }, + { + "epoch": 0.4406970026301224, + "grad_norm": 1171.598388671875, + "learning_rate": 3.466214034720702e-05, + "loss": 62.2778, + "step": 109080 + }, + { + "epoch": 0.44073740389549, + "grad_norm": 981.1380004882812, + "learning_rate": 3.4658920834628335e-05, + "loss": 72.6023, + "step": 109090 + }, + { + "epoch": 0.4407778051608576, + "grad_norm": 858.4905395507812, + "learning_rate": 3.4655701133744e-05, + "loss": 43.7386, + "step": 109100 + }, + { + "epoch": 0.44081820642622527, + "grad_norm": 734.6194458007812, + "learning_rate": 3.465248124461681e-05, + "loss": 41.7183, + "step": 109110 + }, + { + "epoch": 0.4408586076915929, + "grad_norm": 2607.31787109375, + "learning_rate": 3.4649261167309526e-05, + "loss": 105.1528, + "step": 109120 + }, + { + "epoch": 0.44089900895696055, + "grad_norm": 675.8436279296875, + "learning_rate": 3.464604090188493e-05, + "loss": 75.2605, + "step": 109130 + }, + { + "epoch": 0.4409394102223282, + "grad_norm": 472.8738098144531, + "learning_rate": 3.46428204484058e-05, + "loss": 50.3132, + "step": 109140 + }, + { + "epoch": 0.44097981148769577, + "grad_norm": 629.1958618164062, + "learning_rate": 3.463959980693492e-05, + "loss": 47.585, + "step": 109150 + }, + { + "epoch": 0.4410202127530634, + "grad_norm": 714.365478515625, + "learning_rate": 3.4636378977535075e-05, + "loss": 62.9666, + "step": 109160 + }, + { + "epoch": 0.44106061401843105, + "grad_norm": 1306.74169921875, + "learning_rate": 3.4633157960269056e-05, + "loss": 67.3224, + "step": 109170 + }, + { + "epoch": 0.4411010152837987, + "grad_norm": 810.0322265625, + "learning_rate": 3.462993675519968e-05, + "loss": 45.5099, + "step": 109180 + }, + { + "epoch": 0.44114141654916633, + "grad_norm": 822.7820434570312, + "learning_rate": 3.462671536238972e-05, + "loss": 47.5919, + "step": 109190 + }, + { + "epoch": 0.44118181781453397, + "grad_norm": 719.748046875, + "learning_rate": 3.462349378190199e-05, + "loss": 58.4106, + "step": 109200 + }, + { + "epoch": 0.44122221907990156, + "grad_norm": 1387.738525390625, + "learning_rate": 3.4620272013799286e-05, + "loss": 55.8598, + "step": 109210 + }, + { + "epoch": 0.4412626203452692, + "grad_norm": 291.135009765625, + "learning_rate": 3.461705005814444e-05, + "loss": 56.4821, + "step": 109220 + }, + { + "epoch": 0.44130302161063684, + "grad_norm": 2201.89453125, + "learning_rate": 3.4613827915000244e-05, + "loss": 112.0971, + "step": 109230 + }, + { + "epoch": 0.4413434228760045, + "grad_norm": 828.1795043945312, + "learning_rate": 3.461060558442952e-05, + "loss": 52.2993, + "step": 109240 + }, + { + "epoch": 0.4413838241413721, + "grad_norm": 493.1712341308594, + "learning_rate": 3.460738306649509e-05, + "loss": 38.8712, + "step": 109250 + }, + { + "epoch": 0.44142422540673976, + "grad_norm": 310.04742431640625, + "learning_rate": 3.4604160361259796e-05, + "loss": 54.1063, + "step": 109260 + }, + { + "epoch": 0.4414646266721074, + "grad_norm": 1105.275634765625, + "learning_rate": 3.460093746878644e-05, + "loss": 57.73, + "step": 109270 + }, + { + "epoch": 0.441505027937475, + "grad_norm": 422.4311828613281, + "learning_rate": 3.459771438913787e-05, + "loss": 76.8583, + "step": 109280 + }, + { + "epoch": 0.4415454292028426, + "grad_norm": 533.1647338867188, + "learning_rate": 3.459449112237691e-05, + "loss": 70.399, + "step": 109290 + }, + { + "epoch": 0.44158583046821026, + "grad_norm": 453.3349304199219, + "learning_rate": 3.459126766856641e-05, + "loss": 47.0437, + "step": 109300 + }, + { + "epoch": 0.4416262317335779, + "grad_norm": 868.0079345703125, + "learning_rate": 3.458804402776921e-05, + "loss": 49.0908, + "step": 109310 + }, + { + "epoch": 0.44166663299894554, + "grad_norm": 986.170166015625, + "learning_rate": 3.458482020004815e-05, + "loss": 54.6284, + "step": 109320 + }, + { + "epoch": 0.4417070342643132, + "grad_norm": 447.4167785644531, + "learning_rate": 3.4581596185466094e-05, + "loss": 59.609, + "step": 109330 + }, + { + "epoch": 0.44174743552968077, + "grad_norm": 641.2802124023438, + "learning_rate": 3.457837198408588e-05, + "loss": 85.7572, + "step": 109340 + }, + { + "epoch": 0.4417878367950484, + "grad_norm": 808.4306640625, + "learning_rate": 3.457514759597038e-05, + "loss": 64.826, + "step": 109350 + }, + { + "epoch": 0.44182823806041605, + "grad_norm": 202.7559051513672, + "learning_rate": 3.457192302118244e-05, + "loss": 52.6859, + "step": 109360 + }, + { + "epoch": 0.4418686393257837, + "grad_norm": 1533.5028076171875, + "learning_rate": 3.4568698259784945e-05, + "loss": 71.5681, + "step": 109370 + }, + { + "epoch": 0.4419090405911513, + "grad_norm": 788.40771484375, + "learning_rate": 3.4565473311840735e-05, + "loss": 56.7714, + "step": 109380 + }, + { + "epoch": 0.44194944185651897, + "grad_norm": 659.1009521484375, + "learning_rate": 3.4562248177412715e-05, + "loss": 44.7925, + "step": 109390 + }, + { + "epoch": 0.44198984312188655, + "grad_norm": 695.70068359375, + "learning_rate": 3.455902285656373e-05, + "loss": 95.6126, + "step": 109400 + }, + { + "epoch": 0.4420302443872542, + "grad_norm": 536.9435424804688, + "learning_rate": 3.4555797349356676e-05, + "loss": 52.6949, + "step": 109410 + }, + { + "epoch": 0.44207064565262183, + "grad_norm": 928.5408935546875, + "learning_rate": 3.455257165585444e-05, + "loss": 65.1302, + "step": 109420 + }, + { + "epoch": 0.44211104691798947, + "grad_norm": 635.9898071289062, + "learning_rate": 3.454934577611989e-05, + "loss": 88.1777, + "step": 109430 + }, + { + "epoch": 0.4421514481833571, + "grad_norm": 1163.3731689453125, + "learning_rate": 3.454611971021593e-05, + "loss": 75.0465, + "step": 109440 + }, + { + "epoch": 0.44219184944872475, + "grad_norm": 1179.76708984375, + "learning_rate": 3.454289345820546e-05, + "loss": 72.196, + "step": 109450 + }, + { + "epoch": 0.4422322507140924, + "grad_norm": 407.4288024902344, + "learning_rate": 3.453966702015137e-05, + "loss": 37.4781, + "step": 109460 + }, + { + "epoch": 0.44227265197946, + "grad_norm": 1263.00732421875, + "learning_rate": 3.453644039611656e-05, + "loss": 55.1117, + "step": 109470 + }, + { + "epoch": 0.4423130532448276, + "grad_norm": 441.8350830078125, + "learning_rate": 3.453321358616393e-05, + "loss": 34.3389, + "step": 109480 + }, + { + "epoch": 0.44235345451019525, + "grad_norm": 813.268798828125, + "learning_rate": 3.452998659035639e-05, + "loss": 61.2214, + "step": 109490 + }, + { + "epoch": 0.4423938557755629, + "grad_norm": 1138.2327880859375, + "learning_rate": 3.452675940875686e-05, + "loss": 69.0981, + "step": 109500 + }, + { + "epoch": 0.44243425704093053, + "grad_norm": 1215.769287109375, + "learning_rate": 3.452353204142824e-05, + "loss": 52.1015, + "step": 109510 + }, + { + "epoch": 0.4424746583062982, + "grad_norm": 169.15260314941406, + "learning_rate": 3.452030448843347e-05, + "loss": 49.8006, + "step": 109520 + }, + { + "epoch": 0.44251505957166576, + "grad_norm": 372.5907287597656, + "learning_rate": 3.451707674983546e-05, + "loss": 67.884, + "step": 109530 + }, + { + "epoch": 0.4425554608370334, + "grad_norm": 1219.9879150390625, + "learning_rate": 3.451384882569714e-05, + "loss": 48.5552, + "step": 109540 + }, + { + "epoch": 0.44259586210240104, + "grad_norm": 1142.11865234375, + "learning_rate": 3.4510620716081446e-05, + "loss": 50.2349, + "step": 109550 + }, + { + "epoch": 0.4426362633677687, + "grad_norm": 1006.7579956054688, + "learning_rate": 3.45073924210513e-05, + "loss": 100.4007, + "step": 109560 + }, + { + "epoch": 0.4426766646331363, + "grad_norm": 714.3269653320312, + "learning_rate": 3.4504163940669634e-05, + "loss": 65.3141, + "step": 109570 + }, + { + "epoch": 0.44271706589850396, + "grad_norm": 1006.07177734375, + "learning_rate": 3.4500935274999413e-05, + "loss": 98.7038, + "step": 109580 + }, + { + "epoch": 0.4427574671638716, + "grad_norm": 1100.7425537109375, + "learning_rate": 3.449770642410356e-05, + "loss": 58.6847, + "step": 109590 + }, + { + "epoch": 0.4427978684292392, + "grad_norm": 1185.401123046875, + "learning_rate": 3.4494477388045035e-05, + "loss": 56.3779, + "step": 109600 + }, + { + "epoch": 0.4428382696946068, + "grad_norm": 1830.9031982421875, + "learning_rate": 3.449124816688677e-05, + "loss": 71.803, + "step": 109610 + }, + { + "epoch": 0.44287867095997446, + "grad_norm": 639.5352172851562, + "learning_rate": 3.448801876069176e-05, + "loss": 59.7502, + "step": 109620 + }, + { + "epoch": 0.4429190722253421, + "grad_norm": 818.8923950195312, + "learning_rate": 3.4484789169522927e-05, + "loss": 65.0993, + "step": 109630 + }, + { + "epoch": 0.44295947349070974, + "grad_norm": 520.5197143554688, + "learning_rate": 3.448155939344324e-05, + "loss": 58.1046, + "step": 109640 + }, + { + "epoch": 0.4429998747560774, + "grad_norm": 2226.27978515625, + "learning_rate": 3.4478329432515674e-05, + "loss": 49.6613, + "step": 109650 + }, + { + "epoch": 0.44304027602144497, + "grad_norm": 3536.548095703125, + "learning_rate": 3.44750992868032e-05, + "loss": 60.4391, + "step": 109660 + }, + { + "epoch": 0.4430806772868126, + "grad_norm": 922.1171875, + "learning_rate": 3.447186895636879e-05, + "loss": 49.5081, + "step": 109670 + }, + { + "epoch": 0.44312107855218025, + "grad_norm": 485.9327697753906, + "learning_rate": 3.4468638441275415e-05, + "loss": 52.8482, + "step": 109680 + }, + { + "epoch": 0.4431614798175479, + "grad_norm": 1202.439208984375, + "learning_rate": 3.4465407741586056e-05, + "loss": 51.3917, + "step": 109690 + }, + { + "epoch": 0.4432018810829155, + "grad_norm": 3666.6962890625, + "learning_rate": 3.4462176857363704e-05, + "loss": 50.0755, + "step": 109700 + }, + { + "epoch": 0.44324228234828317, + "grad_norm": 566.9150390625, + "learning_rate": 3.445894578867134e-05, + "loss": 77.9221, + "step": 109710 + }, + { + "epoch": 0.44328268361365075, + "grad_norm": 695.4548950195312, + "learning_rate": 3.445571453557196e-05, + "loss": 92.9399, + "step": 109720 + }, + { + "epoch": 0.4433230848790184, + "grad_norm": 1101.68212890625, + "learning_rate": 3.445248309812856e-05, + "loss": 67.3483, + "step": 109730 + }, + { + "epoch": 0.44336348614438603, + "grad_norm": 782.0620727539062, + "learning_rate": 3.4449251476404135e-05, + "loss": 47.7574, + "step": 109740 + }, + { + "epoch": 0.44340388740975367, + "grad_norm": 404.2615661621094, + "learning_rate": 3.444601967046168e-05, + "loss": 68.9839, + "step": 109750 + }, + { + "epoch": 0.4434442886751213, + "grad_norm": 1344.5721435546875, + "learning_rate": 3.444278768036421e-05, + "loss": 51.2115, + "step": 109760 + }, + { + "epoch": 0.44348468994048895, + "grad_norm": 453.09136962890625, + "learning_rate": 3.443955550617474e-05, + "loss": 62.8084, + "step": 109770 + }, + { + "epoch": 0.4435250912058566, + "grad_norm": 690.1050415039062, + "learning_rate": 3.443632314795627e-05, + "loss": 37.6234, + "step": 109780 + }, + { + "epoch": 0.4435654924712242, + "grad_norm": 942.2594604492188, + "learning_rate": 3.443309060577182e-05, + "loss": 57.9371, + "step": 109790 + }, + { + "epoch": 0.4436058937365918, + "grad_norm": 2099.522216796875, + "learning_rate": 3.442985787968442e-05, + "loss": 96.0077, + "step": 109800 + }, + { + "epoch": 0.44364629500195946, + "grad_norm": 894.1741333007812, + "learning_rate": 3.4426624969757083e-05, + "loss": 43.2961, + "step": 109810 + }, + { + "epoch": 0.4436866962673271, + "grad_norm": 515.2462158203125, + "learning_rate": 3.442339187605283e-05, + "loss": 44.3043, + "step": 109820 + }, + { + "epoch": 0.44372709753269474, + "grad_norm": 1050.0601806640625, + "learning_rate": 3.442015859863472e-05, + "loss": 80.5312, + "step": 109830 + }, + { + "epoch": 0.4437674987980624, + "grad_norm": 2025.021484375, + "learning_rate": 3.4416925137565754e-05, + "loss": 60.7582, + "step": 109840 + }, + { + "epoch": 0.44380790006342996, + "grad_norm": 1199.0517578125, + "learning_rate": 3.4413691492908985e-05, + "loss": 62.2759, + "step": 109850 + }, + { + "epoch": 0.4438483013287976, + "grad_norm": 781.955078125, + "learning_rate": 3.441045766472745e-05, + "loss": 55.6269, + "step": 109860 + }, + { + "epoch": 0.44388870259416524, + "grad_norm": 1648.44140625, + "learning_rate": 3.440722365308421e-05, + "loss": 57.5258, + "step": 109870 + }, + { + "epoch": 0.4439291038595329, + "grad_norm": 1811.8358154296875, + "learning_rate": 3.440398945804229e-05, + "loss": 78.8444, + "step": 109880 + }, + { + "epoch": 0.4439695051249005, + "grad_norm": 678.7551879882812, + "learning_rate": 3.440075507966476e-05, + "loss": 58.5628, + "step": 109890 + }, + { + "epoch": 0.44400990639026816, + "grad_norm": 1020.2061157226562, + "learning_rate": 3.439752051801467e-05, + "loss": 41.0844, + "step": 109900 + }, + { + "epoch": 0.4440503076556358, + "grad_norm": 1242.7587890625, + "learning_rate": 3.439428577315508e-05, + "loss": 50.6694, + "step": 109910 + }, + { + "epoch": 0.4440907089210034, + "grad_norm": 1669.455810546875, + "learning_rate": 3.439105084514905e-05, + "loss": 61.132, + "step": 109920 + }, + { + "epoch": 0.444131110186371, + "grad_norm": 357.2478942871094, + "learning_rate": 3.4387815734059654e-05, + "loss": 64.2863, + "step": 109930 + }, + { + "epoch": 0.44417151145173867, + "grad_norm": 696.1216430664062, + "learning_rate": 3.438458043994995e-05, + "loss": 52.3681, + "step": 109940 + }, + { + "epoch": 0.4442119127171063, + "grad_norm": 1389.9837646484375, + "learning_rate": 3.438134496288302e-05, + "loss": 59.619, + "step": 109950 + }, + { + "epoch": 0.44425231398247395, + "grad_norm": 819.2481689453125, + "learning_rate": 3.437810930292195e-05, + "loss": 71.3323, + "step": 109960 + }, + { + "epoch": 0.4442927152478416, + "grad_norm": 541.750732421875, + "learning_rate": 3.43748734601298e-05, + "loss": 73.0371, + "step": 109970 + }, + { + "epoch": 0.44433311651320917, + "grad_norm": 923.3477172851562, + "learning_rate": 3.437163743456967e-05, + "loss": 50.583, + "step": 109980 + }, + { + "epoch": 0.4443735177785768, + "grad_norm": 1519.56005859375, + "learning_rate": 3.436840122630464e-05, + "loss": 63.0997, + "step": 109990 + }, + { + "epoch": 0.44441391904394445, + "grad_norm": 1023.0885620117188, + "learning_rate": 3.436516483539781e-05, + "loss": 35.0977, + "step": 110000 } ], "logging_steps": 10,