{ "best_metric": 0.36434125900268555, "best_model_checkpoint": "./vit-base-beans/checkpoint-3200", "epoch": 2.0, "eval_steps": 100, "global_step": 3228, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006195786864931847, "grad_norm": 1.9642935991287231, "learning_rate": 0.00019938042131350682, "loss": 3.9212, "step": 10 }, { "epoch": 0.012391573729863693, "grad_norm": 1.8843648433685303, "learning_rate": 0.00019876084262701366, "loss": 4.1331, "step": 20 }, { "epoch": 0.01858736059479554, "grad_norm": 1.7067281007766724, "learning_rate": 0.00019814126394052047, "loss": 4.0633, "step": 30 }, { "epoch": 0.024783147459727387, "grad_norm": 1.7453334331512451, "learning_rate": 0.00019752168525402728, "loss": 3.9709, "step": 40 }, { "epoch": 0.030978934324659233, "grad_norm": 2.055915355682373, "learning_rate": 0.0001969021065675341, "loss": 3.8473, "step": 50 }, { "epoch": 0.03717472118959108, "grad_norm": 1.8063888549804688, "learning_rate": 0.00019628252788104092, "loss": 3.7734, "step": 60 }, { "epoch": 0.04337050805452292, "grad_norm": 1.8397654294967651, "learning_rate": 0.0001956629491945477, "loss": 3.8279, "step": 70 }, { "epoch": 0.04956629491945477, "grad_norm": 2.0568203926086426, "learning_rate": 0.00019504337050805452, "loss": 3.7613, "step": 80 }, { "epoch": 0.055762081784386616, "grad_norm": 2.3892436027526855, "learning_rate": 0.00019442379182156135, "loss": 3.6098, "step": 90 }, { "epoch": 0.061957868649318466, "grad_norm": 2.0522940158843994, "learning_rate": 0.00019380421313506816, "loss": 3.5924, "step": 100 }, { "epoch": 0.061957868649318466, "eval_accuracy": 0.1926889714993804, "eval_loss": 3.567479133605957, "eval_runtime": 134.6311, "eval_samples_per_second": 47.953, "eval_steps_per_second": 5.994, "step": 100 }, { "epoch": 0.06815365551425032, "grad_norm": 2.773609161376953, "learning_rate": 0.00019318463444857497, "loss": 3.5271, "step": 110 }, { "epoch": 0.07434944237918216, "grad_norm": 2.709200620651245, "learning_rate": 0.00019256505576208178, "loss": 3.5848, "step": 120 }, { "epoch": 0.080545229244114, "grad_norm": 2.2230889797210693, "learning_rate": 0.00019194547707558862, "loss": 3.419, "step": 130 }, { "epoch": 0.08674101610904585, "grad_norm": 2.003593683242798, "learning_rate": 0.00019132589838909543, "loss": 3.4474, "step": 140 }, { "epoch": 0.09293680297397769, "grad_norm": 2.4022834300994873, "learning_rate": 0.00019070631970260224, "loss": 3.3324, "step": 150 }, { "epoch": 0.09913258983890955, "grad_norm": 2.2967748641967773, "learning_rate": 0.00019008674101610905, "loss": 3.1463, "step": 160 }, { "epoch": 0.10532837670384139, "grad_norm": 2.498286724090576, "learning_rate": 0.0001894671623296159, "loss": 3.2931, "step": 170 }, { "epoch": 0.11152416356877323, "grad_norm": 2.418286085128784, "learning_rate": 0.0001888475836431227, "loss": 3.24, "step": 180 }, { "epoch": 0.11771995043370508, "grad_norm": 2.7124204635620117, "learning_rate": 0.0001882280049566295, "loss": 3.1876, "step": 190 }, { "epoch": 0.12391573729863693, "grad_norm": 2.0705409049987793, "learning_rate": 0.00018760842627013632, "loss": 3.0189, "step": 200 }, { "epoch": 0.12391573729863693, "eval_accuracy": 0.30467781908302355, "eval_loss": 3.0312585830688477, "eval_runtime": 106.8938, "eval_samples_per_second": 60.396, "eval_steps_per_second": 7.55, "step": 200 }, { "epoch": 0.13011152416356878, "grad_norm": 2.31803822517395, "learning_rate": 0.00018698884758364313, "loss": 2.8399, "step": 210 }, { "epoch": 0.13630731102850063, "grad_norm": 2.759551525115967, "learning_rate": 0.00018636926889714994, "loss": 2.9606, "step": 220 }, { "epoch": 0.14250309789343246, "grad_norm": 2.6668756008148193, "learning_rate": 0.00018574969021065675, "loss": 2.8822, "step": 230 }, { "epoch": 0.14869888475836432, "grad_norm": 2.4296159744262695, "learning_rate": 0.00018513011152416359, "loss": 3.0186, "step": 240 }, { "epoch": 0.15489467162329615, "grad_norm": 2.854280948638916, "learning_rate": 0.0001845105328376704, "loss": 2.85, "step": 250 }, { "epoch": 0.161090458488228, "grad_norm": 2.2501845359802246, "learning_rate": 0.0001838909541511772, "loss": 2.7842, "step": 260 }, { "epoch": 0.16728624535315986, "grad_norm": 2.2110307216644287, "learning_rate": 0.00018327137546468402, "loss": 2.6941, "step": 270 }, { "epoch": 0.1734820322180917, "grad_norm": 3.233548164367676, "learning_rate": 0.00018265179677819085, "loss": 2.7383, "step": 280 }, { "epoch": 0.17967781908302355, "grad_norm": 3.4657251834869385, "learning_rate": 0.00018203221809169766, "loss": 2.8092, "step": 290 }, { "epoch": 0.18587360594795538, "grad_norm": 2.545624256134033, "learning_rate": 0.00018141263940520447, "loss": 2.5541, "step": 300 }, { "epoch": 0.18587360594795538, "eval_accuracy": 0.3956009913258984, "eval_loss": 2.557483434677124, "eval_runtime": 106.1092, "eval_samples_per_second": 60.843, "eval_steps_per_second": 7.605, "step": 300 }, { "epoch": 0.19206939281288724, "grad_norm": 3.7214767932891846, "learning_rate": 0.00018079306071871128, "loss": 2.673, "step": 310 }, { "epoch": 0.1982651796778191, "grad_norm": 2.865368604660034, "learning_rate": 0.00018017348203221812, "loss": 2.4952, "step": 320 }, { "epoch": 0.20446096654275092, "grad_norm": 2.8324084281921387, "learning_rate": 0.0001795539033457249, "loss": 2.5799, "step": 330 }, { "epoch": 0.21065675340768278, "grad_norm": 2.980775833129883, "learning_rate": 0.00017893432465923171, "loss": 2.518, "step": 340 }, { "epoch": 0.21685254027261464, "grad_norm": 3.9543583393096924, "learning_rate": 0.00017831474597273855, "loss": 2.3632, "step": 350 }, { "epoch": 0.22304832713754646, "grad_norm": 6.1679368019104, "learning_rate": 0.00017769516728624536, "loss": 2.4168, "step": 360 }, { "epoch": 0.22924411400247832, "grad_norm": 3.2867679595947266, "learning_rate": 0.00017707558859975217, "loss": 2.4561, "step": 370 }, { "epoch": 0.23543990086741015, "grad_norm": 3.4070069789886475, "learning_rate": 0.00017645600991325898, "loss": 2.3094, "step": 380 }, { "epoch": 0.241635687732342, "grad_norm": 2.8205995559692383, "learning_rate": 0.00017583643122676582, "loss": 2.2237, "step": 390 }, { "epoch": 0.24783147459727387, "grad_norm": 3.1000003814697266, "learning_rate": 0.00017521685254027263, "loss": 2.114, "step": 400 }, { "epoch": 0.24783147459727387, "eval_accuracy": 0.45709417596034696, "eval_loss": 2.233164072036743, "eval_runtime": 106.5812, "eval_samples_per_second": 60.574, "eval_steps_per_second": 7.572, "step": 400 }, { "epoch": 0.2540272614622057, "grad_norm": 3.512111186981201, "learning_rate": 0.00017459727385377944, "loss": 2.1117, "step": 410 }, { "epoch": 0.26022304832713755, "grad_norm": 4.639825344085693, "learning_rate": 0.00017397769516728625, "loss": 2.0458, "step": 420 }, { "epoch": 0.2664188351920694, "grad_norm": 3.5039663314819336, "learning_rate": 0.00017335811648079309, "loss": 2.1726, "step": 430 }, { "epoch": 0.27261462205700127, "grad_norm": 3.6471774578094482, "learning_rate": 0.0001727385377942999, "loss": 2.1414, "step": 440 }, { "epoch": 0.2788104089219331, "grad_norm": 4.325891971588135, "learning_rate": 0.0001721189591078067, "loss": 2.0651, "step": 450 }, { "epoch": 0.2850061957868649, "grad_norm": 2.3341152667999268, "learning_rate": 0.00017149938042131352, "loss": 2.0514, "step": 460 }, { "epoch": 0.29120198265179675, "grad_norm": 3.562957525253296, "learning_rate": 0.00017087980173482033, "loss": 1.8995, "step": 470 }, { "epoch": 0.29739776951672864, "grad_norm": 4.375968933105469, "learning_rate": 0.00017026022304832714, "loss": 1.981, "step": 480 }, { "epoch": 0.30359355638166047, "grad_norm": 2.5124247074127197, "learning_rate": 0.00016964064436183395, "loss": 1.8624, "step": 490 }, { "epoch": 0.3097893432465923, "grad_norm": 4.64390230178833, "learning_rate": 0.00016902106567534078, "loss": 1.9624, "step": 500 }, { "epoch": 0.3097893432465923, "eval_accuracy": 0.559634448574969, "eval_loss": 1.945489764213562, "eval_runtime": 107.2688, "eval_samples_per_second": 60.185, "eval_steps_per_second": 7.523, "step": 500 }, { "epoch": 0.3159851301115242, "grad_norm": 3.2166125774383545, "learning_rate": 0.0001684014869888476, "loss": 1.8621, "step": 510 }, { "epoch": 0.322180916976456, "grad_norm": 3.2079391479492188, "learning_rate": 0.0001677819083023544, "loss": 2.048, "step": 520 }, { "epoch": 0.32837670384138784, "grad_norm": 4.8925652503967285, "learning_rate": 0.0001671623296158612, "loss": 1.9486, "step": 530 }, { "epoch": 0.3345724907063197, "grad_norm": 3.3474984169006348, "learning_rate": 0.00016654275092936805, "loss": 1.9194, "step": 540 }, { "epoch": 0.34076827757125155, "grad_norm": 3.6939406394958496, "learning_rate": 0.00016592317224287486, "loss": 1.8698, "step": 550 }, { "epoch": 0.3469640644361834, "grad_norm": 3.176316022872925, "learning_rate": 0.00016530359355638167, "loss": 1.7831, "step": 560 }, { "epoch": 0.35315985130111527, "grad_norm": 3.4785337448120117, "learning_rate": 0.00016468401486988848, "loss": 1.7184, "step": 570 }, { "epoch": 0.3593556381660471, "grad_norm": 2.730374336242676, "learning_rate": 0.00016406443618339532, "loss": 1.6898, "step": 580 }, { "epoch": 0.3655514250309789, "grad_norm": 3.3291196823120117, "learning_rate": 0.0001634448574969021, "loss": 1.6437, "step": 590 }, { "epoch": 0.37174721189591076, "grad_norm": 4.465322017669678, "learning_rate": 0.0001628252788104089, "loss": 1.6749, "step": 600 }, { "epoch": 0.37174721189591076, "eval_accuracy": 0.5786864931846345, "eval_loss": 1.7369675636291504, "eval_runtime": 106.0837, "eval_samples_per_second": 60.858, "eval_steps_per_second": 7.607, "step": 600 }, { "epoch": 0.37794299876084264, "grad_norm": 4.176516056060791, "learning_rate": 0.00016220570012391575, "loss": 1.6242, "step": 610 }, { "epoch": 0.38413878562577447, "grad_norm": 4.0990495681762695, "learning_rate": 0.00016158612143742256, "loss": 1.774, "step": 620 }, { "epoch": 0.3903345724907063, "grad_norm": 5.12111234664917, "learning_rate": 0.00016096654275092937, "loss": 1.7558, "step": 630 }, { "epoch": 0.3965303593556382, "grad_norm": 4.448328018188477, "learning_rate": 0.00016034696406443618, "loss": 1.6905, "step": 640 }, { "epoch": 0.40272614622057, "grad_norm": 4.468796253204346, "learning_rate": 0.00015972738537794301, "loss": 1.5202, "step": 650 }, { "epoch": 0.40892193308550184, "grad_norm": 4.141849517822266, "learning_rate": 0.00015910780669144982, "loss": 1.5273, "step": 660 }, { "epoch": 0.41511771995043373, "grad_norm": 4.007165431976318, "learning_rate": 0.00015848822800495664, "loss": 1.5843, "step": 670 }, { "epoch": 0.42131350681536556, "grad_norm": 3.8219428062438965, "learning_rate": 0.00015786864931846345, "loss": 1.6011, "step": 680 }, { "epoch": 0.4275092936802974, "grad_norm": 4.7010345458984375, "learning_rate": 0.00015724907063197028, "loss": 1.6389, "step": 690 }, { "epoch": 0.43370508054522927, "grad_norm": 3.79860258102417, "learning_rate": 0.0001566294919454771, "loss": 1.5852, "step": 700 }, { "epoch": 0.43370508054522927, "eval_accuracy": 0.6438971499380421, "eval_loss": 1.4947177171707153, "eval_runtime": 106.0976, "eval_samples_per_second": 60.85, "eval_steps_per_second": 7.606, "step": 700 }, { "epoch": 0.4399008674101611, "grad_norm": 3.5789403915405273, "learning_rate": 0.0001560099132589839, "loss": 1.526, "step": 710 }, { "epoch": 0.44609665427509293, "grad_norm": 4.297870635986328, "learning_rate": 0.0001553903345724907, "loss": 1.5452, "step": 720 }, { "epoch": 0.45229244114002476, "grad_norm": 5.054442882537842, "learning_rate": 0.00015477075588599752, "loss": 1.3529, "step": 730 }, { "epoch": 0.45848822800495664, "grad_norm": 5.724175930023193, "learning_rate": 0.00015415117719950433, "loss": 1.4825, "step": 740 }, { "epoch": 0.4646840148698885, "grad_norm": 3.8260886669158936, "learning_rate": 0.00015353159851301114, "loss": 1.4692, "step": 750 }, { "epoch": 0.4708798017348203, "grad_norm": 3.232948064804077, "learning_rate": 0.00015291201982651798, "loss": 1.43, "step": 760 }, { "epoch": 0.4770755885997522, "grad_norm": 6.948119163513184, "learning_rate": 0.0001522924411400248, "loss": 1.5213, "step": 770 }, { "epoch": 0.483271375464684, "grad_norm": 5.678015232086182, "learning_rate": 0.0001516728624535316, "loss": 1.2681, "step": 780 }, { "epoch": 0.48946716232961585, "grad_norm": 3.9260432720184326, "learning_rate": 0.0001510532837670384, "loss": 1.4108, "step": 790 }, { "epoch": 0.49566294919454773, "grad_norm": 5.349125862121582, "learning_rate": 0.00015043370508054525, "loss": 1.1875, "step": 800 }, { "epoch": 0.49566294919454773, "eval_accuracy": 0.6468401486988847, "eval_loss": 1.4151387214660645, "eval_runtime": 105.6692, "eval_samples_per_second": 61.096, "eval_steps_per_second": 7.637, "step": 800 }, { "epoch": 0.5018587360594795, "grad_norm": 4.983338832855225, "learning_rate": 0.00014981412639405206, "loss": 1.3196, "step": 810 }, { "epoch": 0.5080545229244114, "grad_norm": 5.539896488189697, "learning_rate": 0.00014919454770755887, "loss": 1.4704, "step": 820 }, { "epoch": 0.5142503097893433, "grad_norm": 4.018556594848633, "learning_rate": 0.0001485749690210657, "loss": 1.2208, "step": 830 }, { "epoch": 0.5204460966542751, "grad_norm": 4.443668842315674, "learning_rate": 0.00014795539033457251, "loss": 1.2335, "step": 840 }, { "epoch": 0.5266418835192069, "grad_norm": 6.135488033294678, "learning_rate": 0.0001473358116480793, "loss": 1.4955, "step": 850 }, { "epoch": 0.5328376703841388, "grad_norm": 5.287814617156982, "learning_rate": 0.0001467162329615861, "loss": 1.2792, "step": 860 }, { "epoch": 0.5390334572490706, "grad_norm": 5.035948276519775, "learning_rate": 0.00014609665427509294, "loss": 1.2946, "step": 870 }, { "epoch": 0.5452292441140025, "grad_norm": 4.492195129394531, "learning_rate": 0.00014547707558859975, "loss": 1.2686, "step": 880 }, { "epoch": 0.5514250309789344, "grad_norm": 4.308188438415527, "learning_rate": 0.00014485749690210656, "loss": 1.3539, "step": 890 }, { "epoch": 0.5576208178438662, "grad_norm": 5.699028015136719, "learning_rate": 0.00014423791821561337, "loss": 1.5114, "step": 900 }, { "epoch": 0.5576208178438662, "eval_accuracy": 0.682001239157373, "eval_loss": 1.2709109783172607, "eval_runtime": 105.9622, "eval_samples_per_second": 60.927, "eval_steps_per_second": 7.616, "step": 900 }, { "epoch": 0.563816604708798, "grad_norm": 3.6500446796417236, "learning_rate": 0.0001436183395291202, "loss": 1.1699, "step": 910 }, { "epoch": 0.5700123915737298, "grad_norm": 5.26973819732666, "learning_rate": 0.00014299876084262702, "loss": 1.2285, "step": 920 }, { "epoch": 0.5762081784386617, "grad_norm": 5.995537757873535, "learning_rate": 0.00014237918215613383, "loss": 1.2008, "step": 930 }, { "epoch": 0.5824039653035935, "grad_norm": 4.208325386047363, "learning_rate": 0.00014175960346964067, "loss": 0.9493, "step": 940 }, { "epoch": 0.5885997521685254, "grad_norm": 4.716500282287598, "learning_rate": 0.00014114002478314748, "loss": 1.1492, "step": 950 }, { "epoch": 0.5947955390334573, "grad_norm": 3.3192636966705322, "learning_rate": 0.0001405204460966543, "loss": 1.3701, "step": 960 }, { "epoch": 0.6009913258983891, "grad_norm": 4.523627758026123, "learning_rate": 0.0001399008674101611, "loss": 1.1221, "step": 970 }, { "epoch": 0.6071871127633209, "grad_norm": 4.424323081970215, "learning_rate": 0.0001392812887236679, "loss": 1.2023, "step": 980 }, { "epoch": 0.6133828996282528, "grad_norm": 5.295013427734375, "learning_rate": 0.00013866171003717472, "loss": 1.0532, "step": 990 }, { "epoch": 0.6195786864931846, "grad_norm": 4.994614124298096, "learning_rate": 0.00013804213135068153, "loss": 1.3122, "step": 1000 }, { "epoch": 0.6195786864931846, "eval_accuracy": 0.6939281288723668, "eval_loss": 1.1940184831619263, "eval_runtime": 106.4084, "eval_samples_per_second": 60.672, "eval_steps_per_second": 7.584, "step": 1000 }, { "epoch": 0.6257744733581165, "grad_norm": 2.611729145050049, "learning_rate": 0.00013742255266418837, "loss": 0.9277, "step": 1010 }, { "epoch": 0.6319702602230484, "grad_norm": 2.6009323596954346, "learning_rate": 0.00013680297397769518, "loss": 1.0508, "step": 1020 }, { "epoch": 0.6381660470879802, "grad_norm": 3.2199230194091797, "learning_rate": 0.00013618339529120199, "loss": 1.11, "step": 1030 }, { "epoch": 0.644361833952912, "grad_norm": 2.762774705886841, "learning_rate": 0.0001355638166047088, "loss": 1.0821, "step": 1040 }, { "epoch": 0.6505576208178439, "grad_norm": 3.8875350952148438, "learning_rate": 0.00013494423791821563, "loss": 1.0406, "step": 1050 }, { "epoch": 0.6567534076827757, "grad_norm": 4.314332008361816, "learning_rate": 0.00013432465923172244, "loss": 1.1308, "step": 1060 }, { "epoch": 0.6629491945477075, "grad_norm": 3.6009325981140137, "learning_rate": 0.00013370508054522925, "loss": 1.1907, "step": 1070 }, { "epoch": 0.6691449814126395, "grad_norm": 5.977869033813477, "learning_rate": 0.00013308550185873606, "loss": 1.1071, "step": 1080 }, { "epoch": 0.6753407682775713, "grad_norm": 5.390667915344238, "learning_rate": 0.0001324659231722429, "loss": 1.0829, "step": 1090 }, { "epoch": 0.6815365551425031, "grad_norm": 6.445249080657959, "learning_rate": 0.0001318463444857497, "loss": 1.0721, "step": 1100 }, { "epoch": 0.6815365551425031, "eval_accuracy": 0.7261462205700124, "eval_loss": 1.0756527185440063, "eval_runtime": 104.6325, "eval_samples_per_second": 61.702, "eval_steps_per_second": 7.713, "step": 1100 }, { "epoch": 0.6877323420074349, "grad_norm": 3.1488468647003174, "learning_rate": 0.0001312267657992565, "loss": 0.9928, "step": 1110 }, { "epoch": 0.6939281288723668, "grad_norm": 3.6752538681030273, "learning_rate": 0.00013060718711276333, "loss": 0.8412, "step": 1120 }, { "epoch": 0.7001239157372986, "grad_norm": 6.186413764953613, "learning_rate": 0.00012998760842627014, "loss": 0.8979, "step": 1130 }, { "epoch": 0.7063197026022305, "grad_norm": 4.457529544830322, "learning_rate": 0.00012936802973977695, "loss": 1.0586, "step": 1140 }, { "epoch": 0.7125154894671624, "grad_norm": 3.7016208171844482, "learning_rate": 0.00012874845105328376, "loss": 0.7632, "step": 1150 }, { "epoch": 0.7187112763320942, "grad_norm": 3.913440227508545, "learning_rate": 0.0001281288723667906, "loss": 0.9905, "step": 1160 }, { "epoch": 0.724907063197026, "grad_norm": 4.720458984375, "learning_rate": 0.0001275092936802974, "loss": 0.8703, "step": 1170 }, { "epoch": 0.7311028500619579, "grad_norm": 4.232792854309082, "learning_rate": 0.00012688971499380422, "loss": 1.2989, "step": 1180 }, { "epoch": 0.7372986369268897, "grad_norm": 5.886707305908203, "learning_rate": 0.00012627013630731103, "loss": 1.1108, "step": 1190 }, { "epoch": 0.7434944237918215, "grad_norm": 3.9497408866882324, "learning_rate": 0.00012565055762081787, "loss": 0.8249, "step": 1200 }, { "epoch": 0.7434944237918215, "eval_accuracy": 0.7575898389095415, "eval_loss": 0.9666171669960022, "eval_runtime": 106.5314, "eval_samples_per_second": 60.602, "eval_steps_per_second": 7.575, "step": 1200 }, { "epoch": 0.7496902106567535, "grad_norm": 5.659217357635498, "learning_rate": 0.00012503097893432468, "loss": 0.8484, "step": 1210 }, { "epoch": 0.7558859975216853, "grad_norm": 4.372608661651611, "learning_rate": 0.00012441140024783149, "loss": 0.9891, "step": 1220 }, { "epoch": 0.7620817843866171, "grad_norm": 7.296494007110596, "learning_rate": 0.0001237918215613383, "loss": 0.919, "step": 1230 }, { "epoch": 0.7682775712515489, "grad_norm": 2.1547601222991943, "learning_rate": 0.0001231722428748451, "loss": 0.8611, "step": 1240 }, { "epoch": 0.7744733581164808, "grad_norm": 4.57709264755249, "learning_rate": 0.00012255266418835192, "loss": 0.893, "step": 1250 }, { "epoch": 0.7806691449814126, "grad_norm": 4.922760009765625, "learning_rate": 0.00012193308550185874, "loss": 0.9478, "step": 1260 }, { "epoch": 0.7868649318463445, "grad_norm": 4.814465045928955, "learning_rate": 0.00012131350681536555, "loss": 0.9416, "step": 1270 }, { "epoch": 0.7930607187112764, "grad_norm": 5.223793029785156, "learning_rate": 0.00012069392812887237, "loss": 0.7468, "step": 1280 }, { "epoch": 0.7992565055762082, "grad_norm": 7.333277702331543, "learning_rate": 0.00012007434944237918, "loss": 0.8581, "step": 1290 }, { "epoch": 0.80545229244114, "grad_norm": 2.8909173011779785, "learning_rate": 0.000119454770755886, "loss": 0.7944, "step": 1300 }, { "epoch": 0.80545229244114, "eval_accuracy": 0.7707558859975217, "eval_loss": 0.9101163744926453, "eval_runtime": 106.3936, "eval_samples_per_second": 60.68, "eval_steps_per_second": 7.585, "step": 1300 }, { "epoch": 0.8116480793060719, "grad_norm": 5.546220302581787, "learning_rate": 0.00011883519206939282, "loss": 1.066, "step": 1310 }, { "epoch": 0.8178438661710037, "grad_norm": 5.582309246063232, "learning_rate": 0.00011821561338289964, "loss": 0.7992, "step": 1320 }, { "epoch": 0.8240396530359355, "grad_norm": 6.598534107208252, "learning_rate": 0.00011759603469640645, "loss": 0.8142, "step": 1330 }, { "epoch": 0.8302354399008675, "grad_norm": 3.366227865219116, "learning_rate": 0.00011697645600991327, "loss": 0.8297, "step": 1340 }, { "epoch": 0.8364312267657993, "grad_norm": 4.621030807495117, "learning_rate": 0.00011635687732342008, "loss": 0.8083, "step": 1350 }, { "epoch": 0.8426270136307311, "grad_norm": 4.285297870635986, "learning_rate": 0.00011573729863692691, "loss": 0.8572, "step": 1360 }, { "epoch": 0.8488228004956629, "grad_norm": 5.127432823181152, "learning_rate": 0.0001151177199504337, "loss": 0.7723, "step": 1370 }, { "epoch": 0.8550185873605948, "grad_norm": 8.046000480651855, "learning_rate": 0.00011449814126394051, "loss": 0.6129, "step": 1380 }, { "epoch": 0.8612143742255266, "grad_norm": 3.8149867057800293, "learning_rate": 0.00011387856257744734, "loss": 0.7718, "step": 1390 }, { "epoch": 0.8674101610904585, "grad_norm": 7.305781364440918, "learning_rate": 0.00011325898389095415, "loss": 0.8032, "step": 1400 }, { "epoch": 0.8674101610904585, "eval_accuracy": 0.7690520446096655, "eval_loss": 0.901136040687561, "eval_runtime": 106.4417, "eval_samples_per_second": 60.653, "eval_steps_per_second": 7.582, "step": 1400 }, { "epoch": 0.8736059479553904, "grad_norm": 5.850032329559326, "learning_rate": 0.00011263940520446097, "loss": 0.9454, "step": 1410 }, { "epoch": 0.8798017348203222, "grad_norm": 5.336400032043457, "learning_rate": 0.00011201982651796778, "loss": 0.7002, "step": 1420 }, { "epoch": 0.885997521685254, "grad_norm": 3.1872918605804443, "learning_rate": 0.0001114002478314746, "loss": 0.8848, "step": 1430 }, { "epoch": 0.8921933085501859, "grad_norm": 5.940222263336182, "learning_rate": 0.00011078066914498142, "loss": 0.8746, "step": 1440 }, { "epoch": 0.8983890954151177, "grad_norm": 4.074731349945068, "learning_rate": 0.00011016109045848824, "loss": 0.9426, "step": 1450 }, { "epoch": 0.9045848822800495, "grad_norm": 4.467647552490234, "learning_rate": 0.00010954151177199505, "loss": 0.7319, "step": 1460 }, { "epoch": 0.9107806691449815, "grad_norm": 4.298549175262451, "learning_rate": 0.00010892193308550187, "loss": 0.7648, "step": 1470 }, { "epoch": 0.9169764560099133, "grad_norm": 5.923393249511719, "learning_rate": 0.00010830235439900868, "loss": 0.66, "step": 1480 }, { "epoch": 0.9231722428748451, "grad_norm": 3.252465009689331, "learning_rate": 0.0001076827757125155, "loss": 0.6732, "step": 1490 }, { "epoch": 0.929368029739777, "grad_norm": 5.450772285461426, "learning_rate": 0.0001070631970260223, "loss": 0.7479, "step": 1500 }, { "epoch": 0.929368029739777, "eval_accuracy": 0.8066914498141264, "eval_loss": 0.7409122586250305, "eval_runtime": 106.9474, "eval_samples_per_second": 60.366, "eval_steps_per_second": 7.546, "step": 1500 }, { "epoch": 0.9355638166047088, "grad_norm": 3.8321099281311035, "learning_rate": 0.00010644361833952911, "loss": 0.6568, "step": 1510 }, { "epoch": 0.9417596034696406, "grad_norm": 2.7507283687591553, "learning_rate": 0.00010582403965303594, "loss": 0.7641, "step": 1520 }, { "epoch": 0.9479553903345725, "grad_norm": 4.787430286407471, "learning_rate": 0.00010520446096654275, "loss": 0.8192, "step": 1530 }, { "epoch": 0.9541511771995044, "grad_norm": 5.063214302062988, "learning_rate": 0.00010458488228004957, "loss": 0.7953, "step": 1540 }, { "epoch": 0.9603469640644362, "grad_norm": 3.0914242267608643, "learning_rate": 0.00010396530359355638, "loss": 0.6188, "step": 1550 }, { "epoch": 0.966542750929368, "grad_norm": 2.4344420433044434, "learning_rate": 0.0001033457249070632, "loss": 0.7707, "step": 1560 }, { "epoch": 0.9727385377942999, "grad_norm": 5.563531398773193, "learning_rate": 0.00010272614622057001, "loss": 0.657, "step": 1570 }, { "epoch": 0.9789343246592317, "grad_norm": 2.2125167846679688, "learning_rate": 0.00010210656753407684, "loss": 0.7362, "step": 1580 }, { "epoch": 0.9851301115241635, "grad_norm": 7.253428936004639, "learning_rate": 0.00010148698884758365, "loss": 0.974, "step": 1590 }, { "epoch": 0.9913258983890955, "grad_norm": 5.762598037719727, "learning_rate": 0.00010086741016109047, "loss": 0.5997, "step": 1600 }, { "epoch": 0.9913258983890955, "eval_accuracy": 0.8110285006195787, "eval_loss": 0.7325805425643921, "eval_runtime": 105.9884, "eval_samples_per_second": 60.912, "eval_steps_per_second": 7.614, "step": 1600 }, { "epoch": 0.9975216852540273, "grad_norm": 7.013967037200928, "learning_rate": 0.00010024783147459728, "loss": 0.7562, "step": 1610 }, { "epoch": 1.003717472118959, "grad_norm": 4.252784252166748, "learning_rate": 9.962825278810409e-05, "loss": 0.5769, "step": 1620 }, { "epoch": 1.009913258983891, "grad_norm": 2.1795663833618164, "learning_rate": 9.900867410161091e-05, "loss": 0.4631, "step": 1630 }, { "epoch": 1.016109045848823, "grad_norm": 2.469095468521118, "learning_rate": 9.838909541511772e-05, "loss": 0.3613, "step": 1640 }, { "epoch": 1.0223048327137547, "grad_norm": 4.4682297706604, "learning_rate": 9.776951672862455e-05, "loss": 0.4184, "step": 1650 }, { "epoch": 1.0285006195786865, "grad_norm": 1.6269049644470215, "learning_rate": 9.714993804213134e-05, "loss": 0.3376, "step": 1660 }, { "epoch": 1.0346964064436184, "grad_norm": 4.2464423179626465, "learning_rate": 9.653035935563817e-05, "loss": 0.4305, "step": 1670 }, { "epoch": 1.0408921933085502, "grad_norm": 3.627943992614746, "learning_rate": 9.591078066914498e-05, "loss": 0.3696, "step": 1680 }, { "epoch": 1.047087980173482, "grad_norm": 2.6817383766174316, "learning_rate": 9.52912019826518e-05, "loss": 0.3897, "step": 1690 }, { "epoch": 1.0532837670384139, "grad_norm": 7.518842697143555, "learning_rate": 9.467162329615861e-05, "loss": 0.5005, "step": 1700 }, { "epoch": 1.0532837670384139, "eval_accuracy": 0.8210966542750929, "eval_loss": 0.6769081950187683, "eval_runtime": 105.6921, "eval_samples_per_second": 61.083, "eval_steps_per_second": 7.635, "step": 1700 }, { "epoch": 1.0594795539033457, "grad_norm": 3.0254740715026855, "learning_rate": 9.405204460966544e-05, "loss": 0.3827, "step": 1710 }, { "epoch": 1.0656753407682775, "grad_norm": 4.391673564910889, "learning_rate": 9.343246592317225e-05, "loss": 0.3535, "step": 1720 }, { "epoch": 1.0718711276332094, "grad_norm": 3.73157000541687, "learning_rate": 9.281288723667906e-05, "loss": 0.3238, "step": 1730 }, { "epoch": 1.0780669144981412, "grad_norm": 2.160573720932007, "learning_rate": 9.219330855018588e-05, "loss": 0.383, "step": 1740 }, { "epoch": 1.084262701363073, "grad_norm": 4.27864408493042, "learning_rate": 9.157372986369269e-05, "loss": 0.3361, "step": 1750 }, { "epoch": 1.090458488228005, "grad_norm": 3.1258535385131836, "learning_rate": 9.095415117719951e-05, "loss": 0.2797, "step": 1760 }, { "epoch": 1.096654275092937, "grad_norm": 3.8895909786224365, "learning_rate": 9.033457249070632e-05, "loss": 0.4367, "step": 1770 }, { "epoch": 1.1028500619578687, "grad_norm": 2.674630880355835, "learning_rate": 8.971499380421315e-05, "loss": 0.3925, "step": 1780 }, { "epoch": 1.1090458488228006, "grad_norm": 7.267265319824219, "learning_rate": 8.909541511771994e-05, "loss": 0.4627, "step": 1790 }, { "epoch": 1.1152416356877324, "grad_norm": 4.650302886962891, "learning_rate": 8.847583643122677e-05, "loss": 0.4107, "step": 1800 }, { "epoch": 1.1152416356877324, "eval_accuracy": 0.837360594795539, "eval_loss": 0.6374781131744385, "eval_runtime": 106.1067, "eval_samples_per_second": 60.844, "eval_steps_per_second": 7.606, "step": 1800 }, { "epoch": 1.1214374225526642, "grad_norm": 2.1244664192199707, "learning_rate": 8.785625774473358e-05, "loss": 0.3463, "step": 1810 }, { "epoch": 1.127633209417596, "grad_norm": 1.8372740745544434, "learning_rate": 8.72366790582404e-05, "loss": 0.3116, "step": 1820 }, { "epoch": 1.1338289962825279, "grad_norm": 3.6969428062438965, "learning_rate": 8.661710037174722e-05, "loss": 0.4069, "step": 1830 }, { "epoch": 1.1400247831474597, "grad_norm": 3.857111930847168, "learning_rate": 8.599752168525403e-05, "loss": 0.4478, "step": 1840 }, { "epoch": 1.1462205700123915, "grad_norm": 2.005557060241699, "learning_rate": 8.537794299876086e-05, "loss": 0.2794, "step": 1850 }, { "epoch": 1.1524163568773234, "grad_norm": 5.883118629455566, "learning_rate": 8.475836431226765e-05, "loss": 0.3732, "step": 1860 }, { "epoch": 1.1586121437422552, "grad_norm": 5.240428924560547, "learning_rate": 8.413878562577448e-05, "loss": 0.3042, "step": 1870 }, { "epoch": 1.164807930607187, "grad_norm": 2.857640027999878, "learning_rate": 8.351920693928129e-05, "loss": 0.3013, "step": 1880 }, { "epoch": 1.1710037174721188, "grad_norm": 5.086670398712158, "learning_rate": 8.289962825278811e-05, "loss": 0.2786, "step": 1890 }, { "epoch": 1.177199504337051, "grad_norm": 4.9353413581848145, "learning_rate": 8.228004956629492e-05, "loss": 0.4596, "step": 1900 }, { "epoch": 1.177199504337051, "eval_accuracy": 0.8303903345724907, "eval_loss": 0.6301799416542053, "eval_runtime": 106.1215, "eval_samples_per_second": 60.836, "eval_steps_per_second": 7.604, "step": 1900 }, { "epoch": 1.1833952912019827, "grad_norm": 3.3738842010498047, "learning_rate": 8.166047087980174e-05, "loss": 0.2521, "step": 1910 }, { "epoch": 1.1895910780669146, "grad_norm": 3.060638427734375, "learning_rate": 8.104089219330855e-05, "loss": 0.3277, "step": 1920 }, { "epoch": 1.1957868649318464, "grad_norm": 3.8044564723968506, "learning_rate": 8.042131350681536e-05, "loss": 0.3403, "step": 1930 }, { "epoch": 1.2019826517967782, "grad_norm": 4.681379795074463, "learning_rate": 7.980173482032219e-05, "loss": 0.3816, "step": 1940 }, { "epoch": 1.20817843866171, "grad_norm": 5.634707450866699, "learning_rate": 7.9182156133829e-05, "loss": 0.3161, "step": 1950 }, { "epoch": 1.2143742255266419, "grad_norm": 2.39424204826355, "learning_rate": 7.856257744733582e-05, "loss": 0.3638, "step": 1960 }, { "epoch": 1.2205700123915737, "grad_norm": 2.7920138835906982, "learning_rate": 7.794299876084263e-05, "loss": 0.2685, "step": 1970 }, { "epoch": 1.2267657992565055, "grad_norm": 6.918692111968994, "learning_rate": 7.732342007434946e-05, "loss": 0.2768, "step": 1980 }, { "epoch": 1.2329615861214374, "grad_norm": 4.783863544464111, "learning_rate": 7.670384138785625e-05, "loss": 0.3513, "step": 1990 }, { "epoch": 1.2391573729863692, "grad_norm": 1.5945993661880493, "learning_rate": 7.608426270136308e-05, "loss": 0.2544, "step": 2000 }, { "epoch": 1.2391573729863692, "eval_accuracy": 0.8399938042131351, "eval_loss": 0.5804997086524963, "eval_runtime": 106.7403, "eval_samples_per_second": 60.483, "eval_steps_per_second": 7.56, "step": 2000 }, { "epoch": 1.2453531598513012, "grad_norm": 0.8452507853507996, "learning_rate": 7.546468401486989e-05, "loss": 0.2581, "step": 2010 }, { "epoch": 1.251548946716233, "grad_norm": 3.4717066287994385, "learning_rate": 7.484510532837671e-05, "loss": 0.2793, "step": 2020 }, { "epoch": 1.257744733581165, "grad_norm": 3.5942156314849854, "learning_rate": 7.422552664188352e-05, "loss": 0.2798, "step": 2030 }, { "epoch": 1.2639405204460967, "grad_norm": 5.311221599578857, "learning_rate": 7.360594795539034e-05, "loss": 0.3501, "step": 2040 }, { "epoch": 1.2701363073110286, "grad_norm": 3.8793325424194336, "learning_rate": 7.298636926889715e-05, "loss": 0.3645, "step": 2050 }, { "epoch": 1.2763320941759604, "grad_norm": 2.781317949295044, "learning_rate": 7.236679058240396e-05, "loss": 0.3294, "step": 2060 }, { "epoch": 1.2825278810408922, "grad_norm": 1.0684071779251099, "learning_rate": 7.174721189591079e-05, "loss": 0.2507, "step": 2070 }, { "epoch": 1.288723667905824, "grad_norm": 1.6585029363632202, "learning_rate": 7.11276332094176e-05, "loss": 0.3451, "step": 2080 }, { "epoch": 1.2949194547707559, "grad_norm": 1.9376587867736816, "learning_rate": 7.050805452292442e-05, "loss": 0.3423, "step": 2090 }, { "epoch": 1.3011152416356877, "grad_norm": 2.892873525619507, "learning_rate": 6.988847583643123e-05, "loss": 0.2983, "step": 2100 }, { "epoch": 1.3011152416356877, "eval_accuracy": 0.8500619578686494, "eval_loss": 0.5480403304100037, "eval_runtime": 107.2113, "eval_samples_per_second": 60.218, "eval_steps_per_second": 7.527, "step": 2100 }, { "epoch": 1.3073110285006195, "grad_norm": 0.8831340670585632, "learning_rate": 6.926889714993805e-05, "loss": 0.3744, "step": 2110 }, { "epoch": 1.3135068153655514, "grad_norm": 5.404819011688232, "learning_rate": 6.864931846344485e-05, "loss": 0.2856, "step": 2120 }, { "epoch": 1.3197026022304832, "grad_norm": 5.324650287628174, "learning_rate": 6.802973977695167e-05, "loss": 0.2055, "step": 2130 }, { "epoch": 1.325898389095415, "grad_norm": 1.5021191835403442, "learning_rate": 6.741016109045848e-05, "loss": 0.3426, "step": 2140 }, { "epoch": 1.3320941759603468, "grad_norm": 3.320554256439209, "learning_rate": 6.679058240396531e-05, "loss": 0.1865, "step": 2150 }, { "epoch": 1.3382899628252787, "grad_norm": 6.635782241821289, "learning_rate": 6.617100371747212e-05, "loss": 0.2956, "step": 2160 }, { "epoch": 1.3444857496902107, "grad_norm": 7.695481300354004, "learning_rate": 6.555142503097894e-05, "loss": 0.2729, "step": 2170 }, { "epoch": 1.3506815365551426, "grad_norm": 2.3640730381011963, "learning_rate": 6.493184634448575e-05, "loss": 0.2946, "step": 2180 }, { "epoch": 1.3568773234200744, "grad_norm": 6.867854595184326, "learning_rate": 6.431226765799256e-05, "loss": 0.269, "step": 2190 }, { "epoch": 1.3630731102850062, "grad_norm": 5.83229923248291, "learning_rate": 6.369268897149939e-05, "loss": 0.3214, "step": 2200 }, { "epoch": 1.3630731102850062, "eval_accuracy": 0.8683395291201983, "eval_loss": 0.5052544474601746, "eval_runtime": 106.9004, "eval_samples_per_second": 60.393, "eval_steps_per_second": 7.549, "step": 2200 }, { "epoch": 1.369268897149938, "grad_norm": 4.513510227203369, "learning_rate": 6.30731102850062e-05, "loss": 0.2435, "step": 2210 }, { "epoch": 1.3754646840148699, "grad_norm": 0.9935147762298584, "learning_rate": 6.245353159851302e-05, "loss": 0.237, "step": 2220 }, { "epoch": 1.3816604708798017, "grad_norm": 5.048427104949951, "learning_rate": 6.183395291201983e-05, "loss": 0.1811, "step": 2230 }, { "epoch": 1.3878562577447335, "grad_norm": 4.882187366485596, "learning_rate": 6.121437422552665e-05, "loss": 0.2591, "step": 2240 }, { "epoch": 1.3940520446096654, "grad_norm": 3.1441776752471924, "learning_rate": 6.0594795539033456e-05, "loss": 0.2503, "step": 2250 }, { "epoch": 1.4002478314745972, "grad_norm": 5.158385753631592, "learning_rate": 5.997521685254027e-05, "loss": 0.3771, "step": 2260 }, { "epoch": 1.4064436183395292, "grad_norm": 4.472280979156494, "learning_rate": 5.935563816604709e-05, "loss": 0.2673, "step": 2270 }, { "epoch": 1.412639405204461, "grad_norm": 3.1100497245788574, "learning_rate": 5.8736059479553906e-05, "loss": 0.3471, "step": 2280 }, { "epoch": 1.418835192069393, "grad_norm": 1.5550055503845215, "learning_rate": 5.811648079306072e-05, "loss": 0.2452, "step": 2290 }, { "epoch": 1.4250309789343247, "grad_norm": 1.051927924156189, "learning_rate": 5.749690210656754e-05, "loss": 0.2384, "step": 2300 }, { "epoch": 1.4250309789343247, "eval_accuracy": 0.8712825278810409, "eval_loss": 0.4928523004055023, "eval_runtime": 106.1123, "eval_samples_per_second": 60.841, "eval_steps_per_second": 7.605, "step": 2300 }, { "epoch": 1.4312267657992566, "grad_norm": 3.616842031478882, "learning_rate": 5.687732342007436e-05, "loss": 0.251, "step": 2310 }, { "epoch": 1.4374225526641884, "grad_norm": 3.94722580909729, "learning_rate": 5.625774473358117e-05, "loss": 0.1996, "step": 2320 }, { "epoch": 1.4436183395291202, "grad_norm": 6.4395222663879395, "learning_rate": 5.5638166047087984e-05, "loss": 0.3224, "step": 2330 }, { "epoch": 1.449814126394052, "grad_norm": 2.801499366760254, "learning_rate": 5.50185873605948e-05, "loss": 0.2018, "step": 2340 }, { "epoch": 1.4560099132589839, "grad_norm": 4.119659900665283, "learning_rate": 5.439900867410162e-05, "loss": 0.2661, "step": 2350 }, { "epoch": 1.4622057001239157, "grad_norm": 8.405607223510742, "learning_rate": 5.3779429987608434e-05, "loss": 0.3412, "step": 2360 }, { "epoch": 1.4684014869888475, "grad_norm": 3.3943393230438232, "learning_rate": 5.315985130111525e-05, "loss": 0.303, "step": 2370 }, { "epoch": 1.4745972738537794, "grad_norm": 3.990785837173462, "learning_rate": 5.2540272614622054e-05, "loss": 0.1961, "step": 2380 }, { "epoch": 1.4807930607187112, "grad_norm": 8.141942977905273, "learning_rate": 5.192069392812887e-05, "loss": 0.2683, "step": 2390 }, { "epoch": 1.486988847583643, "grad_norm": 2.5247440338134766, "learning_rate": 5.130111524163569e-05, "loss": 0.2397, "step": 2400 }, { "epoch": 1.486988847583643, "eval_accuracy": 0.8742255266418835, "eval_loss": 0.4664279520511627, "eval_runtime": 106.5781, "eval_samples_per_second": 60.575, "eval_steps_per_second": 7.572, "step": 2400 }, { "epoch": 1.4931846344485749, "grad_norm": 2.132350206375122, "learning_rate": 5.0681536555142505e-05, "loss": 0.3258, "step": 2410 }, { "epoch": 1.4993804213135067, "grad_norm": 4.359376430511475, "learning_rate": 5.006195786864932e-05, "loss": 0.2781, "step": 2420 }, { "epoch": 1.5055762081784385, "grad_norm": 7.171940326690674, "learning_rate": 4.944237918215613e-05, "loss": 0.2828, "step": 2430 }, { "epoch": 1.5117719950433703, "grad_norm": 1.664962649345398, "learning_rate": 4.882280049566295e-05, "loss": 0.206, "step": 2440 }, { "epoch": 1.5179677819083024, "grad_norm": 7.0570268630981445, "learning_rate": 4.820322180916977e-05, "loss": 0.2365, "step": 2450 }, { "epoch": 1.5241635687732342, "grad_norm": 4.6403279304504395, "learning_rate": 4.758364312267658e-05, "loss": 0.3308, "step": 2460 }, { "epoch": 1.530359355638166, "grad_norm": 0.445726215839386, "learning_rate": 4.69640644361834e-05, "loss": 0.171, "step": 2470 }, { "epoch": 1.5365551425030979, "grad_norm": 6.475937366485596, "learning_rate": 4.6344485749690216e-05, "loss": 0.2941, "step": 2480 }, { "epoch": 1.5427509293680297, "grad_norm": 1.953753113746643, "learning_rate": 4.5724907063197026e-05, "loss": 0.1691, "step": 2490 }, { "epoch": 1.5489467162329615, "grad_norm": 4.187342166900635, "learning_rate": 4.510532837670384e-05, "loss": 0.3448, "step": 2500 }, { "epoch": 1.5489467162329615, "eval_accuracy": 0.8754646840148699, "eval_loss": 0.46897682547569275, "eval_runtime": 107.2096, "eval_samples_per_second": 60.218, "eval_steps_per_second": 7.527, "step": 2500 }, { "epoch": 1.5551425030978936, "grad_norm": 4.653651714324951, "learning_rate": 4.448574969021066e-05, "loss": 0.2035, "step": 2510 }, { "epoch": 1.5613382899628254, "grad_norm": 1.159033179283142, "learning_rate": 4.3866171003717476e-05, "loss": 0.2354, "step": 2520 }, { "epoch": 1.5675340768277573, "grad_norm": 0.841773271560669, "learning_rate": 4.3246592317224286e-05, "loss": 0.1738, "step": 2530 }, { "epoch": 1.573729863692689, "grad_norm": 6.38914155960083, "learning_rate": 4.26270136307311e-05, "loss": 0.2743, "step": 2540 }, { "epoch": 1.579925650557621, "grad_norm": 0.2980528473854065, "learning_rate": 4.200743494423792e-05, "loss": 0.2383, "step": 2550 }, { "epoch": 1.5861214374225527, "grad_norm": 3.9257161617279053, "learning_rate": 4.1387856257744737e-05, "loss": 0.1728, "step": 2560 }, { "epoch": 1.5923172242874846, "grad_norm": 4.1586785316467285, "learning_rate": 4.0768277571251553e-05, "loss": 0.2226, "step": 2570 }, { "epoch": 1.5985130111524164, "grad_norm": 1.591169834136963, "learning_rate": 4.014869888475837e-05, "loss": 0.1998, "step": 2580 }, { "epoch": 1.6047087980173482, "grad_norm": 1.5540215969085693, "learning_rate": 3.952912019826518e-05, "loss": 0.101, "step": 2590 }, { "epoch": 1.61090458488228, "grad_norm": 1.4260759353637695, "learning_rate": 3.8909541511772e-05, "loss": 0.3129, "step": 2600 }, { "epoch": 1.61090458488228, "eval_accuracy": 0.8842936802973977, "eval_loss": 0.4350809156894684, "eval_runtime": 106.4924, "eval_samples_per_second": 60.624, "eval_steps_per_second": 7.578, "step": 2600 }, { "epoch": 1.6171003717472119, "grad_norm": 4.335544586181641, "learning_rate": 3.8289962825278814e-05, "loss": 0.2159, "step": 2610 }, { "epoch": 1.6232961586121437, "grad_norm": 1.1200919151306152, "learning_rate": 3.7670384138785624e-05, "loss": 0.2679, "step": 2620 }, { "epoch": 1.6294919454770755, "grad_norm": 2.773334264755249, "learning_rate": 3.705080545229244e-05, "loss": 0.2927, "step": 2630 }, { "epoch": 1.6356877323420074, "grad_norm": 2.635826349258423, "learning_rate": 3.643122676579926e-05, "loss": 0.2937, "step": 2640 }, { "epoch": 1.6418835192069392, "grad_norm": 2.025951385498047, "learning_rate": 3.5811648079306074e-05, "loss": 0.2392, "step": 2650 }, { "epoch": 1.648079306071871, "grad_norm": 9.39108943939209, "learning_rate": 3.5192069392812884e-05, "loss": 0.3255, "step": 2660 }, { "epoch": 1.6542750929368029, "grad_norm": 1.8526005744934082, "learning_rate": 3.45724907063197e-05, "loss": 0.1944, "step": 2670 }, { "epoch": 1.6604708798017347, "grad_norm": 0.7823792695999146, "learning_rate": 3.3952912019826525e-05, "loss": 0.149, "step": 2680 }, { "epoch": 1.6666666666666665, "grad_norm": 7.365529537200928, "learning_rate": 3.3333333333333335e-05, "loss": 0.312, "step": 2690 }, { "epoch": 1.6728624535315983, "grad_norm": 3.5820822715759277, "learning_rate": 3.271375464684015e-05, "loss": 0.1027, "step": 2700 }, { "epoch": 1.6728624535315983, "eval_accuracy": 0.8846034696406444, "eval_loss": 0.4310809373855591, "eval_runtime": 106.7243, "eval_samples_per_second": 60.492, "eval_steps_per_second": 7.562, "step": 2700 }, { "epoch": 1.6790582403965304, "grad_norm": 0.8703699111938477, "learning_rate": 3.209417596034697e-05, "loss": 0.1744, "step": 2710 }, { "epoch": 1.6852540272614622, "grad_norm": 3.368072986602783, "learning_rate": 3.147459727385378e-05, "loss": 0.2112, "step": 2720 }, { "epoch": 1.691449814126394, "grad_norm": 2.1907970905303955, "learning_rate": 3.0855018587360595e-05, "loss": 0.227, "step": 2730 }, { "epoch": 1.6976456009913259, "grad_norm": 3.725156784057617, "learning_rate": 3.0235439900867412e-05, "loss": 0.184, "step": 2740 }, { "epoch": 1.7038413878562577, "grad_norm": 0.17684808373451233, "learning_rate": 2.9615861214374226e-05, "loss": 0.1559, "step": 2750 }, { "epoch": 1.7100371747211895, "grad_norm": 5.654155731201172, "learning_rate": 2.8996282527881043e-05, "loss": 0.2551, "step": 2760 }, { "epoch": 1.7162329615861216, "grad_norm": 1.2272543907165527, "learning_rate": 2.837670384138786e-05, "loss": 0.2352, "step": 2770 }, { "epoch": 1.7224287484510534, "grad_norm": 0.7282238006591797, "learning_rate": 2.7757125154894676e-05, "loss": 0.2647, "step": 2780 }, { "epoch": 1.7286245353159853, "grad_norm": 2.949235677719116, "learning_rate": 2.7137546468401486e-05, "loss": 0.1656, "step": 2790 }, { "epoch": 1.734820322180917, "grad_norm": 5.691123962402344, "learning_rate": 2.6517967781908303e-05, "loss": 0.2086, "step": 2800 }, { "epoch": 1.734820322180917, "eval_accuracy": 0.8897149938042132, "eval_loss": 0.4087870121002197, "eval_runtime": 107.499, "eval_samples_per_second": 60.056, "eval_steps_per_second": 7.507, "step": 2800 }, { "epoch": 1.741016109045849, "grad_norm": 0.18745893239974976, "learning_rate": 2.589838909541512e-05, "loss": 0.1878, "step": 2810 }, { "epoch": 1.7472118959107807, "grad_norm": 6.888800621032715, "learning_rate": 2.5278810408921933e-05, "loss": 0.1727, "step": 2820 }, { "epoch": 1.7534076827757126, "grad_norm": 1.913245439529419, "learning_rate": 2.465923172242875e-05, "loss": 0.1732, "step": 2830 }, { "epoch": 1.7596034696406444, "grad_norm": 2.651405096054077, "learning_rate": 2.4039653035935564e-05, "loss": 0.1928, "step": 2840 }, { "epoch": 1.7657992565055762, "grad_norm": 1.3924200534820557, "learning_rate": 2.342007434944238e-05, "loss": 0.1397, "step": 2850 }, { "epoch": 1.771995043370508, "grad_norm": 7.028463840484619, "learning_rate": 2.2800495662949194e-05, "loss": 0.2461, "step": 2860 }, { "epoch": 1.77819083023544, "grad_norm": 1.74459707736969, "learning_rate": 2.218091697645601e-05, "loss": 0.137, "step": 2870 }, { "epoch": 1.7843866171003717, "grad_norm": 0.35962000489234924, "learning_rate": 2.1561338289962827e-05, "loss": 0.1767, "step": 2880 }, { "epoch": 1.7905824039653035, "grad_norm": 1.6832956075668335, "learning_rate": 2.094175960346964e-05, "loss": 0.2574, "step": 2890 }, { "epoch": 1.7967781908302354, "grad_norm": 5.7223029136657715, "learning_rate": 2.0322180916976458e-05, "loss": 0.1683, "step": 2900 }, { "epoch": 1.7967781908302354, "eval_accuracy": 0.8918835192069393, "eval_loss": 0.41334882378578186, "eval_runtime": 106.6472, "eval_samples_per_second": 60.536, "eval_steps_per_second": 7.567, "step": 2900 }, { "epoch": 1.8029739776951672, "grad_norm": 6.862188339233398, "learning_rate": 1.970260223048327e-05, "loss": 0.1923, "step": 2910 }, { "epoch": 1.809169764560099, "grad_norm": 1.456938624382019, "learning_rate": 1.9083023543990088e-05, "loss": 0.167, "step": 2920 }, { "epoch": 1.8153655514250309, "grad_norm": 1.2363471984863281, "learning_rate": 1.8463444857496905e-05, "loss": 0.1129, "step": 2930 }, { "epoch": 1.8215613382899627, "grad_norm": 2.139641761779785, "learning_rate": 1.7843866171003718e-05, "loss": 0.2046, "step": 2940 }, { "epoch": 1.8277571251548945, "grad_norm": 7.9193620681762695, "learning_rate": 1.7224287484510535e-05, "loss": 0.1843, "step": 2950 }, { "epoch": 1.8339529120198264, "grad_norm": 0.4252839684486389, "learning_rate": 1.660470879801735e-05, "loss": 0.1719, "step": 2960 }, { "epoch": 1.8401486988847584, "grad_norm": 2.5650665760040283, "learning_rate": 1.5985130111524162e-05, "loss": 0.2294, "step": 2970 }, { "epoch": 1.8463444857496902, "grad_norm": 4.639560222625732, "learning_rate": 1.536555142503098e-05, "loss": 0.163, "step": 2980 }, { "epoch": 1.852540272614622, "grad_norm": 1.6396369934082031, "learning_rate": 1.4745972738537794e-05, "loss": 0.0855, "step": 2990 }, { "epoch": 1.858736059479554, "grad_norm": 3.6492340564727783, "learning_rate": 1.412639405204461e-05, "loss": 0.2767, "step": 3000 }, { "epoch": 1.858736059479554, "eval_accuracy": 0.8963754646840149, "eval_loss": 0.38507798314094543, "eval_runtime": 106.3075, "eval_samples_per_second": 60.73, "eval_steps_per_second": 7.591, "step": 3000 }, { "epoch": 1.8649318463444857, "grad_norm": 2.8184783458709717, "learning_rate": 1.3506815365551426e-05, "loss": 0.1552, "step": 3010 }, { "epoch": 1.8711276332094176, "grad_norm": 0.4257136881351471, "learning_rate": 1.288723667905824e-05, "loss": 0.169, "step": 3020 }, { "epoch": 1.8773234200743496, "grad_norm": 6.326307773590088, "learning_rate": 1.2267657992565058e-05, "loss": 0.1842, "step": 3030 }, { "epoch": 1.8835192069392814, "grad_norm": 0.6836357712745667, "learning_rate": 1.1648079306071871e-05, "loss": 0.1587, "step": 3040 }, { "epoch": 1.8897149938042133, "grad_norm": 6.755892753601074, "learning_rate": 1.1028500619578686e-05, "loss": 0.1954, "step": 3050 }, { "epoch": 1.895910780669145, "grad_norm": 2.74873948097229, "learning_rate": 1.0408921933085503e-05, "loss": 0.1719, "step": 3060 }, { "epoch": 1.902106567534077, "grad_norm": 4.230051517486572, "learning_rate": 9.789343246592318e-06, "loss": 0.1562, "step": 3070 }, { "epoch": 1.9083023543990087, "grad_norm": 5.3604512214660645, "learning_rate": 9.169764560099132e-06, "loss": 0.2365, "step": 3080 }, { "epoch": 1.9144981412639406, "grad_norm": 0.3976893723011017, "learning_rate": 8.550185873605949e-06, "loss": 0.1322, "step": 3090 }, { "epoch": 1.9206939281288724, "grad_norm": 3.6880292892456055, "learning_rate": 7.930607187112764e-06, "loss": 0.1582, "step": 3100 }, { "epoch": 1.9206939281288724, "eval_accuracy": 0.9017967781908303, "eval_loss": 0.3703024089336395, "eval_runtime": 107.4091, "eval_samples_per_second": 60.107, "eval_steps_per_second": 7.513, "step": 3100 }, { "epoch": 1.9268897149938042, "grad_norm": 9.823491096496582, "learning_rate": 7.31102850061958e-06, "loss": 0.2671, "step": 3110 }, { "epoch": 1.933085501858736, "grad_norm": 5.724573135375977, "learning_rate": 6.691449814126394e-06, "loss": 0.2088, "step": 3120 }, { "epoch": 1.939281288723668, "grad_norm": 6.375148773193359, "learning_rate": 6.071871127633209e-06, "loss": 0.1613, "step": 3130 }, { "epoch": 1.9454770755885997, "grad_norm": 3.657437324523926, "learning_rate": 5.452292441140025e-06, "loss": 0.1824, "step": 3140 }, { "epoch": 1.9516728624535316, "grad_norm": 0.8294070959091187, "learning_rate": 4.832713754646841e-06, "loss": 0.1733, "step": 3150 }, { "epoch": 1.9578686493184634, "grad_norm": 2.991377592086792, "learning_rate": 4.213135068153655e-06, "loss": 0.2467, "step": 3160 }, { "epoch": 1.9640644361833952, "grad_norm": 3.435967445373535, "learning_rate": 3.5935563816604712e-06, "loss": 0.1882, "step": 3170 }, { "epoch": 1.970260223048327, "grad_norm": 4.247952938079834, "learning_rate": 2.9739776951672864e-06, "loss": 0.1783, "step": 3180 }, { "epoch": 1.9764560099132589, "grad_norm": 0.5351110100746155, "learning_rate": 2.3543990086741015e-06, "loss": 0.1373, "step": 3190 }, { "epoch": 1.9826517967781907, "grad_norm": 2.167306661605835, "learning_rate": 1.7348203221809173e-06, "loss": 0.1421, "step": 3200 }, { "epoch": 1.9826517967781907, "eval_accuracy": 0.90272614622057, "eval_loss": 0.36434125900268555, "eval_runtime": 107.4144, "eval_samples_per_second": 60.104, "eval_steps_per_second": 7.513, "step": 3200 }, { "epoch": 1.9888475836431225, "grad_norm": 1.093064785003662, "learning_rate": 1.1152416356877324e-06, "loss": 0.1911, "step": 3210 }, { "epoch": 1.9950433705080544, "grad_norm": 3.7282161712646484, "learning_rate": 4.956629491945477e-07, "loss": 0.2041, "step": 3220 }, { "epoch": 2.0, "step": 3228, "total_flos": 4.004423768814723e+18, "train_loss": 0.9726454161726114, "train_runtime": 5145.5507, "train_samples_per_second": 10.036, "train_steps_per_second": 0.627 } ], "logging_steps": 10, "max_steps": 3228, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.004423768814723e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }