{ "best_metric": 0.9130434782608695, "best_model_checkpoint": "CTMAE2_CS_V7_1/checkpoint-13209", "epoch": 49.02, "eval_steps": 500, "global_step": 38850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002574002574002574, "grad_norm": 5.2134599685668945, "learning_rate": 2.5740025740025742e-08, "loss": 0.6904, "step": 10 }, { "epoch": 0.0005148005148005148, "grad_norm": 5.708760738372803, "learning_rate": 5.1480051480051484e-08, "loss": 0.6958, "step": 20 }, { "epoch": 0.0007722007722007722, "grad_norm": 5.368264675140381, "learning_rate": 7.722007722007723e-08, "loss": 0.696, "step": 30 }, { "epoch": 0.0010296010296010295, "grad_norm": 6.501317977905273, "learning_rate": 1.0296010296010297e-07, "loss": 0.7008, "step": 40 }, { "epoch": 0.001287001287001287, "grad_norm": 6.664251804351807, "learning_rate": 1.287001287001287e-07, "loss": 0.6545, "step": 50 }, { "epoch": 0.0015444015444015444, "grad_norm": 5.930378437042236, "learning_rate": 1.5444015444015445e-07, "loss": 0.6797, "step": 60 }, { "epoch": 0.0018018018018018018, "grad_norm": 8.338624954223633, "learning_rate": 1.801801801801802e-07, "loss": 0.6907, "step": 70 }, { "epoch": 0.002059202059202059, "grad_norm": 5.39778995513916, "learning_rate": 2.0592020592020594e-07, "loss": 0.6943, "step": 80 }, { "epoch": 0.0023166023166023165, "grad_norm": 6.76181173324585, "learning_rate": 2.3166023166023168e-07, "loss": 0.6661, "step": 90 }, { "epoch": 0.002574002574002574, "grad_norm": 6.561943054199219, "learning_rate": 2.574002574002574e-07, "loss": 0.6456, "step": 100 }, { "epoch": 0.0028314028314028314, "grad_norm": 9.402544975280762, "learning_rate": 2.8314028314028316e-07, "loss": 0.6365, "step": 110 }, { "epoch": 0.003088803088803089, "grad_norm": 6.95511531829834, "learning_rate": 3.088803088803089e-07, "loss": 0.6886, "step": 120 }, { "epoch": 0.0033462033462033462, "grad_norm": 12.216158866882324, "learning_rate": 3.3462033462033465e-07, "loss": 0.7131, "step": 130 }, { "epoch": 0.0036036036036036037, "grad_norm": 7.642661094665527, "learning_rate": 3.603603603603604e-07, "loss": 0.5935, "step": 140 }, { "epoch": 0.003861003861003861, "grad_norm": 6.88040018081665, "learning_rate": 3.8610038610038613e-07, "loss": 0.6159, "step": 150 }, { "epoch": 0.004118404118404118, "grad_norm": 7.177238941192627, "learning_rate": 4.1184041184041187e-07, "loss": 0.6105, "step": 160 }, { "epoch": 0.004375804375804376, "grad_norm": 7.751633644104004, "learning_rate": 4.375804375804376e-07, "loss": 0.5116, "step": 170 }, { "epoch": 0.004633204633204633, "grad_norm": 5.686230659484863, "learning_rate": 4.6332046332046336e-07, "loss": 0.7037, "step": 180 }, { "epoch": 0.004890604890604891, "grad_norm": 7.677026271820068, "learning_rate": 4.890604890604891e-07, "loss": 0.7483, "step": 190 }, { "epoch": 0.005148005148005148, "grad_norm": 9.224215507507324, "learning_rate": 5.148005148005148e-07, "loss": 0.6478, "step": 200 }, { "epoch": 0.005405405405405406, "grad_norm": 15.320013046264648, "learning_rate": 5.405405405405406e-07, "loss": 0.6244, "step": 210 }, { "epoch": 0.005662805662805663, "grad_norm": 4.737372875213623, "learning_rate": 5.662805662805663e-07, "loss": 0.5908, "step": 220 }, { "epoch": 0.005920205920205921, "grad_norm": 7.637610912322998, "learning_rate": 5.920205920205921e-07, "loss": 0.7024, "step": 230 }, { "epoch": 0.006177606177606178, "grad_norm": 60.20543670654297, "learning_rate": 6.177606177606178e-07, "loss": 0.6378, "step": 240 }, { "epoch": 0.006435006435006435, "grad_norm": 24.914642333984375, "learning_rate": 6.435006435006435e-07, "loss": 0.6039, "step": 250 }, { "epoch": 0.0066924066924066924, "grad_norm": 28.513675689697266, "learning_rate": 6.692406692406693e-07, "loss": 0.7308, "step": 260 }, { "epoch": 0.0069498069498069494, "grad_norm": 6.873077869415283, "learning_rate": 6.94980694980695e-07, "loss": 0.8579, "step": 270 }, { "epoch": 0.007207207207207207, "grad_norm": 7.60503625869751, "learning_rate": 7.207207207207208e-07, "loss": 0.7404, "step": 280 }, { "epoch": 0.007464607464607464, "grad_norm": 25.57830810546875, "learning_rate": 7.464607464607465e-07, "loss": 0.4125, "step": 290 }, { "epoch": 0.007722007722007722, "grad_norm": 5.907918930053711, "learning_rate": 7.722007722007723e-07, "loss": 0.7342, "step": 300 }, { "epoch": 0.00797940797940798, "grad_norm": 6.313519477844238, "learning_rate": 7.97940797940798e-07, "loss": 0.7723, "step": 310 }, { "epoch": 0.008236808236808236, "grad_norm": 30.61429214477539, "learning_rate": 8.236808236808237e-07, "loss": 0.831, "step": 320 }, { "epoch": 0.008494208494208495, "grad_norm": 5.046879768371582, "learning_rate": 8.494208494208495e-07, "loss": 0.5865, "step": 330 }, { "epoch": 0.008751608751608752, "grad_norm": 3.990790367126465, "learning_rate": 8.751608751608752e-07, "loss": 0.994, "step": 340 }, { "epoch": 0.009009009009009009, "grad_norm": 5.086954593658447, "learning_rate": 9.00900900900901e-07, "loss": 0.6809, "step": 350 }, { "epoch": 0.009266409266409266, "grad_norm": 28.828678131103516, "learning_rate": 9.266409266409267e-07, "loss": 0.8819, "step": 360 }, { "epoch": 0.009523809523809525, "grad_norm": 5.523406028747559, "learning_rate": 9.523809523809525e-07, "loss": 0.9103, "step": 370 }, { "epoch": 0.009781209781209782, "grad_norm": 80.04660034179688, "learning_rate": 9.781209781209782e-07, "loss": 1.1861, "step": 380 }, { "epoch": 0.010038610038610039, "grad_norm": 32.42625045776367, "learning_rate": 1.0038610038610038e-06, "loss": 0.5041, "step": 390 }, { "epoch": 0.010296010296010296, "grad_norm": 8.370193481445312, "learning_rate": 1.0296010296010297e-06, "loss": 1.3096, "step": 400 }, { "epoch": 0.010553410553410553, "grad_norm": 97.4281005859375, "learning_rate": 1.0553410553410555e-06, "loss": 0.8997, "step": 410 }, { "epoch": 0.010810810810810811, "grad_norm": 9.947352409362793, "learning_rate": 1.0810810810810812e-06, "loss": 1.1698, "step": 420 }, { "epoch": 0.011068211068211068, "grad_norm": 154.74952697753906, "learning_rate": 1.106821106821107e-06, "loss": 0.7863, "step": 430 }, { "epoch": 0.011325611325611325, "grad_norm": 8.229863166809082, "learning_rate": 1.1325611325611326e-06, "loss": 0.7175, "step": 440 }, { "epoch": 0.011583011583011582, "grad_norm": 7.129251956939697, "learning_rate": 1.1583011583011585e-06, "loss": 0.7535, "step": 450 }, { "epoch": 0.011840411840411841, "grad_norm": 6.870824337005615, "learning_rate": 1.1840411840411841e-06, "loss": 0.2594, "step": 460 }, { "epoch": 0.012097812097812098, "grad_norm": 40.79051971435547, "learning_rate": 1.20978120978121e-06, "loss": 0.7685, "step": 470 }, { "epoch": 0.012355212355212355, "grad_norm": 72.0915298461914, "learning_rate": 1.2355212355212356e-06, "loss": 0.9368, "step": 480 }, { "epoch": 0.012612612612612612, "grad_norm": 93.46292114257812, "learning_rate": 1.2612612612612613e-06, "loss": 0.8962, "step": 490 }, { "epoch": 0.01287001287001287, "grad_norm": 56.67453384399414, "learning_rate": 1.287001287001287e-06, "loss": 1.1966, "step": 500 }, { "epoch": 0.013127413127413128, "grad_norm": 3.2957065105438232, "learning_rate": 1.3127413127413127e-06, "loss": 0.7705, "step": 510 }, { "epoch": 0.013384813384813385, "grad_norm": 0.5807011723518372, "learning_rate": 1.3384813384813386e-06, "loss": 1.4432, "step": 520 }, { "epoch": 0.013642213642213642, "grad_norm": 2.716315746307373, "learning_rate": 1.3642213642213642e-06, "loss": 0.764, "step": 530 }, { "epoch": 0.013899613899613899, "grad_norm": 97.65186309814453, "learning_rate": 1.38996138996139e-06, "loss": 0.8481, "step": 540 }, { "epoch": 0.014157014157014158, "grad_norm": 0.27565112709999084, "learning_rate": 1.415701415701416e-06, "loss": 0.4895, "step": 550 }, { "epoch": 0.014414414414414415, "grad_norm": 0.5415809750556946, "learning_rate": 1.4414414414414416e-06, "loss": 0.8587, "step": 560 }, { "epoch": 0.014671814671814672, "grad_norm": 0.5196000933647156, "learning_rate": 1.4671814671814674e-06, "loss": 1.4641, "step": 570 }, { "epoch": 0.014929214929214929, "grad_norm": 59.97761917114258, "learning_rate": 1.492921492921493e-06, "loss": 2.4882, "step": 580 }, { "epoch": 0.015186615186615187, "grad_norm": 139.7994842529297, "learning_rate": 1.5186615186615189e-06, "loss": 1.3921, "step": 590 }, { "epoch": 0.015444015444015444, "grad_norm": 130.2068328857422, "learning_rate": 1.5444015444015445e-06, "loss": 2.1871, "step": 600 }, { "epoch": 0.015701415701415703, "grad_norm": 149.9769744873047, "learning_rate": 1.5701415701415704e-06, "loss": 1.6264, "step": 610 }, { "epoch": 0.01595881595881596, "grad_norm": 3.2275736331939697, "learning_rate": 1.595881595881596e-06, "loss": 1.2513, "step": 620 }, { "epoch": 0.016216216216216217, "grad_norm": 100.76615905761719, "learning_rate": 1.6216216216216219e-06, "loss": 2.0362, "step": 630 }, { "epoch": 0.016473616473616472, "grad_norm": 0.8999217748641968, "learning_rate": 1.6473616473616475e-06, "loss": 1.23, "step": 640 }, { "epoch": 0.01673101673101673, "grad_norm": 0.44757452607154846, "learning_rate": 1.6731016731016733e-06, "loss": 0.5529, "step": 650 }, { "epoch": 0.01698841698841699, "grad_norm": 0.33385616540908813, "learning_rate": 1.698841698841699e-06, "loss": 0.5156, "step": 660 }, { "epoch": 0.017245817245817245, "grad_norm": 0.26416975259780884, "learning_rate": 1.7245817245817248e-06, "loss": 1.5602, "step": 670 }, { "epoch": 0.017503217503217504, "grad_norm": 0.3138328790664673, "learning_rate": 1.7503217503217505e-06, "loss": 1.4557, "step": 680 }, { "epoch": 0.01776061776061776, "grad_norm": 52.74371337890625, "learning_rate": 1.7760617760617763e-06, "loss": 1.5494, "step": 690 }, { "epoch": 0.018018018018018018, "grad_norm": 50.65621566772461, "learning_rate": 1.801801801801802e-06, "loss": 1.0334, "step": 700 }, { "epoch": 0.018275418275418277, "grad_norm": 0.3436495363712311, "learning_rate": 1.8275418275418278e-06, "loss": 1.9683, "step": 710 }, { "epoch": 0.018532818532818532, "grad_norm": 0.81874018907547, "learning_rate": 1.8532818532818534e-06, "loss": 1.778, "step": 720 }, { "epoch": 0.01879021879021879, "grad_norm": 0.46802252531051636, "learning_rate": 1.8790218790218793e-06, "loss": 1.3622, "step": 730 }, { "epoch": 0.01904761904761905, "grad_norm": 63.82124328613281, "learning_rate": 1.904761904761905e-06, "loss": 2.0994, "step": 740 }, { "epoch": 0.019305019305019305, "grad_norm": 18.252229690551758, "learning_rate": 1.9305019305019305e-06, "loss": 1.2912, "step": 750 }, { "epoch": 0.019562419562419563, "grad_norm": 0.2647660970687866, "learning_rate": 1.9562419562419564e-06, "loss": 1.5519, "step": 760 }, { "epoch": 0.01981981981981982, "grad_norm": 0.2828594148159027, "learning_rate": 1.9819819819819822e-06, "loss": 1.0024, "step": 770 }, { "epoch": 0.02, "eval_accuracy": 0.45652173913043476, "eval_loss": 2.8390824794769287, "eval_runtime": 17.7944, "eval_samples_per_second": 2.585, "eval_steps_per_second": 2.585, "step": 777 }, { "epoch": 1.00007722007722, "grad_norm": 79.82716369628906, "learning_rate": 2.0077220077220077e-06, "loss": 0.7829, "step": 780 }, { "epoch": 1.0003346203346204, "grad_norm": 0.16030679643154144, "learning_rate": 2.0334620334620335e-06, "loss": 1.0346, "step": 790 }, { "epoch": 1.0005920205920207, "grad_norm": 57.7988166809082, "learning_rate": 2.0592020592020594e-06, "loss": 3.4014, "step": 800 }, { "epoch": 1.0008494208494207, "grad_norm": 0.7285271286964417, "learning_rate": 2.084942084942085e-06, "loss": 1.16, "step": 810 }, { "epoch": 1.001106821106821, "grad_norm": 0.50090491771698, "learning_rate": 2.110682110682111e-06, "loss": 1.1828, "step": 820 }, { "epoch": 1.0013642213642213, "grad_norm": 61.112510681152344, "learning_rate": 2.1364221364221365e-06, "loss": 1.7173, "step": 830 }, { "epoch": 1.0016216216216216, "grad_norm": 3.5773725509643555, "learning_rate": 2.1621621621621623e-06, "loss": 1.2737, "step": 840 }, { "epoch": 1.001879021879022, "grad_norm": 0.7043317556381226, "learning_rate": 2.187902187902188e-06, "loss": 1.804, "step": 850 }, { "epoch": 1.0021364221364222, "grad_norm": 0.6512853503227234, "learning_rate": 2.213642213642214e-06, "loss": 2.3513, "step": 860 }, { "epoch": 1.0023938223938225, "grad_norm": 41.23583221435547, "learning_rate": 2.2393822393822394e-06, "loss": 1.359, "step": 870 }, { "epoch": 1.0026512226512228, "grad_norm": 2.1360208988189697, "learning_rate": 2.2651222651222653e-06, "loss": 1.4268, "step": 880 }, { "epoch": 1.0029086229086228, "grad_norm": 0.1973283439874649, "learning_rate": 2.290862290862291e-06, "loss": 0.842, "step": 890 }, { "epoch": 1.003166023166023, "grad_norm": 80.92622375488281, "learning_rate": 2.316602316602317e-06, "loss": 1.9044, "step": 900 }, { "epoch": 1.0034234234234234, "grad_norm": 1.2733196020126343, "learning_rate": 2.3423423423423424e-06, "loss": 3.3045, "step": 910 }, { "epoch": 1.0036808236808237, "grad_norm": 79.40094757080078, "learning_rate": 2.3680823680823683e-06, "loss": 1.1075, "step": 920 }, { "epoch": 1.003938223938224, "grad_norm": 3.9844746589660645, "learning_rate": 2.393822393822394e-06, "loss": 1.1913, "step": 930 }, { "epoch": 1.0041956241956242, "grad_norm": 1.1007922887802124, "learning_rate": 2.41956241956242e-06, "loss": 0.9658, "step": 940 }, { "epoch": 1.0044530244530245, "grad_norm": 0.7749175429344177, "learning_rate": 2.4453024453024454e-06, "loss": 0.9729, "step": 950 }, { "epoch": 1.0047104247104248, "grad_norm": 58.21550369262695, "learning_rate": 2.4710424710424712e-06, "loss": 2.7515, "step": 960 }, { "epoch": 1.0049678249678249, "grad_norm": 0.8086639642715454, "learning_rate": 2.496782496782497e-06, "loss": 1.7731, "step": 970 }, { "epoch": 1.0052252252252252, "grad_norm": 0.3528954088687897, "learning_rate": 2.5225225225225225e-06, "loss": 1.5213, "step": 980 }, { "epoch": 1.0054826254826255, "grad_norm": 1.0501034259796143, "learning_rate": 2.5482625482625484e-06, "loss": 0.8826, "step": 990 }, { "epoch": 1.0057400257400257, "grad_norm": 64.3667984008789, "learning_rate": 2.574002574002574e-06, "loss": 2.4557, "step": 1000 }, { "epoch": 1.005997425997426, "grad_norm": 0.4949258267879486, "learning_rate": 2.5997425997426e-06, "loss": 1.8877, "step": 1010 }, { "epoch": 1.0062548262548263, "grad_norm": 44.34648895263672, "learning_rate": 2.6254826254826255e-06, "loss": 2.103, "step": 1020 }, { "epoch": 1.0065122265122266, "grad_norm": 1.014761209487915, "learning_rate": 2.6512226512226513e-06, "loss": 0.7794, "step": 1030 }, { "epoch": 1.0067696267696267, "grad_norm": 0.24838252365589142, "learning_rate": 2.676962676962677e-06, "loss": 0.0121, "step": 1040 }, { "epoch": 1.007027027027027, "grad_norm": 0.22434230148792267, "learning_rate": 2.702702702702703e-06, "loss": 1.5669, "step": 1050 }, { "epoch": 1.0072844272844272, "grad_norm": 0.5810171961784363, "learning_rate": 2.7284427284427284e-06, "loss": 1.016, "step": 1060 }, { "epoch": 1.0075418275418275, "grad_norm": 68.941650390625, "learning_rate": 2.7541827541827543e-06, "loss": 2.4654, "step": 1070 }, { "epoch": 1.0077992277992278, "grad_norm": 45.62387466430664, "learning_rate": 2.77992277992278e-06, "loss": 2.2378, "step": 1080 }, { "epoch": 1.008056628056628, "grad_norm": 51.002010345458984, "learning_rate": 2.805662805662806e-06, "loss": 1.5515, "step": 1090 }, { "epoch": 1.0083140283140284, "grad_norm": 1.32701575756073, "learning_rate": 2.831402831402832e-06, "loss": 1.6134, "step": 1100 }, { "epoch": 1.0085714285714287, "grad_norm": 3.001387357711792, "learning_rate": 2.8571428571428573e-06, "loss": 2.4355, "step": 1110 }, { "epoch": 1.0088288288288287, "grad_norm": 2.806532144546509, "learning_rate": 2.882882882882883e-06, "loss": 0.5442, "step": 1120 }, { "epoch": 1.009086229086229, "grad_norm": 33.64170455932617, "learning_rate": 2.908622908622909e-06, "loss": 1.8864, "step": 1130 }, { "epoch": 1.0093436293436293, "grad_norm": 46.05695724487305, "learning_rate": 2.934362934362935e-06, "loss": 1.3197, "step": 1140 }, { "epoch": 1.0096010296010296, "grad_norm": 0.5466468334197998, "learning_rate": 2.9601029601029602e-06, "loss": 0.8938, "step": 1150 }, { "epoch": 1.0098584298584299, "grad_norm": 0.12830115854740143, "learning_rate": 2.985842985842986e-06, "loss": 0.3857, "step": 1160 }, { "epoch": 1.0101158301158302, "grad_norm": 0.15344300866127014, "learning_rate": 3.011583011583012e-06, "loss": 0.5299, "step": 1170 }, { "epoch": 1.0103732303732305, "grad_norm": 0.11964306980371475, "learning_rate": 3.0373230373230378e-06, "loss": 0.0049, "step": 1180 }, { "epoch": 1.0106306306306305, "grad_norm": 46.51392364501953, "learning_rate": 3.063063063063063e-06, "loss": 3.7267, "step": 1190 }, { "epoch": 1.0108880308880308, "grad_norm": 1.6805133819580078, "learning_rate": 3.088803088803089e-06, "loss": 1.7001, "step": 1200 }, { "epoch": 1.011145431145431, "grad_norm": 47.035987854003906, "learning_rate": 3.114543114543115e-06, "loss": 1.5832, "step": 1210 }, { "epoch": 1.0114028314028314, "grad_norm": 0.3172648251056671, "learning_rate": 3.1402831402831407e-06, "loss": 1.4229, "step": 1220 }, { "epoch": 1.0116602316602317, "grad_norm": 0.3255147337913513, "learning_rate": 3.166023166023166e-06, "loss": 1.6876, "step": 1230 }, { "epoch": 1.011917631917632, "grad_norm": 1.1769987344741821, "learning_rate": 3.191763191763192e-06, "loss": 2.4022, "step": 1240 }, { "epoch": 1.0121750321750322, "grad_norm": 0.6336804628372192, "learning_rate": 3.217503217503218e-06, "loss": 0.6996, "step": 1250 }, { "epoch": 1.0124324324324325, "grad_norm": 47.805816650390625, "learning_rate": 3.2432432432432437e-06, "loss": 1.6999, "step": 1260 }, { "epoch": 1.0126898326898326, "grad_norm": 0.20486073195934296, "learning_rate": 3.268983268983269e-06, "loss": 0.9389, "step": 1270 }, { "epoch": 1.0129472329472329, "grad_norm": 39.7364387512207, "learning_rate": 3.294723294723295e-06, "loss": 1.3614, "step": 1280 }, { "epoch": 1.0132046332046332, "grad_norm": 0.8014540672302246, "learning_rate": 3.320463320463321e-06, "loss": 1.3432, "step": 1290 }, { "epoch": 1.0134620334620335, "grad_norm": 0.6335378289222717, "learning_rate": 3.3462033462033467e-06, "loss": 1.2641, "step": 1300 }, { "epoch": 1.0137194337194337, "grad_norm": 0.7638051509857178, "learning_rate": 3.371943371943372e-06, "loss": 1.7139, "step": 1310 }, { "epoch": 1.013976833976834, "grad_norm": 49.99736785888672, "learning_rate": 3.397683397683398e-06, "loss": 1.675, "step": 1320 }, { "epoch": 1.0142342342342343, "grad_norm": 59.96963882446289, "learning_rate": 3.423423423423424e-06, "loss": 2.2552, "step": 1330 }, { "epoch": 1.0144916344916346, "grad_norm": 0.4418865144252777, "learning_rate": 3.4491634491634496e-06, "loss": 1.1082, "step": 1340 }, { "epoch": 1.0147490347490347, "grad_norm": 1.3214060068130493, "learning_rate": 3.4749034749034755e-06, "loss": 1.2404, "step": 1350 }, { "epoch": 1.015006435006435, "grad_norm": 0.9876627922058105, "learning_rate": 3.500643500643501e-06, "loss": 1.9432, "step": 1360 }, { "epoch": 1.0152638352638352, "grad_norm": 50.07007598876953, "learning_rate": 3.5263835263835268e-06, "loss": 1.6856, "step": 1370 }, { "epoch": 1.0155212355212355, "grad_norm": 0.20070776343345642, "learning_rate": 3.5521235521235526e-06, "loss": 0.7285, "step": 1380 }, { "epoch": 1.0157786357786358, "grad_norm": 43.47827911376953, "learning_rate": 3.5778635778635785e-06, "loss": 2.393, "step": 1390 }, { "epoch": 1.016036036036036, "grad_norm": 51.43101501464844, "learning_rate": 3.603603603603604e-06, "loss": 1.4497, "step": 1400 }, { "epoch": 1.0162934362934364, "grad_norm": 40.5467529296875, "learning_rate": 3.6293436293436297e-06, "loss": 1.2371, "step": 1410 }, { "epoch": 1.0165508365508364, "grad_norm": 0.8846995830535889, "learning_rate": 3.6550836550836556e-06, "loss": 0.4215, "step": 1420 }, { "epoch": 1.0168082368082367, "grad_norm": 0.22001178562641144, "learning_rate": 3.6808236808236814e-06, "loss": 1.9439, "step": 1430 }, { "epoch": 1.017065637065637, "grad_norm": 156.90634155273438, "learning_rate": 3.706563706563707e-06, "loss": 2.5799, "step": 1440 }, { "epoch": 1.0173230373230373, "grad_norm": 2.8918557167053223, "learning_rate": 3.7323037323037327e-06, "loss": 0.479, "step": 1450 }, { "epoch": 1.0175804375804376, "grad_norm": 49.26845169067383, "learning_rate": 3.7580437580437585e-06, "loss": 1.8774, "step": 1460 }, { "epoch": 1.0178378378378379, "grad_norm": 0.13710249960422516, "learning_rate": 3.7837837837837844e-06, "loss": 0.4813, "step": 1470 }, { "epoch": 1.0180952380952382, "grad_norm": 0.31490692496299744, "learning_rate": 3.80952380952381e-06, "loss": 1.5386, "step": 1480 }, { "epoch": 1.0183526383526385, "grad_norm": 0.6874269247055054, "learning_rate": 3.835263835263835e-06, "loss": 0.4951, "step": 1490 }, { "epoch": 1.0186100386100385, "grad_norm": 0.10348591953516006, "learning_rate": 3.861003861003861e-06, "loss": 1.0332, "step": 1500 }, { "epoch": 1.0188674388674388, "grad_norm": 2.6218488216400146, "learning_rate": 3.886743886743887e-06, "loss": 2.4594, "step": 1510 }, { "epoch": 1.019124839124839, "grad_norm": 0.5357375741004944, "learning_rate": 3.912483912483913e-06, "loss": 0.864, "step": 1520 }, { "epoch": 1.0193822393822394, "grad_norm": 36.62129211425781, "learning_rate": 3.938223938223939e-06, "loss": 2.2063, "step": 1530 }, { "epoch": 1.0196396396396397, "grad_norm": 0.9374207854270935, "learning_rate": 3.9639639639639645e-06, "loss": 0.4099, "step": 1540 }, { "epoch": 1.01989703989704, "grad_norm": 44.0263671875, "learning_rate": 3.98970398970399e-06, "loss": 1.8322, "step": 1550 }, { "epoch": 1.02, "eval_accuracy": 0.45652173913043476, "eval_loss": 2.382408857345581, "eval_runtime": 16.4743, "eval_samples_per_second": 2.792, "eval_steps_per_second": 2.792, "step": 1554 }, { "epoch": 2.00015444015444, "grad_norm": 39.1295166015625, "learning_rate": 4.015444015444015e-06, "loss": 1.4183, "step": 1560 }, { "epoch": 2.0004118404118403, "grad_norm": 0.612335741519928, "learning_rate": 4.041184041184041e-06, "loss": 1.6604, "step": 1570 }, { "epoch": 2.000669240669241, "grad_norm": 0.7441545724868774, "learning_rate": 4.066924066924067e-06, "loss": 0.4295, "step": 1580 }, { "epoch": 2.000926640926641, "grad_norm": 43.12367248535156, "learning_rate": 4.092664092664093e-06, "loss": 1.8161, "step": 1590 }, { "epoch": 2.0011840411840414, "grad_norm": 81.20256805419922, "learning_rate": 4.118404118404119e-06, "loss": 0.9656, "step": 1600 }, { "epoch": 2.0014414414414414, "grad_norm": 0.589714765548706, "learning_rate": 4.1441441441441446e-06, "loss": 2.417, "step": 1610 }, { "epoch": 2.0016988416988415, "grad_norm": 2.8620784282684326, "learning_rate": 4.16988416988417e-06, "loss": 1.7992, "step": 1620 }, { "epoch": 2.001956241956242, "grad_norm": 28.455774307250977, "learning_rate": 4.195624195624196e-06, "loss": 1.4184, "step": 1630 }, { "epoch": 2.002213642213642, "grad_norm": 1.0315122604370117, "learning_rate": 4.221364221364222e-06, "loss": 1.5888, "step": 1640 }, { "epoch": 2.0024710424710426, "grad_norm": 0.40549129247665405, "learning_rate": 4.247104247104247e-06, "loss": 0.3045, "step": 1650 }, { "epoch": 2.0027284427284426, "grad_norm": 51.82611846923828, "learning_rate": 4.272844272844273e-06, "loss": 0.9216, "step": 1660 }, { "epoch": 2.002985842985843, "grad_norm": 55.43302536010742, "learning_rate": 4.298584298584299e-06, "loss": 1.9498, "step": 1670 }, { "epoch": 2.003243243243243, "grad_norm": 117.26138305664062, "learning_rate": 4.324324324324325e-06, "loss": 2.0472, "step": 1680 }, { "epoch": 2.0035006435006437, "grad_norm": 0.6769011616706848, "learning_rate": 4.3500643500643505e-06, "loss": 0.9034, "step": 1690 }, { "epoch": 2.003758043758044, "grad_norm": 2.4109325408935547, "learning_rate": 4.375804375804376e-06, "loss": 0.5953, "step": 1700 }, { "epoch": 2.004015444015444, "grad_norm": 41.65446472167969, "learning_rate": 4.401544401544402e-06, "loss": 2.514, "step": 1710 }, { "epoch": 2.0042728442728444, "grad_norm": 0.39666905999183655, "learning_rate": 4.427284427284428e-06, "loss": 1.5683, "step": 1720 }, { "epoch": 2.0045302445302444, "grad_norm": 1.444018006324768, "learning_rate": 4.453024453024453e-06, "loss": 0.508, "step": 1730 }, { "epoch": 2.004787644787645, "grad_norm": 0.13277959823608398, "learning_rate": 4.478764478764479e-06, "loss": 0.385, "step": 1740 }, { "epoch": 2.005045045045045, "grad_norm": 1.3261194229125977, "learning_rate": 4.504504504504505e-06, "loss": 1.6581, "step": 1750 }, { "epoch": 2.0053024453024455, "grad_norm": 81.54496002197266, "learning_rate": 4.530244530244531e-06, "loss": 3.6516, "step": 1760 }, { "epoch": 2.0055598455598456, "grad_norm": 84.0995864868164, "learning_rate": 4.5559845559845564e-06, "loss": 1.1699, "step": 1770 }, { "epoch": 2.0058172458172456, "grad_norm": 0.608340859413147, "learning_rate": 4.581724581724582e-06, "loss": 0.9071, "step": 1780 }, { "epoch": 2.006074646074646, "grad_norm": 0.5994428992271423, "learning_rate": 4.607464607464608e-06, "loss": 1.7902, "step": 1790 }, { "epoch": 2.006332046332046, "grad_norm": 0.7909229397773743, "learning_rate": 4.633204633204634e-06, "loss": 2.3984, "step": 1800 }, { "epoch": 2.0065894465894467, "grad_norm": 1.1406596899032593, "learning_rate": 4.658944658944659e-06, "loss": 0.9614, "step": 1810 }, { "epoch": 2.0068468468468468, "grad_norm": 0.39752158522605896, "learning_rate": 4.684684684684685e-06, "loss": 0.9369, "step": 1820 }, { "epoch": 2.0071042471042473, "grad_norm": 34.1922607421875, "learning_rate": 4.710424710424711e-06, "loss": 1.55, "step": 1830 }, { "epoch": 2.0073616473616473, "grad_norm": 34.1176643371582, "learning_rate": 4.7361647361647365e-06, "loss": 1.9668, "step": 1840 }, { "epoch": 2.0076190476190474, "grad_norm": 2.4576947689056396, "learning_rate": 4.761904761904762e-06, "loss": 1.8566, "step": 1850 }, { "epoch": 2.007876447876448, "grad_norm": 0.6945159435272217, "learning_rate": 4.787644787644788e-06, "loss": 0.7815, "step": 1860 }, { "epoch": 2.008133848133848, "grad_norm": 0.2894461154937744, "learning_rate": 4.813384813384814e-06, "loss": 1.2931, "step": 1870 }, { "epoch": 2.0083912483912485, "grad_norm": 2.950002431869507, "learning_rate": 4.83912483912484e-06, "loss": 1.9119, "step": 1880 }, { "epoch": 2.0086486486486486, "grad_norm": 1.8771700859069824, "learning_rate": 4.864864864864866e-06, "loss": 1.8962, "step": 1890 }, { "epoch": 2.008906048906049, "grad_norm": 30.816640853881836, "learning_rate": 4.890604890604891e-06, "loss": 0.8887, "step": 1900 }, { "epoch": 2.009163449163449, "grad_norm": 0.21323242783546448, "learning_rate": 4.916344916344917e-06, "loss": 0.9562, "step": 1910 }, { "epoch": 2.0094208494208496, "grad_norm": 0.3352386951446533, "learning_rate": 4.9420849420849425e-06, "loss": 1.3885, "step": 1920 }, { "epoch": 2.0096782496782497, "grad_norm": 0.2881447374820709, "learning_rate": 4.967824967824968e-06, "loss": 0.8771, "step": 1930 }, { "epoch": 2.0099356499356498, "grad_norm": 0.4859983026981354, "learning_rate": 4.993564993564994e-06, "loss": 2.1814, "step": 1940 }, { "epoch": 2.0101930501930503, "grad_norm": 0.3629135489463806, "learning_rate": 5.01930501930502e-06, "loss": 1.2579, "step": 1950 }, { "epoch": 2.0104504504504503, "grad_norm": 32.26611328125, "learning_rate": 5.045045045045045e-06, "loss": 1.1767, "step": 1960 }, { "epoch": 2.010707850707851, "grad_norm": 0.28480803966522217, "learning_rate": 5.070785070785072e-06, "loss": 0.4105, "step": 1970 }, { "epoch": 2.010965250965251, "grad_norm": 0.5369265079498291, "learning_rate": 5.096525096525097e-06, "loss": 1.3313, "step": 1980 }, { "epoch": 2.0112226512226514, "grad_norm": 0.269264817237854, "learning_rate": 5.122265122265123e-06, "loss": 0.9514, "step": 1990 }, { "epoch": 2.0114800514800515, "grad_norm": 0.19434118270874023, "learning_rate": 5.148005148005148e-06, "loss": 0.9533, "step": 2000 }, { "epoch": 2.0117374517374516, "grad_norm": 32.72032165527344, "learning_rate": 5.173745173745173e-06, "loss": 2.4204, "step": 2010 }, { "epoch": 2.011994851994852, "grad_norm": 1.2949801683425903, "learning_rate": 5.1994851994852e-06, "loss": 2.0395, "step": 2020 }, { "epoch": 2.012252252252252, "grad_norm": 32.453060150146484, "learning_rate": 5.225225225225226e-06, "loss": 1.7792, "step": 2030 }, { "epoch": 2.0125096525096526, "grad_norm": 33.515010833740234, "learning_rate": 5.250965250965251e-06, "loss": 1.3822, "step": 2040 }, { "epoch": 2.0127670527670527, "grad_norm": 1.7297950983047485, "learning_rate": 5.276705276705278e-06, "loss": 1.2671, "step": 2050 }, { "epoch": 2.013024453024453, "grad_norm": 44.402427673339844, "learning_rate": 5.302445302445303e-06, "loss": 1.9451, "step": 2060 }, { "epoch": 2.0132818532818533, "grad_norm": 44.23021697998047, "learning_rate": 5.328185328185329e-06, "loss": 1.3187, "step": 2070 }, { "epoch": 2.0135392535392533, "grad_norm": 0.1740846484899521, "learning_rate": 5.353925353925354e-06, "loss": 0.7822, "step": 2080 }, { "epoch": 2.013796653796654, "grad_norm": 42.054649353027344, "learning_rate": 5.379665379665379e-06, "loss": 1.4139, "step": 2090 }, { "epoch": 2.014054054054054, "grad_norm": 0.2841881215572357, "learning_rate": 5.405405405405406e-06, "loss": 1.3738, "step": 2100 }, { "epoch": 2.0143114543114544, "grad_norm": 0.4734525680541992, "learning_rate": 5.431145431145432e-06, "loss": 1.9621, "step": 2110 }, { "epoch": 2.0145688545688545, "grad_norm": 0.4707898795604706, "learning_rate": 5.456885456885457e-06, "loss": 0.6735, "step": 2120 }, { "epoch": 2.014826254826255, "grad_norm": 0.5470343232154846, "learning_rate": 5.4826254826254836e-06, "loss": 0.9792, "step": 2130 }, { "epoch": 2.015083655083655, "grad_norm": 0.32799121737480164, "learning_rate": 5.5083655083655086e-06, "loss": 1.867, "step": 2140 }, { "epoch": 2.015341055341055, "grad_norm": 0.09990080446004868, "learning_rate": 5.534105534105535e-06, "loss": 0.376, "step": 2150 }, { "epoch": 2.0155984555984556, "grad_norm": 39.74260711669922, "learning_rate": 5.55984555984556e-06, "loss": 2.8961, "step": 2160 }, { "epoch": 2.0158558558558557, "grad_norm": 2.921924352645874, "learning_rate": 5.585585585585585e-06, "loss": 0.3304, "step": 2170 }, { "epoch": 2.016113256113256, "grad_norm": 1.005566120147705, "learning_rate": 5.611325611325612e-06, "loss": 0.4781, "step": 2180 }, { "epoch": 2.0163706563706563, "grad_norm": 43.54568099975586, "learning_rate": 5.637065637065637e-06, "loss": 1.4212, "step": 2190 }, { "epoch": 2.0166280566280568, "grad_norm": 0.2699664235115051, "learning_rate": 5.662805662805664e-06, "loss": 2.2329, "step": 2200 }, { "epoch": 2.016885456885457, "grad_norm": 1.8670907020568848, "learning_rate": 5.6885456885456895e-06, "loss": 1.4189, "step": 2210 }, { "epoch": 2.0171428571428573, "grad_norm": 0.6736155152320862, "learning_rate": 5.7142857142857145e-06, "loss": 1.6321, "step": 2220 }, { "epoch": 2.0174002574002574, "grad_norm": 0.12274114787578583, "learning_rate": 5.740025740025741e-06, "loss": 1.6314, "step": 2230 }, { "epoch": 2.0176576576576575, "grad_norm": 1.2040308713912964, "learning_rate": 5.765765765765766e-06, "loss": 0.6115, "step": 2240 }, { "epoch": 2.017915057915058, "grad_norm": 2.5511398315429688, "learning_rate": 5.791505791505791e-06, "loss": 0.7289, "step": 2250 }, { "epoch": 2.018172458172458, "grad_norm": 0.15484865009784698, "learning_rate": 5.817245817245818e-06, "loss": 1.058, "step": 2260 }, { "epoch": 2.0184298584298586, "grad_norm": 40.67464828491211, "learning_rate": 5.842985842985843e-06, "loss": 1.3452, "step": 2270 }, { "epoch": 2.0186872586872586, "grad_norm": 0.10303802043199539, "learning_rate": 5.86872586872587e-06, "loss": 1.1189, "step": 2280 }, { "epoch": 2.018944658944659, "grad_norm": 1.1726830005645752, "learning_rate": 5.894465894465895e-06, "loss": 2.4109, "step": 2290 }, { "epoch": 2.019202059202059, "grad_norm": 0.34925177693367004, "learning_rate": 5.9202059202059204e-06, "loss": 0.6467, "step": 2300 }, { "epoch": 2.0194594594594593, "grad_norm": 36.52827072143555, "learning_rate": 5.945945945945947e-06, "loss": 1.2837, "step": 2310 }, { "epoch": 2.0197168597168598, "grad_norm": 58.13482666015625, "learning_rate": 5.971685971685972e-06, "loss": 2.3565, "step": 2320 }, { "epoch": 2.01997425997426, "grad_norm": 0.45640629529953003, "learning_rate": 5.997425997425997e-06, "loss": 0.8493, "step": 2330 }, { "epoch": 2.02, "eval_accuracy": 0.45652173913043476, "eval_loss": 1.1137661933898926, "eval_runtime": 16.6817, "eval_samples_per_second": 2.758, "eval_steps_per_second": 2.758, "step": 2331 }, { "epoch": 3.0002316602316603, "grad_norm": 33.54939651489258, "learning_rate": 6.023166023166024e-06, "loss": 0.4011, "step": 2340 }, { "epoch": 3.0004890604890604, "grad_norm": 3.8214738368988037, "learning_rate": 6.048906048906049e-06, "loss": 1.5293, "step": 2350 }, { "epoch": 3.000746460746461, "grad_norm": 4.559745788574219, "learning_rate": 6.0746460746460755e-06, "loss": 0.6424, "step": 2360 }, { "epoch": 3.001003861003861, "grad_norm": 0.11879268288612366, "learning_rate": 6.1003861003861005e-06, "loss": 0.7711, "step": 2370 }, { "epoch": 3.0012612612612615, "grad_norm": 62.36249542236328, "learning_rate": 6.126126126126126e-06, "loss": 1.7833, "step": 2380 }, { "epoch": 3.0015186615186615, "grad_norm": 15.011298179626465, "learning_rate": 6.151866151866153e-06, "loss": 0.8435, "step": 2390 }, { "epoch": 3.0017760617760616, "grad_norm": 21.581642150878906, "learning_rate": 6.177606177606178e-06, "loss": 0.5523, "step": 2400 }, { "epoch": 3.002033462033462, "grad_norm": 29.18716049194336, "learning_rate": 6.203346203346203e-06, "loss": 1.2024, "step": 2410 }, { "epoch": 3.002290862290862, "grad_norm": 0.08250798285007477, "learning_rate": 6.22908622908623e-06, "loss": 2.0342, "step": 2420 }, { "epoch": 3.0025482625482627, "grad_norm": 0.7816446423530579, "learning_rate": 6.254826254826255e-06, "loss": 1.2861, "step": 2430 }, { "epoch": 3.0028056628056627, "grad_norm": 70.71495056152344, "learning_rate": 6.2805662805662815e-06, "loss": 1.8372, "step": 2440 }, { "epoch": 3.0030630630630633, "grad_norm": 57.512603759765625, "learning_rate": 6.3063063063063065e-06, "loss": 1.9071, "step": 2450 }, { "epoch": 3.0033204633204633, "grad_norm": 1.428179144859314, "learning_rate": 6.332046332046332e-06, "loss": 1.127, "step": 2460 }, { "epoch": 3.0035778635778634, "grad_norm": 124.6517333984375, "learning_rate": 6.357786357786358e-06, "loss": 1.237, "step": 2470 }, { "epoch": 3.003835263835264, "grad_norm": 2.761348247528076, "learning_rate": 6.383526383526384e-06, "loss": 0.9261, "step": 2480 }, { "epoch": 3.004092664092664, "grad_norm": 0.9238646626472473, "learning_rate": 6.409266409266411e-06, "loss": 1.2299, "step": 2490 }, { "epoch": 3.0043500643500645, "grad_norm": 128.46592712402344, "learning_rate": 6.435006435006436e-06, "loss": 1.2574, "step": 2500 }, { "epoch": 3.0046074646074645, "grad_norm": 0.5994557738304138, "learning_rate": 6.460746460746461e-06, "loss": 1.0424, "step": 2510 }, { "epoch": 3.004864864864865, "grad_norm": 0.7990912795066833, "learning_rate": 6.486486486486487e-06, "loss": 1.3062, "step": 2520 }, { "epoch": 3.005122265122265, "grad_norm": 48.199005126953125, "learning_rate": 6.512226512226512e-06, "loss": 2.3465, "step": 2530 }, { "epoch": 3.005379665379665, "grad_norm": 1.4259648323059082, "learning_rate": 6.537966537966538e-06, "loss": 1.165, "step": 2540 }, { "epoch": 3.0056370656370657, "grad_norm": 0.08506966382265091, "learning_rate": 6.563706563706564e-06, "loss": 0.0076, "step": 2550 }, { "epoch": 3.0058944658944657, "grad_norm": 0.06992141902446747, "learning_rate": 6.58944658944659e-06, "loss": 1.07, "step": 2560 }, { "epoch": 3.0061518661518662, "grad_norm": 0.048462025821208954, "learning_rate": 6.615186615186616e-06, "loss": 0.9731, "step": 2570 }, { "epoch": 3.0064092664092663, "grad_norm": 0.43919050693511963, "learning_rate": 6.640926640926642e-06, "loss": 1.4364, "step": 2580 }, { "epoch": 3.006666666666667, "grad_norm": 0.037599463015794754, "learning_rate": 6.666666666666667e-06, "loss": 0.698, "step": 2590 }, { "epoch": 3.006924066924067, "grad_norm": 0.03720109537243843, "learning_rate": 6.692406692406693e-06, "loss": 0.0058, "step": 2600 }, { "epoch": 3.0071814671814674, "grad_norm": 0.0219118595123291, "learning_rate": 6.718146718146718e-06, "loss": 0.5981, "step": 2610 }, { "epoch": 3.0074388674388675, "grad_norm": 46.0898551940918, "learning_rate": 6.743886743886744e-06, "loss": 1.8813, "step": 2620 }, { "epoch": 3.0076962676962675, "grad_norm": 84.8902359008789, "learning_rate": 6.76962676962677e-06, "loss": 2.6947, "step": 2630 }, { "epoch": 3.007953667953668, "grad_norm": 0.8112554550170898, "learning_rate": 6.795366795366796e-06, "loss": 1.4373, "step": 2640 }, { "epoch": 3.008211068211068, "grad_norm": 4.990564823150635, "learning_rate": 6.821106821106822e-06, "loss": 1.9307, "step": 2650 }, { "epoch": 3.0084684684684686, "grad_norm": 25.768632888793945, "learning_rate": 6.846846846846848e-06, "loss": 1.3059, "step": 2660 }, { "epoch": 3.0087258687258687, "grad_norm": 54.58412170410156, "learning_rate": 6.872586872586873e-06, "loss": 0.9245, "step": 2670 }, { "epoch": 3.008983268983269, "grad_norm": 47.7413330078125, "learning_rate": 6.898326898326899e-06, "loss": 1.2352, "step": 2680 }, { "epoch": 3.0092406692406692, "grad_norm": 1.7431989908218384, "learning_rate": 6.924066924066924e-06, "loss": 1.2804, "step": 2690 }, { "epoch": 3.0094980694980693, "grad_norm": 0.4777579605579376, "learning_rate": 6.949806949806951e-06, "loss": 0.7333, "step": 2700 }, { "epoch": 3.00975546975547, "grad_norm": 56.5815544128418, "learning_rate": 6.975546975546976e-06, "loss": 1.6311, "step": 2710 }, { "epoch": 3.01001287001287, "grad_norm": 84.84012603759766, "learning_rate": 7.001287001287002e-06, "loss": 3.0436, "step": 2720 }, { "epoch": 3.0102702702702704, "grad_norm": 2.14959716796875, "learning_rate": 7.027027027027028e-06, "loss": 0.6488, "step": 2730 }, { "epoch": 3.0105276705276705, "grad_norm": 0.05591985583305359, "learning_rate": 7.0527670527670535e-06, "loss": 0.8272, "step": 2740 }, { "epoch": 3.010785070785071, "grad_norm": 0.3322344422340393, "learning_rate": 7.0785070785070785e-06, "loss": 0.5012, "step": 2750 }, { "epoch": 3.011042471042471, "grad_norm": 0.042405061423778534, "learning_rate": 7.104247104247105e-06, "loss": 1.7783, "step": 2760 }, { "epoch": 3.011299871299871, "grad_norm": 0.10109010338783264, "learning_rate": 7.12998712998713e-06, "loss": 1.5399, "step": 2770 }, { "epoch": 3.0115572715572716, "grad_norm": 1.0357915163040161, "learning_rate": 7.155727155727157e-06, "loss": 1.998, "step": 2780 }, { "epoch": 3.0118146718146717, "grad_norm": 3.6725423336029053, "learning_rate": 7.181467181467182e-06, "loss": 1.5494, "step": 2790 }, { "epoch": 3.012072072072072, "grad_norm": 0.21130582690238953, "learning_rate": 7.207207207207208e-06, "loss": 0.8854, "step": 2800 }, { "epoch": 3.0123294723294722, "grad_norm": 80.49689483642578, "learning_rate": 7.232947232947234e-06, "loss": 0.7072, "step": 2810 }, { "epoch": 3.0125868725868727, "grad_norm": 0.08989892154932022, "learning_rate": 7.2586872586872595e-06, "loss": 1.1194, "step": 2820 }, { "epoch": 3.012844272844273, "grad_norm": 2.3026134967803955, "learning_rate": 7.2844272844272845e-06, "loss": 1.0552, "step": 2830 }, { "epoch": 3.0131016731016733, "grad_norm": 26.29865264892578, "learning_rate": 7.310167310167311e-06, "loss": 0.5594, "step": 2840 }, { "epoch": 3.0133590733590734, "grad_norm": 22.02301597595215, "learning_rate": 7.335907335907336e-06, "loss": 0.4249, "step": 2850 }, { "epoch": 3.0136164736164734, "grad_norm": 118.84080505371094, "learning_rate": 7.361647361647363e-06, "loss": 0.9131, "step": 2860 }, { "epoch": 3.013873873873874, "grad_norm": 109.1220474243164, "learning_rate": 7.387387387387388e-06, "loss": 1.1904, "step": 2870 }, { "epoch": 3.014131274131274, "grad_norm": 9.534757614135742, "learning_rate": 7.413127413127414e-06, "loss": 1.3307, "step": 2880 }, { "epoch": 3.0143886743886745, "grad_norm": 108.12349700927734, "learning_rate": 7.4388674388674395e-06, "loss": 0.3219, "step": 2890 }, { "epoch": 3.0146460746460746, "grad_norm": 3.933692693710327, "learning_rate": 7.464607464607465e-06, "loss": 0.9342, "step": 2900 }, { "epoch": 3.014903474903475, "grad_norm": 164.84881591796875, "learning_rate": 7.49034749034749e-06, "loss": 1.1618, "step": 2910 }, { "epoch": 3.015160875160875, "grad_norm": 0.16784685850143433, "learning_rate": 7.516087516087517e-06, "loss": 0.9138, "step": 2920 }, { "epoch": 3.0154182754182752, "grad_norm": 0.22646842896938324, "learning_rate": 7.541827541827542e-06, "loss": 0.6134, "step": 2930 }, { "epoch": 3.0156756756756757, "grad_norm": 18.39739227294922, "learning_rate": 7.567567567567569e-06, "loss": 0.2646, "step": 2940 }, { "epoch": 3.015933075933076, "grad_norm": 0.6532003283500671, "learning_rate": 7.593307593307594e-06, "loss": 0.7952, "step": 2950 }, { "epoch": 3.0161904761904763, "grad_norm": 1.6719454526901245, "learning_rate": 7.61904761904762e-06, "loss": 2.2333, "step": 2960 }, { "epoch": 3.0164478764478764, "grad_norm": 0.13571570813655853, "learning_rate": 7.644787644787645e-06, "loss": 1.1252, "step": 2970 }, { "epoch": 3.016705276705277, "grad_norm": 0.48596158623695374, "learning_rate": 7.67052767052767e-06, "loss": 1.1305, "step": 2980 }, { "epoch": 3.016962676962677, "grad_norm": 2.5286593437194824, "learning_rate": 7.696267696267697e-06, "loss": 1.0435, "step": 2990 }, { "epoch": 3.017220077220077, "grad_norm": 40.78852081298828, "learning_rate": 7.722007722007722e-06, "loss": 1.9916, "step": 3000 }, { "epoch": 3.0174774774774775, "grad_norm": 0.17999140918254852, "learning_rate": 7.747747747747749e-06, "loss": 0.9862, "step": 3010 }, { "epoch": 3.0177348777348776, "grad_norm": 2.1499030590057373, "learning_rate": 7.773487773487774e-06, "loss": 1.7006, "step": 3020 }, { "epoch": 3.017992277992278, "grad_norm": 0.820077121257782, "learning_rate": 7.7992277992278e-06, "loss": 1.6318, "step": 3030 }, { "epoch": 3.018249678249678, "grad_norm": 0.7950751781463623, "learning_rate": 7.824967824967826e-06, "loss": 1.7085, "step": 3040 }, { "epoch": 3.0185070785070787, "grad_norm": 65.81083679199219, "learning_rate": 7.850707850707852e-06, "loss": 0.9623, "step": 3050 }, { "epoch": 3.0187644787644787, "grad_norm": 0.8133057355880737, "learning_rate": 7.876447876447877e-06, "loss": 2.0652, "step": 3060 }, { "epoch": 3.0190218790218792, "grad_norm": 0.6378155946731567, "learning_rate": 7.902187902187904e-06, "loss": 0.3423, "step": 3070 }, { "epoch": 3.0192792792792793, "grad_norm": 1.3878402709960938, "learning_rate": 7.927927927927929e-06, "loss": 0.751, "step": 3080 }, { "epoch": 3.0195366795366794, "grad_norm": 0.09455589950084686, "learning_rate": 7.953667953667954e-06, "loss": 2.3321, "step": 3090 }, { "epoch": 3.01979407979408, "grad_norm": 0.12332921475172043, "learning_rate": 7.97940797940798e-06, "loss": 0.4768, "step": 3100 }, { "epoch": 3.02, "eval_accuracy": 0.5, "eval_loss": 1.7404745817184448, "eval_runtime": 15.4534, "eval_samples_per_second": 2.977, "eval_steps_per_second": 2.977, "step": 3108 }, { "epoch": 4.00005148005148, "grad_norm": 0.7716472744941711, "learning_rate": 8.005148005148006e-06, "loss": 0.7142, "step": 3110 }, { "epoch": 4.00030888030888, "grad_norm": 136.4551239013672, "learning_rate": 8.03088803088803e-06, "loss": 1.2605, "step": 3120 }, { "epoch": 4.0005662805662805, "grad_norm": 0.08112074434757233, "learning_rate": 8.056628056628057e-06, "loss": 0.3466, "step": 3130 }, { "epoch": 4.000823680823681, "grad_norm": 0.0227906946092844, "learning_rate": 8.082368082368082e-06, "loss": 0.0025, "step": 3140 }, { "epoch": 4.0010810810810815, "grad_norm": 0.3076695203781128, "learning_rate": 8.108108108108109e-06, "loss": 2.6437, "step": 3150 }, { "epoch": 4.001338481338482, "grad_norm": 31.70879554748535, "learning_rate": 8.133848133848134e-06, "loss": 1.7806, "step": 3160 }, { "epoch": 4.001595881595882, "grad_norm": 0.3824242949485779, "learning_rate": 8.159588159588159e-06, "loss": 0.9026, "step": 3170 }, { "epoch": 4.001853281853282, "grad_norm": 1.8017970323562622, "learning_rate": 8.185328185328186e-06, "loss": 1.5216, "step": 3180 }, { "epoch": 4.002110682110682, "grad_norm": 0.25629478693008423, "learning_rate": 8.211068211068212e-06, "loss": 0.3092, "step": 3190 }, { "epoch": 4.002368082368083, "grad_norm": 0.0877305120229721, "learning_rate": 8.236808236808237e-06, "loss": 1.7358, "step": 3200 }, { "epoch": 4.002625482625483, "grad_norm": 87.11286926269531, "learning_rate": 8.262548262548264e-06, "loss": 1.9197, "step": 3210 }, { "epoch": 4.002882882882883, "grad_norm": 52.01739501953125, "learning_rate": 8.288288288288289e-06, "loss": 1.8059, "step": 3220 }, { "epoch": 4.003140283140283, "grad_norm": 1.063254952430725, "learning_rate": 8.314028314028316e-06, "loss": 0.2557, "step": 3230 }, { "epoch": 4.003397683397683, "grad_norm": 33.61056137084961, "learning_rate": 8.33976833976834e-06, "loss": 2.4951, "step": 3240 }, { "epoch": 4.003655083655084, "grad_norm": 40.32854461669922, "learning_rate": 8.365508365508366e-06, "loss": 1.8026, "step": 3250 }, { "epoch": 4.003912483912484, "grad_norm": 1.654442548751831, "learning_rate": 8.391248391248393e-06, "loss": 0.595, "step": 3260 }, { "epoch": 4.004169884169884, "grad_norm": 7.549476146697998, "learning_rate": 8.416988416988418e-06, "loss": 0.9348, "step": 3270 }, { "epoch": 4.004427284427284, "grad_norm": 0.09671440720558167, "learning_rate": 8.442728442728444e-06, "loss": 0.4664, "step": 3280 }, { "epoch": 4.004684684684685, "grad_norm": 0.10881602019071579, "learning_rate": 8.46846846846847e-06, "loss": 1.231, "step": 3290 }, { "epoch": 4.004942084942085, "grad_norm": 0.24008144438266754, "learning_rate": 8.494208494208494e-06, "loss": 0.7618, "step": 3300 }, { "epoch": 4.005199485199485, "grad_norm": 42.40687561035156, "learning_rate": 8.519948519948521e-06, "loss": 2.8717, "step": 3310 }, { "epoch": 4.005456885456885, "grad_norm": 59.698265075683594, "learning_rate": 8.545688545688546e-06, "loss": 1.3768, "step": 3320 }, { "epoch": 4.005714285714285, "grad_norm": 30.52323341369629, "learning_rate": 8.571428571428571e-06, "loss": 0.449, "step": 3330 }, { "epoch": 4.005971685971686, "grad_norm": 0.7098885774612427, "learning_rate": 8.597168597168598e-06, "loss": 0.3891, "step": 3340 }, { "epoch": 4.006229086229086, "grad_norm": 0.10432929545640945, "learning_rate": 8.622908622908623e-06, "loss": 1.038, "step": 3350 }, { "epoch": 4.006486486486486, "grad_norm": 35.44326400756836, "learning_rate": 8.64864864864865e-06, "loss": 2.8441, "step": 3360 }, { "epoch": 4.0067438867438865, "grad_norm": 0.09243518859148026, "learning_rate": 8.674388674388674e-06, "loss": 0.5229, "step": 3370 }, { "epoch": 4.007001287001287, "grad_norm": 59.27909469604492, "learning_rate": 8.700128700128701e-06, "loss": 1.4812, "step": 3380 }, { "epoch": 4.0072586872586875, "grad_norm": 0.09579314291477203, "learning_rate": 8.725868725868728e-06, "loss": 0.9448, "step": 3390 }, { "epoch": 4.007516087516088, "grad_norm": 78.7119140625, "learning_rate": 8.751608751608753e-06, "loss": 1.3743, "step": 3400 }, { "epoch": 4.007773487773488, "grad_norm": 17.389902114868164, "learning_rate": 8.777348777348778e-06, "loss": 0.3665, "step": 3410 }, { "epoch": 4.008030888030888, "grad_norm": 0.03131213039159775, "learning_rate": 8.803088803088804e-06, "loss": 0.95, "step": 3420 }, { "epoch": 4.008288288288289, "grad_norm": 0.07040321081876755, "learning_rate": 8.82882882882883e-06, "loss": 1.618, "step": 3430 }, { "epoch": 4.008545688545689, "grad_norm": 0.0936579629778862, "learning_rate": 8.854568854568856e-06, "loss": 1.8872, "step": 3440 }, { "epoch": 4.008803088803089, "grad_norm": 0.3549942970275879, "learning_rate": 8.880308880308881e-06, "loss": 0.8224, "step": 3450 }, { "epoch": 4.009060489060489, "grad_norm": 51.08911895751953, "learning_rate": 8.906048906048906e-06, "loss": 0.9038, "step": 3460 }, { "epoch": 4.009317889317889, "grad_norm": 96.68467712402344, "learning_rate": 8.931788931788933e-06, "loss": 1.0418, "step": 3470 }, { "epoch": 4.00957528957529, "grad_norm": 2.3784525394439697, "learning_rate": 8.957528957528958e-06, "loss": 0.4836, "step": 3480 }, { "epoch": 4.00983268983269, "grad_norm": 0.043230555951595306, "learning_rate": 8.983268983268984e-06, "loss": 2.1446, "step": 3490 }, { "epoch": 4.01009009009009, "grad_norm": 0.10071835666894913, "learning_rate": 9.00900900900901e-06, "loss": 0.8607, "step": 3500 }, { "epoch": 4.01034749034749, "grad_norm": 151.3938751220703, "learning_rate": 9.034749034749034e-06, "loss": 1.0037, "step": 3510 }, { "epoch": 4.010604890604891, "grad_norm": 1.0010333061218262, "learning_rate": 9.060489060489061e-06, "loss": 1.6101, "step": 3520 }, { "epoch": 4.010862290862291, "grad_norm": 139.9682159423828, "learning_rate": 9.086229086229086e-06, "loss": 0.8691, "step": 3530 }, { "epoch": 4.011119691119691, "grad_norm": 50.917869567871094, "learning_rate": 9.111969111969113e-06, "loss": 1.0019, "step": 3540 }, { "epoch": 4.011377091377091, "grad_norm": 31.801929473876953, "learning_rate": 9.137709137709138e-06, "loss": 1.0467, "step": 3550 }, { "epoch": 4.011634491634491, "grad_norm": 1.3450016975402832, "learning_rate": 9.163449163449165e-06, "loss": 0.5833, "step": 3560 }, { "epoch": 4.011891891891892, "grad_norm": 0.3178759217262268, "learning_rate": 9.189189189189191e-06, "loss": 0.2232, "step": 3570 }, { "epoch": 4.012149292149292, "grad_norm": 85.76224517822266, "learning_rate": 9.214929214929216e-06, "loss": 0.9149, "step": 3580 }, { "epoch": 4.012406692406692, "grad_norm": 0.0713811069726944, "learning_rate": 9.240669240669241e-06, "loss": 1.2048, "step": 3590 }, { "epoch": 4.012664092664092, "grad_norm": 261.92388916015625, "learning_rate": 9.266409266409268e-06, "loss": 0.9989, "step": 3600 }, { "epoch": 4.012921492921493, "grad_norm": 0.05194203555583954, "learning_rate": 9.292149292149293e-06, "loss": 1.0579, "step": 3610 }, { "epoch": 4.013178893178893, "grad_norm": 0.05077645182609558, "learning_rate": 9.317889317889318e-06, "loss": 0.6101, "step": 3620 }, { "epoch": 4.0134362934362935, "grad_norm": 0.11396818608045578, "learning_rate": 9.343629343629345e-06, "loss": 2.4917, "step": 3630 }, { "epoch": 4.0136936936936936, "grad_norm": 66.50660705566406, "learning_rate": 9.36936936936937e-06, "loss": 0.9722, "step": 3640 }, { "epoch": 4.013951093951094, "grad_norm": 91.65625762939453, "learning_rate": 9.395109395109396e-06, "loss": 1.8263, "step": 3650 }, { "epoch": 4.014208494208495, "grad_norm": 0.4844493567943573, "learning_rate": 9.420849420849421e-06, "loss": 1.0256, "step": 3660 }, { "epoch": 4.014465894465895, "grad_norm": 0.10221810638904572, "learning_rate": 9.446589446589446e-06, "loss": 0.8021, "step": 3670 }, { "epoch": 4.014723294723295, "grad_norm": 133.58883666992188, "learning_rate": 9.472329472329473e-06, "loss": 1.4417, "step": 3680 }, { "epoch": 4.014980694980695, "grad_norm": 105.06199645996094, "learning_rate": 9.498069498069498e-06, "loss": 1.2516, "step": 3690 }, { "epoch": 4.015238095238095, "grad_norm": 0.04352974891662598, "learning_rate": 9.523809523809525e-06, "loss": 0.9863, "step": 3700 }, { "epoch": 4.015495495495496, "grad_norm": 0.12624229490756989, "learning_rate": 9.54954954954955e-06, "loss": 0.5863, "step": 3710 }, { "epoch": 4.015752895752896, "grad_norm": 0.05648575723171234, "learning_rate": 9.575289575289576e-06, "loss": 1.0716, "step": 3720 }, { "epoch": 4.016010296010296, "grad_norm": 1.002746820449829, "learning_rate": 9.601029601029601e-06, "loss": 1.0243, "step": 3730 }, { "epoch": 4.016267696267696, "grad_norm": 0.12015024572610855, "learning_rate": 9.626769626769628e-06, "loss": 1.0932, "step": 3740 }, { "epoch": 4.016525096525097, "grad_norm": 86.13858032226562, "learning_rate": 9.652509652509653e-06, "loss": 1.452, "step": 3750 }, { "epoch": 4.016782496782497, "grad_norm": 378.3923034667969, "learning_rate": 9.67824967824968e-06, "loss": 1.5833, "step": 3760 }, { "epoch": 4.017039897039897, "grad_norm": 0.2414206564426422, "learning_rate": 9.703989703989705e-06, "loss": 0.6792, "step": 3770 }, { "epoch": 4.017297297297297, "grad_norm": 0.05312476307153702, "learning_rate": 9.729729729729732e-06, "loss": 0.2052, "step": 3780 }, { "epoch": 4.017554697554697, "grad_norm": 435.47015380859375, "learning_rate": 9.755469755469757e-06, "loss": 2.0852, "step": 3790 }, { "epoch": 4.017812097812098, "grad_norm": 94.0691909790039, "learning_rate": 9.781209781209782e-06, "loss": 0.6855, "step": 3800 }, { "epoch": 4.018069498069498, "grad_norm": 39.46242904663086, "learning_rate": 9.806949806949808e-06, "loss": 1.4943, "step": 3810 }, { "epoch": 4.018326898326898, "grad_norm": 0.44313281774520874, "learning_rate": 9.832689832689833e-06, "loss": 1.179, "step": 3820 }, { "epoch": 4.018584298584298, "grad_norm": 0.845443069934845, "learning_rate": 9.858429858429858e-06, "loss": 0.7002, "step": 3830 }, { "epoch": 4.018841698841699, "grad_norm": 0.0821545198559761, "learning_rate": 9.884169884169885e-06, "loss": 0.8125, "step": 3840 }, { "epoch": 4.019099099099099, "grad_norm": 20.311683654785156, "learning_rate": 9.90990990990991e-06, "loss": 1.363, "step": 3850 }, { "epoch": 4.019356499356499, "grad_norm": 0.4300023317337036, "learning_rate": 9.935649935649937e-06, "loss": 0.0073, "step": 3860 }, { "epoch": 4.0196138996138995, "grad_norm": 0.026708662509918213, "learning_rate": 9.961389961389962e-06, "loss": 0.6242, "step": 3870 }, { "epoch": 4.0198712998712995, "grad_norm": 0.08312055468559265, "learning_rate": 9.987129987129988e-06, "loss": 2.0611, "step": 3880 }, { "epoch": 4.02, "eval_accuracy": 0.782608695652174, "eval_loss": 1.0069572925567627, "eval_runtime": 15.454, "eval_samples_per_second": 2.977, "eval_steps_per_second": 2.977, "step": 3885 }, { "epoch": 5.0001287001287, "grad_norm": 0.08920583873987198, "learning_rate": 9.998569998569999e-06, "loss": 1.1077, "step": 3890 }, { "epoch": 5.0003861003861, "grad_norm": 0.2523305416107178, "learning_rate": 9.995709995709997e-06, "loss": 1.4093, "step": 3900 }, { "epoch": 5.000643500643501, "grad_norm": 124.69507598876953, "learning_rate": 9.992849992849994e-06, "loss": 0.9293, "step": 3910 }, { "epoch": 5.000900900900901, "grad_norm": 0.04359631985425949, "learning_rate": 9.989989989989992e-06, "loss": 0.6195, "step": 3920 }, { "epoch": 5.001158301158301, "grad_norm": 0.5823188424110413, "learning_rate": 9.987129987129988e-06, "loss": 1.2996, "step": 3930 }, { "epoch": 5.001415701415701, "grad_norm": 8.539837837219238, "learning_rate": 9.984269984269985e-06, "loss": 1.1665, "step": 3940 }, { "epoch": 5.001673101673101, "grad_norm": 48.99810028076172, "learning_rate": 9.981409981409981e-06, "loss": 0.9725, "step": 3950 }, { "epoch": 5.001930501930502, "grad_norm": 7.8650803565979, "learning_rate": 9.97854997854998e-06, "loss": 0.4707, "step": 3960 }, { "epoch": 5.002187902187902, "grad_norm": 0.041543882340192795, "learning_rate": 9.975689975689976e-06, "loss": 1.2242, "step": 3970 }, { "epoch": 5.002445302445302, "grad_norm": 0.11458486318588257, "learning_rate": 9.972829972829974e-06, "loss": 0.5848, "step": 3980 }, { "epoch": 5.0027027027027025, "grad_norm": 0.022899646311998367, "learning_rate": 9.96996996996997e-06, "loss": 1.6402, "step": 3990 }, { "epoch": 5.0029601029601025, "grad_norm": 0.3953354060649872, "learning_rate": 9.967109967109969e-06, "loss": 2.1309, "step": 4000 }, { "epoch": 5.0032175032175035, "grad_norm": 1.6589573621749878, "learning_rate": 9.964249964249965e-06, "loss": 0.7909, "step": 4010 }, { "epoch": 5.0034749034749035, "grad_norm": 59.53203201293945, "learning_rate": 9.961389961389962e-06, "loss": 1.8867, "step": 4020 }, { "epoch": 5.003732303732304, "grad_norm": 9.76834774017334, "learning_rate": 9.958529958529958e-06, "loss": 0.8285, "step": 4030 }, { "epoch": 5.003989703989704, "grad_norm": 0.16586628556251526, "learning_rate": 9.955669955669956e-06, "loss": 0.3786, "step": 4040 }, { "epoch": 5.004247104247105, "grad_norm": 0.14669224619865417, "learning_rate": 9.952809952809953e-06, "loss": 0.4211, "step": 4050 }, { "epoch": 5.004504504504505, "grad_norm": 158.1543731689453, "learning_rate": 9.949949949949951e-06, "loss": 0.1001, "step": 4060 }, { "epoch": 5.004761904761905, "grad_norm": 2.3664519786834717, "learning_rate": 9.947089947089947e-06, "loss": 0.4621, "step": 4070 }, { "epoch": 5.005019305019305, "grad_norm": 48.814117431640625, "learning_rate": 9.944229944229946e-06, "loss": 2.4297, "step": 4080 }, { "epoch": 5.005276705276705, "grad_norm": 46.63951873779297, "learning_rate": 9.941369941369942e-06, "loss": 1.5123, "step": 4090 }, { "epoch": 5.005534105534106, "grad_norm": 0.18840543925762177, "learning_rate": 9.938509938509938e-06, "loss": 0.542, "step": 4100 }, { "epoch": 5.005791505791506, "grad_norm": 0.28926903009414673, "learning_rate": 9.935649935649937e-06, "loss": 1.427, "step": 4110 }, { "epoch": 5.006048906048906, "grad_norm": 0.14491912722587585, "learning_rate": 9.932789932789933e-06, "loss": 0.9667, "step": 4120 }, { "epoch": 5.006306306306306, "grad_norm": 0.11946146190166473, "learning_rate": 9.929929929929931e-06, "loss": 0.4551, "step": 4130 }, { "epoch": 5.006563706563707, "grad_norm": 15.428871154785156, "learning_rate": 9.927069927069928e-06, "loss": 1.3655, "step": 4140 }, { "epoch": 5.006821106821107, "grad_norm": 126.84295654296875, "learning_rate": 9.924209924209926e-06, "loss": 0.9207, "step": 4150 }, { "epoch": 5.007078507078507, "grad_norm": 0.029610566794872284, "learning_rate": 9.921349921349922e-06, "loss": 0.004, "step": 4160 }, { "epoch": 5.007335907335907, "grad_norm": 0.1009824275970459, "learning_rate": 9.91848991848992e-06, "loss": 3.1177, "step": 4170 }, { "epoch": 5.007593307593307, "grad_norm": 0.4252803325653076, "learning_rate": 9.915629915629917e-06, "loss": 1.466, "step": 4180 }, { "epoch": 5.007850707850708, "grad_norm": 0.5266346335411072, "learning_rate": 9.912769912769913e-06, "loss": 0.7741, "step": 4190 }, { "epoch": 5.008108108108108, "grad_norm": 0.19914963841438293, "learning_rate": 9.90990990990991e-06, "loss": 0.9405, "step": 4200 }, { "epoch": 5.008365508365508, "grad_norm": 0.14336754381656647, "learning_rate": 9.907049907049908e-06, "loss": 0.5279, "step": 4210 }, { "epoch": 5.008622908622908, "grad_norm": 0.16508668661117554, "learning_rate": 9.904189904189905e-06, "loss": 1.5353, "step": 4220 }, { "epoch": 5.008880308880308, "grad_norm": 33.556251525878906, "learning_rate": 9.901329901329903e-06, "loss": 1.7293, "step": 4230 }, { "epoch": 5.009137709137709, "grad_norm": 0.7371513843536377, "learning_rate": 9.8984698984699e-06, "loss": 1.4752, "step": 4240 }, { "epoch": 5.0093951093951095, "grad_norm": 33.60354995727539, "learning_rate": 9.895609895609897e-06, "loss": 1.7302, "step": 4250 }, { "epoch": 5.0096525096525095, "grad_norm": 2.943221092224121, "learning_rate": 9.892749892749894e-06, "loss": 0.2602, "step": 4260 }, { "epoch": 5.00990990990991, "grad_norm": 0.029436398297548294, "learning_rate": 9.88988988988989e-06, "loss": 0.7335, "step": 4270 }, { "epoch": 5.0101673101673105, "grad_norm": 1.5921765565872192, "learning_rate": 9.887029887029887e-06, "loss": 1.551, "step": 4280 }, { "epoch": 5.010424710424711, "grad_norm": 0.15461337566375732, "learning_rate": 9.884169884169885e-06, "loss": 0.9979, "step": 4290 }, { "epoch": 5.010682110682111, "grad_norm": 0.40406131744384766, "learning_rate": 9.881309881309881e-06, "loss": 1.3223, "step": 4300 }, { "epoch": 5.010939510939511, "grad_norm": 154.21099853515625, "learning_rate": 9.87844987844988e-06, "loss": 1.8915, "step": 4310 }, { "epoch": 5.011196911196911, "grad_norm": 0.4178062081336975, "learning_rate": 9.875589875589876e-06, "loss": 0.2505, "step": 4320 }, { "epoch": 5.011454311454312, "grad_norm": 39.81876754760742, "learning_rate": 9.872729872729874e-06, "loss": 1.4891, "step": 4330 }, { "epoch": 5.011711711711712, "grad_norm": 0.10712433606386185, "learning_rate": 9.86986986986987e-06, "loss": 1.139, "step": 4340 }, { "epoch": 5.011969111969112, "grad_norm": 0.09909055382013321, "learning_rate": 9.867009867009867e-06, "loss": 0.4782, "step": 4350 }, { "epoch": 5.012226512226512, "grad_norm": 0.033393606543540955, "learning_rate": 9.864149864149865e-06, "loss": 1.3209, "step": 4360 }, { "epoch": 5.012483912483913, "grad_norm": 178.87274169921875, "learning_rate": 9.861289861289862e-06, "loss": 1.907, "step": 4370 }, { "epoch": 5.012741312741313, "grad_norm": 41.26455307006836, "learning_rate": 9.858429858429858e-06, "loss": 2.6711, "step": 4380 }, { "epoch": 5.012998712998713, "grad_norm": 0.3698756992816925, "learning_rate": 9.855569855569856e-06, "loss": 0.7055, "step": 4390 }, { "epoch": 5.013256113256113, "grad_norm": 240.4568328857422, "learning_rate": 9.852709852709853e-06, "loss": 0.5967, "step": 4400 }, { "epoch": 5.013513513513513, "grad_norm": 0.27150461077690125, "learning_rate": 9.849849849849851e-06, "loss": 1.4454, "step": 4410 }, { "epoch": 5.013770913770914, "grad_norm": 0.4859977960586548, "learning_rate": 9.846989846989847e-06, "loss": 0.5113, "step": 4420 }, { "epoch": 5.014028314028314, "grad_norm": 0.10837042331695557, "learning_rate": 9.844129844129846e-06, "loss": 1.0488, "step": 4430 }, { "epoch": 5.014285714285714, "grad_norm": 0.09987702965736389, "learning_rate": 9.841269841269842e-06, "loss": 0.2134, "step": 4440 }, { "epoch": 5.014543114543114, "grad_norm": 0.07767640054225922, "learning_rate": 9.838409838409839e-06, "loss": 1.6714, "step": 4450 }, { "epoch": 5.014800514800514, "grad_norm": 0.2725449502468109, "learning_rate": 9.835549835549837e-06, "loss": 0.8876, "step": 4460 }, { "epoch": 5.015057915057915, "grad_norm": 20.469709396362305, "learning_rate": 9.832689832689833e-06, "loss": 1.102, "step": 4470 }, { "epoch": 5.015315315315315, "grad_norm": 0.8056849241256714, "learning_rate": 9.829829829829831e-06, "loss": 0.9803, "step": 4480 }, { "epoch": 5.0155727155727154, "grad_norm": 0.11724364757537842, "learning_rate": 9.826969826969828e-06, "loss": 1.1717, "step": 4490 }, { "epoch": 5.0158301158301155, "grad_norm": 47.51169204711914, "learning_rate": 9.824109824109826e-06, "loss": 1.3517, "step": 4500 }, { "epoch": 5.0160875160875165, "grad_norm": 0.021014723926782608, "learning_rate": 9.821249821249822e-06, "loss": 1.8496, "step": 4510 }, { "epoch": 5.0163449163449165, "grad_norm": 109.34514617919922, "learning_rate": 9.818389818389819e-06, "loss": 0.5541, "step": 4520 }, { "epoch": 5.016602316602317, "grad_norm": 47.571590423583984, "learning_rate": 9.815529815529815e-06, "loss": 0.9756, "step": 4530 }, { "epoch": 5.016859716859717, "grad_norm": 0.08140508085489273, "learning_rate": 9.812669812669814e-06, "loss": 0.9688, "step": 4540 }, { "epoch": 5.017117117117117, "grad_norm": 300.04425048828125, "learning_rate": 9.80980980980981e-06, "loss": 0.9426, "step": 4550 }, { "epoch": 5.017374517374518, "grad_norm": 68.44358825683594, "learning_rate": 9.806949806949808e-06, "loss": 0.8523, "step": 4560 }, { "epoch": 5.017631917631918, "grad_norm": 0.3220089077949524, "learning_rate": 9.804089804089805e-06, "loss": 0.8468, "step": 4570 }, { "epoch": 5.017889317889318, "grad_norm": 101.19426727294922, "learning_rate": 9.801229801229803e-06, "loss": 0.557, "step": 4580 }, { "epoch": 5.018146718146718, "grad_norm": 0.06881493330001831, "learning_rate": 9.7983697983698e-06, "loss": 1.4231, "step": 4590 }, { "epoch": 5.018404118404119, "grad_norm": 0.06560848653316498, "learning_rate": 9.795509795509796e-06, "loss": 1.1197, "step": 4600 }, { "epoch": 5.018661518661519, "grad_norm": 43.2938117980957, "learning_rate": 9.792649792649794e-06, "loss": 1.691, "step": 4610 }, { "epoch": 5.018918918918919, "grad_norm": 227.61483764648438, "learning_rate": 9.78978978978979e-06, "loss": 1.1375, "step": 4620 }, { "epoch": 5.019176319176319, "grad_norm": 0.2670377790927887, "learning_rate": 9.786929786929787e-06, "loss": 1.6694, "step": 4630 }, { "epoch": 5.019433719433719, "grad_norm": 2.1211440563201904, "learning_rate": 9.784069784069785e-06, "loss": 1.3324, "step": 4640 }, { "epoch": 5.01969111969112, "grad_norm": 667.440185546875, "learning_rate": 9.781209781209782e-06, "loss": 1.2619, "step": 4650 }, { "epoch": 5.01994851994852, "grad_norm": 0.2644244432449341, "learning_rate": 9.77834977834978e-06, "loss": 0.4943, "step": 4660 }, { "epoch": 5.02, "eval_accuracy": 0.45652173913043476, "eval_loss": 3.257457971572876, "eval_runtime": 16.4996, "eval_samples_per_second": 2.788, "eval_steps_per_second": 2.788, "step": 4662 }, { "epoch": 6.000205920205921, "grad_norm": 0.05469132587313652, "learning_rate": 9.775489775489776e-06, "loss": 1.8461, "step": 4670 }, { "epoch": 6.000463320463321, "grad_norm": 0.45225292444229126, "learning_rate": 9.772629772629774e-06, "loss": 1.9015, "step": 4680 }, { "epoch": 6.000720720720721, "grad_norm": 33.26803970336914, "learning_rate": 9.76976976976977e-06, "loss": 2.2968, "step": 4690 }, { "epoch": 6.000978120978121, "grad_norm": 29.261432647705078, "learning_rate": 9.766909766909767e-06, "loss": 1.1083, "step": 4700 }, { "epoch": 6.001235521235521, "grad_norm": 0.2970881462097168, "learning_rate": 9.764049764049764e-06, "loss": 1.5378, "step": 4710 }, { "epoch": 6.001492921492922, "grad_norm": 0.820044755935669, "learning_rate": 9.761189761189762e-06, "loss": 0.7441, "step": 4720 }, { "epoch": 6.001750321750322, "grad_norm": 0.10087312757968903, "learning_rate": 9.758329758329758e-06, "loss": 0.3919, "step": 4730 }, { "epoch": 6.002007722007722, "grad_norm": 0.0689639076590538, "learning_rate": 9.755469755469757e-06, "loss": 1.4962, "step": 4740 }, { "epoch": 6.002265122265122, "grad_norm": 0.08053558319807053, "learning_rate": 9.752609752609753e-06, "loss": 1.8286, "step": 4750 }, { "epoch": 6.002522522522523, "grad_norm": 0.11258360743522644, "learning_rate": 9.749749749749751e-06, "loss": 0.9527, "step": 4760 }, { "epoch": 6.002779922779923, "grad_norm": 2.33036470413208, "learning_rate": 9.746889746889748e-06, "loss": 1.0479, "step": 4770 }, { "epoch": 6.003037323037323, "grad_norm": 2.5247790813446045, "learning_rate": 9.744029744029744e-06, "loss": 0.889, "step": 4780 }, { "epoch": 6.003294723294723, "grad_norm": 0.057729437947273254, "learning_rate": 9.741169741169742e-06, "loss": 1.0201, "step": 4790 }, { "epoch": 6.003552123552123, "grad_norm": 70.08110046386719, "learning_rate": 9.738309738309739e-06, "loss": 1.1606, "step": 4800 }, { "epoch": 6.003809523809524, "grad_norm": 0.26452142000198364, "learning_rate": 9.735449735449735e-06, "loss": 0.7754, "step": 4810 }, { "epoch": 6.004066924066924, "grad_norm": 0.436287522315979, "learning_rate": 9.732589732589733e-06, "loss": 0.9025, "step": 4820 }, { "epoch": 6.004324324324324, "grad_norm": 80.23675537109375, "learning_rate": 9.729729729729732e-06, "loss": 0.8798, "step": 4830 }, { "epoch": 6.004581724581724, "grad_norm": 187.31350708007812, "learning_rate": 9.726869726869728e-06, "loss": 1.0004, "step": 4840 }, { "epoch": 6.004839124839124, "grad_norm": 0.10163071006536484, "learning_rate": 9.724009724009724e-06, "loss": 1.1963, "step": 4850 }, { "epoch": 6.005096525096525, "grad_norm": 0.16315636038780212, "learning_rate": 9.721149721149723e-06, "loss": 1.364, "step": 4860 }, { "epoch": 6.005353925353925, "grad_norm": 0.1200859546661377, "learning_rate": 9.718289718289719e-06, "loss": 2.1892, "step": 4870 }, { "epoch": 6.0056113256113255, "grad_norm": 118.95958709716797, "learning_rate": 9.715429715429716e-06, "loss": 1.5139, "step": 4880 }, { "epoch": 6.0058687258687256, "grad_norm": 0.24599742889404297, "learning_rate": 9.712569712569714e-06, "loss": 0.8105, "step": 4890 }, { "epoch": 6.0061261261261265, "grad_norm": 1.2496775388717651, "learning_rate": 9.70970970970971e-06, "loss": 0.3488, "step": 4900 }, { "epoch": 6.006383526383527, "grad_norm": 32.44328689575195, "learning_rate": 9.706849706849708e-06, "loss": 0.8115, "step": 4910 }, { "epoch": 6.006640926640927, "grad_norm": 0.39588698744773865, "learning_rate": 9.703989703989705e-06, "loss": 1.2346, "step": 4920 }, { "epoch": 6.006898326898327, "grad_norm": 95.88199615478516, "learning_rate": 9.701129701129703e-06, "loss": 1.3044, "step": 4930 }, { "epoch": 6.007155727155727, "grad_norm": 0.2991234064102173, "learning_rate": 9.6982696982697e-06, "loss": 0.5547, "step": 4940 }, { "epoch": 6.007413127413128, "grad_norm": 0.5006702542304993, "learning_rate": 9.695409695409696e-06, "loss": 0.7136, "step": 4950 }, { "epoch": 6.007670527670528, "grad_norm": 0.5361593961715698, "learning_rate": 9.692549692549692e-06, "loss": 1.1746, "step": 4960 }, { "epoch": 6.007927927927928, "grad_norm": 0.11017898470163345, "learning_rate": 9.68968968968969e-06, "loss": 1.8545, "step": 4970 }, { "epoch": 6.008185328185328, "grad_norm": 0.18768690526485443, "learning_rate": 9.686829686829687e-06, "loss": 1.9508, "step": 4980 }, { "epoch": 6.008442728442729, "grad_norm": 0.10016202181577682, "learning_rate": 9.683969683969685e-06, "loss": 0.9797, "step": 4990 }, { "epoch": 6.008700128700129, "grad_norm": 309.5798645019531, "learning_rate": 9.681109681109682e-06, "loss": 1.8029, "step": 5000 }, { "epoch": 6.008957528957529, "grad_norm": 0.2282564789056778, "learning_rate": 9.67824967824968e-06, "loss": 1.4809, "step": 5010 }, { "epoch": 6.009214929214929, "grad_norm": 25.585124969482422, "learning_rate": 9.675389675389676e-06, "loss": 0.7676, "step": 5020 }, { "epoch": 6.009472329472329, "grad_norm": 614.1220092773438, "learning_rate": 9.672529672529673e-06, "loss": 1.0285, "step": 5030 }, { "epoch": 6.00972972972973, "grad_norm": 1062.7408447265625, "learning_rate": 9.669669669669671e-06, "loss": 1.3201, "step": 5040 }, { "epoch": 6.00998712998713, "grad_norm": 1.943227767944336, "learning_rate": 9.666809666809667e-06, "loss": 0.4388, "step": 5050 }, { "epoch": 6.01024453024453, "grad_norm": 0.21532581746578217, "learning_rate": 9.663949663949664e-06, "loss": 0.7339, "step": 5060 }, { "epoch": 6.01050193050193, "grad_norm": 0.018828408792614937, "learning_rate": 9.661089661089662e-06, "loss": 1.2573, "step": 5070 }, { "epoch": 6.01075933075933, "grad_norm": 0.5531067252159119, "learning_rate": 9.658229658229659e-06, "loss": 1.4139, "step": 5080 }, { "epoch": 6.011016731016731, "grad_norm": 0.6515811085700989, "learning_rate": 9.655369655369657e-06, "loss": 0.4051, "step": 5090 }, { "epoch": 6.011274131274131, "grad_norm": 0.48128044605255127, "learning_rate": 9.652509652509653e-06, "loss": 0.8952, "step": 5100 }, { "epoch": 6.011531531531531, "grad_norm": 0.38696253299713135, "learning_rate": 9.649649649649651e-06, "loss": 0.4374, "step": 5110 }, { "epoch": 6.0117889317889315, "grad_norm": 4.012783050537109, "learning_rate": 9.646789646789648e-06, "loss": 2.2389, "step": 5120 }, { "epoch": 6.012046332046332, "grad_norm": 569.133056640625, "learning_rate": 9.643929643929644e-06, "loss": 0.286, "step": 5130 }, { "epoch": 6.0123037323037325, "grad_norm": 27.13798713684082, "learning_rate": 9.64106964106964e-06, "loss": 1.8604, "step": 5140 }, { "epoch": 6.012561132561133, "grad_norm": 70.15827941894531, "learning_rate": 9.638209638209639e-06, "loss": 1.1294, "step": 5150 }, { "epoch": 6.012818532818533, "grad_norm": 43.90142059326172, "learning_rate": 9.635349635349635e-06, "loss": 1.5029, "step": 5160 }, { "epoch": 6.013075933075933, "grad_norm": 1.4846750497817993, "learning_rate": 9.632489632489634e-06, "loss": 0.3423, "step": 5170 }, { "epoch": 6.013333333333334, "grad_norm": 1.0613987445831299, "learning_rate": 9.62962962962963e-06, "loss": 0.3034, "step": 5180 }, { "epoch": 6.013590733590734, "grad_norm": 17.084869384765625, "learning_rate": 9.626769626769628e-06, "loss": 0.5023, "step": 5190 }, { "epoch": 6.013848133848134, "grad_norm": 0.020237158983945847, "learning_rate": 9.623909623909625e-06, "loss": 0.602, "step": 5200 }, { "epoch": 6.014105534105534, "grad_norm": 0.02380361221730709, "learning_rate": 9.621049621049621e-06, "loss": 0.8313, "step": 5210 }, { "epoch": 6.014362934362935, "grad_norm": 0.016284339129924774, "learning_rate": 9.61818961818962e-06, "loss": 1.5708, "step": 5220 }, { "epoch": 6.014620334620335, "grad_norm": 615.75, "learning_rate": 9.615329615329616e-06, "loss": 2.1004, "step": 5230 }, { "epoch": 6.014877734877735, "grad_norm": 0.3784148395061493, "learning_rate": 9.612469612469614e-06, "loss": 1.5562, "step": 5240 }, { "epoch": 6.015135135135135, "grad_norm": 0.0682976171374321, "learning_rate": 9.60960960960961e-06, "loss": 1.2095, "step": 5250 }, { "epoch": 6.015392535392535, "grad_norm": 0.5212209820747375, "learning_rate": 9.606749606749609e-06, "loss": 0.5701, "step": 5260 }, { "epoch": 6.015649935649936, "grad_norm": 458.25054931640625, "learning_rate": 9.603889603889605e-06, "loss": 1.0626, "step": 5270 }, { "epoch": 6.015907335907336, "grad_norm": 50.43803024291992, "learning_rate": 9.601029601029601e-06, "loss": 2.7078, "step": 5280 }, { "epoch": 6.016164736164736, "grad_norm": 0.5311137437820435, "learning_rate": 9.5981695981696e-06, "loss": 0.8123, "step": 5290 }, { "epoch": 6.016422136422136, "grad_norm": 0.015996312722563744, "learning_rate": 9.595309595309596e-06, "loss": 0.5862, "step": 5300 }, { "epoch": 6.016679536679536, "grad_norm": 0.4782085716724396, "learning_rate": 9.592449592449593e-06, "loss": 0.0025, "step": 5310 }, { "epoch": 6.016936936936937, "grad_norm": 0.11970794200897217, "learning_rate": 9.58958958958959e-06, "loss": 2.5401, "step": 5320 }, { "epoch": 6.017194337194337, "grad_norm": 134.60182189941406, "learning_rate": 9.586729586729587e-06, "loss": 1.5374, "step": 5330 }, { "epoch": 6.017451737451737, "grad_norm": 0.026819629594683647, "learning_rate": 9.583869583869585e-06, "loss": 0.2047, "step": 5340 }, { "epoch": 6.017709137709137, "grad_norm": 30.610950469970703, "learning_rate": 9.581009581009582e-06, "loss": 1.9448, "step": 5350 }, { "epoch": 6.017966537966538, "grad_norm": 23.234464645385742, "learning_rate": 9.57814957814958e-06, "loss": 2.3533, "step": 5360 }, { "epoch": 6.018223938223938, "grad_norm": 3.35427188873291, "learning_rate": 9.575289575289576e-06, "loss": 0.6203, "step": 5370 }, { "epoch": 6.0184813384813385, "grad_norm": 0.10274845361709595, "learning_rate": 9.572429572429573e-06, "loss": 0.8995, "step": 5380 }, { "epoch": 6.0187387387387385, "grad_norm": 49.04452896118164, "learning_rate": 9.56956956956957e-06, "loss": 2.0313, "step": 5390 }, { "epoch": 6.018996138996139, "grad_norm": 0.2745307385921478, "learning_rate": 9.566709566709568e-06, "loss": 1.3825, "step": 5400 }, { "epoch": 6.01925353925354, "grad_norm": 34.342376708984375, "learning_rate": 9.563849563849564e-06, "loss": 1.4412, "step": 5410 }, { "epoch": 6.01951093951094, "grad_norm": 0.6465485095977783, "learning_rate": 9.560989560989562e-06, "loss": 0.6112, "step": 5420 }, { "epoch": 6.01976833976834, "grad_norm": 0.03590095415711403, "learning_rate": 9.558129558129559e-06, "loss": 0.708, "step": 5430 }, { "epoch": 6.02, "eval_accuracy": 0.45652173913043476, "eval_loss": 2.1190643310546875, "eval_runtime": 15.4643, "eval_samples_per_second": 2.975, "eval_steps_per_second": 2.975, "step": 5439 }, { "epoch": 7.00002574002574, "grad_norm": 0.48325133323669434, "learning_rate": 9.555269555269557e-06, "loss": 2.154, "step": 5440 }, { "epoch": 7.00028314028314, "grad_norm": 0.226426899433136, "learning_rate": 9.552409552409553e-06, "loss": 0.6806, "step": 5450 }, { "epoch": 7.00054054054054, "grad_norm": 296.4757995605469, "learning_rate": 9.54954954954955e-06, "loss": 0.646, "step": 5460 }, { "epoch": 7.00079794079794, "grad_norm": 26.78754997253418, "learning_rate": 9.546689546689546e-06, "loss": 0.6618, "step": 5470 }, { "epoch": 7.001055341055341, "grad_norm": 0.030151739716529846, "learning_rate": 9.543829543829544e-06, "loss": 1.7167, "step": 5480 }, { "epoch": 7.001312741312741, "grad_norm": 179.15652465820312, "learning_rate": 9.540969540969541e-06, "loss": 1.1285, "step": 5490 }, { "epoch": 7.0015701415701415, "grad_norm": 0.09001732617616653, "learning_rate": 9.538109538109539e-06, "loss": 0.4645, "step": 5500 }, { "epoch": 7.0018275418275415, "grad_norm": 173.9577178955078, "learning_rate": 9.535249535249535e-06, "loss": 1.4252, "step": 5510 }, { "epoch": 7.0020849420849425, "grad_norm": 12.78044319152832, "learning_rate": 9.532389532389534e-06, "loss": 0.8229, "step": 5520 }, { "epoch": 7.0023423423423425, "grad_norm": 0.49420520663261414, "learning_rate": 9.52952952952953e-06, "loss": 0.023, "step": 5530 }, { "epoch": 7.002599742599743, "grad_norm": 34.80965042114258, "learning_rate": 9.526669526669528e-06, "loss": 0.6871, "step": 5540 }, { "epoch": 7.002857142857143, "grad_norm": 2.7141544818878174, "learning_rate": 9.523809523809525e-06, "loss": 2.8692, "step": 5550 }, { "epoch": 7.003114543114543, "grad_norm": 27.715356826782227, "learning_rate": 9.520949520949521e-06, "loss": 2.216, "step": 5560 }, { "epoch": 7.003371943371944, "grad_norm": 0.8110296726226807, "learning_rate": 9.51808951808952e-06, "loss": 0.6849, "step": 5570 }, { "epoch": 7.003629343629344, "grad_norm": 11.079747200012207, "learning_rate": 9.515229515229516e-06, "loss": 1.0239, "step": 5580 }, { "epoch": 7.003886743886744, "grad_norm": 0.3448202311992645, "learning_rate": 9.512369512369514e-06, "loss": 0.67, "step": 5590 }, { "epoch": 7.004144144144144, "grad_norm": 0.313346266746521, "learning_rate": 9.50950950950951e-06, "loss": 0.4556, "step": 5600 }, { "epoch": 7.004401544401545, "grad_norm": 24.44046401977539, "learning_rate": 9.506649506649509e-06, "loss": 1.7268, "step": 5610 }, { "epoch": 7.004658944658945, "grad_norm": 23.968717575073242, "learning_rate": 9.503789503789505e-06, "loss": 1.6574, "step": 5620 }, { "epoch": 7.004916344916345, "grad_norm": 1.9536049365997314, "learning_rate": 9.500929500929502e-06, "loss": 0.6711, "step": 5630 }, { "epoch": 7.005173745173745, "grad_norm": 0.11860974878072739, "learning_rate": 9.498069498069498e-06, "loss": 0.0218, "step": 5640 }, { "epoch": 7.005431145431145, "grad_norm": 3.811457872390747, "learning_rate": 9.495209495209496e-06, "loss": 0.4929, "step": 5650 }, { "epoch": 7.005688545688546, "grad_norm": 0.047364648431539536, "learning_rate": 9.492349492349493e-06, "loss": 0.5272, "step": 5660 }, { "epoch": 7.005945945945946, "grad_norm": 0.2796438932418823, "learning_rate": 9.489489489489491e-06, "loss": 1.3004, "step": 5670 }, { "epoch": 7.006203346203346, "grad_norm": 82.55836486816406, "learning_rate": 9.486629486629487e-06, "loss": 0.6588, "step": 5680 }, { "epoch": 7.006460746460746, "grad_norm": 0.2848183512687683, "learning_rate": 9.483769483769485e-06, "loss": 1.5836, "step": 5690 }, { "epoch": 7.006718146718146, "grad_norm": 60.738807678222656, "learning_rate": 9.480909480909482e-06, "loss": 0.8493, "step": 5700 }, { "epoch": 7.006975546975547, "grad_norm": 0.10891953110694885, "learning_rate": 9.478049478049478e-06, "loss": 0.2529, "step": 5710 }, { "epoch": 7.007232947232947, "grad_norm": 393.4809875488281, "learning_rate": 9.475189475189477e-06, "loss": 0.5602, "step": 5720 }, { "epoch": 7.007490347490347, "grad_norm": 0.00841040350496769, "learning_rate": 9.472329472329473e-06, "loss": 1.9119, "step": 5730 }, { "epoch": 7.0077477477477474, "grad_norm": 312.6628723144531, "learning_rate": 9.46946946946947e-06, "loss": 1.7177, "step": 5740 }, { "epoch": 7.008005148005148, "grad_norm": 2.4715847969055176, "learning_rate": 9.466609466609468e-06, "loss": 1.079, "step": 5750 }, { "epoch": 7.0082625482625485, "grad_norm": 21.34333610534668, "learning_rate": 9.463749463749464e-06, "loss": 0.3107, "step": 5760 }, { "epoch": 7.0085199485199485, "grad_norm": 359.7504577636719, "learning_rate": 9.460889460889462e-06, "loss": 1.2887, "step": 5770 }, { "epoch": 7.008777348777349, "grad_norm": 80.45604705810547, "learning_rate": 9.458029458029459e-06, "loss": 0.8233, "step": 5780 }, { "epoch": 7.009034749034749, "grad_norm": 0.030902400612831116, "learning_rate": 9.455169455169457e-06, "loss": 0.1018, "step": 5790 }, { "epoch": 7.00929214929215, "grad_norm": 25.676687240600586, "learning_rate": 9.452309452309453e-06, "loss": 1.71, "step": 5800 }, { "epoch": 7.00954954954955, "grad_norm": 2.5907270908355713, "learning_rate": 9.44944944944945e-06, "loss": 2.2116, "step": 5810 }, { "epoch": 7.00980694980695, "grad_norm": 3.035196542739868, "learning_rate": 9.446589446589446e-06, "loss": 0.181, "step": 5820 }, { "epoch": 7.01006435006435, "grad_norm": 0.8250662088394165, "learning_rate": 9.443729443729445e-06, "loss": 1.0846, "step": 5830 }, { "epoch": 7.010321750321751, "grad_norm": 0.5744351148605347, "learning_rate": 9.440869440869441e-06, "loss": 1.3217, "step": 5840 }, { "epoch": 7.010579150579151, "grad_norm": 33.13734436035156, "learning_rate": 9.43800943800944e-06, "loss": 1.1113, "step": 5850 }, { "epoch": 7.010836550836551, "grad_norm": 0.7868608832359314, "learning_rate": 9.435149435149436e-06, "loss": 2.172, "step": 5860 }, { "epoch": 7.011093951093951, "grad_norm": 1.6519296169281006, "learning_rate": 9.432289432289434e-06, "loss": 0.0257, "step": 5870 }, { "epoch": 7.011351351351351, "grad_norm": 396.9693908691406, "learning_rate": 9.42942942942943e-06, "loss": 0.5133, "step": 5880 }, { "epoch": 7.011608751608752, "grad_norm": 22.835020065307617, "learning_rate": 9.426569426569427e-06, "loss": 2.0565, "step": 5890 }, { "epoch": 7.011866151866152, "grad_norm": 54.88117980957031, "learning_rate": 9.423709423709423e-06, "loss": 1.8865, "step": 5900 }, { "epoch": 7.012123552123552, "grad_norm": 0.0457146093249321, "learning_rate": 9.420849420849421e-06, "loss": 0.0163, "step": 5910 }, { "epoch": 7.012380952380952, "grad_norm": 41.79448318481445, "learning_rate": 9.417989417989418e-06, "loss": 1.0209, "step": 5920 }, { "epoch": 7.012638352638352, "grad_norm": 270.59869384765625, "learning_rate": 9.415129415129416e-06, "loss": 1.266, "step": 5930 }, { "epoch": 7.012895752895753, "grad_norm": 3.5031347274780273, "learning_rate": 9.412269412269412e-06, "loss": 0.6264, "step": 5940 }, { "epoch": 7.013153153153153, "grad_norm": 0.2792441248893738, "learning_rate": 9.40940940940941e-06, "loss": 1.3217, "step": 5950 }, { "epoch": 7.013410553410553, "grad_norm": 938.5770874023438, "learning_rate": 9.406549406549407e-06, "loss": 0.7044, "step": 5960 }, { "epoch": 7.013667953667953, "grad_norm": 0.05712044611573219, "learning_rate": 9.403689403689405e-06, "loss": 0.4378, "step": 5970 }, { "epoch": 7.013925353925354, "grad_norm": 87.4327621459961, "learning_rate": 9.400829400829402e-06, "loss": 1.763, "step": 5980 }, { "epoch": 7.014182754182754, "grad_norm": 0.3371102511882782, "learning_rate": 9.397969397969398e-06, "loss": 0.9069, "step": 5990 }, { "epoch": 7.0144401544401545, "grad_norm": 0.3371520936489105, "learning_rate": 9.395109395109396e-06, "loss": 0.8725, "step": 6000 }, { "epoch": 7.0146975546975545, "grad_norm": 96.59081268310547, "learning_rate": 9.392249392249393e-06, "loss": 1.4305, "step": 6010 }, { "epoch": 7.014954954954955, "grad_norm": 0.3758425712585449, "learning_rate": 9.389389389389391e-06, "loss": 1.7905, "step": 6020 }, { "epoch": 7.0152123552123555, "grad_norm": 0.03138767182826996, "learning_rate": 9.386529386529387e-06, "loss": 2.0007, "step": 6030 }, { "epoch": 7.015469755469756, "grad_norm": 33.08185958862305, "learning_rate": 9.383669383669386e-06, "loss": 0.9456, "step": 6040 }, { "epoch": 7.015727155727156, "grad_norm": 0.5954800248146057, "learning_rate": 9.380809380809382e-06, "loss": 1.2856, "step": 6050 }, { "epoch": 7.015984555984556, "grad_norm": 0.17369569838047028, "learning_rate": 9.377949377949379e-06, "loss": 0.5541, "step": 6060 }, { "epoch": 7.016241956241957, "grad_norm": 0.3233203887939453, "learning_rate": 9.375089375089375e-06, "loss": 1.2611, "step": 6070 }, { "epoch": 7.016499356499357, "grad_norm": 40.826045989990234, "learning_rate": 9.372229372229373e-06, "loss": 1.7143, "step": 6080 }, { "epoch": 7.016756756756757, "grad_norm": 56.034400939941406, "learning_rate": 9.36936936936937e-06, "loss": 0.9582, "step": 6090 }, { "epoch": 7.017014157014157, "grad_norm": 8.84603214263916, "learning_rate": 9.366509366509368e-06, "loss": 0.018, "step": 6100 }, { "epoch": 7.017271557271557, "grad_norm": 0.7207472920417786, "learning_rate": 9.363649363649364e-06, "loss": 0.507, "step": 6110 }, { "epoch": 7.017528957528958, "grad_norm": 0.1129591315984726, "learning_rate": 9.360789360789362e-06, "loss": 0.4114, "step": 6120 }, { "epoch": 7.017786357786358, "grad_norm": 0.03248320892453194, "learning_rate": 9.357929357929359e-06, "loss": 1.3072, "step": 6130 }, { "epoch": 7.018043758043758, "grad_norm": 41.5610237121582, "learning_rate": 9.355069355069355e-06, "loss": 1.705, "step": 6140 }, { "epoch": 7.018301158301158, "grad_norm": 0.6427077651023865, "learning_rate": 9.352209352209352e-06, "loss": 0.9994, "step": 6150 }, { "epoch": 7.018558558558558, "grad_norm": 0.4536170959472656, "learning_rate": 9.34934934934935e-06, "loss": 0.4102, "step": 6160 }, { "epoch": 7.018815958815959, "grad_norm": 0.4406795799732208, "learning_rate": 9.346489346489346e-06, "loss": 0.4422, "step": 6170 }, { "epoch": 7.019073359073359, "grad_norm": 0.02585803158581257, "learning_rate": 9.343629343629345e-06, "loss": 0.4257, "step": 6180 }, { "epoch": 7.019330759330759, "grad_norm": 0.07975375652313232, "learning_rate": 9.340769340769341e-06, "loss": 0.5653, "step": 6190 }, { "epoch": 7.019588159588159, "grad_norm": 0.0948067232966423, "learning_rate": 9.33790933790934e-06, "loss": 1.3287, "step": 6200 }, { "epoch": 7.01984555984556, "grad_norm": 0.33225017786026, "learning_rate": 9.335049335049336e-06, "loss": 0.5518, "step": 6210 }, { "epoch": 7.02, "eval_accuracy": 0.7608695652173914, "eval_loss": 1.4169539213180542, "eval_runtime": 15.4728, "eval_samples_per_second": 2.973, "eval_steps_per_second": 2.973, "step": 6216 }, { "epoch": 8.00010296010296, "grad_norm": 0.03941204398870468, "learning_rate": 9.332189332189334e-06, "loss": 1.5208, "step": 6220 }, { "epoch": 8.00036036036036, "grad_norm": 0.024762960150837898, "learning_rate": 9.32932932932933e-06, "loss": 0.002, "step": 6230 }, { "epoch": 8.00061776061776, "grad_norm": 398.88885498046875, "learning_rate": 9.326469326469327e-06, "loss": 0.6583, "step": 6240 }, { "epoch": 8.000875160875161, "grad_norm": 0.023645753040909767, "learning_rate": 9.323609323609323e-06, "loss": 1.3758, "step": 6250 }, { "epoch": 8.001132561132561, "grad_norm": 0.012583833187818527, "learning_rate": 9.320749320749321e-06, "loss": 0.8099, "step": 6260 }, { "epoch": 8.001389961389961, "grad_norm": 0.020370107144117355, "learning_rate": 9.317889317889318e-06, "loss": 1.0169, "step": 6270 }, { "epoch": 8.001647361647361, "grad_norm": 0.051698509603738785, "learning_rate": 9.315029315029316e-06, "loss": 0.4926, "step": 6280 }, { "epoch": 8.001904761904761, "grad_norm": 0.9180232882499695, "learning_rate": 9.312169312169313e-06, "loss": 1.2576, "step": 6290 }, { "epoch": 8.002162162162163, "grad_norm": 1456.8536376953125, "learning_rate": 9.30930930930931e-06, "loss": 0.2036, "step": 6300 }, { "epoch": 8.002419562419563, "grad_norm": 164.44483947753906, "learning_rate": 9.306449306449307e-06, "loss": 2.9082, "step": 6310 }, { "epoch": 8.002676962676963, "grad_norm": 1.0428372621536255, "learning_rate": 9.303589303589304e-06, "loss": 0.3385, "step": 6320 }, { "epoch": 8.002934362934363, "grad_norm": 0.04050123319029808, "learning_rate": 9.300729300729302e-06, "loss": 0.4214, "step": 6330 }, { "epoch": 8.003191763191763, "grad_norm": 0.06089348345994949, "learning_rate": 9.297869297869298e-06, "loss": 0.8934, "step": 6340 }, { "epoch": 8.003449163449163, "grad_norm": 0.10277870297431946, "learning_rate": 9.295009295009296e-06, "loss": 1.4394, "step": 6350 }, { "epoch": 8.003706563706563, "grad_norm": 0.1762549728155136, "learning_rate": 9.292149292149293e-06, "loss": 2.8751, "step": 6360 }, { "epoch": 8.003963963963963, "grad_norm": 9.25487232208252, "learning_rate": 9.289289289289291e-06, "loss": 0.4291, "step": 6370 }, { "epoch": 8.004221364221364, "grad_norm": 0.05675046518445015, "learning_rate": 9.286429286429288e-06, "loss": 0.3791, "step": 6380 }, { "epoch": 8.004478764478764, "grad_norm": 0.13098298013210297, "learning_rate": 9.283569283569284e-06, "loss": 1.0009, "step": 6390 }, { "epoch": 8.004736164736165, "grad_norm": 0.7045866847038269, "learning_rate": 9.28070928070928e-06, "loss": 1.6997, "step": 6400 }, { "epoch": 8.004993564993566, "grad_norm": 0.40885546803474426, "learning_rate": 9.277849277849279e-06, "loss": 1.2252, "step": 6410 }, { "epoch": 8.005250965250966, "grad_norm": 0.27662187814712524, "learning_rate": 9.274989274989275e-06, "loss": 0.927, "step": 6420 }, { "epoch": 8.005508365508366, "grad_norm": 0.2577447295188904, "learning_rate": 9.272129272129273e-06, "loss": 0.4666, "step": 6430 }, { "epoch": 8.005765765765766, "grad_norm": 0.04225856065750122, "learning_rate": 9.26926926926927e-06, "loss": 1.812, "step": 6440 }, { "epoch": 8.006023166023166, "grad_norm": 0.5405942797660828, "learning_rate": 9.266409266409268e-06, "loss": 1.5169, "step": 6450 }, { "epoch": 8.006280566280566, "grad_norm": 40.016326904296875, "learning_rate": 9.263549263549264e-06, "loss": 2.1544, "step": 6460 }, { "epoch": 8.006537966537966, "grad_norm": 0.7114521861076355, "learning_rate": 9.260689260689263e-06, "loss": 0.3712, "step": 6470 }, { "epoch": 8.006795366795366, "grad_norm": 0.04546615853905678, "learning_rate": 9.257829257829259e-06, "loss": 0.2992, "step": 6480 }, { "epoch": 8.007052767052768, "grad_norm": 230.9104766845703, "learning_rate": 9.254969254969256e-06, "loss": 0.3405, "step": 6490 }, { "epoch": 8.007310167310168, "grad_norm": 0.03966987505555153, "learning_rate": 9.252109252109252e-06, "loss": 1.7531, "step": 6500 }, { "epoch": 8.007567567567568, "grad_norm": 0.1251051276922226, "learning_rate": 9.24924924924925e-06, "loss": 1.1201, "step": 6510 }, { "epoch": 8.007824967824968, "grad_norm": 0.15064413845539093, "learning_rate": 9.246389246389247e-06, "loss": 0.8073, "step": 6520 }, { "epoch": 8.008082368082368, "grad_norm": 27.61880874633789, "learning_rate": 9.243529243529245e-06, "loss": 0.8102, "step": 6530 }, { "epoch": 8.008339768339768, "grad_norm": 0.06560596078634262, "learning_rate": 9.240669240669241e-06, "loss": 1.1757, "step": 6540 }, { "epoch": 8.008597168597168, "grad_norm": 0.17579619586467743, "learning_rate": 9.23780923780924e-06, "loss": 0.8287, "step": 6550 }, { "epoch": 8.008854568854568, "grad_norm": 0.3297277092933655, "learning_rate": 9.234949234949236e-06, "loss": 0.4357, "step": 6560 }, { "epoch": 8.009111969111968, "grad_norm": 3.586346387863159, "learning_rate": 9.232089232089232e-06, "loss": 1.4158, "step": 6570 }, { "epoch": 8.00936936936937, "grad_norm": 94.88516998291016, "learning_rate": 9.229229229229229e-06, "loss": 1.24, "step": 6580 }, { "epoch": 8.00962676962677, "grad_norm": 112.45343780517578, "learning_rate": 9.226369226369227e-06, "loss": 1.1322, "step": 6590 }, { "epoch": 8.00988416988417, "grad_norm": 0.0703471377491951, "learning_rate": 9.223509223509223e-06, "loss": 1.4622, "step": 6600 }, { "epoch": 8.01014157014157, "grad_norm": 0.048765502870082855, "learning_rate": 9.220649220649222e-06, "loss": 1.6374, "step": 6610 }, { "epoch": 8.01039897039897, "grad_norm": 0.2779279053211212, "learning_rate": 9.217789217789218e-06, "loss": 1.0281, "step": 6620 }, { "epoch": 8.01065637065637, "grad_norm": 33.617103576660156, "learning_rate": 9.214929214929216e-06, "loss": 1.1899, "step": 6630 }, { "epoch": 8.01091377091377, "grad_norm": 54.08098602294922, "learning_rate": 9.212069212069213e-06, "loss": 1.3617, "step": 6640 }, { "epoch": 8.01117117117117, "grad_norm": 26.711761474609375, "learning_rate": 9.20920920920921e-06, "loss": 1.1732, "step": 6650 }, { "epoch": 8.01142857142857, "grad_norm": 79.53500366210938, "learning_rate": 9.206349206349207e-06, "loss": 0.6164, "step": 6660 }, { "epoch": 8.011685971685973, "grad_norm": 0.0222012996673584, "learning_rate": 9.203489203489204e-06, "loss": 0.904, "step": 6670 }, { "epoch": 8.011943371943373, "grad_norm": 37.63016128540039, "learning_rate": 9.2006292006292e-06, "loss": 1.8474, "step": 6680 }, { "epoch": 8.012200772200773, "grad_norm": 0.025222521275281906, "learning_rate": 9.197769197769198e-06, "loss": 0.8185, "step": 6690 }, { "epoch": 8.012458172458173, "grad_norm": 0.039707981050014496, "learning_rate": 9.194909194909197e-06, "loss": 1.7299, "step": 6700 }, { "epoch": 8.012715572715573, "grad_norm": 45.59052276611328, "learning_rate": 9.192049192049193e-06, "loss": 0.3896, "step": 6710 }, { "epoch": 8.012972972972973, "grad_norm": 2.0984888076782227, "learning_rate": 9.189189189189191e-06, "loss": 0.8229, "step": 6720 }, { "epoch": 8.013230373230373, "grad_norm": 0.26229918003082275, "learning_rate": 9.186329186329188e-06, "loss": 1.2765, "step": 6730 }, { "epoch": 8.013487773487773, "grad_norm": 0.5436154007911682, "learning_rate": 9.183469183469184e-06, "loss": 0.0315, "step": 6740 }, { "epoch": 8.013745173745173, "grad_norm": 0.10348271578550339, "learning_rate": 9.18060918060918e-06, "loss": 0.8857, "step": 6750 }, { "epoch": 8.014002574002575, "grad_norm": 0.26006609201431274, "learning_rate": 9.177749177749179e-06, "loss": 0.4398, "step": 6760 }, { "epoch": 8.014259974259975, "grad_norm": 0.019766027107834816, "learning_rate": 9.174889174889175e-06, "loss": 0.4339, "step": 6770 }, { "epoch": 8.014517374517375, "grad_norm": 0.3928743600845337, "learning_rate": 9.172029172029173e-06, "loss": 1.7112, "step": 6780 }, { "epoch": 8.014774774774775, "grad_norm": 76.59400939941406, "learning_rate": 9.16916916916917e-06, "loss": 1.4805, "step": 6790 }, { "epoch": 8.015032175032175, "grad_norm": 0.12240134179592133, "learning_rate": 9.166309166309168e-06, "loss": 1.3292, "step": 6800 }, { "epoch": 8.015289575289575, "grad_norm": 44.478981018066406, "learning_rate": 9.163449163449165e-06, "loss": 0.8538, "step": 6810 }, { "epoch": 8.015546975546975, "grad_norm": 0.02366860955953598, "learning_rate": 9.160589160589161e-06, "loss": 2.4329, "step": 6820 }, { "epoch": 8.015804375804375, "grad_norm": 0.42346879839897156, "learning_rate": 9.157729157729158e-06, "loss": 0.5564, "step": 6830 }, { "epoch": 8.016061776061775, "grad_norm": 208.15951538085938, "learning_rate": 9.154869154869156e-06, "loss": 0.8429, "step": 6840 }, { "epoch": 8.016319176319175, "grad_norm": 0.2946467995643616, "learning_rate": 9.152009152009152e-06, "loss": 1.0465, "step": 6850 }, { "epoch": 8.016576576576577, "grad_norm": 1.0983664989471436, "learning_rate": 9.14914914914915e-06, "loss": 1.2315, "step": 6860 }, { "epoch": 8.016833976833977, "grad_norm": 0.42272210121154785, "learning_rate": 9.146289146289147e-06, "loss": 0.5185, "step": 6870 }, { "epoch": 8.017091377091377, "grad_norm": 0.46459901332855225, "learning_rate": 9.143429143429145e-06, "loss": 2.1266, "step": 6880 }, { "epoch": 8.017348777348777, "grad_norm": 0.15095072984695435, "learning_rate": 9.140569140569141e-06, "loss": 0.5774, "step": 6890 }, { "epoch": 8.017606177606178, "grad_norm": 0.029569724574685097, "learning_rate": 9.137709137709138e-06, "loss": 0.3594, "step": 6900 }, { "epoch": 8.017863577863578, "grad_norm": 112.70240783691406, "learning_rate": 9.134849134849136e-06, "loss": 0.9142, "step": 6910 }, { "epoch": 8.018120978120978, "grad_norm": 0.057099491357803345, "learning_rate": 9.131989131989133e-06, "loss": 1.3048, "step": 6920 }, { "epoch": 8.018378378378378, "grad_norm": 231.97354125976562, "learning_rate": 9.129129129129129e-06, "loss": 0.0491, "step": 6930 }, { "epoch": 8.018635778635778, "grad_norm": 47.82123565673828, "learning_rate": 9.126269126269127e-06, "loss": 0.9094, "step": 6940 }, { "epoch": 8.01889317889318, "grad_norm": 27.74190902709961, "learning_rate": 9.123409123409124e-06, "loss": 0.9677, "step": 6950 }, { "epoch": 8.01915057915058, "grad_norm": 0.03225734457373619, "learning_rate": 9.120549120549122e-06, "loss": 1.1965, "step": 6960 }, { "epoch": 8.01940797940798, "grad_norm": 0.7187479138374329, "learning_rate": 9.117689117689118e-06, "loss": 0.9899, "step": 6970 }, { "epoch": 8.01966537966538, "grad_norm": 0.08666837960481644, "learning_rate": 9.114829114829116e-06, "loss": 0.6683, "step": 6980 }, { "epoch": 8.01992277992278, "grad_norm": 0.019565997645258904, "learning_rate": 9.111969111969113e-06, "loss": 1.2084, "step": 6990 }, { "epoch": 8.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 0.6193599104881287, "eval_runtime": 16.653, "eval_samples_per_second": 2.762, "eval_steps_per_second": 2.762, "step": 6993 }, { "epoch": 9.00018018018018, "grad_norm": 0.5390567183494568, "learning_rate": 9.10910910910911e-06, "loss": 1.2249, "step": 7000 }, { "epoch": 9.00043758043758, "grad_norm": 0.24792684614658356, "learning_rate": 9.106249106249106e-06, "loss": 0.3936, "step": 7010 }, { "epoch": 9.00069498069498, "grad_norm": 0.17737895250320435, "learning_rate": 9.103389103389104e-06, "loss": 1.5842, "step": 7020 }, { "epoch": 9.00095238095238, "grad_norm": 0.08357981592416763, "learning_rate": 9.1005291005291e-06, "loss": 0.2975, "step": 7030 }, { "epoch": 9.00120978120978, "grad_norm": 0.15179596841335297, "learning_rate": 9.097669097669099e-06, "loss": 0.5302, "step": 7040 }, { "epoch": 9.00146718146718, "grad_norm": 0.021750060841441154, "learning_rate": 9.094809094809095e-06, "loss": 0.2233, "step": 7050 }, { "epoch": 9.001724581724583, "grad_norm": 357.4986267089844, "learning_rate": 9.091949091949093e-06, "loss": 1.7797, "step": 7060 }, { "epoch": 9.001981981981983, "grad_norm": 0.08269771188497543, "learning_rate": 9.08908908908909e-06, "loss": 1.0481, "step": 7070 }, { "epoch": 9.002239382239383, "grad_norm": 0.02010294981300831, "learning_rate": 9.086229086229086e-06, "loss": 1.2008, "step": 7080 }, { "epoch": 9.002496782496783, "grad_norm": 45.67758560180664, "learning_rate": 9.083369083369084e-06, "loss": 2.2412, "step": 7090 }, { "epoch": 9.002754182754183, "grad_norm": 24.944141387939453, "learning_rate": 9.08050908050908e-06, "loss": 2.3046, "step": 7100 }, { "epoch": 9.003011583011583, "grad_norm": 27.399066925048828, "learning_rate": 9.077649077649079e-06, "loss": 0.9692, "step": 7110 }, { "epoch": 9.003268983268983, "grad_norm": 2.160198450088501, "learning_rate": 9.074789074789075e-06, "loss": 0.4344, "step": 7120 }, { "epoch": 9.003526383526383, "grad_norm": 0.816569983959198, "learning_rate": 9.071929071929074e-06, "loss": 0.8706, "step": 7130 }, { "epoch": 9.003783783783783, "grad_norm": 1.0084000825881958, "learning_rate": 9.06906906906907e-06, "loss": 0.5086, "step": 7140 }, { "epoch": 9.004041184041185, "grad_norm": 0.053576670587062836, "learning_rate": 9.066209066209067e-06, "loss": 0.8812, "step": 7150 }, { "epoch": 9.004298584298585, "grad_norm": 25.146881103515625, "learning_rate": 9.063349063349065e-06, "loss": 0.5933, "step": 7160 }, { "epoch": 9.004555984555985, "grad_norm": 0.5635517239570618, "learning_rate": 9.060489060489061e-06, "loss": 1.2934, "step": 7170 }, { "epoch": 9.004813384813385, "grad_norm": 0.36832961440086365, "learning_rate": 9.057629057629058e-06, "loss": 1.8208, "step": 7180 }, { "epoch": 9.005070785070785, "grad_norm": 0.1600521355867386, "learning_rate": 9.054769054769056e-06, "loss": 0.3754, "step": 7190 }, { "epoch": 9.005328185328185, "grad_norm": 96.4287338256836, "learning_rate": 9.051909051909052e-06, "loss": 1.0538, "step": 7200 }, { "epoch": 9.005585585585585, "grad_norm": 4.262108325958252, "learning_rate": 9.04904904904905e-06, "loss": 0.4425, "step": 7210 }, { "epoch": 9.005842985842985, "grad_norm": 152.94366455078125, "learning_rate": 9.046189046189047e-06, "loss": 1.0066, "step": 7220 }, { "epoch": 9.006100386100385, "grad_norm": 0.028285371139645576, "learning_rate": 9.043329043329045e-06, "loss": 0.5249, "step": 7230 }, { "epoch": 9.006357786357785, "grad_norm": 0.15451128780841827, "learning_rate": 9.040469040469042e-06, "loss": 1.171, "step": 7240 }, { "epoch": 9.006615186615187, "grad_norm": 0.056364186108112335, "learning_rate": 9.037609037609038e-06, "loss": 0.5535, "step": 7250 }, { "epoch": 9.006872586872587, "grad_norm": 29.896148681640625, "learning_rate": 9.034749034749034e-06, "loss": 1.2453, "step": 7260 }, { "epoch": 9.007129987129987, "grad_norm": 0.19092273712158203, "learning_rate": 9.031889031889033e-06, "loss": 2.4563, "step": 7270 }, { "epoch": 9.007387387387388, "grad_norm": 2.0076355934143066, "learning_rate": 9.029029029029029e-06, "loss": 1.0114, "step": 7280 }, { "epoch": 9.007644787644788, "grad_norm": 29.020702362060547, "learning_rate": 9.026169026169027e-06, "loss": 1.0298, "step": 7290 }, { "epoch": 9.007902187902188, "grad_norm": 0.1018686518073082, "learning_rate": 9.023309023309024e-06, "loss": 1.2876, "step": 7300 }, { "epoch": 9.008159588159588, "grad_norm": 36.953407287597656, "learning_rate": 9.020449020449022e-06, "loss": 0.8832, "step": 7310 }, { "epoch": 9.008416988416988, "grad_norm": 0.04455059394240379, "learning_rate": 9.017589017589018e-06, "loss": 1.0117, "step": 7320 }, { "epoch": 9.008674388674388, "grad_norm": 34.155006408691406, "learning_rate": 9.014729014729015e-06, "loss": 0.6126, "step": 7330 }, { "epoch": 9.00893178893179, "grad_norm": 30.061325073242188, "learning_rate": 9.011869011869013e-06, "loss": 0.6105, "step": 7340 }, { "epoch": 9.00918918918919, "grad_norm": 27.52150535583496, "learning_rate": 9.00900900900901e-06, "loss": 2.3751, "step": 7350 }, { "epoch": 9.00944658944659, "grad_norm": 0.4952925443649292, "learning_rate": 9.006149006149006e-06, "loss": 0.6956, "step": 7360 }, { "epoch": 9.00970398970399, "grad_norm": 25.885974884033203, "learning_rate": 9.003289003289004e-06, "loss": 0.7876, "step": 7370 }, { "epoch": 9.00996138996139, "grad_norm": 0.8748680949211121, "learning_rate": 9.000429000429e-06, "loss": 0.9017, "step": 7380 }, { "epoch": 9.01021879021879, "grad_norm": 24.5634708404541, "learning_rate": 8.997568997568999e-06, "loss": 0.6759, "step": 7390 }, { "epoch": 9.01047619047619, "grad_norm": 24.236270904541016, "learning_rate": 8.994708994708995e-06, "loss": 0.9211, "step": 7400 }, { "epoch": 9.01073359073359, "grad_norm": 23.703262329101562, "learning_rate": 8.991848991848993e-06, "loss": 0.6189, "step": 7410 }, { "epoch": 9.01099099099099, "grad_norm": 241.41310119628906, "learning_rate": 8.98898898898899e-06, "loss": 0.8321, "step": 7420 }, { "epoch": 9.011248391248392, "grad_norm": 0.09685607999563217, "learning_rate": 8.986128986128986e-06, "loss": 0.4043, "step": 7430 }, { "epoch": 9.011505791505792, "grad_norm": 0.21560928225517273, "learning_rate": 8.983268983268984e-06, "loss": 0.2176, "step": 7440 }, { "epoch": 9.011763191763192, "grad_norm": 0.07047346979379654, "learning_rate": 8.980408980408981e-06, "loss": 0.618, "step": 7450 }, { "epoch": 9.012020592020592, "grad_norm": 0.04795054346323013, "learning_rate": 8.977548977548979e-06, "loss": 0.3734, "step": 7460 }, { "epoch": 9.012277992277992, "grad_norm": 0.0800858736038208, "learning_rate": 8.974688974688976e-06, "loss": 0.5159, "step": 7470 }, { "epoch": 9.012535392535392, "grad_norm": 0.08157502114772797, "learning_rate": 8.971828971828974e-06, "loss": 0.0621, "step": 7480 }, { "epoch": 9.012792792792792, "grad_norm": 27.266658782958984, "learning_rate": 8.96896896896897e-06, "loss": 1.2765, "step": 7490 }, { "epoch": 9.013050193050193, "grad_norm": 0.17528657615184784, "learning_rate": 8.966108966108967e-06, "loss": 2.575, "step": 7500 }, { "epoch": 9.013307593307593, "grad_norm": 0.031745266169309616, "learning_rate": 8.963248963248963e-06, "loss": 0.7197, "step": 7510 }, { "epoch": 9.013564993564994, "grad_norm": 241.27267456054688, "learning_rate": 8.960388960388961e-06, "loss": 0.671, "step": 7520 }, { "epoch": 9.013822393822394, "grad_norm": 0.04264303669333458, "learning_rate": 8.957528957528958e-06, "loss": 1.5703, "step": 7530 }, { "epoch": 9.014079794079795, "grad_norm": 0.016448037698864937, "learning_rate": 8.954668954668956e-06, "loss": 0.4535, "step": 7540 }, { "epoch": 9.014337194337195, "grad_norm": 0.028359560295939445, "learning_rate": 8.951808951808952e-06, "loss": 0.6602, "step": 7550 }, { "epoch": 9.014594594594595, "grad_norm": 0.04676564037799835, "learning_rate": 8.94894894894895e-06, "loss": 0.6422, "step": 7560 }, { "epoch": 9.014851994851995, "grad_norm": 0.31510064005851746, "learning_rate": 8.946088946088947e-06, "loss": 0.0245, "step": 7570 }, { "epoch": 9.015109395109395, "grad_norm": 0.02477765455842018, "learning_rate": 8.943228943228944e-06, "loss": 1.7651, "step": 7580 }, { "epoch": 9.015366795366795, "grad_norm": 0.023003634065389633, "learning_rate": 8.940368940368942e-06, "loss": 1.3478, "step": 7590 }, { "epoch": 9.015624195624195, "grad_norm": 0.03384900093078613, "learning_rate": 8.937508937508938e-06, "loss": 0.7807, "step": 7600 }, { "epoch": 9.015881595881597, "grad_norm": 35.01763153076172, "learning_rate": 8.934648934648935e-06, "loss": 0.6444, "step": 7610 }, { "epoch": 9.016138996138997, "grad_norm": 0.8073970675468445, "learning_rate": 8.931788931788933e-06, "loss": 0.0087, "step": 7620 }, { "epoch": 9.016396396396397, "grad_norm": 0.2138083279132843, "learning_rate": 8.92892892892893e-06, "loss": 1.3422, "step": 7630 }, { "epoch": 9.016653796653797, "grad_norm": 0.03585595265030861, "learning_rate": 8.926068926068927e-06, "loss": 0.452, "step": 7640 }, { "epoch": 9.016911196911197, "grad_norm": 0.04055719077587128, "learning_rate": 8.923208923208924e-06, "loss": 0.7551, "step": 7650 }, { "epoch": 9.017168597168597, "grad_norm": 25.380233764648438, "learning_rate": 8.920348920348922e-06, "loss": 0.7222, "step": 7660 }, { "epoch": 9.017425997425997, "grad_norm": 0.0834406390786171, "learning_rate": 8.917488917488919e-06, "loss": 0.8602, "step": 7670 }, { "epoch": 9.017683397683397, "grad_norm": 56.41606903076172, "learning_rate": 8.914628914628915e-06, "loss": 1.9781, "step": 7680 }, { "epoch": 9.017940797940797, "grad_norm": 0.08189455419778824, "learning_rate": 8.911768911768911e-06, "loss": 0.3519, "step": 7690 }, { "epoch": 9.018198198198197, "grad_norm": 0.034165192395448685, "learning_rate": 8.90890890890891e-06, "loss": 1.345, "step": 7700 }, { "epoch": 9.0184555984556, "grad_norm": 0.14859578013420105, "learning_rate": 8.906048906048906e-06, "loss": 0.2301, "step": 7710 }, { "epoch": 9.018712998713, "grad_norm": 0.040978193283081055, "learning_rate": 8.903188903188904e-06, "loss": 0.5242, "step": 7720 }, { "epoch": 9.0189703989704, "grad_norm": 1.3178437948226929, "learning_rate": 8.9003289003289e-06, "loss": 1.1051, "step": 7730 }, { "epoch": 9.0192277992278, "grad_norm": 0.22424042224884033, "learning_rate": 8.897468897468899e-06, "loss": 0.899, "step": 7740 }, { "epoch": 9.0194851994852, "grad_norm": 0.023189270868897438, "learning_rate": 8.894608894608895e-06, "loss": 1.1294, "step": 7750 }, { "epoch": 9.0197425997426, "grad_norm": 0.5692568421363831, "learning_rate": 8.891748891748892e-06, "loss": 1.3751, "step": 7760 }, { "epoch": 9.02, "grad_norm": 25.165239334106445, "learning_rate": 8.888888888888888e-06, "loss": 1.4958, "step": 7770 }, { "epoch": 9.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 0.6078376173973083, "eval_runtime": 13.4317, "eval_samples_per_second": 3.425, "eval_steps_per_second": 3.425, "step": 7770 }, { "epoch": 10.0002574002574, "grad_norm": 0.1204485297203064, "learning_rate": 8.886028886028886e-06, "loss": 0.5537, "step": 7780 }, { "epoch": 10.0005148005148, "grad_norm": 0.06360988318920135, "learning_rate": 8.883168883168883e-06, "loss": 1.546, "step": 7790 }, { "epoch": 10.0007722007722, "grad_norm": 120.32550048828125, "learning_rate": 8.880308880308881e-06, "loss": 0.3902, "step": 7800 }, { "epoch": 10.0010296010296, "grad_norm": 0.15769629180431366, "learning_rate": 8.877448877448878e-06, "loss": 0.0061, "step": 7810 }, { "epoch": 10.001287001287002, "grad_norm": 0.497676819562912, "learning_rate": 8.874588874588876e-06, "loss": 0.95, "step": 7820 }, { "epoch": 10.001544401544402, "grad_norm": 6.817390441894531, "learning_rate": 8.871728871728872e-06, "loss": 0.7795, "step": 7830 }, { "epoch": 10.001801801801802, "grad_norm": 25.054075241088867, "learning_rate": 8.86886886886887e-06, "loss": 1.6924, "step": 7840 }, { "epoch": 10.002059202059202, "grad_norm": 0.5180888772010803, "learning_rate": 8.866008866008867e-06, "loss": 1.7818, "step": 7850 }, { "epoch": 10.002316602316602, "grad_norm": 96.92232513427734, "learning_rate": 8.863148863148863e-06, "loss": 1.1024, "step": 7860 }, { "epoch": 10.002574002574002, "grad_norm": 23.87799644470215, "learning_rate": 8.860288860288861e-06, "loss": 0.3728, "step": 7870 }, { "epoch": 10.002831402831402, "grad_norm": 0.5069870948791504, "learning_rate": 8.857428857428858e-06, "loss": 0.8391, "step": 7880 }, { "epoch": 10.003088803088803, "grad_norm": 0.02661108784377575, "learning_rate": 8.854568854568856e-06, "loss": 0.8022, "step": 7890 }, { "epoch": 10.003346203346203, "grad_norm": 0.21478380262851715, "learning_rate": 8.851708851708853e-06, "loss": 0.0109, "step": 7900 }, { "epoch": 10.003603603603604, "grad_norm": 3.073247194290161, "learning_rate": 8.84884884884885e-06, "loss": 1.2947, "step": 7910 }, { "epoch": 10.003861003861005, "grad_norm": 274.8877868652344, "learning_rate": 8.845988845988847e-06, "loss": 0.5697, "step": 7920 }, { "epoch": 10.004118404118405, "grad_norm": 37.88632583618164, "learning_rate": 8.843128843128844e-06, "loss": 1.3985, "step": 7930 }, { "epoch": 10.004375804375805, "grad_norm": 0.012297092005610466, "learning_rate": 8.84026884026884e-06, "loss": 0.4317, "step": 7940 }, { "epoch": 10.004633204633205, "grad_norm": 0.17633600533008575, "learning_rate": 8.837408837408838e-06, "loss": 0.6907, "step": 7950 }, { "epoch": 10.004890604890605, "grad_norm": 0.00784577988088131, "learning_rate": 8.834548834548835e-06, "loss": 1.0013, "step": 7960 }, { "epoch": 10.005148005148005, "grad_norm": 0.46426498889923096, "learning_rate": 8.831688831688833e-06, "loss": 1.081, "step": 7970 }, { "epoch": 10.005405405405405, "grad_norm": 0.21582862734794617, "learning_rate": 8.82882882882883e-06, "loss": 1.0164, "step": 7980 }, { "epoch": 10.005662805662805, "grad_norm": 159.87257385253906, "learning_rate": 8.825968825968828e-06, "loss": 1.0832, "step": 7990 }, { "epoch": 10.005920205920205, "grad_norm": 0.01719667576253414, "learning_rate": 8.823108823108824e-06, "loss": 0.9401, "step": 8000 }, { "epoch": 10.006177606177607, "grad_norm": 43.231407165527344, "learning_rate": 8.82024882024882e-06, "loss": 1.3899, "step": 8010 }, { "epoch": 10.006435006435007, "grad_norm": 0.17739209532737732, "learning_rate": 8.817388817388817e-06, "loss": 0.1192, "step": 8020 }, { "epoch": 10.006692406692407, "grad_norm": 44.295108795166016, "learning_rate": 8.814528814528815e-06, "loss": 1.9272, "step": 8030 }, { "epoch": 10.006949806949807, "grad_norm": 0.03208237141370773, "learning_rate": 8.811668811668812e-06, "loss": 0.9453, "step": 8040 }, { "epoch": 10.007207207207207, "grad_norm": 246.54135131835938, "learning_rate": 8.80880880880881e-06, "loss": 0.586, "step": 8050 }, { "epoch": 10.007464607464607, "grad_norm": 0.024292565882205963, "learning_rate": 8.805948805948806e-06, "loss": 0.306, "step": 8060 }, { "epoch": 10.007722007722007, "grad_norm": 0.043888118118047714, "learning_rate": 8.803088803088804e-06, "loss": 0.6646, "step": 8070 }, { "epoch": 10.007979407979407, "grad_norm": 0.05276341363787651, "learning_rate": 8.800228800228801e-06, "loss": 1.8646, "step": 8080 }, { "epoch": 10.008236808236807, "grad_norm": 3.0538644790649414, "learning_rate": 8.797368797368799e-06, "loss": 1.7883, "step": 8090 }, { "epoch": 10.00849420849421, "grad_norm": 0.43052423000335693, "learning_rate": 8.794508794508795e-06, "loss": 1.537, "step": 8100 }, { "epoch": 10.00875160875161, "grad_norm": 0.2308303564786911, "learning_rate": 8.791648791648792e-06, "loss": 0.6922, "step": 8110 }, { "epoch": 10.00900900900901, "grad_norm": 0.9070534706115723, "learning_rate": 8.788788788788788e-06, "loss": 0.7481, "step": 8120 }, { "epoch": 10.00926640926641, "grad_norm": 0.32801300287246704, "learning_rate": 8.785928785928787e-06, "loss": 0.6439, "step": 8130 }, { "epoch": 10.00952380952381, "grad_norm": 220.7642822265625, "learning_rate": 8.783068783068783e-06, "loss": 1.4445, "step": 8140 }, { "epoch": 10.00978120978121, "grad_norm": 0.18322208523750305, "learning_rate": 8.780208780208781e-06, "loss": 0.5422, "step": 8150 }, { "epoch": 10.01003861003861, "grad_norm": 0.15416328608989716, "learning_rate": 8.777348777348778e-06, "loss": 1.7991, "step": 8160 }, { "epoch": 10.01029601029601, "grad_norm": 0.02595771849155426, "learning_rate": 8.774488774488776e-06, "loss": 0.163, "step": 8170 }, { "epoch": 10.01055341055341, "grad_norm": 131.53567504882812, "learning_rate": 8.771628771628772e-06, "loss": 1.3368, "step": 8180 }, { "epoch": 10.010810810810812, "grad_norm": 0.39858779311180115, "learning_rate": 8.768768768768769e-06, "loss": 0.4107, "step": 8190 }, { "epoch": 10.011068211068212, "grad_norm": 0.34023207426071167, "learning_rate": 8.765908765908767e-06, "loss": 0.6566, "step": 8200 }, { "epoch": 10.011325611325612, "grad_norm": 0.4535696506500244, "learning_rate": 8.763048763048763e-06, "loss": 1.5843, "step": 8210 }, { "epoch": 10.011583011583012, "grad_norm": 0.04021540284156799, "learning_rate": 8.760188760188762e-06, "loss": 0.3503, "step": 8220 }, { "epoch": 10.011840411840412, "grad_norm": 46.56422424316406, "learning_rate": 8.757328757328758e-06, "loss": 1.2619, "step": 8230 }, { "epoch": 10.012097812097812, "grad_norm": 40.27321243286133, "learning_rate": 8.754468754468756e-06, "loss": 0.7487, "step": 8240 }, { "epoch": 10.012355212355212, "grad_norm": 25.20823097229004, "learning_rate": 8.751608751608753e-06, "loss": 1.2994, "step": 8250 }, { "epoch": 10.012612612612612, "grad_norm": 108.85746002197266, "learning_rate": 8.74874874874875e-06, "loss": 2.4518, "step": 8260 }, { "epoch": 10.012870012870012, "grad_norm": 0.0417940616607666, "learning_rate": 8.745888745888746e-06, "loss": 0.7443, "step": 8270 }, { "epoch": 10.013127413127414, "grad_norm": 0.008936642669141293, "learning_rate": 8.743028743028744e-06, "loss": 0.7142, "step": 8280 }, { "epoch": 10.013384813384814, "grad_norm": 0.036207396537065506, "learning_rate": 8.74016874016874e-06, "loss": 1.2926, "step": 8290 }, { "epoch": 10.013642213642214, "grad_norm": 0.016090108081698418, "learning_rate": 8.737308737308738e-06, "loss": 0.7859, "step": 8300 }, { "epoch": 10.013899613899614, "grad_norm": 0.013586858287453651, "learning_rate": 8.734448734448735e-06, "loss": 0.4006, "step": 8310 }, { "epoch": 10.014157014157014, "grad_norm": 0.01622426137328148, "learning_rate": 8.731588731588733e-06, "loss": 1.5877, "step": 8320 }, { "epoch": 10.014414414414414, "grad_norm": 0.1665509194135666, "learning_rate": 8.72872872872873e-06, "loss": 0.8227, "step": 8330 }, { "epoch": 10.014671814671814, "grad_norm": 0.13879388570785522, "learning_rate": 8.725868725868728e-06, "loss": 1.5522, "step": 8340 }, { "epoch": 10.014929214929214, "grad_norm": 44.89026641845703, "learning_rate": 8.723008723008724e-06, "loss": 1.3654, "step": 8350 }, { "epoch": 10.015186615186614, "grad_norm": 0.5491271018981934, "learning_rate": 8.72014872014872e-06, "loss": 1.0478, "step": 8360 }, { "epoch": 10.015444015444016, "grad_norm": 10.410693168640137, "learning_rate": 8.717288717288717e-06, "loss": 0.7527, "step": 8370 }, { "epoch": 10.015701415701416, "grad_norm": 0.08085234463214874, "learning_rate": 8.714428714428715e-06, "loss": 1.1404, "step": 8380 }, { "epoch": 10.015958815958816, "grad_norm": 0.05885207653045654, "learning_rate": 8.711568711568712e-06, "loss": 0.6541, "step": 8390 }, { "epoch": 10.016216216216216, "grad_norm": 0.12580057978630066, "learning_rate": 8.70870870870871e-06, "loss": 1.6544, "step": 8400 }, { "epoch": 10.016473616473617, "grad_norm": 0.2117607593536377, "learning_rate": 8.705848705848706e-06, "loss": 0.9909, "step": 8410 }, { "epoch": 10.016731016731017, "grad_norm": 0.22397179901599884, "learning_rate": 8.702988702988705e-06, "loss": 0.8806, "step": 8420 }, { "epoch": 10.016988416988417, "grad_norm": 25.2739200592041, "learning_rate": 8.700128700128701e-06, "loss": 1.1715, "step": 8430 }, { "epoch": 10.017245817245817, "grad_norm": 36.23960876464844, "learning_rate": 8.697268697268697e-06, "loss": 1.3008, "step": 8440 }, { "epoch": 10.017503217503217, "grad_norm": 0.24737899005413055, "learning_rate": 8.694408694408694e-06, "loss": 0.7906, "step": 8450 }, { "epoch": 10.017760617760617, "grad_norm": 37.60052490234375, "learning_rate": 8.691548691548692e-06, "loss": 1.56, "step": 8460 }, { "epoch": 10.018018018018019, "grad_norm": 56.47618865966797, "learning_rate": 8.688688688688689e-06, "loss": 1.2833, "step": 8470 }, { "epoch": 10.018275418275419, "grad_norm": 6.734436511993408, "learning_rate": 8.685828685828687e-06, "loss": 0.9528, "step": 8480 }, { "epoch": 10.018532818532819, "grad_norm": 9.789285659790039, "learning_rate": 8.682968682968683e-06, "loss": 1.0823, "step": 8490 }, { "epoch": 10.018790218790219, "grad_norm": 69.39130401611328, "learning_rate": 8.680108680108681e-06, "loss": 0.7756, "step": 8500 }, { "epoch": 10.019047619047619, "grad_norm": 0.3788280189037323, "learning_rate": 8.677248677248678e-06, "loss": 1.2168, "step": 8510 }, { "epoch": 10.019305019305019, "grad_norm": 32.36293411254883, "learning_rate": 8.674388674388674e-06, "loss": 1.4968, "step": 8520 }, { "epoch": 10.019562419562419, "grad_norm": 120.9354476928711, "learning_rate": 8.671528671528672e-06, "loss": 1.799, "step": 8530 }, { "epoch": 10.01981981981982, "grad_norm": 0.5419692993164062, "learning_rate": 8.668668668668669e-06, "loss": 1.7393, "step": 8540 }, { "epoch": 10.02, "eval_accuracy": 0.7608695652173914, "eval_loss": 0.7950884699821472, "eval_runtime": 13.4666, "eval_samples_per_second": 3.416, "eval_steps_per_second": 3.416, "step": 8547 }, { "epoch": 11.00007722007722, "grad_norm": 0.28460660576820374, "learning_rate": 8.665808665808665e-06, "loss": 0.878, "step": 8550 }, { "epoch": 11.00033462033462, "grad_norm": 24.105052947998047, "learning_rate": 8.662948662948664e-06, "loss": 1.2981, "step": 8560 }, { "epoch": 11.00059202059202, "grad_norm": 39.04997634887695, "learning_rate": 8.660088660088662e-06, "loss": 0.4637, "step": 8570 }, { "epoch": 11.000849420849422, "grad_norm": 0.8172537088394165, "learning_rate": 8.657228657228658e-06, "loss": 0.9727, "step": 8580 }, { "epoch": 11.001106821106822, "grad_norm": 0.018035829067230225, "learning_rate": 8.654368654368656e-06, "loss": 0.8219, "step": 8590 }, { "epoch": 11.001364221364222, "grad_norm": 100.51970672607422, "learning_rate": 8.651508651508653e-06, "loss": 0.4128, "step": 8600 }, { "epoch": 11.001621621621622, "grad_norm": 0.02792496792972088, "learning_rate": 8.64864864864865e-06, "loss": 1.9699, "step": 8610 }, { "epoch": 11.001879021879022, "grad_norm": 54.30400085449219, "learning_rate": 8.645788645788646e-06, "loss": 2.0642, "step": 8620 }, { "epoch": 11.002136422136422, "grad_norm": 72.56423950195312, "learning_rate": 8.642928642928644e-06, "loss": 0.299, "step": 8630 }, { "epoch": 11.002393822393822, "grad_norm": 0.7110329866409302, "learning_rate": 8.64006864006864e-06, "loss": 0.1974, "step": 8640 }, { "epoch": 11.002651222651222, "grad_norm": 587.3732299804688, "learning_rate": 8.637208637208639e-06, "loss": 0.9534, "step": 8650 }, { "epoch": 11.002908622908622, "grad_norm": 6.628912448883057, "learning_rate": 8.634348634348635e-06, "loss": 1.8159, "step": 8660 }, { "epoch": 11.003166023166024, "grad_norm": 35.30387878417969, "learning_rate": 8.631488631488633e-06, "loss": 0.3854, "step": 8670 }, { "epoch": 11.003423423423424, "grad_norm": 0.421171098947525, "learning_rate": 8.62862862862863e-06, "loss": 1.1167, "step": 8680 }, { "epoch": 11.003680823680824, "grad_norm": 0.05929442122578621, "learning_rate": 8.625768625768626e-06, "loss": 0.8298, "step": 8690 }, { "epoch": 11.003938223938224, "grad_norm": 0.041819799691438675, "learning_rate": 8.622908622908623e-06, "loss": 1.2108, "step": 8700 }, { "epoch": 11.004195624195624, "grad_norm": 0.06778862327337265, "learning_rate": 8.62004862004862e-06, "loss": 0.7713, "step": 8710 }, { "epoch": 11.004453024453024, "grad_norm": 40.76213455200195, "learning_rate": 8.617188617188617e-06, "loss": 1.4024, "step": 8720 }, { "epoch": 11.004710424710424, "grad_norm": 0.07441040128469467, "learning_rate": 8.614328614328615e-06, "loss": 0.7176, "step": 8730 }, { "epoch": 11.004967824967824, "grad_norm": 0.007878212258219719, "learning_rate": 8.611468611468612e-06, "loss": 0.0048, "step": 8740 }, { "epoch": 11.005225225225225, "grad_norm": 0.04463992267847061, "learning_rate": 8.60860860860861e-06, "loss": 0.4363, "step": 8750 }, { "epoch": 11.005482625482626, "grad_norm": 87.40447235107422, "learning_rate": 8.605748605748607e-06, "loss": 0.7171, "step": 8760 }, { "epoch": 11.005740025740026, "grad_norm": 0.3318406343460083, "learning_rate": 8.602888602888605e-06, "loss": 1.4153, "step": 8770 }, { "epoch": 11.005997425997426, "grad_norm": 0.25054681301116943, "learning_rate": 8.600028600028601e-06, "loss": 0.4377, "step": 8780 }, { "epoch": 11.006254826254827, "grad_norm": 0.2337009757757187, "learning_rate": 8.597168597168598e-06, "loss": 0.0044, "step": 8790 }, { "epoch": 11.006512226512227, "grad_norm": 0.26338788866996765, "learning_rate": 8.594308594308594e-06, "loss": 0.4798, "step": 8800 }, { "epoch": 11.006769626769627, "grad_norm": 24.581886291503906, "learning_rate": 8.591448591448592e-06, "loss": 2.3055, "step": 8810 }, { "epoch": 11.007027027027027, "grad_norm": 0.10813351720571518, "learning_rate": 8.588588588588589e-06, "loss": 0.1178, "step": 8820 }, { "epoch": 11.007284427284427, "grad_norm": 313.4660949707031, "learning_rate": 8.585728585728587e-06, "loss": 2.1168, "step": 8830 }, { "epoch": 11.007541827541827, "grad_norm": 3.0175836086273193, "learning_rate": 8.582868582868583e-06, "loss": 1.697, "step": 8840 }, { "epoch": 11.007799227799227, "grad_norm": 0.008230429142713547, "learning_rate": 8.580008580008582e-06, "loss": 1.0399, "step": 8850 }, { "epoch": 11.008056628056629, "grad_norm": 24.00910758972168, "learning_rate": 8.577148577148578e-06, "loss": 0.7707, "step": 8860 }, { "epoch": 11.008314028314029, "grad_norm": 160.81443786621094, "learning_rate": 8.574288574288574e-06, "loss": 1.0067, "step": 8870 }, { "epoch": 11.008571428571429, "grad_norm": 250.05177307128906, "learning_rate": 8.571428571428571e-06, "loss": 0.5634, "step": 8880 }, { "epoch": 11.008828828828829, "grad_norm": 0.2900729179382324, "learning_rate": 8.568568568568569e-06, "loss": 1.7788, "step": 8890 }, { "epoch": 11.009086229086229, "grad_norm": 0.0074234092608094215, "learning_rate": 8.565708565708566e-06, "loss": 1.3323, "step": 8900 }, { "epoch": 11.009343629343629, "grad_norm": 0.32027724385261536, "learning_rate": 8.562848562848564e-06, "loss": 0.5043, "step": 8910 }, { "epoch": 11.00960102960103, "grad_norm": 2.96531343460083, "learning_rate": 8.55998855998856e-06, "loss": 0.6105, "step": 8920 }, { "epoch": 11.00985842985843, "grad_norm": 0.34536412358283997, "learning_rate": 8.557128557128558e-06, "loss": 0.9915, "step": 8930 }, { "epoch": 11.01011583011583, "grad_norm": 5.191471099853516, "learning_rate": 8.554268554268555e-06, "loss": 0.5698, "step": 8940 }, { "epoch": 11.010373230373231, "grad_norm": 0.3449559509754181, "learning_rate": 8.551408551408551e-06, "loss": 0.004, "step": 8950 }, { "epoch": 11.010630630630631, "grad_norm": 0.012626473791897297, "learning_rate": 8.54854854854855e-06, "loss": 1.3268, "step": 8960 }, { "epoch": 11.010888030888031, "grad_norm": 0.15182413160800934, "learning_rate": 8.545688545688546e-06, "loss": 1.576, "step": 8970 }, { "epoch": 11.011145431145431, "grad_norm": 0.5726296305656433, "learning_rate": 8.542828542828544e-06, "loss": 0.606, "step": 8980 }, { "epoch": 11.011402831402831, "grad_norm": 0.0075800674967467785, "learning_rate": 8.53996853996854e-06, "loss": 0.4473, "step": 8990 }, { "epoch": 11.011660231660231, "grad_norm": 25.251062393188477, "learning_rate": 8.537108537108539e-06, "loss": 2.1039, "step": 9000 }, { "epoch": 11.011917631917632, "grad_norm": 22.96194076538086, "learning_rate": 8.534248534248535e-06, "loss": 2.052, "step": 9010 }, { "epoch": 11.012175032175032, "grad_norm": 75.685791015625, "learning_rate": 8.531388531388533e-06, "loss": 1.1469, "step": 9020 }, { "epoch": 11.012432432432432, "grad_norm": 0.6149288415908813, "learning_rate": 8.52852852852853e-06, "loss": 0.4877, "step": 9030 }, { "epoch": 11.012689832689833, "grad_norm": 0.15368354320526123, "learning_rate": 8.525668525668526e-06, "loss": 0.6674, "step": 9040 }, { "epoch": 11.012947232947234, "grad_norm": 277.51312255859375, "learning_rate": 8.522808522808523e-06, "loss": 2.0748, "step": 9050 }, { "epoch": 11.013204633204634, "grad_norm": 0.009949028491973877, "learning_rate": 8.519948519948521e-06, "loss": 0.382, "step": 9060 }, { "epoch": 11.013462033462034, "grad_norm": 33.68278121948242, "learning_rate": 8.517088517088517e-06, "loss": 0.9996, "step": 9070 }, { "epoch": 11.013719433719434, "grad_norm": 0.07025768607854843, "learning_rate": 8.514228514228516e-06, "loss": 0.7772, "step": 9080 }, { "epoch": 11.013976833976834, "grad_norm": 0.5183196663856506, "learning_rate": 8.511368511368512e-06, "loss": 0.0227, "step": 9090 }, { "epoch": 11.014234234234234, "grad_norm": 0.3555748462677002, "learning_rate": 8.50850850850851e-06, "loss": 2.1808, "step": 9100 }, { "epoch": 11.014491634491634, "grad_norm": 383.48785400390625, "learning_rate": 8.505648505648507e-06, "loss": 1.5736, "step": 9110 }, { "epoch": 11.014749034749034, "grad_norm": 0.4379149377346039, "learning_rate": 8.502788502788503e-06, "loss": 0.3884, "step": 9120 }, { "epoch": 11.015006435006436, "grad_norm": 0.4699688255786896, "learning_rate": 8.4999284999285e-06, "loss": 0.4054, "step": 9130 }, { "epoch": 11.015263835263836, "grad_norm": 1.8073198795318604, "learning_rate": 8.497068497068498e-06, "loss": 0.4486, "step": 9140 }, { "epoch": 11.015521235521236, "grad_norm": 297.4031066894531, "learning_rate": 8.494208494208494e-06, "loss": 1.0507, "step": 9150 }, { "epoch": 11.015778635778636, "grad_norm": 0.2800242602825165, "learning_rate": 8.491348491348492e-06, "loss": 0.8891, "step": 9160 }, { "epoch": 11.016036036036036, "grad_norm": 0.012133017182350159, "learning_rate": 8.488488488488489e-06, "loss": 0.4816, "step": 9170 }, { "epoch": 11.016293436293436, "grad_norm": 0.12698222696781158, "learning_rate": 8.485628485628487e-06, "loss": 1.0889, "step": 9180 }, { "epoch": 11.016550836550836, "grad_norm": 0.012583434581756592, "learning_rate": 8.482768482768483e-06, "loss": 1.4182, "step": 9190 }, { "epoch": 11.016808236808236, "grad_norm": 0.011629793792963028, "learning_rate": 8.47990847990848e-06, "loss": 0.3926, "step": 9200 }, { "epoch": 11.017065637065636, "grad_norm": 297.43707275390625, "learning_rate": 8.477048477048478e-06, "loss": 1.2325, "step": 9210 }, { "epoch": 11.017323037323038, "grad_norm": 1184.72265625, "learning_rate": 8.474188474188475e-06, "loss": 1.7258, "step": 9220 }, { "epoch": 11.017580437580438, "grad_norm": 0.061885394155979156, "learning_rate": 8.471328471328471e-06, "loss": 0.7547, "step": 9230 }, { "epoch": 11.017837837837838, "grad_norm": 101.12828826904297, "learning_rate": 8.46846846846847e-06, "loss": 1.0483, "step": 9240 }, { "epoch": 11.018095238095238, "grad_norm": 0.5181973576545715, "learning_rate": 8.465608465608466e-06, "loss": 0.4266, "step": 9250 }, { "epoch": 11.018352638352638, "grad_norm": 1.9151122570037842, "learning_rate": 8.462748462748464e-06, "loss": 1.0087, "step": 9260 }, { "epoch": 11.018610038610039, "grad_norm": 0.022047946229577065, "learning_rate": 8.45988845988846e-06, "loss": 0.6823, "step": 9270 }, { "epoch": 11.018867438867439, "grad_norm": 417.81298828125, "learning_rate": 8.457028457028458e-06, "loss": 0.3463, "step": 9280 }, { "epoch": 11.019124839124839, "grad_norm": 10.910456657409668, "learning_rate": 8.454168454168455e-06, "loss": 0.8723, "step": 9290 }, { "epoch": 11.019382239382239, "grad_norm": 0.32493460178375244, "learning_rate": 8.451308451308451e-06, "loss": 0.6945, "step": 9300 }, { "epoch": 11.019639639639639, "grad_norm": 0.010908570140600204, "learning_rate": 8.44844844844845e-06, "loss": 0.0103, "step": 9310 }, { "epoch": 11.01989703989704, "grad_norm": 0.07723493129014969, "learning_rate": 8.445588445588446e-06, "loss": 1.0987, "step": 9320 }, { "epoch": 11.02, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.7859626412391663, "eval_runtime": 13.4764, "eval_samples_per_second": 3.413, "eval_steps_per_second": 3.413, "step": 9324 }, { "epoch": 12.00015444015444, "grad_norm": 0.47433146834373474, "learning_rate": 8.442728442728444e-06, "loss": 1.2386, "step": 9330 }, { "epoch": 12.000411840411841, "grad_norm": 0.011013006791472435, "learning_rate": 8.43986843986844e-06, "loss": 0.378, "step": 9340 }, { "epoch": 12.000669240669241, "grad_norm": 0.00891246646642685, "learning_rate": 8.437008437008439e-06, "loss": 0.3884, "step": 9350 }, { "epoch": 12.000926640926641, "grad_norm": 0.01927793025970459, "learning_rate": 8.434148434148435e-06, "loss": 0.5555, "step": 9360 }, { "epoch": 12.001184041184041, "grad_norm": 2.8038787841796875, "learning_rate": 8.431288431288432e-06, "loss": 1.0693, "step": 9370 }, { "epoch": 12.001441441441441, "grad_norm": 802.3114013671875, "learning_rate": 8.428428428428428e-06, "loss": 1.8102, "step": 9380 }, { "epoch": 12.001698841698841, "grad_norm": 0.3516940176486969, "learning_rate": 8.425568425568426e-06, "loss": 0.9108, "step": 9390 }, { "epoch": 12.001956241956242, "grad_norm": 0.00919923186302185, "learning_rate": 8.422708422708423e-06, "loss": 0.8663, "step": 9400 }, { "epoch": 12.002213642213642, "grad_norm": 0.17423267662525177, "learning_rate": 8.419848419848421e-06, "loss": 1.6467, "step": 9410 }, { "epoch": 12.002471042471042, "grad_norm": 0.010266111232340336, "learning_rate": 8.416988416988418e-06, "loss": 0.4335, "step": 9420 }, { "epoch": 12.002728442728444, "grad_norm": 0.29146385192871094, "learning_rate": 8.414128414128416e-06, "loss": 0.7404, "step": 9430 }, { "epoch": 12.002985842985844, "grad_norm": 0.3227342963218689, "learning_rate": 8.411268411268412e-06, "loss": 0.4085, "step": 9440 }, { "epoch": 12.003243243243244, "grad_norm": 46.19145965576172, "learning_rate": 8.408408408408409e-06, "loss": 1.2482, "step": 9450 }, { "epoch": 12.003500643500644, "grad_norm": 0.007554527837783098, "learning_rate": 8.405548405548407e-06, "loss": 1.2104, "step": 9460 }, { "epoch": 12.003758043758044, "grad_norm": 0.6184163689613342, "learning_rate": 8.402688402688403e-06, "loss": 0.7161, "step": 9470 }, { "epoch": 12.004015444015444, "grad_norm": 1685.7156982421875, "learning_rate": 8.3998283998284e-06, "loss": 1.4936, "step": 9480 }, { "epoch": 12.004272844272844, "grad_norm": 410.31732177734375, "learning_rate": 8.396968396968398e-06, "loss": 1.1704, "step": 9490 }, { "epoch": 12.004530244530244, "grad_norm": 15.203624725341797, "learning_rate": 8.394108394108394e-06, "loss": 0.0113, "step": 9500 }, { "epoch": 12.004787644787644, "grad_norm": 0.34546852111816406, "learning_rate": 8.391248391248393e-06, "loss": 0.9388, "step": 9510 }, { "epoch": 12.005045045045046, "grad_norm": 0.10535982996225357, "learning_rate": 8.388388388388389e-06, "loss": 0.4243, "step": 9520 }, { "epoch": 12.005302445302446, "grad_norm": 0.005769877228885889, "learning_rate": 8.385528385528387e-06, "loss": 0.8682, "step": 9530 }, { "epoch": 12.005559845559846, "grad_norm": 0.0826544538140297, "learning_rate": 8.382668382668384e-06, "loss": 1.2218, "step": 9540 }, { "epoch": 12.005817245817246, "grad_norm": 0.014597934670746326, "learning_rate": 8.37980837980838e-06, "loss": 0.8302, "step": 9550 }, { "epoch": 12.006074646074646, "grad_norm": 40.39971160888672, "learning_rate": 8.376948376948377e-06, "loss": 1.5042, "step": 9560 }, { "epoch": 12.006332046332046, "grad_norm": 98.84669494628906, "learning_rate": 8.374088374088375e-06, "loss": 0.9766, "step": 9570 }, { "epoch": 12.006589446589446, "grad_norm": 0.027279607951641083, "learning_rate": 8.371228371228371e-06, "loss": 0.7949, "step": 9580 }, { "epoch": 12.006846846846846, "grad_norm": 0.5903051495552063, "learning_rate": 8.36836836836837e-06, "loss": 1.9912, "step": 9590 }, { "epoch": 12.007104247104246, "grad_norm": 0.3788576126098633, "learning_rate": 8.365508365508366e-06, "loss": 0.0129, "step": 9600 }, { "epoch": 12.007361647361648, "grad_norm": 0.021588651463389397, "learning_rate": 8.362648362648364e-06, "loss": 2.1048, "step": 9610 }, { "epoch": 12.007619047619048, "grad_norm": 0.36952582001686096, "learning_rate": 8.35978835978836e-06, "loss": 1.2766, "step": 9620 }, { "epoch": 12.007876447876448, "grad_norm": 0.09042210876941681, "learning_rate": 8.356928356928357e-06, "loss": 0.7546, "step": 9630 }, { "epoch": 12.008133848133848, "grad_norm": 2171.077880859375, "learning_rate": 8.354068354068355e-06, "loss": 1.81, "step": 9640 }, { "epoch": 12.008391248391248, "grad_norm": 0.14514465630054474, "learning_rate": 8.351208351208352e-06, "loss": 1.5139, "step": 9650 }, { "epoch": 12.008648648648649, "grad_norm": 0.049092844128608704, "learning_rate": 8.348348348348348e-06, "loss": 1.7561, "step": 9660 }, { "epoch": 12.008906048906049, "grad_norm": 30.588136672973633, "learning_rate": 8.345488345488346e-06, "loss": 0.4941, "step": 9670 }, { "epoch": 12.009163449163449, "grad_norm": 0.8432363271713257, "learning_rate": 8.342628342628343e-06, "loss": 0.7714, "step": 9680 }, { "epoch": 12.009420849420849, "grad_norm": 1.5549473762512207, "learning_rate": 8.33976833976834e-06, "loss": 1.1233, "step": 9690 }, { "epoch": 12.009678249678249, "grad_norm": 1.1150774955749512, "learning_rate": 8.336908336908337e-06, "loss": 0.7609, "step": 9700 }, { "epoch": 12.00993564993565, "grad_norm": 0.13375124335289001, "learning_rate": 8.334048334048335e-06, "loss": 0.3168, "step": 9710 }, { "epoch": 12.01019305019305, "grad_norm": 0.010085425339639187, "learning_rate": 8.331188331188332e-06, "loss": 0.123, "step": 9720 }, { "epoch": 12.01045045045045, "grad_norm": 0.4398421347141266, "learning_rate": 8.328328328328328e-06, "loss": 0.0059, "step": 9730 }, { "epoch": 12.01070785070785, "grad_norm": 0.03640660271048546, "learning_rate": 8.325468325468327e-06, "loss": 1.4955, "step": 9740 }, { "epoch": 12.010965250965251, "grad_norm": 0.0063200341537594795, "learning_rate": 8.322608322608323e-06, "loss": 1.0175, "step": 9750 }, { "epoch": 12.011222651222651, "grad_norm": 0.2818882167339325, "learning_rate": 8.319748319748321e-06, "loss": 0.6537, "step": 9760 }, { "epoch": 12.011480051480051, "grad_norm": 30.326845169067383, "learning_rate": 8.316888316888318e-06, "loss": 0.8632, "step": 9770 }, { "epoch": 12.011737451737451, "grad_norm": 0.115543894469738, "learning_rate": 8.314028314028316e-06, "loss": 0.3636, "step": 9780 }, { "epoch": 12.011994851994851, "grad_norm": 1787.333740234375, "learning_rate": 8.311168311168312e-06, "loss": 1.3063, "step": 9790 }, { "epoch": 12.012252252252253, "grad_norm": 0.19999530911445618, "learning_rate": 8.308308308308309e-06, "loss": 0.8213, "step": 9800 }, { "epoch": 12.012509652509653, "grad_norm": 4.1851654052734375, "learning_rate": 8.305448305448305e-06, "loss": 0.0428, "step": 9810 }, { "epoch": 12.012767052767053, "grad_norm": 0.09847444295883179, "learning_rate": 8.302588302588303e-06, "loss": 0.4367, "step": 9820 }, { "epoch": 12.013024453024453, "grad_norm": 0.3852860629558563, "learning_rate": 8.2997282997283e-06, "loss": 1.4293, "step": 9830 }, { "epoch": 12.013281853281853, "grad_norm": 0.13400162756443024, "learning_rate": 8.296868296868298e-06, "loss": 0.0083, "step": 9840 }, { "epoch": 12.013539253539253, "grad_norm": 0.18309134244918823, "learning_rate": 8.294008294008295e-06, "loss": 0.114, "step": 9850 }, { "epoch": 12.013796653796653, "grad_norm": 0.12962332367897034, "learning_rate": 8.291148291148293e-06, "loss": 1.1698, "step": 9860 }, { "epoch": 12.014054054054053, "grad_norm": 0.711496114730835, "learning_rate": 8.288288288288289e-06, "loss": 1.2569, "step": 9870 }, { "epoch": 12.014311454311454, "grad_norm": 33.74251937866211, "learning_rate": 8.285428285428286e-06, "loss": 2.4428, "step": 9880 }, { "epoch": 12.014568854568855, "grad_norm": 0.04531510919332504, "learning_rate": 8.282568282568284e-06, "loss": 1.7002, "step": 9890 }, { "epoch": 12.014826254826255, "grad_norm": 55.16579818725586, "learning_rate": 8.27970827970828e-06, "loss": 0.3482, "step": 9900 }, { "epoch": 12.015083655083656, "grad_norm": 0.34218963980674744, "learning_rate": 8.276848276848277e-06, "loss": 0.7469, "step": 9910 }, { "epoch": 12.015341055341056, "grad_norm": 39.31129837036133, "learning_rate": 8.273988273988275e-06, "loss": 1.3403, "step": 9920 }, { "epoch": 12.015598455598456, "grad_norm": 0.025866910815238953, "learning_rate": 8.271128271128271e-06, "loss": 0.6076, "step": 9930 }, { "epoch": 12.015855855855856, "grad_norm": 58.11125946044922, "learning_rate": 8.26826826826827e-06, "loss": 1.2356, "step": 9940 }, { "epoch": 12.016113256113256, "grad_norm": 0.6136714816093445, "learning_rate": 8.265408265408266e-06, "loss": 1.3545, "step": 9950 }, { "epoch": 12.016370656370656, "grad_norm": 0.12369683384895325, "learning_rate": 8.262548262548264e-06, "loss": 0.7099, "step": 9960 }, { "epoch": 12.016628056628056, "grad_norm": 0.0038368524983525276, "learning_rate": 8.25968825968826e-06, "loss": 0.5042, "step": 9970 }, { "epoch": 12.016885456885458, "grad_norm": 33.09714889526367, "learning_rate": 8.256828256828257e-06, "loss": 1.0448, "step": 9980 }, { "epoch": 12.017142857142858, "grad_norm": 0.4139602780342102, "learning_rate": 8.253968253968254e-06, "loss": 0.015, "step": 9990 }, { "epoch": 12.017400257400258, "grad_norm": 0.0036292793229222298, "learning_rate": 8.251108251108252e-06, "loss": 0.0043, "step": 10000 }, { "epoch": 12.017657657657658, "grad_norm": 0.04275906831026077, "learning_rate": 8.248248248248248e-06, "loss": 1.3682, "step": 10010 }, { "epoch": 12.017915057915058, "grad_norm": 0.4722258150577545, "learning_rate": 8.245388245388246e-06, "loss": 1.6312, "step": 10020 }, { "epoch": 12.018172458172458, "grad_norm": 24.255712509155273, "learning_rate": 8.242528242528243e-06, "loss": 1.44, "step": 10030 }, { "epoch": 12.018429858429858, "grad_norm": 1.5887428522109985, "learning_rate": 8.239668239668241e-06, "loss": 0.3472, "step": 10040 }, { "epoch": 12.018687258687258, "grad_norm": 0.007095601875334978, "learning_rate": 8.236808236808237e-06, "loss": 1.1121, "step": 10050 }, { "epoch": 12.018944658944658, "grad_norm": 0.10487360507249832, "learning_rate": 8.233948233948234e-06, "loss": 0.0206, "step": 10060 }, { "epoch": 12.01920205920206, "grad_norm": 528.3002319335938, "learning_rate": 8.231088231088232e-06, "loss": 0.8335, "step": 10070 }, { "epoch": 12.01945945945946, "grad_norm": 0.003817453049123287, "learning_rate": 8.228228228228229e-06, "loss": 1.8427, "step": 10080 }, { "epoch": 12.01971685971686, "grad_norm": 41.397342681884766, "learning_rate": 8.225368225368227e-06, "loss": 2.466, "step": 10090 }, { "epoch": 12.01997425997426, "grad_norm": 1.2678889036178589, "learning_rate": 8.222508222508223e-06, "loss": 0.8703, "step": 10100 }, { "epoch": 12.02, "eval_accuracy": 0.782608695652174, "eval_loss": 0.9878782033920288, "eval_runtime": 13.4438, "eval_samples_per_second": 3.422, "eval_steps_per_second": 3.422, "step": 10101 }, { "epoch": 13.00023166023166, "grad_norm": 34.533180236816406, "learning_rate": 8.219648219648221e-06, "loss": 1.276, "step": 10110 }, { "epoch": 13.00048906048906, "grad_norm": 0.05692519620060921, "learning_rate": 8.216788216788218e-06, "loss": 0.3495, "step": 10120 }, { "epoch": 13.000746460746461, "grad_norm": 39.88087844848633, "learning_rate": 8.213928213928214e-06, "loss": 0.8957, "step": 10130 }, { "epoch": 13.001003861003861, "grad_norm": 0.8153846859931946, "learning_rate": 8.211068211068212e-06, "loss": 0.4844, "step": 10140 }, { "epoch": 13.001261261261261, "grad_norm": 0.43959328532218933, "learning_rate": 8.208208208208209e-06, "loss": 0.3933, "step": 10150 }, { "epoch": 13.001518661518661, "grad_norm": 0.044305652379989624, "learning_rate": 8.205348205348205e-06, "loss": 0.8691, "step": 10160 }, { "epoch": 13.001776061776061, "grad_norm": 0.014474362134933472, "learning_rate": 8.202488202488204e-06, "loss": 1.1859, "step": 10170 }, { "epoch": 13.002033462033461, "grad_norm": 0.018979106098413467, "learning_rate": 8.1996281996282e-06, "loss": 0.0387, "step": 10180 }, { "epoch": 13.002290862290863, "grad_norm": 0.03073548711836338, "learning_rate": 8.196768196768198e-06, "loss": 0.9004, "step": 10190 }, { "epoch": 13.002548262548263, "grad_norm": 0.06650511175394058, "learning_rate": 8.193908193908195e-06, "loss": 1.8394, "step": 10200 }, { "epoch": 13.002805662805663, "grad_norm": 0.024794111028313637, "learning_rate": 8.191048191048193e-06, "loss": 0.8927, "step": 10210 }, { "epoch": 13.003063063063063, "grad_norm": 0.039084263145923615, "learning_rate": 8.18818818818819e-06, "loss": 0.4123, "step": 10220 }, { "epoch": 13.003320463320463, "grad_norm": 0.06459786742925644, "learning_rate": 8.185328185328186e-06, "loss": 0.3527, "step": 10230 }, { "epoch": 13.003577863577863, "grad_norm": 44.55921173095703, "learning_rate": 8.182468182468182e-06, "loss": 0.0077, "step": 10240 }, { "epoch": 13.003835263835263, "grad_norm": 0.22598184645175934, "learning_rate": 8.17960817960818e-06, "loss": 1.4898, "step": 10250 }, { "epoch": 13.004092664092664, "grad_norm": 204.12083435058594, "learning_rate": 8.176748176748177e-06, "loss": 0.0211, "step": 10260 }, { "epoch": 13.004350064350064, "grad_norm": 213.62789916992188, "learning_rate": 8.173888173888175e-06, "loss": 0.7785, "step": 10270 }, { "epoch": 13.004607464607465, "grad_norm": 0.017117006704211235, "learning_rate": 8.171028171028171e-06, "loss": 0.2757, "step": 10280 }, { "epoch": 13.004864864864865, "grad_norm": 0.24598395824432373, "learning_rate": 8.16816816816817e-06, "loss": 1.5201, "step": 10290 }, { "epoch": 13.005122265122266, "grad_norm": 0.04848215728998184, "learning_rate": 8.165308165308166e-06, "loss": 0.8018, "step": 10300 }, { "epoch": 13.005379665379666, "grad_norm": 0.005087207071483135, "learning_rate": 8.162448162448163e-06, "loss": 1.3702, "step": 10310 }, { "epoch": 13.005637065637066, "grad_norm": 0.22931011021137238, "learning_rate": 8.159588159588159e-06, "loss": 1.4593, "step": 10320 }, { "epoch": 13.005894465894466, "grad_norm": 0.7556596994400024, "learning_rate": 8.156728156728157e-06, "loss": 0.6623, "step": 10330 }, { "epoch": 13.006151866151866, "grad_norm": 178.25360107421875, "learning_rate": 8.153868153868154e-06, "loss": 0.9403, "step": 10340 }, { "epoch": 13.006409266409266, "grad_norm": 0.3808182179927826, "learning_rate": 8.151008151008152e-06, "loss": 0.9131, "step": 10350 }, { "epoch": 13.006666666666666, "grad_norm": 0.014495433308184147, "learning_rate": 8.148148148148148e-06, "loss": 0.4056, "step": 10360 }, { "epoch": 13.006924066924068, "grad_norm": 27.801916122436523, "learning_rate": 8.145288145288146e-06, "loss": 1.3077, "step": 10370 }, { "epoch": 13.007181467181468, "grad_norm": 55.3273811340332, "learning_rate": 8.142428142428143e-06, "loss": 0.6712, "step": 10380 }, { "epoch": 13.007438867438868, "grad_norm": 0.006372170057147741, "learning_rate": 8.139568139568141e-06, "loss": 1.3903, "step": 10390 }, { "epoch": 13.007696267696268, "grad_norm": 24.87656593322754, "learning_rate": 8.136708136708138e-06, "loss": 2.7086, "step": 10400 }, { "epoch": 13.007953667953668, "grad_norm": 0.33655011653900146, "learning_rate": 8.133848133848134e-06, "loss": 0.9958, "step": 10410 }, { "epoch": 13.008211068211068, "grad_norm": 0.038330886512994766, "learning_rate": 8.13098813098813e-06, "loss": 1.4135, "step": 10420 }, { "epoch": 13.008468468468468, "grad_norm": 0.3756619393825531, "learning_rate": 8.128128128128129e-06, "loss": 0.7018, "step": 10430 }, { "epoch": 13.008725868725868, "grad_norm": 0.04136725515127182, "learning_rate": 8.125268125268127e-06, "loss": 0.7366, "step": 10440 }, { "epoch": 13.008983268983268, "grad_norm": 27.85120391845703, "learning_rate": 8.122408122408123e-06, "loss": 0.8004, "step": 10450 }, { "epoch": 13.009240669240668, "grad_norm": 0.12497317045927048, "learning_rate": 8.119548119548121e-06, "loss": 0.789, "step": 10460 }, { "epoch": 13.00949806949807, "grad_norm": 0.3569711148738861, "learning_rate": 8.116688116688118e-06, "loss": 0.0059, "step": 10470 }, { "epoch": 13.00975546975547, "grad_norm": 0.012415600009262562, "learning_rate": 8.113828113828114e-06, "loss": 1.2201, "step": 10480 }, { "epoch": 13.01001287001287, "grad_norm": 0.012238414026796818, "learning_rate": 8.110968110968111e-06, "loss": 0.0005, "step": 10490 }, { "epoch": 13.01027027027027, "grad_norm": 65.99483489990234, "learning_rate": 8.108108108108109e-06, "loss": 1.8716, "step": 10500 }, { "epoch": 13.01052767052767, "grad_norm": 0.11564770340919495, "learning_rate": 8.105248105248106e-06, "loss": 1.0245, "step": 10510 }, { "epoch": 13.01078507078507, "grad_norm": 35.921592712402344, "learning_rate": 8.102388102388104e-06, "loss": 1.9181, "step": 10520 }, { "epoch": 13.01104247104247, "grad_norm": 84.63827514648438, "learning_rate": 8.0995280995281e-06, "loss": 0.584, "step": 10530 }, { "epoch": 13.01129987129987, "grad_norm": 177.37327575683594, "learning_rate": 8.096668096668098e-06, "loss": 0.7746, "step": 10540 }, { "epoch": 13.01155727155727, "grad_norm": 31.49327850341797, "learning_rate": 8.093808093808095e-06, "loss": 2.6965, "step": 10550 }, { "epoch": 13.011814671814673, "grad_norm": 0.9470750689506531, "learning_rate": 8.090948090948091e-06, "loss": 0.212, "step": 10560 }, { "epoch": 13.012072072072073, "grad_norm": 0.012930578552186489, "learning_rate": 8.088088088088088e-06, "loss": 0.7271, "step": 10570 }, { "epoch": 13.012329472329473, "grad_norm": 0.01820276491343975, "learning_rate": 8.085228085228086e-06, "loss": 0.7162, "step": 10580 }, { "epoch": 13.012586872586873, "grad_norm": 21.15532684326172, "learning_rate": 8.082368082368082e-06, "loss": 2.9765, "step": 10590 }, { "epoch": 13.012844272844273, "grad_norm": 0.6806768178939819, "learning_rate": 8.07950807950808e-06, "loss": 0.7837, "step": 10600 }, { "epoch": 13.013101673101673, "grad_norm": 15.736740112304688, "learning_rate": 8.076648076648077e-06, "loss": 1.3596, "step": 10610 }, { "epoch": 13.013359073359073, "grad_norm": 0.6283915042877197, "learning_rate": 8.073788073788075e-06, "loss": 0.6883, "step": 10620 }, { "epoch": 13.013616473616473, "grad_norm": 0.961954653263092, "learning_rate": 8.070928070928072e-06, "loss": 2.2455, "step": 10630 }, { "epoch": 13.013873873873873, "grad_norm": 0.627880334854126, "learning_rate": 8.06806806806807e-06, "loss": 0.7301, "step": 10640 }, { "epoch": 13.014131274131275, "grad_norm": 0.23738040030002594, "learning_rate": 8.065208065208066e-06, "loss": 1.0691, "step": 10650 }, { "epoch": 13.014388674388675, "grad_norm": 0.4037235677242279, "learning_rate": 8.062348062348063e-06, "loss": 0.4116, "step": 10660 }, { "epoch": 13.014646074646075, "grad_norm": 72.83759307861328, "learning_rate": 8.05948805948806e-06, "loss": 1.5558, "step": 10670 }, { "epoch": 13.014903474903475, "grad_norm": 0.330087274312973, "learning_rate": 8.056628056628057e-06, "loss": 1.1148, "step": 10680 }, { "epoch": 13.015160875160875, "grad_norm": 0.4726085364818573, "learning_rate": 8.053768053768054e-06, "loss": 1.1219, "step": 10690 }, { "epoch": 13.015418275418275, "grad_norm": 52.33597946166992, "learning_rate": 8.050908050908052e-06, "loss": 1.2237, "step": 10700 }, { "epoch": 13.015675675675675, "grad_norm": 0.02517028898000717, "learning_rate": 8.048048048048048e-06, "loss": 0.889, "step": 10710 }, { "epoch": 13.015933075933075, "grad_norm": 0.34835392236709595, "learning_rate": 8.045188045188047e-06, "loss": 0.408, "step": 10720 }, { "epoch": 13.016190476190475, "grad_norm": 87.9798583984375, "learning_rate": 8.042328042328043e-06, "loss": 1.3656, "step": 10730 }, { "epoch": 13.016447876447877, "grad_norm": 0.05181077867746353, "learning_rate": 8.03946803946804e-06, "loss": 0.9934, "step": 10740 }, { "epoch": 13.016705276705277, "grad_norm": 0.011249941773712635, "learning_rate": 8.036608036608036e-06, "loss": 0.7905, "step": 10750 }, { "epoch": 13.016962676962677, "grad_norm": 0.6939520239830017, "learning_rate": 8.033748033748034e-06, "loss": 0.8866, "step": 10760 }, { "epoch": 13.017220077220077, "grad_norm": 0.22471930086612701, "learning_rate": 8.03088803088803e-06, "loss": 0.0061, "step": 10770 }, { "epoch": 13.017477477477478, "grad_norm": 2.8579299449920654, "learning_rate": 8.028028028028029e-06, "loss": 0.6723, "step": 10780 }, { "epoch": 13.017734877734878, "grad_norm": 0.15794941782951355, "learning_rate": 8.025168025168025e-06, "loss": 1.1582, "step": 10790 }, { "epoch": 13.017992277992278, "grad_norm": 0.09103313833475113, "learning_rate": 8.022308022308023e-06, "loss": 0.4761, "step": 10800 }, { "epoch": 13.018249678249678, "grad_norm": 2.296825408935547, "learning_rate": 8.01944801944802e-06, "loss": 0.8791, "step": 10810 }, { "epoch": 13.018507078507078, "grad_norm": 0.6113599538803101, "learning_rate": 8.016588016588016e-06, "loss": 1.0967, "step": 10820 }, { "epoch": 13.01876447876448, "grad_norm": 0.026831205934286118, "learning_rate": 8.013728013728015e-06, "loss": 1.3861, "step": 10830 }, { "epoch": 13.01902187902188, "grad_norm": 0.035754233598709106, "learning_rate": 8.010868010868011e-06, "loss": 0.9776, "step": 10840 }, { "epoch": 13.01927927927928, "grad_norm": 0.055288445204496384, "learning_rate": 8.00800800800801e-06, "loss": 0.8417, "step": 10850 }, { "epoch": 13.01953667953668, "grad_norm": 5.07427978515625, "learning_rate": 8.005148005148006e-06, "loss": 0.872, "step": 10860 }, { "epoch": 13.01979407979408, "grad_norm": 0.008023238740861416, "learning_rate": 8.002288002288004e-06, "loss": 1.0987, "step": 10870 }, { "epoch": 13.02, "eval_accuracy": 0.782608695652174, "eval_loss": 1.0684303045272827, "eval_runtime": 13.5415, "eval_samples_per_second": 3.397, "eval_steps_per_second": 3.397, "step": 10878 }, { "epoch": 14.00005148005148, "grad_norm": 0.0345095694065094, "learning_rate": 7.999427999428e-06, "loss": 1.0408, "step": 10880 }, { "epoch": 14.00030888030888, "grad_norm": 567.0715942382812, "learning_rate": 7.996567996567998e-06, "loss": 0.2114, "step": 10890 }, { "epoch": 14.00056628056628, "grad_norm": 0.04684921354055405, "learning_rate": 7.993707993707995e-06, "loss": 0.4128, "step": 10900 }, { "epoch": 14.00082368082368, "grad_norm": 0.021159229800105095, "learning_rate": 7.990847990847991e-06, "loss": 1.9883, "step": 10910 }, { "epoch": 14.00108108108108, "grad_norm": 0.7736424803733826, "learning_rate": 7.987987987987988e-06, "loss": 0.8038, "step": 10920 }, { "epoch": 14.00133848133848, "grad_norm": 25.88015365600586, "learning_rate": 7.985127985127986e-06, "loss": 1.3199, "step": 10930 }, { "epoch": 14.00159588159588, "grad_norm": 27.008317947387695, "learning_rate": 7.982267982267982e-06, "loss": 1.3514, "step": 10940 }, { "epoch": 14.001853281853283, "grad_norm": 0.025671614333987236, "learning_rate": 7.97940797940798e-06, "loss": 1.9131, "step": 10950 }, { "epoch": 14.002110682110683, "grad_norm": 0.06644091010093689, "learning_rate": 7.976547976547977e-06, "loss": 0.6798, "step": 10960 }, { "epoch": 14.002368082368083, "grad_norm": 0.0720016285777092, "learning_rate": 7.973687973687975e-06, "loss": 1.4921, "step": 10970 }, { "epoch": 14.002625482625483, "grad_norm": 0.058085646480321884, "learning_rate": 7.970827970827972e-06, "loss": 1.1381, "step": 10980 }, { "epoch": 14.002882882882883, "grad_norm": 28.230207443237305, "learning_rate": 7.967967967967968e-06, "loss": 0.5997, "step": 10990 }, { "epoch": 14.003140283140283, "grad_norm": 0.015309504233300686, "learning_rate": 7.965107965107965e-06, "loss": 0.0371, "step": 11000 }, { "epoch": 14.003397683397683, "grad_norm": 42.59065246582031, "learning_rate": 7.962247962247963e-06, "loss": 0.4222, "step": 11010 }, { "epoch": 14.003655083655083, "grad_norm": 0.2624385952949524, "learning_rate": 7.95938795938796e-06, "loss": 0.7029, "step": 11020 }, { "epoch": 14.003912483912483, "grad_norm": 0.036210574209690094, "learning_rate": 7.956527956527957e-06, "loss": 1.0643, "step": 11030 }, { "epoch": 14.004169884169885, "grad_norm": 0.25612005591392517, "learning_rate": 7.953667953667954e-06, "loss": 1.8172, "step": 11040 }, { "epoch": 14.004427284427285, "grad_norm": 0.010674877092242241, "learning_rate": 7.950807950807952e-06, "loss": 1.4193, "step": 11050 }, { "epoch": 14.004684684684685, "grad_norm": 0.021124469116330147, "learning_rate": 7.947947947947949e-06, "loss": 1.3414, "step": 11060 }, { "epoch": 14.004942084942085, "grad_norm": 3.517850637435913, "learning_rate": 7.945087945087945e-06, "loss": 0.9322, "step": 11070 }, { "epoch": 14.005199485199485, "grad_norm": 0.025426078587770462, "learning_rate": 7.942227942227943e-06, "loss": 0.0065, "step": 11080 }, { "epoch": 14.005456885456885, "grad_norm": 0.015744155272841454, "learning_rate": 7.93936793936794e-06, "loss": 0.0105, "step": 11090 }, { "epoch": 14.005714285714285, "grad_norm": 0.05212106183171272, "learning_rate": 7.936507936507936e-06, "loss": 0.3777, "step": 11100 }, { "epoch": 14.005971685971685, "grad_norm": 0.008490210399031639, "learning_rate": 7.933647933647934e-06, "loss": 1.4914, "step": 11110 }, { "epoch": 14.006229086229085, "grad_norm": 0.16355255246162415, "learning_rate": 7.93078793078793e-06, "loss": 0.9386, "step": 11120 }, { "epoch": 14.006486486486487, "grad_norm": 0.5630903244018555, "learning_rate": 7.927927927927929e-06, "loss": 0.9168, "step": 11130 }, { "epoch": 14.006743886743887, "grad_norm": 27.17743682861328, "learning_rate": 7.925067925067925e-06, "loss": 0.4545, "step": 11140 }, { "epoch": 14.007001287001287, "grad_norm": 0.007702608127146959, "learning_rate": 7.922207922207924e-06, "loss": 1.1431, "step": 11150 }, { "epoch": 14.007258687258688, "grad_norm": 1.3509392738342285, "learning_rate": 7.91934791934792e-06, "loss": 0.3952, "step": 11160 }, { "epoch": 14.007516087516088, "grad_norm": 1.47977614402771, "learning_rate": 7.916487916487917e-06, "loss": 1.6739, "step": 11170 }, { "epoch": 14.007773487773488, "grad_norm": 0.06250022351741791, "learning_rate": 7.913627913627915e-06, "loss": 1.11, "step": 11180 }, { "epoch": 14.008030888030888, "grad_norm": 42.10331726074219, "learning_rate": 7.910767910767911e-06, "loss": 1.5249, "step": 11190 }, { "epoch": 14.008288288288288, "grad_norm": 0.4262247383594513, "learning_rate": 7.90790790790791e-06, "loss": 0.5517, "step": 11200 }, { "epoch": 14.008545688545688, "grad_norm": 341.49090576171875, "learning_rate": 7.905047905047906e-06, "loss": 0.8725, "step": 11210 }, { "epoch": 14.00880308880309, "grad_norm": 111.06766510009766, "learning_rate": 7.902187902187904e-06, "loss": 0.5028, "step": 11220 }, { "epoch": 14.00906048906049, "grad_norm": 0.021268639713525772, "learning_rate": 7.8993278993279e-06, "loss": 0.675, "step": 11230 }, { "epoch": 14.00931788931789, "grad_norm": 0.00978196132928133, "learning_rate": 7.896467896467897e-06, "loss": 1.7491, "step": 11240 }, { "epoch": 14.00957528957529, "grad_norm": 0.13062025606632233, "learning_rate": 7.893607893607893e-06, "loss": 0.0045, "step": 11250 }, { "epoch": 14.00983268983269, "grad_norm": 0.40098121762275696, "learning_rate": 7.890747890747892e-06, "loss": 1.331, "step": 11260 }, { "epoch": 14.01009009009009, "grad_norm": 99.57262420654297, "learning_rate": 7.887887887887888e-06, "loss": 0.8479, "step": 11270 }, { "epoch": 14.01034749034749, "grad_norm": 0.15203261375427246, "learning_rate": 7.885027885027886e-06, "loss": 0.5903, "step": 11280 }, { "epoch": 14.01060489060489, "grad_norm": 0.16804452240467072, "learning_rate": 7.882167882167883e-06, "loss": 1.2317, "step": 11290 }, { "epoch": 14.01086229086229, "grad_norm": 0.05834130197763443, "learning_rate": 7.87930787930788e-06, "loss": 0.2802, "step": 11300 }, { "epoch": 14.01111969111969, "grad_norm": 0.24336637556552887, "learning_rate": 7.876447876447877e-06, "loss": 0.891, "step": 11310 }, { "epoch": 14.011377091377092, "grad_norm": 27.7568359375, "learning_rate": 7.873587873587874e-06, "loss": 1.3452, "step": 11320 }, { "epoch": 14.011634491634492, "grad_norm": 59.793251037597656, "learning_rate": 7.870727870727872e-06, "loss": 0.5921, "step": 11330 }, { "epoch": 14.011891891891892, "grad_norm": 3.0434534549713135, "learning_rate": 7.867867867867868e-06, "loss": 1.6081, "step": 11340 }, { "epoch": 14.012149292149292, "grad_norm": 0.012520610354840755, "learning_rate": 7.865007865007865e-06, "loss": 0.5631, "step": 11350 }, { "epoch": 14.012406692406692, "grad_norm": 0.03452794998884201, "learning_rate": 7.862147862147863e-06, "loss": 0.4335, "step": 11360 }, { "epoch": 14.012664092664092, "grad_norm": 0.013125480152666569, "learning_rate": 7.85928785928786e-06, "loss": 0.6082, "step": 11370 }, { "epoch": 14.012921492921492, "grad_norm": 0.07402829825878143, "learning_rate": 7.856427856427858e-06, "loss": 1.1006, "step": 11380 }, { "epoch": 14.013178893178893, "grad_norm": 5.510627269744873, "learning_rate": 7.853567853567854e-06, "loss": 0.2657, "step": 11390 }, { "epoch": 14.013436293436293, "grad_norm": 0.4086941182613373, "learning_rate": 7.850707850707852e-06, "loss": 0.4711, "step": 11400 }, { "epoch": 14.013693693693694, "grad_norm": 49.80360412597656, "learning_rate": 7.847847847847849e-06, "loss": 0.5284, "step": 11410 }, { "epoch": 14.013951093951095, "grad_norm": 137.60855102539062, "learning_rate": 7.844987844987845e-06, "loss": 1.0677, "step": 11420 }, { "epoch": 14.014208494208495, "grad_norm": 0.006826744880527258, "learning_rate": 7.842127842127842e-06, "loss": 2.5411, "step": 11430 }, { "epoch": 14.014465894465895, "grad_norm": 0.35207125544548035, "learning_rate": 7.83926783926784e-06, "loss": 1.3689, "step": 11440 }, { "epoch": 14.014723294723295, "grad_norm": 1.623186469078064, "learning_rate": 7.836407836407836e-06, "loss": 0.5639, "step": 11450 }, { "epoch": 14.014980694980695, "grad_norm": 0.17725925147533417, "learning_rate": 7.833547833547834e-06, "loss": 0.4686, "step": 11460 }, { "epoch": 14.015238095238095, "grad_norm": 0.007551025133579969, "learning_rate": 7.830687830687831e-06, "loss": 0.4924, "step": 11470 }, { "epoch": 14.015495495495495, "grad_norm": 46.443031311035156, "learning_rate": 7.827827827827829e-06, "loss": 1.5643, "step": 11480 }, { "epoch": 14.015752895752895, "grad_norm": 0.04052784666419029, "learning_rate": 7.824967824967826e-06, "loss": 0.854, "step": 11490 }, { "epoch": 14.016010296010297, "grad_norm": 33.286373138427734, "learning_rate": 7.822107822107822e-06, "loss": 0.8945, "step": 11500 }, { "epoch": 14.016267696267697, "grad_norm": 30.311275482177734, "learning_rate": 7.81924781924782e-06, "loss": 0.7156, "step": 11510 }, { "epoch": 14.016525096525097, "grad_norm": 0.5423368215560913, "learning_rate": 7.816387816387817e-06, "loss": 0.5232, "step": 11520 }, { "epoch": 14.016782496782497, "grad_norm": 55.021217346191406, "learning_rate": 7.813527813527813e-06, "loss": 0.9088, "step": 11530 }, { "epoch": 14.017039897039897, "grad_norm": 0.35494714975357056, "learning_rate": 7.810667810667811e-06, "loss": 0.551, "step": 11540 }, { "epoch": 14.017297297297297, "grad_norm": 0.24058228731155396, "learning_rate": 7.807807807807808e-06, "loss": 0.6426, "step": 11550 }, { "epoch": 14.017554697554697, "grad_norm": 1.1704317331314087, "learning_rate": 7.804947804947806e-06, "loss": 1.029, "step": 11560 }, { "epoch": 14.017812097812097, "grad_norm": 0.269205242395401, "learning_rate": 7.802087802087804e-06, "loss": 1.8044, "step": 11570 }, { "epoch": 14.018069498069497, "grad_norm": 0.7438356280326843, "learning_rate": 7.7992277992278e-06, "loss": 0.851, "step": 11580 }, { "epoch": 14.0183268983269, "grad_norm": 0.6309325695037842, "learning_rate": 7.796367796367797e-06, "loss": 1.1659, "step": 11590 }, { "epoch": 14.0185842985843, "grad_norm": 0.8117626905441284, "learning_rate": 7.793507793507794e-06, "loss": 0.8181, "step": 11600 }, { "epoch": 14.0188416988417, "grad_norm": 0.028226610273122787, "learning_rate": 7.790647790647792e-06, "loss": 1.0875, "step": 11610 }, { "epoch": 14.0190990990991, "grad_norm": 1.9404858350753784, "learning_rate": 7.787787787787788e-06, "loss": 1.3949, "step": 11620 }, { "epoch": 14.0193564993565, "grad_norm": 7.443615913391113, "learning_rate": 7.784927784927786e-06, "loss": 0.7276, "step": 11630 }, { "epoch": 14.0196138996139, "grad_norm": 0.020177453756332397, "learning_rate": 7.782067782067783e-06, "loss": 0.9539, "step": 11640 }, { "epoch": 14.0198712998713, "grad_norm": 0.0387568436563015, "learning_rate": 7.779207779207781e-06, "loss": 0.0074, "step": 11650 }, { "epoch": 14.02, "eval_accuracy": 0.7391304347826086, "eval_loss": 1.2900571823120117, "eval_runtime": 13.4677, "eval_samples_per_second": 3.416, "eval_steps_per_second": 3.416, "step": 11655 }, { "epoch": 15.0001287001287, "grad_norm": 0.21021750569343567, "learning_rate": 7.776347776347777e-06, "loss": 0.0063, "step": 11660 }, { "epoch": 15.0003861003861, "grad_norm": 0.18497781455516815, "learning_rate": 7.773487773487774e-06, "loss": 1.5418, "step": 11670 }, { "epoch": 15.0006435006435, "grad_norm": 0.07883092761039734, "learning_rate": 7.77062777062777e-06, "loss": 1.1415, "step": 11680 }, { "epoch": 15.0009009009009, "grad_norm": 66.12890625, "learning_rate": 7.767767767767769e-06, "loss": 0.6305, "step": 11690 }, { "epoch": 15.0011583011583, "grad_norm": 78.24878692626953, "learning_rate": 7.764907764907765e-06, "loss": 0.3121, "step": 11700 }, { "epoch": 15.001415701415702, "grad_norm": 0.011892065405845642, "learning_rate": 7.762047762047763e-06, "loss": 0.0114, "step": 11710 }, { "epoch": 15.001673101673102, "grad_norm": 0.012777644209563732, "learning_rate": 7.75918775918776e-06, "loss": 1.8192, "step": 11720 }, { "epoch": 15.001930501930502, "grad_norm": 0.17456497251987457, "learning_rate": 7.756327756327758e-06, "loss": 1.0947, "step": 11730 }, { "epoch": 15.002187902187902, "grad_norm": 95.4760971069336, "learning_rate": 7.753467753467754e-06, "loss": 0.5226, "step": 11740 }, { "epoch": 15.002445302445302, "grad_norm": 334.6659851074219, "learning_rate": 7.75060775060775e-06, "loss": 1.6629, "step": 11750 }, { "epoch": 15.002702702702702, "grad_norm": 0.3331473171710968, "learning_rate": 7.747747747747749e-06, "loss": 0.5419, "step": 11760 }, { "epoch": 15.002960102960103, "grad_norm": 0.0954216867685318, "learning_rate": 7.744887744887745e-06, "loss": 0.0035, "step": 11770 }, { "epoch": 15.003217503217503, "grad_norm": 28.08690643310547, "learning_rate": 7.742027742027742e-06, "loss": 0.4808, "step": 11780 }, { "epoch": 15.003474903474903, "grad_norm": 10.32856559753418, "learning_rate": 7.73916773916774e-06, "loss": 0.0072, "step": 11790 }, { "epoch": 15.003732303732304, "grad_norm": 79.47557830810547, "learning_rate": 7.736307736307736e-06, "loss": 0.6229, "step": 11800 }, { "epoch": 15.003989703989705, "grad_norm": 0.05898858606815338, "learning_rate": 7.733447733447735e-06, "loss": 0.6668, "step": 11810 }, { "epoch": 15.004247104247105, "grad_norm": 0.010334447957575321, "learning_rate": 7.730587730587731e-06, "loss": 0.5522, "step": 11820 }, { "epoch": 15.004504504504505, "grad_norm": 0.014395112171769142, "learning_rate": 7.72772772772773e-06, "loss": 0.6832, "step": 11830 }, { "epoch": 15.004761904761905, "grad_norm": 0.40451979637145996, "learning_rate": 7.724867724867726e-06, "loss": 0.4752, "step": 11840 }, { "epoch": 15.005019305019305, "grad_norm": 0.09375791996717453, "learning_rate": 7.722007722007722e-06, "loss": 0.7253, "step": 11850 }, { "epoch": 15.005276705276705, "grad_norm": 0.011127919889986515, "learning_rate": 7.719147719147719e-06, "loss": 1.563, "step": 11860 }, { "epoch": 15.005534105534105, "grad_norm": 0.10237417370080948, "learning_rate": 7.716287716287717e-06, "loss": 0.4648, "step": 11870 }, { "epoch": 15.005791505791505, "grad_norm": 0.13473637402057648, "learning_rate": 7.713427713427713e-06, "loss": 0.006, "step": 11880 }, { "epoch": 15.006048906048907, "grad_norm": 0.09255123883485794, "learning_rate": 7.710567710567711e-06, "loss": 0.004, "step": 11890 }, { "epoch": 15.006306306306307, "grad_norm": 0.41176626086235046, "learning_rate": 7.707707707707708e-06, "loss": 1.3119, "step": 11900 }, { "epoch": 15.006563706563707, "grad_norm": 0.0077965897507965565, "learning_rate": 7.704847704847706e-06, "loss": 1.5889, "step": 11910 }, { "epoch": 15.006821106821107, "grad_norm": 0.24158236384391785, "learning_rate": 7.701987701987703e-06, "loss": 1.7263, "step": 11920 }, { "epoch": 15.007078507078507, "grad_norm": 5.002826690673828, "learning_rate": 7.699127699127699e-06, "loss": 0.8271, "step": 11930 }, { "epoch": 15.007335907335907, "grad_norm": 39.31734848022461, "learning_rate": 7.696267696267697e-06, "loss": 2.0124, "step": 11940 }, { "epoch": 15.007593307593307, "grad_norm": 0.05357268825173378, "learning_rate": 7.693407693407694e-06, "loss": 0.434, "step": 11950 }, { "epoch": 15.007850707850707, "grad_norm": 0.13800211250782013, "learning_rate": 7.690547690547692e-06, "loss": 0.5399, "step": 11960 }, { "epoch": 15.008108108108107, "grad_norm": 201.25430297851562, "learning_rate": 7.687687687687688e-06, "loss": 1.66, "step": 11970 }, { "epoch": 15.00836550836551, "grad_norm": 0.3996284008026123, "learning_rate": 7.684827684827686e-06, "loss": 0.645, "step": 11980 }, { "epoch": 15.00862290862291, "grad_norm": 95.54779052734375, "learning_rate": 7.681967681967683e-06, "loss": 1.0521, "step": 11990 }, { "epoch": 15.00888030888031, "grad_norm": 3.103736162185669, "learning_rate": 7.67910767910768e-06, "loss": 0.649, "step": 12000 }, { "epoch": 15.00913770913771, "grad_norm": 99.35127258300781, "learning_rate": 7.676247676247678e-06, "loss": 0.9866, "step": 12010 }, { "epoch": 15.00939510939511, "grad_norm": 0.011812890879809856, "learning_rate": 7.673387673387674e-06, "loss": 0.0066, "step": 12020 }, { "epoch": 15.00965250965251, "grad_norm": 0.24866342544555664, "learning_rate": 7.67052767052767e-06, "loss": 1.5628, "step": 12030 }, { "epoch": 15.00990990990991, "grad_norm": 0.011373716406524181, "learning_rate": 7.667667667667669e-06, "loss": 0.2936, "step": 12040 }, { "epoch": 15.01016731016731, "grad_norm": 0.16900913417339325, "learning_rate": 7.664807664807665e-06, "loss": 0.9559, "step": 12050 }, { "epoch": 15.01042471042471, "grad_norm": 0.31922098994255066, "learning_rate": 7.661947661947663e-06, "loss": 0.151, "step": 12060 }, { "epoch": 15.010682110682112, "grad_norm": 352.00823974609375, "learning_rate": 7.65908765908766e-06, "loss": 0.5267, "step": 12070 }, { "epoch": 15.010939510939512, "grad_norm": 457.71478271484375, "learning_rate": 7.656227656227658e-06, "loss": 0.5103, "step": 12080 }, { "epoch": 15.011196911196912, "grad_norm": 0.18350684642791748, "learning_rate": 7.653367653367654e-06, "loss": 1.0546, "step": 12090 }, { "epoch": 15.011454311454312, "grad_norm": 81.0344009399414, "learning_rate": 7.65050765050765e-06, "loss": 1.6585, "step": 12100 }, { "epoch": 15.011711711711712, "grad_norm": 34.371742248535156, "learning_rate": 7.647647647647647e-06, "loss": 0.6012, "step": 12110 }, { "epoch": 15.011969111969112, "grad_norm": 54.86530303955078, "learning_rate": 7.644787644787645e-06, "loss": 1.5436, "step": 12120 }, { "epoch": 15.012226512226512, "grad_norm": 0.05258643254637718, "learning_rate": 7.641927641927642e-06, "loss": 2.2123, "step": 12130 }, { "epoch": 15.012483912483912, "grad_norm": 486.4405212402344, "learning_rate": 7.63906763906764e-06, "loss": 0.941, "step": 12140 }, { "epoch": 15.012741312741312, "grad_norm": 33.052574157714844, "learning_rate": 7.636207636207637e-06, "loss": 0.4, "step": 12150 }, { "epoch": 15.012998712998712, "grad_norm": 0.40217721462249756, "learning_rate": 7.633347633347635e-06, "loss": 1.2872, "step": 12160 }, { "epoch": 15.013256113256114, "grad_norm": 0.026713982224464417, "learning_rate": 7.630487630487631e-06, "loss": 1.5662, "step": 12170 }, { "epoch": 15.013513513513514, "grad_norm": 25.906904220581055, "learning_rate": 7.6276276276276285e-06, "loss": 1.0581, "step": 12180 }, { "epoch": 15.013770913770914, "grad_norm": 0.21586193144321442, "learning_rate": 7.624767624767625e-06, "loss": 0.5713, "step": 12190 }, { "epoch": 15.014028314028314, "grad_norm": 53.68928909301758, "learning_rate": 7.621907621907622e-06, "loss": 0.3419, "step": 12200 }, { "epoch": 15.014285714285714, "grad_norm": 0.16979816555976868, "learning_rate": 7.61904761904762e-06, "loss": 0.0113, "step": 12210 }, { "epoch": 15.014543114543114, "grad_norm": 0.515741229057312, "learning_rate": 7.616187616187617e-06, "loss": 0.5549, "step": 12220 }, { "epoch": 15.014800514800514, "grad_norm": 110.49329376220703, "learning_rate": 7.613327613327613e-06, "loss": 0.9444, "step": 12230 }, { "epoch": 15.015057915057914, "grad_norm": 58.470523834228516, "learning_rate": 7.610467610467612e-06, "loss": 1.3908, "step": 12240 }, { "epoch": 15.015315315315314, "grad_norm": 65.19776916503906, "learning_rate": 7.607607607607608e-06, "loss": 1.0306, "step": 12250 }, { "epoch": 15.015572715572716, "grad_norm": 45.314945220947266, "learning_rate": 7.604747604747605e-06, "loss": 0.5072, "step": 12260 }, { "epoch": 15.015830115830116, "grad_norm": 0.008456357754766941, "learning_rate": 7.601887601887602e-06, "loss": 2.315, "step": 12270 }, { "epoch": 15.016087516087516, "grad_norm": 0.0190535020083189, "learning_rate": 7.5990275990276e-06, "loss": 1.7028, "step": 12280 }, { "epoch": 15.016344916344917, "grad_norm": 25.07335662841797, "learning_rate": 7.5961675961675965e-06, "loss": 2.0921, "step": 12290 }, { "epoch": 15.016602316602317, "grad_norm": 1.3651502132415771, "learning_rate": 7.593307593307594e-06, "loss": 0.3876, "step": 12300 }, { "epoch": 15.016859716859717, "grad_norm": 332.7787170410156, "learning_rate": 7.590447590447592e-06, "loss": 0.8509, "step": 12310 }, { "epoch": 15.017117117117117, "grad_norm": 0.7402872443199158, "learning_rate": 7.587587587587588e-06, "loss": 0.4219, "step": 12320 }, { "epoch": 15.017374517374517, "grad_norm": 1.146626353263855, "learning_rate": 7.584727584727586e-06, "loss": 0.6808, "step": 12330 }, { "epoch": 15.017631917631917, "grad_norm": 59.793434143066406, "learning_rate": 7.581867581867582e-06, "loss": 0.8229, "step": 12340 }, { "epoch": 15.017889317889319, "grad_norm": 0.011528403498232365, "learning_rate": 7.57900757900758e-06, "loss": 0.2795, "step": 12350 }, { "epoch": 15.018146718146719, "grad_norm": 74.38329315185547, "learning_rate": 7.576147576147577e-06, "loss": 0.5573, "step": 12360 }, { "epoch": 15.018404118404119, "grad_norm": 39.21977233886719, "learning_rate": 7.573287573287574e-06, "loss": 1.7136, "step": 12370 }, { "epoch": 15.018661518661519, "grad_norm": 87.86930847167969, "learning_rate": 7.570427570427571e-06, "loss": 1.151, "step": 12380 }, { "epoch": 15.018918918918919, "grad_norm": 0.3435974419116974, "learning_rate": 7.567567567567569e-06, "loss": 0.5449, "step": 12390 }, { "epoch": 15.019176319176319, "grad_norm": 0.020228708162903786, "learning_rate": 7.564707564707565e-06, "loss": 1.1051, "step": 12400 }, { "epoch": 15.019433719433719, "grad_norm": 48.28805923461914, "learning_rate": 7.5618475618475626e-06, "loss": 0.9372, "step": 12410 }, { "epoch": 15.01969111969112, "grad_norm": 0.9852522611618042, "learning_rate": 7.558987558987559e-06, "loss": 0.6727, "step": 12420 }, { "epoch": 15.01994851994852, "grad_norm": 0.5902912020683289, "learning_rate": 7.556127556127557e-06, "loss": 1.39, "step": 12430 }, { "epoch": 15.02, "eval_accuracy": 0.6304347826086957, "eval_loss": 1.6627668142318726, "eval_runtime": 13.4992, "eval_samples_per_second": 3.408, "eval_steps_per_second": 3.408, "step": 12432 }, { "epoch": 16.00020592020592, "grad_norm": 0.04411958158016205, "learning_rate": 7.553267553267554e-06, "loss": 1.2937, "step": 12440 }, { "epoch": 16.00046332046332, "grad_norm": 0.03254665061831474, "learning_rate": 7.550407550407551e-06, "loss": 0.6591, "step": 12450 }, { "epoch": 16.00072072072072, "grad_norm": 0.3264172375202179, "learning_rate": 7.547547547547548e-06, "loss": 0.3905, "step": 12460 }, { "epoch": 16.00097812097812, "grad_norm": 0.06069876626133919, "learning_rate": 7.544687544687546e-06, "loss": 1.8253, "step": 12470 }, { "epoch": 16.00123552123552, "grad_norm": 0.23206213116645813, "learning_rate": 7.541827541827542e-06, "loss": 0.1752, "step": 12480 }, { "epoch": 16.001492921492922, "grad_norm": 0.4681251049041748, "learning_rate": 7.53896753896754e-06, "loss": 0.5147, "step": 12490 }, { "epoch": 16.001750321750322, "grad_norm": 0.06292621046304703, "learning_rate": 7.536107536107537e-06, "loss": 1.005, "step": 12500 }, { "epoch": 16.002007722007722, "grad_norm": 30.542930603027344, "learning_rate": 7.533247533247534e-06, "loss": 0.8872, "step": 12510 }, { "epoch": 16.002265122265122, "grad_norm": 0.456661581993103, "learning_rate": 7.5303875303875305e-06, "loss": 0.517, "step": 12520 }, { "epoch": 16.002522522522522, "grad_norm": 0.6798164248466492, "learning_rate": 7.527527527527529e-06, "loss": 1.2443, "step": 12530 }, { "epoch": 16.002779922779922, "grad_norm": 3.348841905593872, "learning_rate": 7.524667524667525e-06, "loss": 0.4661, "step": 12540 }, { "epoch": 16.003037323037322, "grad_norm": 0.005561114754527807, "learning_rate": 7.5218075218075225e-06, "loss": 0.4401, "step": 12550 }, { "epoch": 16.003294723294722, "grad_norm": 0.14631526172161102, "learning_rate": 7.518947518947519e-06, "loss": 0.4638, "step": 12560 }, { "epoch": 16.003552123552122, "grad_norm": 0.02118610218167305, "learning_rate": 7.516087516087517e-06, "loss": 0.678, "step": 12570 }, { "epoch": 16.003809523809522, "grad_norm": 0.13729602098464966, "learning_rate": 7.5132275132275136e-06, "loss": 0.4831, "step": 12580 }, { "epoch": 16.004066924066922, "grad_norm": 0.3196340501308441, "learning_rate": 7.510367510367511e-06, "loss": 0.9561, "step": 12590 }, { "epoch": 16.004324324324326, "grad_norm": 0.006972214672714472, "learning_rate": 7.507507507507507e-06, "loss": 1.2584, "step": 12600 }, { "epoch": 16.004581724581726, "grad_norm": 26.036048889160156, "learning_rate": 7.5046475046475055e-06, "loss": 2.6442, "step": 12610 }, { "epoch": 16.004839124839126, "grad_norm": 58.6983757019043, "learning_rate": 7.501787501787502e-06, "loss": 2.0255, "step": 12620 }, { "epoch": 16.005096525096526, "grad_norm": 243.01780700683594, "learning_rate": 7.498927498927499e-06, "loss": 0.2647, "step": 12630 }, { "epoch": 16.005353925353926, "grad_norm": 1.2858657836914062, "learning_rate": 7.496067496067496e-06, "loss": 0.3808, "step": 12640 }, { "epoch": 16.005611325611326, "grad_norm": 1.131397008895874, "learning_rate": 7.493207493207494e-06, "loss": 0.4255, "step": 12650 }, { "epoch": 16.005868725868726, "grad_norm": 0.0808020830154419, "learning_rate": 7.49034749034749e-06, "loss": 1.2511, "step": 12660 }, { "epoch": 16.006126126126127, "grad_norm": 52.445899963378906, "learning_rate": 7.487487487487488e-06, "loss": 0.9333, "step": 12670 }, { "epoch": 16.006383526383527, "grad_norm": 0.38577455282211304, "learning_rate": 7.484627484627486e-06, "loss": 0.0374, "step": 12680 }, { "epoch": 16.006640926640927, "grad_norm": 58.25796890258789, "learning_rate": 7.481767481767482e-06, "loss": 1.0307, "step": 12690 }, { "epoch": 16.006898326898327, "grad_norm": 0.05789424851536751, "learning_rate": 7.47890747890748e-06, "loss": 0.0427, "step": 12700 }, { "epoch": 16.007155727155727, "grad_norm": 0.046346213668584824, "learning_rate": 7.476047476047477e-06, "loss": 0.4548, "step": 12710 }, { "epoch": 16.007413127413127, "grad_norm": 0.00563997495919466, "learning_rate": 7.473187473187474e-06, "loss": 0.0034, "step": 12720 }, { "epoch": 16.007670527670527, "grad_norm": 0.01808410882949829, "learning_rate": 7.470327470327471e-06, "loss": 1.1605, "step": 12730 }, { "epoch": 16.007927927927927, "grad_norm": 0.005591370165348053, "learning_rate": 7.467467467467469e-06, "loss": 1.2435, "step": 12740 }, { "epoch": 16.008185328185327, "grad_norm": 0.14639759063720703, "learning_rate": 7.464607464607465e-06, "loss": 1.8811, "step": 12750 }, { "epoch": 16.008442728442727, "grad_norm": 0.0185764841735363, "learning_rate": 7.461747461747463e-06, "loss": 0.5857, "step": 12760 }, { "epoch": 16.008700128700127, "grad_norm": 61.76352310180664, "learning_rate": 7.458887458887459e-06, "loss": 0.6318, "step": 12770 }, { "epoch": 16.008957528957527, "grad_norm": 0.20087188482284546, "learning_rate": 7.456027456027457e-06, "loss": 0.003, "step": 12780 }, { "epoch": 16.00921492921493, "grad_norm": 0.11291741579771042, "learning_rate": 7.453167453167454e-06, "loss": 0.592, "step": 12790 }, { "epoch": 16.00947232947233, "grad_norm": 25.189674377441406, "learning_rate": 7.450307450307451e-06, "loss": 0.9249, "step": 12800 }, { "epoch": 16.00972972972973, "grad_norm": 0.01021770853549242, "learning_rate": 7.447447447447448e-06, "loss": 0.4188, "step": 12810 }, { "epoch": 16.00998712998713, "grad_norm": 0.7061002850532532, "learning_rate": 7.444587444587446e-06, "loss": 0.2926, "step": 12820 }, { "epoch": 16.01024453024453, "grad_norm": 0.2825552821159363, "learning_rate": 7.441727441727442e-06, "loss": 0.0139, "step": 12830 }, { "epoch": 16.01050193050193, "grad_norm": 0.2040746808052063, "learning_rate": 7.4388674388674395e-06, "loss": 0.8342, "step": 12840 }, { "epoch": 16.01075933075933, "grad_norm": 127.69252014160156, "learning_rate": 7.436007436007436e-06, "loss": 0.6337, "step": 12850 }, { "epoch": 16.01101673101673, "grad_norm": 284.5594787597656, "learning_rate": 7.433147433147434e-06, "loss": 1.0989, "step": 12860 }, { "epoch": 16.01127413127413, "grad_norm": 0.0073196436278522015, "learning_rate": 7.430287430287431e-06, "loss": 1.6559, "step": 12870 }, { "epoch": 16.01153153153153, "grad_norm": 0.018638137727975845, "learning_rate": 7.427427427427428e-06, "loss": 1.8099, "step": 12880 }, { "epoch": 16.01178893178893, "grad_norm": 0.12463497370481491, "learning_rate": 7.4245674245674244e-06, "loss": 1.7454, "step": 12890 }, { "epoch": 16.01204633204633, "grad_norm": 0.014725065790116787, "learning_rate": 7.421707421707423e-06, "loss": 1.511, "step": 12900 }, { "epoch": 16.01230373230373, "grad_norm": 80.54376220703125, "learning_rate": 7.418847418847419e-06, "loss": 1.1794, "step": 12910 }, { "epoch": 16.01256113256113, "grad_norm": 0.05689245089888573, "learning_rate": 7.415987415987416e-06, "loss": 0.5597, "step": 12920 }, { "epoch": 16.01281853281853, "grad_norm": 63.525150299072266, "learning_rate": 7.413127413127414e-06, "loss": 1.1223, "step": 12930 }, { "epoch": 16.013075933075932, "grad_norm": 0.011247380636632442, "learning_rate": 7.410267410267411e-06, "loss": 0.6782, "step": 12940 }, { "epoch": 16.013333333333332, "grad_norm": 0.007309725042432547, "learning_rate": 7.4074074074074075e-06, "loss": 0.0069, "step": 12950 }, { "epoch": 16.013590733590732, "grad_norm": 0.19261324405670166, "learning_rate": 7.404547404547406e-06, "loss": 0.4312, "step": 12960 }, { "epoch": 16.013848133848136, "grad_norm": 0.2645350992679596, "learning_rate": 7.401687401687402e-06, "loss": 1.1204, "step": 12970 }, { "epoch": 16.014105534105536, "grad_norm": 0.005993553902953863, "learning_rate": 7.3988273988273994e-06, "loss": 0.9521, "step": 12980 }, { "epoch": 16.014362934362936, "grad_norm": 0.12547120451927185, "learning_rate": 7.395967395967396e-06, "loss": 0.0541, "step": 12990 }, { "epoch": 16.014620334620336, "grad_norm": 148.83920288085938, "learning_rate": 7.393107393107394e-06, "loss": 1.1941, "step": 13000 }, { "epoch": 16.014877734877736, "grad_norm": 0.0067084734328091145, "learning_rate": 7.3902473902473905e-06, "loss": 0.4661, "step": 13010 }, { "epoch": 16.015135135135136, "grad_norm": 0.14274343848228455, "learning_rate": 7.387387387387388e-06, "loss": 0.2892, "step": 13020 }, { "epoch": 16.015392535392536, "grad_norm": 284.9540100097656, "learning_rate": 7.384527384527384e-06, "loss": 0.3677, "step": 13030 }, { "epoch": 16.015649935649936, "grad_norm": 0.053901880979537964, "learning_rate": 7.3816673816673825e-06, "loss": 0.5037, "step": 13040 }, { "epoch": 16.015907335907336, "grad_norm": 0.137607142329216, "learning_rate": 7.37880737880738e-06, "loss": 1.7523, "step": 13050 }, { "epoch": 16.016164736164736, "grad_norm": 0.3642958700656891, "learning_rate": 7.375947375947376e-06, "loss": 1.2588, "step": 13060 }, { "epoch": 16.016422136422136, "grad_norm": 28.83527183532715, "learning_rate": 7.3730873730873744e-06, "loss": 2.3998, "step": 13070 }, { "epoch": 16.016679536679536, "grad_norm": 0.5148364901542664, "learning_rate": 7.370227370227371e-06, "loss": 0.1868, "step": 13080 }, { "epoch": 16.016936936936936, "grad_norm": 1.1156384944915771, "learning_rate": 7.367367367367368e-06, "loss": 0.5132, "step": 13090 }, { "epoch": 16.017194337194336, "grad_norm": 219.80577087402344, "learning_rate": 7.364507364507365e-06, "loss": 0.4853, "step": 13100 }, { "epoch": 16.017451737451736, "grad_norm": 0.015949252992868423, "learning_rate": 7.361647361647363e-06, "loss": 0.1669, "step": 13110 }, { "epoch": 16.017709137709137, "grad_norm": 0.011382266879081726, "learning_rate": 7.358787358787359e-06, "loss": 0.0042, "step": 13120 }, { "epoch": 16.017966537966537, "grad_norm": 0.01774335838854313, "learning_rate": 7.355927355927357e-06, "loss": 0.3558, "step": 13130 }, { "epoch": 16.018223938223937, "grad_norm": 0.006449257023632526, "learning_rate": 7.353067353067354e-06, "loss": 0.6681, "step": 13140 }, { "epoch": 16.018481338481337, "grad_norm": 1302.005126953125, "learning_rate": 7.350207350207351e-06, "loss": 0.9604, "step": 13150 }, { "epoch": 16.01873873873874, "grad_norm": 0.00848193746060133, "learning_rate": 7.347347347347348e-06, "loss": 1.6639, "step": 13160 }, { "epoch": 16.01899613899614, "grad_norm": 0.8369548916816711, "learning_rate": 7.344487344487345e-06, "loss": 0.532, "step": 13170 }, { "epoch": 16.01925353925354, "grad_norm": 0.06056103855371475, "learning_rate": 7.341627341627342e-06, "loss": 0.539, "step": 13180 }, { "epoch": 16.01951093951094, "grad_norm": 0.15646892786026, "learning_rate": 7.33876733876734e-06, "loss": 0.0049, "step": 13190 }, { "epoch": 16.01976833976834, "grad_norm": 0.008946855552494526, "learning_rate": 7.335907335907336e-06, "loss": 0.2423, "step": 13200 }, { "epoch": 16.02, "eval_accuracy": 0.9130434782608695, "eval_loss": 0.42057546973228455, "eval_runtime": 13.3647, "eval_samples_per_second": 3.442, "eval_steps_per_second": 3.442, "step": 13209 }, { "epoch": 17.00002574002574, "grad_norm": 133.33091735839844, "learning_rate": 7.333047333047334e-06, "loss": 0.813, "step": 13210 }, { "epoch": 17.00028314028314, "grad_norm": 0.12156759202480316, "learning_rate": 7.330187330187331e-06, "loss": 1.0484, "step": 13220 }, { "epoch": 17.00054054054054, "grad_norm": 0.07529735565185547, "learning_rate": 7.327327327327328e-06, "loss": 1.0572, "step": 13230 }, { "epoch": 17.00079794079794, "grad_norm": 0.008459187112748623, "learning_rate": 7.3244673244673246e-06, "loss": 0.4062, "step": 13240 }, { "epoch": 17.00105534105534, "grad_norm": 0.029139036312699318, "learning_rate": 7.321607321607323e-06, "loss": 0.5629, "step": 13250 }, { "epoch": 17.00131274131274, "grad_norm": 24.8751220703125, "learning_rate": 7.318747318747319e-06, "loss": 0.9865, "step": 13260 }, { "epoch": 17.00157014157014, "grad_norm": 0.34304794669151306, "learning_rate": 7.3158873158873165e-06, "loss": 1.1491, "step": 13270 }, { "epoch": 17.00182754182754, "grad_norm": 0.24863027036190033, "learning_rate": 7.313027313027313e-06, "loss": 1.0053, "step": 13280 }, { "epoch": 17.00208494208494, "grad_norm": 0.015837261453270912, "learning_rate": 7.310167310167311e-06, "loss": 0.4432, "step": 13290 }, { "epoch": 17.00234234234234, "grad_norm": 0.12219315022230148, "learning_rate": 7.307307307307308e-06, "loss": 0.5438, "step": 13300 }, { "epoch": 17.00259974259974, "grad_norm": 13.343441009521484, "learning_rate": 7.304447304447305e-06, "loss": 0.0071, "step": 13310 }, { "epoch": 17.002857142857142, "grad_norm": 0.2364991009235382, "learning_rate": 7.301587301587301e-06, "loss": 0.8329, "step": 13320 }, { "epoch": 17.003114543114542, "grad_norm": 0.008193550631403923, "learning_rate": 7.2987272987272996e-06, "loss": 0.0023, "step": 13330 }, { "epoch": 17.003371943371942, "grad_norm": 72.55078887939453, "learning_rate": 7.295867295867296e-06, "loss": 0.5952, "step": 13340 }, { "epoch": 17.003629343629342, "grad_norm": 0.3132573664188385, "learning_rate": 7.293007293007293e-06, "loss": 2.4965, "step": 13350 }, { "epoch": 17.003886743886746, "grad_norm": 0.3048330843448639, "learning_rate": 7.290147290147291e-06, "loss": 1.5725, "step": 13360 }, { "epoch": 17.004144144144146, "grad_norm": 0.006918249651789665, "learning_rate": 7.287287287287288e-06, "loss": 1.2566, "step": 13370 }, { "epoch": 17.004401544401546, "grad_norm": 87.47795867919922, "learning_rate": 7.2844272844272845e-06, "loss": 0.7258, "step": 13380 }, { "epoch": 17.004658944658946, "grad_norm": 691.7349243164062, "learning_rate": 7.281567281567283e-06, "loss": 0.2839, "step": 13390 }, { "epoch": 17.004916344916346, "grad_norm": 45.06636047363281, "learning_rate": 7.278707278707279e-06, "loss": 1.7674, "step": 13400 }, { "epoch": 17.005173745173746, "grad_norm": 0.014170120470225811, "learning_rate": 7.275847275847276e-06, "loss": 0.0174, "step": 13410 }, { "epoch": 17.005431145431146, "grad_norm": 0.00938112661242485, "learning_rate": 7.272987272987273e-06, "loss": 0.6797, "step": 13420 }, { "epoch": 17.005688545688546, "grad_norm": 0.02455279231071472, "learning_rate": 7.270127270127271e-06, "loss": 0.3925, "step": 13430 }, { "epoch": 17.005945945945946, "grad_norm": 0.3047299087047577, "learning_rate": 7.267267267267268e-06, "loss": 0.8212, "step": 13440 }, { "epoch": 17.006203346203346, "grad_norm": 0.0063845436088740826, "learning_rate": 7.264407264407265e-06, "loss": 0.426, "step": 13450 }, { "epoch": 17.006460746460746, "grad_norm": 0.745879590511322, "learning_rate": 7.261547261547263e-06, "loss": 0.6011, "step": 13460 }, { "epoch": 17.006718146718146, "grad_norm": 113.57637786865234, "learning_rate": 7.2586872586872595e-06, "loss": 0.8187, "step": 13470 }, { "epoch": 17.006975546975546, "grad_norm": 0.03586834669113159, "learning_rate": 7.255827255827257e-06, "loss": 0.4669, "step": 13480 }, { "epoch": 17.007232947232946, "grad_norm": 0.31438571214675903, "learning_rate": 7.252967252967253e-06, "loss": 0.2039, "step": 13490 }, { "epoch": 17.007490347490346, "grad_norm": 0.03270833194255829, "learning_rate": 7.250107250107251e-06, "loss": 1.9521, "step": 13500 }, { "epoch": 17.007747747747747, "grad_norm": 98.77244567871094, "learning_rate": 7.247247247247248e-06, "loss": 0.8431, "step": 13510 }, { "epoch": 17.008005148005147, "grad_norm": 0.7995626330375671, "learning_rate": 7.244387244387245e-06, "loss": 0.0078, "step": 13520 }, { "epoch": 17.008262548262547, "grad_norm": 24.94700813293457, "learning_rate": 7.241527241527242e-06, "loss": 1.761, "step": 13530 }, { "epoch": 17.008519948519947, "grad_norm": 0.008624196983873844, "learning_rate": 7.23866723866724e-06, "loss": 0.8168, "step": 13540 }, { "epoch": 17.00877734877735, "grad_norm": 0.05530587583780289, "learning_rate": 7.235807235807236e-06, "loss": 1.7042, "step": 13550 }, { "epoch": 17.00903474903475, "grad_norm": 0.15424524247646332, "learning_rate": 7.232947232947234e-06, "loss": 0.0123, "step": 13560 }, { "epoch": 17.00929214929215, "grad_norm": 0.36576831340789795, "learning_rate": 7.23008723008723e-06, "loss": 0.5522, "step": 13570 }, { "epoch": 17.00954954954955, "grad_norm": 0.6042425632476807, "learning_rate": 7.227227227227228e-06, "loss": 0.9601, "step": 13580 }, { "epoch": 17.00980694980695, "grad_norm": 0.17106890678405762, "learning_rate": 7.224367224367225e-06, "loss": 1.7365, "step": 13590 }, { "epoch": 17.01006435006435, "grad_norm": 152.11839294433594, "learning_rate": 7.221507221507222e-06, "loss": 0.5564, "step": 13600 }, { "epoch": 17.01032175032175, "grad_norm": 0.12229585647583008, "learning_rate": 7.218647218647219e-06, "loss": 1.4063, "step": 13610 }, { "epoch": 17.01057915057915, "grad_norm": 428.216796875, "learning_rate": 7.215787215787217e-06, "loss": 1.8207, "step": 13620 }, { "epoch": 17.01083655083655, "grad_norm": 0.4224972128868103, "learning_rate": 7.212927212927213e-06, "loss": 0.6798, "step": 13630 }, { "epoch": 17.01109395109395, "grad_norm": 73.0058364868164, "learning_rate": 7.210067210067211e-06, "loss": 1.6111, "step": 13640 }, { "epoch": 17.01135135135135, "grad_norm": 0.010296664200723171, "learning_rate": 7.207207207207208e-06, "loss": 0.6907, "step": 13650 }, { "epoch": 17.01160875160875, "grad_norm": 0.022636495530605316, "learning_rate": 7.204347204347205e-06, "loss": 0.0496, "step": 13660 }, { "epoch": 17.01186615186615, "grad_norm": 0.02433607541024685, "learning_rate": 7.2014872014872015e-06, "loss": 0.4299, "step": 13670 }, { "epoch": 17.01212355212355, "grad_norm": 0.007355282548815012, "learning_rate": 7.1986271986272e-06, "loss": 0.4069, "step": 13680 }, { "epoch": 17.01238095238095, "grad_norm": 0.00920641515403986, "learning_rate": 7.195767195767196e-06, "loss": 1.1034, "step": 13690 }, { "epoch": 17.01263835263835, "grad_norm": 37.67753601074219, "learning_rate": 7.1929071929071935e-06, "loss": 2.2347, "step": 13700 }, { "epoch": 17.01289575289575, "grad_norm": 0.05537370219826698, "learning_rate": 7.19004719004719e-06, "loss": 0.2241, "step": 13710 }, { "epoch": 17.01315315315315, "grad_norm": 1.393071174621582, "learning_rate": 7.187187187187188e-06, "loss": 0.5425, "step": 13720 }, { "epoch": 17.013410553410555, "grad_norm": 0.037601619958877563, "learning_rate": 7.184327184327185e-06, "loss": 1.6061, "step": 13730 }, { "epoch": 17.013667953667955, "grad_norm": 0.01806103065609932, "learning_rate": 7.181467181467182e-06, "loss": 1.0232, "step": 13740 }, { "epoch": 17.013925353925355, "grad_norm": 0.01816631108522415, "learning_rate": 7.178607178607178e-06, "loss": 0.0538, "step": 13750 }, { "epoch": 17.014182754182755, "grad_norm": 0.005556888412684202, "learning_rate": 7.1757471757471765e-06, "loss": 0.8444, "step": 13760 }, { "epoch": 17.014440154440155, "grad_norm": 16.42436981201172, "learning_rate": 7.172887172887173e-06, "loss": 0.4929, "step": 13770 }, { "epoch": 17.014697554697555, "grad_norm": 2.0494089126586914, "learning_rate": 7.17002717002717e-06, "loss": 0.5265, "step": 13780 }, { "epoch": 17.014954954954955, "grad_norm": 0.4712642431259155, "learning_rate": 7.167167167167167e-06, "loss": 0.0059, "step": 13790 }, { "epoch": 17.015212355212356, "grad_norm": 47.848602294921875, "learning_rate": 7.164307164307165e-06, "loss": 0.8093, "step": 13800 }, { "epoch": 17.015469755469756, "grad_norm": 0.009627053514122963, "learning_rate": 7.161447161447162e-06, "loss": 0.0029, "step": 13810 }, { "epoch": 17.015727155727156, "grad_norm": 0.01003243587911129, "learning_rate": 7.158587158587159e-06, "loss": 0.0871, "step": 13820 }, { "epoch": 17.015984555984556, "grad_norm": 0.07373271137475967, "learning_rate": 7.155727155727157e-06, "loss": 0.6212, "step": 13830 }, { "epoch": 17.016241956241956, "grad_norm": 0.007202493958175182, "learning_rate": 7.152867152867153e-06, "loss": 0.4612, "step": 13840 }, { "epoch": 17.016499356499356, "grad_norm": 0.01294265128672123, "learning_rate": 7.150007150007151e-06, "loss": 0.7214, "step": 13850 }, { "epoch": 17.016756756756756, "grad_norm": 0.005408270284533501, "learning_rate": 7.147147147147148e-06, "loss": 0.6311, "step": 13860 }, { "epoch": 17.017014157014156, "grad_norm": 215.44699096679688, "learning_rate": 7.144287144287145e-06, "loss": 1.4436, "step": 13870 }, { "epoch": 17.017271557271556, "grad_norm": 0.01759917289018631, "learning_rate": 7.141427141427142e-06, "loss": 0.5106, "step": 13880 }, { "epoch": 17.017528957528956, "grad_norm": 0.02081468515098095, "learning_rate": 7.13856713856714e-06, "loss": 1.7547, "step": 13890 }, { "epoch": 17.017786357786356, "grad_norm": 0.6049190163612366, "learning_rate": 7.1357071357071364e-06, "loss": 1.6229, "step": 13900 }, { "epoch": 17.01804375804376, "grad_norm": 0.9745853543281555, "learning_rate": 7.132847132847134e-06, "loss": 0.484, "step": 13910 }, { "epoch": 17.01830115830116, "grad_norm": 0.10427520424127579, "learning_rate": 7.12998712998713e-06, "loss": 0.0048, "step": 13920 }, { "epoch": 17.01855855855856, "grad_norm": 0.09744715690612793, "learning_rate": 7.127127127127128e-06, "loss": 0.8102, "step": 13930 }, { "epoch": 17.01881595881596, "grad_norm": 0.26503968238830566, "learning_rate": 7.124267124267125e-06, "loss": 2.1186, "step": 13940 }, { "epoch": 17.01907335907336, "grad_norm": 0.0076754712499678135, "learning_rate": 7.121407121407122e-06, "loss": 0.5523, "step": 13950 }, { "epoch": 17.01933075933076, "grad_norm": 0.008721827529370785, "learning_rate": 7.118547118547119e-06, "loss": 0.9198, "step": 13960 }, { "epoch": 17.01958815958816, "grad_norm": 0.272495836019516, "learning_rate": 7.115687115687117e-06, "loss": 1.0507, "step": 13970 }, { "epoch": 17.01984555984556, "grad_norm": 0.43529975414276123, "learning_rate": 7.112827112827113e-06, "loss": 1.4765, "step": 13980 }, { "epoch": 17.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 0.5210566520690918, "eval_runtime": 13.421, "eval_samples_per_second": 3.427, "eval_steps_per_second": 3.427, "step": 13986 }, { "epoch": 18.00010296010296, "grad_norm": 138.43173217773438, "learning_rate": 7.109967109967111e-06, "loss": 2.2081, "step": 13990 }, { "epoch": 18.00036036036036, "grad_norm": 0.004014770966023207, "learning_rate": 7.107107107107107e-06, "loss": 0.3443, "step": 14000 }, { "epoch": 18.00061776061776, "grad_norm": 0.5814266204833984, "learning_rate": 7.104247104247105e-06, "loss": 0.2336, "step": 14010 }, { "epoch": 18.00087516087516, "grad_norm": 2.1018807888031006, "learning_rate": 7.101387101387102e-06, "loss": 0.4295, "step": 14020 }, { "epoch": 18.00113256113256, "grad_norm": 0.5535208582878113, "learning_rate": 7.098527098527099e-06, "loss": 0.1623, "step": 14030 }, { "epoch": 18.00138996138996, "grad_norm": 0.011099116876721382, "learning_rate": 7.0956670956670955e-06, "loss": 0.0023, "step": 14040 }, { "epoch": 18.00164736164736, "grad_norm": 0.01819237321615219, "learning_rate": 7.092807092807094e-06, "loss": 0.0015, "step": 14050 }, { "epoch": 18.00190476190476, "grad_norm": 0.003633988555520773, "learning_rate": 7.08994708994709e-06, "loss": 0.9, "step": 14060 }, { "epoch": 18.00216216216216, "grad_norm": 551.4736328125, "learning_rate": 7.087087087087087e-06, "loss": 0.7453, "step": 14070 }, { "epoch": 18.00241956241956, "grad_norm": 0.003984920680522919, "learning_rate": 7.084227084227085e-06, "loss": 0.5739, "step": 14080 }, { "epoch": 18.00267696267696, "grad_norm": 51.20891189575195, "learning_rate": 7.081367081367082e-06, "loss": 0.6431, "step": 14090 }, { "epoch": 18.00293436293436, "grad_norm": 0.01216127909719944, "learning_rate": 7.0785070785070785e-06, "loss": 0.4871, "step": 14100 }, { "epoch": 18.00319176319176, "grad_norm": 0.3225768506526947, "learning_rate": 7.075647075647077e-06, "loss": 0.0077, "step": 14110 }, { "epoch": 18.003449163449165, "grad_norm": 809.6256103515625, "learning_rate": 7.072787072787073e-06, "loss": 2.0674, "step": 14120 }, { "epoch": 18.003706563706565, "grad_norm": 6.774144649505615, "learning_rate": 7.0699270699270705e-06, "loss": 0.8586, "step": 14130 }, { "epoch": 18.003963963963965, "grad_norm": 62.32297897338867, "learning_rate": 7.067067067067067e-06, "loss": 2.386, "step": 14140 }, { "epoch": 18.004221364221365, "grad_norm": 0.009228847920894623, "learning_rate": 7.064207064207065e-06, "loss": 1.2169, "step": 14150 }, { "epoch": 18.004478764478765, "grad_norm": 0.3130154013633728, "learning_rate": 7.0613470613470616e-06, "loss": 0.0056, "step": 14160 }, { "epoch": 18.004736164736165, "grad_norm": 0.15906237065792084, "learning_rate": 7.058487058487059e-06, "loss": 0.8068, "step": 14170 }, { "epoch": 18.004993564993566, "grad_norm": 0.21492519974708557, "learning_rate": 7.055627055627057e-06, "loss": 0.8975, "step": 14180 }, { "epoch": 18.005250965250966, "grad_norm": 972.6399536132812, "learning_rate": 7.0527670527670535e-06, "loss": 0.9294, "step": 14190 }, { "epoch": 18.005508365508366, "grad_norm": 0.2181262969970703, "learning_rate": 7.049907049907051e-06, "loss": 0.3442, "step": 14200 }, { "epoch": 18.005765765765766, "grad_norm": 0.010371779091656208, "learning_rate": 7.047047047047047e-06, "loss": 0.9992, "step": 14210 }, { "epoch": 18.006023166023166, "grad_norm": 0.16542059183120728, "learning_rate": 7.0441870441870455e-06, "loss": 1.0675, "step": 14220 }, { "epoch": 18.006280566280566, "grad_norm": 0.0024146682117134333, "learning_rate": 7.041327041327042e-06, "loss": 0.8035, "step": 14230 }, { "epoch": 18.006537966537966, "grad_norm": 0.00679363775998354, "learning_rate": 7.038467038467039e-06, "loss": 1.4576, "step": 14240 }, { "epoch": 18.006795366795366, "grad_norm": 42.69426345825195, "learning_rate": 7.035607035607036e-06, "loss": 1.4004, "step": 14250 }, { "epoch": 18.007052767052766, "grad_norm": 0.24903038144111633, "learning_rate": 7.032747032747034e-06, "loss": 0.0173, "step": 14260 }, { "epoch": 18.007310167310166, "grad_norm": 0.251482754945755, "learning_rate": 7.02988702988703e-06, "loss": 0.8903, "step": 14270 }, { "epoch": 18.007567567567566, "grad_norm": 0.15955966711044312, "learning_rate": 7.027027027027028e-06, "loss": 0.5083, "step": 14280 }, { "epoch": 18.007824967824966, "grad_norm": 0.08874122053384781, "learning_rate": 7.024167024167024e-06, "loss": 0.0515, "step": 14290 }, { "epoch": 18.00808236808237, "grad_norm": 193.19786071777344, "learning_rate": 7.021307021307022e-06, "loss": 1.0134, "step": 14300 }, { "epoch": 18.00833976833977, "grad_norm": 0.0486479327082634, "learning_rate": 7.018447018447019e-06, "loss": 1.2563, "step": 14310 }, { "epoch": 18.00859716859717, "grad_norm": 0.09023340046405792, "learning_rate": 7.015587015587016e-06, "loss": 0.8956, "step": 14320 }, { "epoch": 18.00885456885457, "grad_norm": 942.4277954101562, "learning_rate": 7.012727012727013e-06, "loss": 0.6395, "step": 14330 }, { "epoch": 18.00911196911197, "grad_norm": 0.028225857764482498, "learning_rate": 7.009867009867011e-06, "loss": 1.329, "step": 14340 }, { "epoch": 18.00936936936937, "grad_norm": 0.1975196748971939, "learning_rate": 7.007007007007007e-06, "loss": 0.009, "step": 14350 }, { "epoch": 18.00962676962677, "grad_norm": 0.29792848229408264, "learning_rate": 7.004147004147005e-06, "loss": 0.4711, "step": 14360 }, { "epoch": 18.00988416988417, "grad_norm": 0.003600794356316328, "learning_rate": 7.001287001287002e-06, "loss": 0.9534, "step": 14370 }, { "epoch": 18.01014157014157, "grad_norm": 0.4418034851551056, "learning_rate": 6.998426998426999e-06, "loss": 0.0036, "step": 14380 }, { "epoch": 18.01039897039897, "grad_norm": 53.621925354003906, "learning_rate": 6.995566995566996e-06, "loss": 1.7642, "step": 14390 }, { "epoch": 18.01065637065637, "grad_norm": 0.22013112902641296, "learning_rate": 6.992706992706994e-06, "loss": 1.1519, "step": 14400 }, { "epoch": 18.01091377091377, "grad_norm": 0.009073288179934025, "learning_rate": 6.98984698984699e-06, "loss": 1.3366, "step": 14410 }, { "epoch": 18.01117117117117, "grad_norm": 0.24850936233997345, "learning_rate": 6.9869869869869876e-06, "loss": 0.0021, "step": 14420 }, { "epoch": 18.01142857142857, "grad_norm": 354.8047790527344, "learning_rate": 6.984126984126984e-06, "loss": 0.0833, "step": 14430 }, { "epoch": 18.01168597168597, "grad_norm": 0.001540736760944128, "learning_rate": 6.981266981266982e-06, "loss": 0.0097, "step": 14440 }, { "epoch": 18.01194337194337, "grad_norm": 23.982343673706055, "learning_rate": 6.978406978406979e-06, "loss": 3.2159, "step": 14450 }, { "epoch": 18.01220077220077, "grad_norm": 0.686680793762207, "learning_rate": 6.975546975546976e-06, "loss": 0.4192, "step": 14460 }, { "epoch": 18.01245817245817, "grad_norm": 0.17515744268894196, "learning_rate": 6.9726869726869724e-06, "loss": 0.0089, "step": 14470 }, { "epoch": 18.01271557271557, "grad_norm": 0.6502960920333862, "learning_rate": 6.969826969826971e-06, "loss": 0.8773, "step": 14480 }, { "epoch": 18.012972972972975, "grad_norm": 81.23369598388672, "learning_rate": 6.966966966966967e-06, "loss": 0.9279, "step": 14490 }, { "epoch": 18.013230373230375, "grad_norm": 0.2982556223869324, "learning_rate": 6.964106964106964e-06, "loss": 0.4003, "step": 14500 }, { "epoch": 18.013487773487775, "grad_norm": 0.008642779663205147, "learning_rate": 6.961246961246962e-06, "loss": 0.8602, "step": 14510 }, { "epoch": 18.013745173745175, "grad_norm": 0.18318037688732147, "learning_rate": 6.958386958386959e-06, "loss": 0.005, "step": 14520 }, { "epoch": 18.014002574002575, "grad_norm": 0.3546335697174072, "learning_rate": 6.9555269555269555e-06, "loss": 0.0034, "step": 14530 }, { "epoch": 18.014259974259975, "grad_norm": 0.041229523718357086, "learning_rate": 6.952666952666954e-06, "loss": 0.0029, "step": 14540 }, { "epoch": 18.014517374517375, "grad_norm": 0.09673618525266647, "learning_rate": 6.949806949806951e-06, "loss": 0.0661, "step": 14550 }, { "epoch": 18.014774774774775, "grad_norm": 0.030932040885090828, "learning_rate": 6.9469469469469474e-06, "loss": 1.5543, "step": 14560 }, { "epoch": 18.015032175032175, "grad_norm": 0.1227671205997467, "learning_rate": 6.944086944086945e-06, "loss": 0.0012, "step": 14570 }, { "epoch": 18.015289575289575, "grad_norm": 0.15442447364330292, "learning_rate": 6.941226941226942e-06, "loss": 0.4028, "step": 14580 }, { "epoch": 18.015546975546975, "grad_norm": 525.80224609375, "learning_rate": 6.938366938366939e-06, "loss": 0.3612, "step": 14590 }, { "epoch": 18.015804375804375, "grad_norm": 1.3030730485916138, "learning_rate": 6.935506935506936e-06, "loss": 0.0142, "step": 14600 }, { "epoch": 18.016061776061775, "grad_norm": 0.0016896520974114537, "learning_rate": 6.932646932646934e-06, "loss": 1.0281, "step": 14610 }, { "epoch": 18.016319176319175, "grad_norm": 479.7268981933594, "learning_rate": 6.9297869297869305e-06, "loss": 0.3177, "step": 14620 }, { "epoch": 18.016576576576576, "grad_norm": 0.004493164364248514, "learning_rate": 6.926926926926928e-06, "loss": 0.6365, "step": 14630 }, { "epoch": 18.016833976833976, "grad_norm": 0.15360552072525024, "learning_rate": 6.924066924066924e-06, "loss": 0.0011, "step": 14640 }, { "epoch": 18.017091377091376, "grad_norm": 28.01856231689453, "learning_rate": 6.9212069212069224e-06, "loss": 0.9447, "step": 14650 }, { "epoch": 18.017348777348776, "grad_norm": 77.52731323242188, "learning_rate": 6.918346918346919e-06, "loss": 1.6479, "step": 14660 }, { "epoch": 18.01760617760618, "grad_norm": 0.0018540214514359832, "learning_rate": 6.915486915486916e-06, "loss": 0.0014, "step": 14670 }, { "epoch": 18.01786357786358, "grad_norm": 5.362928867340088, "learning_rate": 6.912626912626913e-06, "loss": 1.5786, "step": 14680 }, { "epoch": 18.01812097812098, "grad_norm": 0.010288232006132603, "learning_rate": 6.909766909766911e-06, "loss": 1.0865, "step": 14690 }, { "epoch": 18.01837837837838, "grad_norm": 0.00446264399215579, "learning_rate": 6.906906906906907e-06, "loss": 0.8953, "step": 14700 }, { "epoch": 18.01863577863578, "grad_norm": 0.036626916378736496, "learning_rate": 6.904046904046905e-06, "loss": 1.5741, "step": 14710 }, { "epoch": 18.01889317889318, "grad_norm": 105.58182525634766, "learning_rate": 6.901186901186901e-06, "loss": 0.4651, "step": 14720 }, { "epoch": 18.01915057915058, "grad_norm": 1.4560140371322632, "learning_rate": 6.898326898326899e-06, "loss": 1.017, "step": 14730 }, { "epoch": 18.01940797940798, "grad_norm": 0.0017882263055071235, "learning_rate": 6.895466895466896e-06, "loss": 0.853, "step": 14740 }, { "epoch": 18.01966537966538, "grad_norm": 0.20019206404685974, "learning_rate": 6.892606892606893e-06, "loss": 1.3635, "step": 14750 }, { "epoch": 18.01992277992278, "grad_norm": 0.003446888877078891, "learning_rate": 6.88974688974689e-06, "loss": 0.4732, "step": 14760 }, { "epoch": 18.02, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.8912200927734375, "eval_runtime": 13.4927, "eval_samples_per_second": 3.409, "eval_steps_per_second": 3.409, "step": 14763 }, { "epoch": 19.00018018018018, "grad_norm": 0.819979727268219, "learning_rate": 6.886886886886888e-06, "loss": 0.479, "step": 14770 }, { "epoch": 19.00043758043758, "grad_norm": 0.3846934735774994, "learning_rate": 6.884026884026884e-06, "loss": 0.8816, "step": 14780 }, { "epoch": 19.00069498069498, "grad_norm": 0.28570055961608887, "learning_rate": 6.881166881166882e-06, "loss": 1.1393, "step": 14790 }, { "epoch": 19.00095238095238, "grad_norm": 0.0016494010342285037, "learning_rate": 6.878306878306879e-06, "loss": 0.0054, "step": 14800 }, { "epoch": 19.00120978120978, "grad_norm": 0.13023339211940765, "learning_rate": 6.875446875446876e-06, "loss": 0.0038, "step": 14810 }, { "epoch": 19.00146718146718, "grad_norm": 0.001983684953302145, "learning_rate": 6.872586872586873e-06, "loss": 0.4917, "step": 14820 }, { "epoch": 19.00172458172458, "grad_norm": 0.16467513144016266, "learning_rate": 6.869726869726871e-06, "loss": 0.4895, "step": 14830 }, { "epoch": 19.00198198198198, "grad_norm": 0.004174495581537485, "learning_rate": 6.866866866866867e-06, "loss": 0.0456, "step": 14840 }, { "epoch": 19.00223938223938, "grad_norm": 96.41185760498047, "learning_rate": 6.8640068640068645e-06, "loss": 0.9856, "step": 14850 }, { "epoch": 19.00249678249678, "grad_norm": 0.012012992985546589, "learning_rate": 6.861146861146861e-06, "loss": 0.7848, "step": 14860 }, { "epoch": 19.00275418275418, "grad_norm": 0.17765195667743683, "learning_rate": 6.858286858286859e-06, "loss": 0.5042, "step": 14870 }, { "epoch": 19.003011583011585, "grad_norm": 0.0029575249645859003, "learning_rate": 6.855426855426856e-06, "loss": 0.499, "step": 14880 }, { "epoch": 19.003268983268985, "grad_norm": 81.89859771728516, "learning_rate": 6.852566852566853e-06, "loss": 0.0079, "step": 14890 }, { "epoch": 19.003526383526385, "grad_norm": 0.0032242753077298403, "learning_rate": 6.849706849706849e-06, "loss": 0.0018, "step": 14900 }, { "epoch": 19.003783783783785, "grad_norm": 0.15290415287017822, "learning_rate": 6.846846846846848e-06, "loss": 1.2286, "step": 14910 }, { "epoch": 19.004041184041185, "grad_norm": 0.001420488115400076, "learning_rate": 6.843986843986845e-06, "loss": 0.478, "step": 14920 }, { "epoch": 19.004298584298585, "grad_norm": 0.009055113419890404, "learning_rate": 6.841126841126841e-06, "loss": 0.8538, "step": 14930 }, { "epoch": 19.004555984555985, "grad_norm": 0.001777785480953753, "learning_rate": 6.8382668382668395e-06, "loss": 0.0001, "step": 14940 }, { "epoch": 19.004813384813385, "grad_norm": 0.33212217688560486, "learning_rate": 6.835406835406836e-06, "loss": 1.249, "step": 14950 }, { "epoch": 19.005070785070785, "grad_norm": 0.17998294532299042, "learning_rate": 6.832546832546833e-06, "loss": 0.0113, "step": 14960 }, { "epoch": 19.005328185328185, "grad_norm": 0.004125115927308798, "learning_rate": 6.82968682968683e-06, "loss": 1.0992, "step": 14970 }, { "epoch": 19.005585585585585, "grad_norm": 0.06852870434522629, "learning_rate": 6.826826826826828e-06, "loss": 0.4464, "step": 14980 }, { "epoch": 19.005842985842985, "grad_norm": 0.272107869386673, "learning_rate": 6.823966823966824e-06, "loss": 1.0914, "step": 14990 }, { "epoch": 19.006100386100385, "grad_norm": 0.035318005830049515, "learning_rate": 6.821106821106822e-06, "loss": 0.9878, "step": 15000 }, { "epoch": 19.006357786357785, "grad_norm": 2.790893793106079, "learning_rate": 6.818246818246819e-06, "loss": 1.3259, "step": 15010 }, { "epoch": 19.006615186615186, "grad_norm": 79.72868347167969, "learning_rate": 6.815386815386816e-06, "loss": 0.5364, "step": 15020 }, { "epoch": 19.006872586872586, "grad_norm": 0.45355862379074097, "learning_rate": 6.812526812526813e-06, "loss": 0.4649, "step": 15030 }, { "epoch": 19.007129987129986, "grad_norm": 954.1507568359375, "learning_rate": 6.809666809666811e-06, "loss": 1.2683, "step": 15040 }, { "epoch": 19.007387387387386, "grad_norm": 143.90093994140625, "learning_rate": 6.8068068068068075e-06, "loss": 0.5476, "step": 15050 }, { "epoch": 19.00764478764479, "grad_norm": 0.19214926660060883, "learning_rate": 6.803946803946805e-06, "loss": 0.3, "step": 15060 }, { "epoch": 19.00790218790219, "grad_norm": 0.002590177347883582, "learning_rate": 6.801086801086801e-06, "loss": 0.002, "step": 15070 }, { "epoch": 19.00815958815959, "grad_norm": 0.030673101544380188, "learning_rate": 6.798226798226799e-06, "loss": 1.4, "step": 15080 }, { "epoch": 19.00841698841699, "grad_norm": 0.09079337120056152, "learning_rate": 6.795366795366796e-06, "loss": 0.0028, "step": 15090 }, { "epoch": 19.00867438867439, "grad_norm": 0.034733597189188004, "learning_rate": 6.792506792506793e-06, "loss": 1.5467, "step": 15100 }, { "epoch": 19.00893178893179, "grad_norm": 0.010993434116244316, "learning_rate": 6.78964678964679e-06, "loss": 0.9666, "step": 15110 }, { "epoch": 19.00918918918919, "grad_norm": 0.07968287169933319, "learning_rate": 6.786786786786788e-06, "loss": 0.9137, "step": 15120 }, { "epoch": 19.00944658944659, "grad_norm": 0.09925149381160736, "learning_rate": 6.783926783926784e-06, "loss": 0.1951, "step": 15130 }, { "epoch": 19.00970398970399, "grad_norm": 247.01406860351562, "learning_rate": 6.781066781066782e-06, "loss": 0.7136, "step": 15140 }, { "epoch": 19.00996138996139, "grad_norm": 0.009178612381219864, "learning_rate": 6.778206778206778e-06, "loss": 1.4312, "step": 15150 }, { "epoch": 19.01021879021879, "grad_norm": 0.0027663582004606724, "learning_rate": 6.775346775346776e-06, "loss": 0.7116, "step": 15160 }, { "epoch": 19.01047619047619, "grad_norm": 0.1210724487900734, "learning_rate": 6.772486772486773e-06, "loss": 0.5382, "step": 15170 }, { "epoch": 19.01073359073359, "grad_norm": 28.404563903808594, "learning_rate": 6.76962676962677e-06, "loss": 0.7048, "step": 15180 }, { "epoch": 19.01099099099099, "grad_norm": 0.0018646479584276676, "learning_rate": 6.7667667667667665e-06, "loss": 0.7556, "step": 15190 }, { "epoch": 19.01124839124839, "grad_norm": 0.003445609239861369, "learning_rate": 6.763906763906765e-06, "loss": 1.1341, "step": 15200 }, { "epoch": 19.01150579150579, "grad_norm": 0.0021248559933155775, "learning_rate": 6.761046761046761e-06, "loss": 1.7684, "step": 15210 }, { "epoch": 19.01176319176319, "grad_norm": 0.014535117894411087, "learning_rate": 6.7581867581867585e-06, "loss": 0.0021, "step": 15220 }, { "epoch": 19.01202059202059, "grad_norm": 0.07674351334571838, "learning_rate": 6.755326755326756e-06, "loss": 1.07, "step": 15230 }, { "epoch": 19.01227799227799, "grad_norm": 37.96657943725586, "learning_rate": 6.752466752466753e-06, "loss": 1.5324, "step": 15240 }, { "epoch": 19.012535392535394, "grad_norm": 0.5922670960426331, "learning_rate": 6.7496067496067496e-06, "loss": 0.5106, "step": 15250 }, { "epoch": 19.012792792792794, "grad_norm": 0.2684486210346222, "learning_rate": 6.746746746746748e-06, "loss": 0.82, "step": 15260 }, { "epoch": 19.013050193050194, "grad_norm": 0.003213370218873024, "learning_rate": 6.743886743886744e-06, "loss": 0.0033, "step": 15270 }, { "epoch": 19.013307593307594, "grad_norm": 0.004637459293007851, "learning_rate": 6.7410267410267415e-06, "loss": 0.5989, "step": 15280 }, { "epoch": 19.013564993564994, "grad_norm": 22.795841217041016, "learning_rate": 6.738166738166738e-06, "loss": 0.6718, "step": 15290 }, { "epoch": 19.013822393822394, "grad_norm": 30.51113510131836, "learning_rate": 6.735306735306736e-06, "loss": 0.9449, "step": 15300 }, { "epoch": 19.014079794079795, "grad_norm": 0.10669000446796417, "learning_rate": 6.7324467324467335e-06, "loss": 0.0043, "step": 15310 }, { "epoch": 19.014337194337195, "grad_norm": 0.10424339026212692, "learning_rate": 6.72958672958673e-06, "loss": 1.5502, "step": 15320 }, { "epoch": 19.014594594594595, "grad_norm": 0.010130718350410461, "learning_rate": 6.726726726726728e-06, "loss": 0.9745, "step": 15330 }, { "epoch": 19.014851994851995, "grad_norm": 0.04613509774208069, "learning_rate": 6.7238667238667246e-06, "loss": 1.0258, "step": 15340 }, { "epoch": 19.015109395109395, "grad_norm": 0.014101505279541016, "learning_rate": 6.721006721006722e-06, "loss": 0.4591, "step": 15350 }, { "epoch": 19.015366795366795, "grad_norm": 0.2920877933502197, "learning_rate": 6.718146718146718e-06, "loss": 0.8987, "step": 15360 }, { "epoch": 19.015624195624195, "grad_norm": 0.003366332734003663, "learning_rate": 6.7152867152867165e-06, "loss": 0.5684, "step": 15370 }, { "epoch": 19.015881595881595, "grad_norm": 0.014356138184666634, "learning_rate": 6.712426712426713e-06, "loss": 0.9558, "step": 15380 }, { "epoch": 19.016138996138995, "grad_norm": 0.033906999975442886, "learning_rate": 6.70956670956671e-06, "loss": 1.3507, "step": 15390 }, { "epoch": 19.016396396396395, "grad_norm": 0.18974842131137848, "learning_rate": 6.706706706706707e-06, "loss": 0.0257, "step": 15400 }, { "epoch": 19.016653796653795, "grad_norm": 0.008604479022324085, "learning_rate": 6.703846703846705e-06, "loss": 1.7851, "step": 15410 }, { "epoch": 19.016911196911195, "grad_norm": 0.2468060553073883, "learning_rate": 6.700986700986701e-06, "loss": 0.8377, "step": 15420 }, { "epoch": 19.0171685971686, "grad_norm": 0.025349579751491547, "learning_rate": 6.698126698126699e-06, "loss": 0.4866, "step": 15430 }, { "epoch": 19.017425997426, "grad_norm": 0.39254313707351685, "learning_rate": 6.695266695266695e-06, "loss": 0.4546, "step": 15440 }, { "epoch": 19.0176833976834, "grad_norm": 25.649742126464844, "learning_rate": 6.692406692406693e-06, "loss": 0.8261, "step": 15450 }, { "epoch": 19.0179407979408, "grad_norm": 0.19370250403881073, "learning_rate": 6.68954668954669e-06, "loss": 0.3301, "step": 15460 }, { "epoch": 19.0181981981982, "grad_norm": 70.06546783447266, "learning_rate": 6.686686686686687e-06, "loss": 1.0178, "step": 15470 }, { "epoch": 19.0184555984556, "grad_norm": 0.29361337423324585, "learning_rate": 6.6838266838266844e-06, "loss": 0.8067, "step": 15480 }, { "epoch": 19.018712998713, "grad_norm": 0.00261647067964077, "learning_rate": 6.680966680966682e-06, "loss": 1.6859, "step": 15490 }, { "epoch": 19.0189703989704, "grad_norm": 0.1030469685792923, "learning_rate": 6.678106678106678e-06, "loss": 1.0168, "step": 15500 }, { "epoch": 19.0192277992278, "grad_norm": 42.0329475402832, "learning_rate": 6.675246675246676e-06, "loss": 1.385, "step": 15510 }, { "epoch": 19.0194851994852, "grad_norm": 0.21290437877178192, "learning_rate": 6.672386672386673e-06, "loss": 0.2358, "step": 15520 }, { "epoch": 19.0197425997426, "grad_norm": 10.05051326751709, "learning_rate": 6.66952666952667e-06, "loss": 1.1642, "step": 15530 }, { "epoch": 19.02, "grad_norm": 0.021836817264556885, "learning_rate": 6.666666666666667e-06, "loss": 0.8663, "step": 15540 }, { "epoch": 19.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.4560748338699341, "eval_runtime": 13.467, "eval_samples_per_second": 3.416, "eval_steps_per_second": 3.416, "step": 15540 }, { "epoch": 20.0002574002574, "grad_norm": 0.010793312452733517, "learning_rate": 6.663806663806665e-06, "loss": 0.0118, "step": 15550 }, { "epoch": 20.0005148005148, "grad_norm": 0.004275476094335318, "learning_rate": 6.660946660946661e-06, "loss": 0.0026, "step": 15560 }, { "epoch": 20.0007722007722, "grad_norm": 0.0035458989441394806, "learning_rate": 6.658086658086659e-06, "loss": 0.8338, "step": 15570 }, { "epoch": 20.0010296010296, "grad_norm": 0.002316601574420929, "learning_rate": 6.655226655226655e-06, "loss": 1.4684, "step": 15580 }, { "epoch": 20.001287001287, "grad_norm": 0.003117610700428486, "learning_rate": 6.652366652366653e-06, "loss": 0.7738, "step": 15590 }, { "epoch": 20.0015444015444, "grad_norm": 0.004058063495904207, "learning_rate": 6.64950664950665e-06, "loss": 1.7848, "step": 15600 }, { "epoch": 20.0018018018018, "grad_norm": 3.553769111633301, "learning_rate": 6.646646646646647e-06, "loss": 1.1946, "step": 15610 }, { "epoch": 20.0020592020592, "grad_norm": 0.03767959401011467, "learning_rate": 6.6437866437866435e-06, "loss": 0.9582, "step": 15620 }, { "epoch": 20.0023166023166, "grad_norm": 0.02251305803656578, "learning_rate": 6.640926640926642e-06, "loss": 0.0018, "step": 15630 }, { "epoch": 20.002574002574004, "grad_norm": 0.14382117986679077, "learning_rate": 6.638066638066638e-06, "loss": 0.4451, "step": 15640 }, { "epoch": 20.002831402831404, "grad_norm": 0.0074038137681782246, "learning_rate": 6.6352066352066354e-06, "loss": 0.0175, "step": 15650 }, { "epoch": 20.003088803088804, "grad_norm": 309.4783630371094, "learning_rate": 6.632346632346633e-06, "loss": 1.088, "step": 15660 }, { "epoch": 20.003346203346204, "grad_norm": 0.0028939915355294943, "learning_rate": 6.62948662948663e-06, "loss": 0.7427, "step": 15670 }, { "epoch": 20.003603603603604, "grad_norm": 0.01108844019472599, "learning_rate": 6.626626626626627e-06, "loss": 0.0041, "step": 15680 }, { "epoch": 20.003861003861005, "grad_norm": 0.23344767093658447, "learning_rate": 6.623766623766624e-06, "loss": 0.6241, "step": 15690 }, { "epoch": 20.004118404118405, "grad_norm": 382.5850830078125, "learning_rate": 6.620906620906622e-06, "loss": 0.5393, "step": 15700 }, { "epoch": 20.004375804375805, "grad_norm": 0.17710717022418976, "learning_rate": 6.6180466180466185e-06, "loss": 0.0036, "step": 15710 }, { "epoch": 20.004633204633205, "grad_norm": 0.8400039076805115, "learning_rate": 6.615186615186616e-06, "loss": 0.5744, "step": 15720 }, { "epoch": 20.004890604890605, "grad_norm": 0.10408028960227966, "learning_rate": 6.612326612326613e-06, "loss": 0.1466, "step": 15730 }, { "epoch": 20.005148005148005, "grad_norm": 0.0018540999153628945, "learning_rate": 6.6094666094666104e-06, "loss": 0.4497, "step": 15740 }, { "epoch": 20.005405405405405, "grad_norm": 0.004669906571507454, "learning_rate": 6.606606606606607e-06, "loss": 0.443, "step": 15750 }, { "epoch": 20.005662805662805, "grad_norm": 91.52081298828125, "learning_rate": 6.603746603746605e-06, "loss": 2.0061, "step": 15760 }, { "epoch": 20.005920205920205, "grad_norm": 0.014538582414388657, "learning_rate": 6.6008866008866015e-06, "loss": 0.6473, "step": 15770 }, { "epoch": 20.006177606177605, "grad_norm": 0.09253435581922531, "learning_rate": 6.598026598026599e-06, "loss": 0.563, "step": 15780 }, { "epoch": 20.006435006435005, "grad_norm": 0.005112254060804844, "learning_rate": 6.595166595166595e-06, "loss": 0.0031, "step": 15790 }, { "epoch": 20.006692406692405, "grad_norm": 0.01782931201159954, "learning_rate": 6.5923065923065935e-06, "loss": 0.0068, "step": 15800 }, { "epoch": 20.006949806949805, "grad_norm": 3.834109306335449, "learning_rate": 6.58944658944659e-06, "loss": 0.6368, "step": 15810 }, { "epoch": 20.00720720720721, "grad_norm": 0.3963877856731415, "learning_rate": 6.586586586586587e-06, "loss": 0.5123, "step": 15820 }, { "epoch": 20.00746460746461, "grad_norm": 0.0013604898704215884, "learning_rate": 6.583726583726584e-06, "loss": 0.7435, "step": 15830 }, { "epoch": 20.00772200772201, "grad_norm": 0.37704506516456604, "learning_rate": 6.580866580866582e-06, "loss": 0.0023, "step": 15840 }, { "epoch": 20.00797940797941, "grad_norm": 26.52029037475586, "learning_rate": 6.578006578006578e-06, "loss": 0.1471, "step": 15850 }, { "epoch": 20.00823680823681, "grad_norm": 0.026596808806061745, "learning_rate": 6.575146575146576e-06, "loss": 0.0049, "step": 15860 }, { "epoch": 20.00849420849421, "grad_norm": 0.055981434881687164, "learning_rate": 6.572286572286572e-06, "loss": 0.5553, "step": 15870 }, { "epoch": 20.00875160875161, "grad_norm": 0.00963540282100439, "learning_rate": 6.56942656942657e-06, "loss": 0.0018, "step": 15880 }, { "epoch": 20.00900900900901, "grad_norm": 0.06317219883203506, "learning_rate": 6.566566566566567e-06, "loss": 0.4827, "step": 15890 }, { "epoch": 20.00926640926641, "grad_norm": 0.05564889311790466, "learning_rate": 6.563706563706564e-06, "loss": 1.7282, "step": 15900 }, { "epoch": 20.00952380952381, "grad_norm": 58.55592346191406, "learning_rate": 6.560846560846561e-06, "loss": 0.786, "step": 15910 }, { "epoch": 20.00978120978121, "grad_norm": 0.07347892969846725, "learning_rate": 6.557986557986559e-06, "loss": 0.3896, "step": 15920 }, { "epoch": 20.01003861003861, "grad_norm": 0.11479922384023666, "learning_rate": 6.555126555126555e-06, "loss": 1.1231, "step": 15930 }, { "epoch": 20.01029601029601, "grad_norm": 0.0013371037784963846, "learning_rate": 6.552266552266553e-06, "loss": 1.6537, "step": 15940 }, { "epoch": 20.01055341055341, "grad_norm": 0.024909818544983864, "learning_rate": 6.54940654940655e-06, "loss": 0.5062, "step": 15950 }, { "epoch": 20.01081081081081, "grad_norm": 0.26621049642562866, "learning_rate": 6.546546546546547e-06, "loss": 1.1278, "step": 15960 }, { "epoch": 20.01106821106821, "grad_norm": 0.27428120374679565, "learning_rate": 6.543686543686544e-06, "loss": 0.0021, "step": 15970 }, { "epoch": 20.01132561132561, "grad_norm": 0.19413171708583832, "learning_rate": 6.540826540826542e-06, "loss": 1.3931, "step": 15980 }, { "epoch": 20.01158301158301, "grad_norm": 0.9698719382286072, "learning_rate": 6.537966537966538e-06, "loss": 0.0043, "step": 15990 }, { "epoch": 20.01184041184041, "grad_norm": 0.0036435227375477552, "learning_rate": 6.5351065351065356e-06, "loss": 0.0025, "step": 16000 }, { "epoch": 20.012097812097814, "grad_norm": 0.005679155234247446, "learning_rate": 6.532246532246532e-06, "loss": 0.3591, "step": 16010 }, { "epoch": 20.012355212355214, "grad_norm": 0.2528136372566223, "learning_rate": 6.52938652938653e-06, "loss": 1.0381, "step": 16020 }, { "epoch": 20.012612612612614, "grad_norm": 0.008972964249551296, "learning_rate": 6.526526526526527e-06, "loss": 0.5617, "step": 16030 }, { "epoch": 20.012870012870014, "grad_norm": 0.437963604927063, "learning_rate": 6.523666523666524e-06, "loss": 1.372, "step": 16040 }, { "epoch": 20.013127413127414, "grad_norm": 0.0023576742969453335, "learning_rate": 6.520806520806522e-06, "loss": 0.7603, "step": 16050 }, { "epoch": 20.013384813384814, "grad_norm": 0.002086960943415761, "learning_rate": 6.517946517946519e-06, "loss": 1.0576, "step": 16060 }, { "epoch": 20.013642213642214, "grad_norm": 0.03679662570357323, "learning_rate": 6.515086515086516e-06, "loss": 0.6061, "step": 16070 }, { "epoch": 20.013899613899614, "grad_norm": 0.0061900317668914795, "learning_rate": 6.512226512226512e-06, "loss": 0.9334, "step": 16080 }, { "epoch": 20.014157014157014, "grad_norm": 0.041533537209033966, "learning_rate": 6.5093665093665106e-06, "loss": 1.0999, "step": 16090 }, { "epoch": 20.014414414414414, "grad_norm": 0.003179677063599229, "learning_rate": 6.506506506506507e-06, "loss": 0.9324, "step": 16100 }, { "epoch": 20.014671814671814, "grad_norm": 51.39768600463867, "learning_rate": 6.503646503646504e-06, "loss": 1.2931, "step": 16110 }, { "epoch": 20.014929214929214, "grad_norm": 31.660776138305664, "learning_rate": 6.500786500786501e-06, "loss": 0.9663, "step": 16120 }, { "epoch": 20.015186615186614, "grad_norm": 27.631328582763672, "learning_rate": 6.497926497926499e-06, "loss": 0.5043, "step": 16130 }, { "epoch": 20.015444015444015, "grad_norm": 0.10719513893127441, "learning_rate": 6.4950664950664955e-06, "loss": 1.2169, "step": 16140 }, { "epoch": 20.015701415701415, "grad_norm": 0.5144920349121094, "learning_rate": 6.492206492206493e-06, "loss": 1.0045, "step": 16150 }, { "epoch": 20.015958815958815, "grad_norm": 124.62886810302734, "learning_rate": 6.48934648934649e-06, "loss": 1.8065, "step": 16160 }, { "epoch": 20.016216216216215, "grad_norm": 47.27402114868164, "learning_rate": 6.486486486486487e-06, "loss": 0.3963, "step": 16170 }, { "epoch": 20.016473616473615, "grad_norm": 0.2665756344795227, "learning_rate": 6.483626483626484e-06, "loss": 0.7758, "step": 16180 }, { "epoch": 20.01673101673102, "grad_norm": 31.503211975097656, "learning_rate": 6.480766480766482e-06, "loss": 1.632, "step": 16190 }, { "epoch": 20.01698841698842, "grad_norm": 0.003470065537840128, "learning_rate": 6.4779064779064785e-06, "loss": 0.9137, "step": 16200 }, { "epoch": 20.01724581724582, "grad_norm": 0.24733567237854004, "learning_rate": 6.475046475046476e-06, "loss": 0.0986, "step": 16210 }, { "epoch": 20.01750321750322, "grad_norm": 2.922898292541504, "learning_rate": 6.472186472186472e-06, "loss": 0.4414, "step": 16220 }, { "epoch": 20.01776061776062, "grad_norm": 0.6893411874771118, "learning_rate": 6.4693264693264705e-06, "loss": 0.0069, "step": 16230 }, { "epoch": 20.01801801801802, "grad_norm": 0.3383210599422455, "learning_rate": 6.466466466466467e-06, "loss": 0.4328, "step": 16240 }, { "epoch": 20.01827541827542, "grad_norm": 368.6217346191406, "learning_rate": 6.463606463606464e-06, "loss": 0.9819, "step": 16250 }, { "epoch": 20.01853281853282, "grad_norm": 0.19034291803836823, "learning_rate": 6.460746460746461e-06, "loss": 0.9037, "step": 16260 }, { "epoch": 20.01879021879022, "grad_norm": 0.08844771236181259, "learning_rate": 6.457886457886459e-06, "loss": 1.7082, "step": 16270 }, { "epoch": 20.01904761904762, "grad_norm": 0.32688507437705994, "learning_rate": 6.455026455026455e-06, "loss": 0.851, "step": 16280 }, { "epoch": 20.01930501930502, "grad_norm": 1.21635901927948, "learning_rate": 6.452166452166453e-06, "loss": 0.7596, "step": 16290 }, { "epoch": 20.01956241956242, "grad_norm": 0.3342961370944977, "learning_rate": 6.449306449306449e-06, "loss": 0.0038, "step": 16300 }, { "epoch": 20.01981981981982, "grad_norm": 0.06980375945568085, "learning_rate": 6.446446446446447e-06, "loss": 1.0169, "step": 16310 }, { "epoch": 20.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.6228641271591187, "eval_runtime": 13.4391, "eval_samples_per_second": 3.423, "eval_steps_per_second": 3.423, "step": 16317 }, { "epoch": 21.00007722007722, "grad_norm": 0.05950063839554787, "learning_rate": 6.443586443586444e-06, "loss": 0.7334, "step": 16320 }, { "epoch": 21.00033462033462, "grad_norm": 0.0011386877158656716, "learning_rate": 6.440726440726441e-06, "loss": 0.6521, "step": 16330 }, { "epoch": 21.00059202059202, "grad_norm": 0.11157349497079849, "learning_rate": 6.4378664378664375e-06, "loss": 0.7863, "step": 16340 }, { "epoch": 21.00084942084942, "grad_norm": 0.13690587878227234, "learning_rate": 6.435006435006436e-06, "loss": 0.0157, "step": 16350 }, { "epoch": 21.00110682110682, "grad_norm": 0.1480686217546463, "learning_rate": 6.432146432146432e-06, "loss": 0.4204, "step": 16360 }, { "epoch": 21.00136422136422, "grad_norm": 0.0624036081135273, "learning_rate": 6.4292864292864295e-06, "loss": 1.0578, "step": 16370 }, { "epoch": 21.00162162162162, "grad_norm": 0.44504156708717346, "learning_rate": 6.426426426426427e-06, "loss": 0.3797, "step": 16380 }, { "epoch": 21.00187902187902, "grad_norm": 0.14833220839500427, "learning_rate": 6.423566423566424e-06, "loss": 0.5096, "step": 16390 }, { "epoch": 21.002136422136424, "grad_norm": 0.07981384545564651, "learning_rate": 6.420706420706421e-06, "loss": 0.4623, "step": 16400 }, { "epoch": 21.002393822393824, "grad_norm": 132.79852294921875, "learning_rate": 6.417846417846419e-06, "loss": 0.9159, "step": 16410 }, { "epoch": 21.002651222651224, "grad_norm": 0.0031174507457762957, "learning_rate": 6.414986414986416e-06, "loss": 1.6705, "step": 16420 }, { "epoch": 21.002908622908624, "grad_norm": 0.33982014656066895, "learning_rate": 6.4121264121264125e-06, "loss": 0.0024, "step": 16430 }, { "epoch": 21.003166023166024, "grad_norm": 0.06785816699266434, "learning_rate": 6.409266409266411e-06, "loss": 0.0103, "step": 16440 }, { "epoch": 21.003423423423424, "grad_norm": 0.0009388201870024204, "learning_rate": 6.406406406406407e-06, "loss": 0.604, "step": 16450 }, { "epoch": 21.003680823680824, "grad_norm": 0.3058651387691498, "learning_rate": 6.4035464035464045e-06, "loss": 0.7045, "step": 16460 }, { "epoch": 21.003938223938224, "grad_norm": 0.19235381484031677, "learning_rate": 6.400686400686401e-06, "loss": 1.2, "step": 16470 }, { "epoch": 21.004195624195624, "grad_norm": 0.18182158470153809, "learning_rate": 6.397826397826399e-06, "loss": 0.5073, "step": 16480 }, { "epoch": 21.004453024453024, "grad_norm": 0.0032803218346089125, "learning_rate": 6.394966394966396e-06, "loss": 1.8055, "step": 16490 }, { "epoch": 21.004710424710424, "grad_norm": 374.3335266113281, "learning_rate": 6.392106392106393e-06, "loss": 0.0291, "step": 16500 }, { "epoch": 21.004967824967824, "grad_norm": 0.002655749674886465, "learning_rate": 6.389246389246389e-06, "loss": 0.7674, "step": 16510 }, { "epoch": 21.005225225225225, "grad_norm": 0.035156842321157455, "learning_rate": 6.3863863863863875e-06, "loss": 0.9229, "step": 16520 }, { "epoch": 21.005482625482625, "grad_norm": 0.012515634298324585, "learning_rate": 6.383526383526384e-06, "loss": 0.5184, "step": 16530 }, { "epoch": 21.005740025740025, "grad_norm": 0.226984903216362, "learning_rate": 6.380666380666381e-06, "loss": 0.4388, "step": 16540 }, { "epoch": 21.005997425997425, "grad_norm": 0.13279147446155548, "learning_rate": 6.377806377806378e-06, "loss": 0.9475, "step": 16550 }, { "epoch": 21.006254826254825, "grad_norm": 0.13161540031433105, "learning_rate": 6.374946374946376e-06, "loss": 0.5151, "step": 16560 }, { "epoch": 21.006512226512225, "grad_norm": 0.018993711099028587, "learning_rate": 6.3720863720863724e-06, "loss": 0.4802, "step": 16570 }, { "epoch": 21.00676962676963, "grad_norm": 0.002260217210277915, "learning_rate": 6.36922636922637e-06, "loss": 0.5331, "step": 16580 }, { "epoch": 21.00702702702703, "grad_norm": 0.002179916715249419, "learning_rate": 6.366366366366366e-06, "loss": 1.0489, "step": 16590 }, { "epoch": 21.00728442728443, "grad_norm": 0.41448646783828735, "learning_rate": 6.363506363506364e-06, "loss": 0.9842, "step": 16600 }, { "epoch": 21.00754182754183, "grad_norm": 2809.028076171875, "learning_rate": 6.360646360646361e-06, "loss": 0.4465, "step": 16610 }, { "epoch": 21.00779922779923, "grad_norm": 3.043079376220703, "learning_rate": 6.357786357786358e-06, "loss": 1.0932, "step": 16620 }, { "epoch": 21.00805662805663, "grad_norm": 0.0025238539092242718, "learning_rate": 6.3549263549263555e-06, "loss": 0.7486, "step": 16630 }, { "epoch": 21.00831402831403, "grad_norm": 0.0024203872308135033, "learning_rate": 6.352066352066353e-06, "loss": 1.4116, "step": 16640 }, { "epoch": 21.00857142857143, "grad_norm": 0.18933537602424622, "learning_rate": 6.349206349206349e-06, "loss": 1.9212, "step": 16650 }, { "epoch": 21.00882882882883, "grad_norm": 0.00907233264297247, "learning_rate": 6.3463463463463474e-06, "loss": 1.2356, "step": 16660 }, { "epoch": 21.00908622908623, "grad_norm": 0.43250593543052673, "learning_rate": 6.343486343486344e-06, "loss": 0.0086, "step": 16670 }, { "epoch": 21.00934362934363, "grad_norm": 0.23632937669754028, "learning_rate": 6.340626340626341e-06, "loss": 0.2125, "step": 16680 }, { "epoch": 21.00960102960103, "grad_norm": 0.4456353187561035, "learning_rate": 6.337766337766338e-06, "loss": 0.5689, "step": 16690 }, { "epoch": 21.00985842985843, "grad_norm": 0.003951109945774078, "learning_rate": 6.334906334906336e-06, "loss": 1.4499, "step": 16700 }, { "epoch": 21.01011583011583, "grad_norm": 0.005860066507011652, "learning_rate": 6.332046332046332e-06, "loss": 0.6452, "step": 16710 }, { "epoch": 21.01037323037323, "grad_norm": 0.677861213684082, "learning_rate": 6.32918632918633e-06, "loss": 0.0304, "step": 16720 }, { "epoch": 21.01063063063063, "grad_norm": 0.22151070833206177, "learning_rate": 6.326326326326326e-06, "loss": 0.3219, "step": 16730 }, { "epoch": 21.01088803088803, "grad_norm": 0.002829265082255006, "learning_rate": 6.323466323466324e-06, "loss": 1.018, "step": 16740 }, { "epoch": 21.01114543114543, "grad_norm": 0.0027092285454273224, "learning_rate": 6.320606320606321e-06, "loss": 0.5489, "step": 16750 }, { "epoch": 21.011402831402833, "grad_norm": 0.011305739171802998, "learning_rate": 6.317746317746318e-06, "loss": 0.5165, "step": 16760 }, { "epoch": 21.011660231660233, "grad_norm": 2.433239221572876, "learning_rate": 6.3148863148863145e-06, "loss": 0.0064, "step": 16770 }, { "epoch": 21.011917631917633, "grad_norm": 107.0201416015625, "learning_rate": 6.312026312026313e-06, "loss": 1.519, "step": 16780 }, { "epoch": 21.012175032175033, "grad_norm": 0.11884047091007233, "learning_rate": 6.30916630916631e-06, "loss": 1.4826, "step": 16790 }, { "epoch": 21.012432432432433, "grad_norm": 1.756311297416687, "learning_rate": 6.3063063063063065e-06, "loss": 1.1079, "step": 16800 }, { "epoch": 21.012689832689833, "grad_norm": 0.4935521185398102, "learning_rate": 6.303446303446305e-06, "loss": 0.5533, "step": 16810 }, { "epoch": 21.012947232947234, "grad_norm": 0.14828822016716003, "learning_rate": 6.300586300586301e-06, "loss": 1.0428, "step": 16820 }, { "epoch": 21.013204633204634, "grad_norm": 0.10324635356664658, "learning_rate": 6.297726297726298e-06, "loss": 0.0046, "step": 16830 }, { "epoch": 21.013462033462034, "grad_norm": 0.1749248206615448, "learning_rate": 6.294866294866295e-06, "loss": 0.877, "step": 16840 }, { "epoch": 21.013719433719434, "grad_norm": 0.0036808913573622704, "learning_rate": 6.292006292006293e-06, "loss": 0.5626, "step": 16850 }, { "epoch": 21.013976833976834, "grad_norm": 0.007130262907594442, "learning_rate": 6.2891462891462895e-06, "loss": 0.1482, "step": 16860 }, { "epoch": 21.014234234234234, "grad_norm": 0.02497122436761856, "learning_rate": 6.286286286286287e-06, "loss": 0.4806, "step": 16870 }, { "epoch": 21.014491634491634, "grad_norm": 346.6941833496094, "learning_rate": 6.283426283426284e-06, "loss": 1.3267, "step": 16880 }, { "epoch": 21.014749034749034, "grad_norm": 0.004221619106829166, "learning_rate": 6.2805662805662815e-06, "loss": 1.259, "step": 16890 }, { "epoch": 21.015006435006434, "grad_norm": 0.0013160128146409988, "learning_rate": 6.277706277706278e-06, "loss": 0.715, "step": 16900 }, { "epoch": 21.015263835263834, "grad_norm": 0.0038280917797237635, "learning_rate": 6.274846274846276e-06, "loss": 0.5407, "step": 16910 }, { "epoch": 21.015521235521234, "grad_norm": 35.85282897949219, "learning_rate": 6.2719862719862726e-06, "loss": 0.957, "step": 16920 }, { "epoch": 21.015778635778634, "grad_norm": 0.06263106316328049, "learning_rate": 6.26912626912627e-06, "loss": 0.9689, "step": 16930 }, { "epoch": 21.016036036036034, "grad_norm": 0.09454566240310669, "learning_rate": 6.266266266266266e-06, "loss": 0.5719, "step": 16940 }, { "epoch": 21.016293436293438, "grad_norm": 60.81951904296875, "learning_rate": 6.2634062634062645e-06, "loss": 1.0382, "step": 16950 }, { "epoch": 21.016550836550838, "grad_norm": 29.10282325744629, "learning_rate": 6.260546260546261e-06, "loss": 1.8424, "step": 16960 }, { "epoch": 21.016808236808238, "grad_norm": 0.004924100823700428, "learning_rate": 6.257686257686258e-06, "loss": 0.0054, "step": 16970 }, { "epoch": 21.017065637065638, "grad_norm": 0.08713381737470627, "learning_rate": 6.254826254826255e-06, "loss": 0.0118, "step": 16980 }, { "epoch": 21.017323037323038, "grad_norm": 0.0026128473691642284, "learning_rate": 6.251966251966253e-06, "loss": 0.0031, "step": 16990 }, { "epoch": 21.01758043758044, "grad_norm": 0.0017107778694480658, "learning_rate": 6.249106249106249e-06, "loss": 1.7852, "step": 17000 }, { "epoch": 21.01783783783784, "grad_norm": 57.05978012084961, "learning_rate": 6.246246246246247e-06, "loss": 1.4597, "step": 17010 }, { "epoch": 21.01809523809524, "grad_norm": 6.505259037017822, "learning_rate": 6.243386243386243e-06, "loss": 0.449, "step": 17020 }, { "epoch": 21.01835263835264, "grad_norm": 0.7023016214370728, "learning_rate": 6.240526240526241e-06, "loss": 1.3402, "step": 17030 }, { "epoch": 21.01861003861004, "grad_norm": 0.07186837494373322, "learning_rate": 6.237666237666238e-06, "loss": 0.6516, "step": 17040 }, { "epoch": 21.01886743886744, "grad_norm": 0.006106872111558914, "learning_rate": 6.234806234806235e-06, "loss": 0.0604, "step": 17050 }, { "epoch": 21.01912483912484, "grad_norm": 0.001679259818047285, "learning_rate": 6.2319462319462325e-06, "loss": 0.4562, "step": 17060 }, { "epoch": 21.01938223938224, "grad_norm": 0.20496806502342224, "learning_rate": 6.22908622908623e-06, "loss": 1.5803, "step": 17070 }, { "epoch": 21.01963963963964, "grad_norm": 0.002377886790782213, "learning_rate": 6.226226226226226e-06, "loss": 0.4701, "step": 17080 }, { "epoch": 21.01989703989704, "grad_norm": 15.251885414123535, "learning_rate": 6.2233662233662236e-06, "loss": 0.0059, "step": 17090 }, { "epoch": 21.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 0.7053625583648682, "eval_runtime": 13.4557, "eval_samples_per_second": 3.419, "eval_steps_per_second": 3.419, "step": 17094 }, { "epoch": 22.00015444015444, "grad_norm": 0.002505769720301032, "learning_rate": 6.220506220506221e-06, "loss": 0.5072, "step": 17100 }, { "epoch": 22.00041184041184, "grad_norm": 0.08272158354520798, "learning_rate": 6.217646217646218e-06, "loss": 0.5502, "step": 17110 }, { "epoch": 22.00066924066924, "grad_norm": 0.002082453342154622, "learning_rate": 6.214786214786215e-06, "loss": 1.0242, "step": 17120 }, { "epoch": 22.00092664092664, "grad_norm": 0.18459756672382355, "learning_rate": 6.211926211926213e-06, "loss": 0.4925, "step": 17130 }, { "epoch": 22.00118404118404, "grad_norm": 0.00420813262462616, "learning_rate": 6.209066209066209e-06, "loss": 0.8276, "step": 17140 }, { "epoch": 22.001441441441443, "grad_norm": 0.21216662228107452, "learning_rate": 6.206206206206207e-06, "loss": 0.2827, "step": 17150 }, { "epoch": 22.001698841698843, "grad_norm": 59.495052337646484, "learning_rate": 6.203346203346203e-06, "loss": 0.1756, "step": 17160 }, { "epoch": 22.001956241956243, "grad_norm": 0.07312590628862381, "learning_rate": 6.200486200486201e-06, "loss": 0.0012, "step": 17170 }, { "epoch": 22.002213642213643, "grad_norm": 0.2232220619916916, "learning_rate": 6.1976261976261986e-06, "loss": 0.5605, "step": 17180 }, { "epoch": 22.002471042471043, "grad_norm": 0.00914843287318945, "learning_rate": 6.194766194766195e-06, "loss": 0.5424, "step": 17190 }, { "epoch": 22.002728442728444, "grad_norm": 0.004124210216104984, "learning_rate": 6.191906191906193e-06, "loss": 0.013, "step": 17200 }, { "epoch": 22.002985842985844, "grad_norm": 133.94290161132812, "learning_rate": 6.18904618904619e-06, "loss": 1.0607, "step": 17210 }, { "epoch": 22.003243243243244, "grad_norm": 15.545845031738281, "learning_rate": 6.186186186186187e-06, "loss": 0.5845, "step": 17220 }, { "epoch": 22.003500643500644, "grad_norm": 0.2141716480255127, "learning_rate": 6.1833261833261834e-06, "loss": 1.2612, "step": 17230 }, { "epoch": 22.003758043758044, "grad_norm": 0.11907975375652313, "learning_rate": 6.180466180466182e-06, "loss": 0.5358, "step": 17240 }, { "epoch": 22.004015444015444, "grad_norm": 0.1688116192817688, "learning_rate": 6.177606177606178e-06, "loss": 0.0027, "step": 17250 }, { "epoch": 22.004272844272844, "grad_norm": 29.69304847717285, "learning_rate": 6.174746174746175e-06, "loss": 1.9443, "step": 17260 }, { "epoch": 22.004530244530244, "grad_norm": 0.001488303067162633, "learning_rate": 6.171886171886172e-06, "loss": 0.4166, "step": 17270 }, { "epoch": 22.004787644787644, "grad_norm": 0.003606772283092141, "learning_rate": 6.16902616902617e-06, "loss": 0.0036, "step": 17280 }, { "epoch": 22.005045045045044, "grad_norm": 0.23906883597373962, "learning_rate": 6.1661661661661665e-06, "loss": 0.8975, "step": 17290 }, { "epoch": 22.005302445302444, "grad_norm": 0.0013485512463375926, "learning_rate": 6.163306163306164e-06, "loss": 0.4444, "step": 17300 }, { "epoch": 22.005559845559844, "grad_norm": 0.0031310829799622297, "learning_rate": 6.160446160446161e-06, "loss": 0.0087, "step": 17310 }, { "epoch": 22.005817245817244, "grad_norm": 0.0013559467624872923, "learning_rate": 6.1575861575861584e-06, "loss": 1.025, "step": 17320 }, { "epoch": 22.006074646074644, "grad_norm": 0.11612682044506073, "learning_rate": 6.154726154726155e-06, "loss": 0.5893, "step": 17330 }, { "epoch": 22.006332046332048, "grad_norm": 0.001688244054093957, "learning_rate": 6.151866151866153e-06, "loss": 0.0039, "step": 17340 }, { "epoch": 22.006589446589448, "grad_norm": 0.017765646800398827, "learning_rate": 6.1490061490061495e-06, "loss": 1.5407, "step": 17350 }, { "epoch": 22.006846846846848, "grad_norm": 0.09817246347665787, "learning_rate": 6.146146146146147e-06, "loss": 0.0038, "step": 17360 }, { "epoch": 22.007104247104248, "grad_norm": 0.2186000794172287, "learning_rate": 6.143286143286143e-06, "loss": 0.4914, "step": 17370 }, { "epoch": 22.00736164736165, "grad_norm": 0.0013634796487167478, "learning_rate": 6.1404261404261415e-06, "loss": 0.0024, "step": 17380 }, { "epoch": 22.00761904761905, "grad_norm": 0.12472115457057953, "learning_rate": 6.137566137566138e-06, "loss": 0.615, "step": 17390 }, { "epoch": 22.00787644787645, "grad_norm": 40.63284683227539, "learning_rate": 6.134706134706135e-06, "loss": 1.4173, "step": 17400 }, { "epoch": 22.00813384813385, "grad_norm": 249.48081970214844, "learning_rate": 6.131846131846132e-06, "loss": 0.4317, "step": 17410 }, { "epoch": 22.00839124839125, "grad_norm": 0.1064755842089653, "learning_rate": 6.12898612898613e-06, "loss": 0.9979, "step": 17420 }, { "epoch": 22.00864864864865, "grad_norm": 0.002664172323420644, "learning_rate": 6.126126126126126e-06, "loss": 1.0715, "step": 17430 }, { "epoch": 22.00890604890605, "grad_norm": 0.11099614202976227, "learning_rate": 6.123266123266124e-06, "loss": 1.1395, "step": 17440 }, { "epoch": 22.00916344916345, "grad_norm": 451.1719665527344, "learning_rate": 6.12040612040612e-06, "loss": 0.595, "step": 17450 }, { "epoch": 22.00942084942085, "grad_norm": 1.3345633745193481, "learning_rate": 6.117546117546118e-06, "loss": 0.4494, "step": 17460 }, { "epoch": 22.00967824967825, "grad_norm": 0.8723010420799255, "learning_rate": 6.114686114686115e-06, "loss": 0.0143, "step": 17470 }, { "epoch": 22.00993564993565, "grad_norm": 164.72413635253906, "learning_rate": 6.111826111826112e-06, "loss": 1.8803, "step": 17480 }, { "epoch": 22.01019305019305, "grad_norm": 0.0019611294846981764, "learning_rate": 6.108966108966109e-06, "loss": 0.0016, "step": 17490 }, { "epoch": 22.01045045045045, "grad_norm": 29.908557891845703, "learning_rate": 6.106106106106107e-06, "loss": 1.4703, "step": 17500 }, { "epoch": 22.01070785070785, "grad_norm": 0.449646532535553, "learning_rate": 6.103246103246103e-06, "loss": 0.6162, "step": 17510 }, { "epoch": 22.010965250965253, "grad_norm": 0.007094630040228367, "learning_rate": 6.1003861003861005e-06, "loss": 0.5787, "step": 17520 }, { "epoch": 22.011222651222653, "grad_norm": 0.0014509069733321667, "learning_rate": 6.097526097526098e-06, "loss": 0.0015, "step": 17530 }, { "epoch": 22.011480051480053, "grad_norm": 0.011663383804261684, "learning_rate": 6.094666094666095e-06, "loss": 0.5842, "step": 17540 }, { "epoch": 22.011737451737453, "grad_norm": 0.1782633364200592, "learning_rate": 6.0918060918060925e-06, "loss": 1.0637, "step": 17550 }, { "epoch": 22.011994851994853, "grad_norm": 0.013621443882584572, "learning_rate": 6.08894608894609e-06, "loss": 0.3769, "step": 17560 }, { "epoch": 22.012252252252253, "grad_norm": 0.08306556195020676, "learning_rate": 6.086086086086087e-06, "loss": 0.5647, "step": 17570 }, { "epoch": 22.012509652509653, "grad_norm": 200.47479248046875, "learning_rate": 6.083226083226084e-06, "loss": 1.4816, "step": 17580 }, { "epoch": 22.012767052767053, "grad_norm": 0.008521323092281818, "learning_rate": 6.080366080366082e-06, "loss": 0.9749, "step": 17590 }, { "epoch": 22.013024453024453, "grad_norm": 0.0009275858174078166, "learning_rate": 6.077506077506078e-06, "loss": 2.1086, "step": 17600 }, { "epoch": 22.013281853281853, "grad_norm": 0.10126996785402298, "learning_rate": 6.0746460746460755e-06, "loss": 0.4938, "step": 17610 }, { "epoch": 22.013539253539253, "grad_norm": 0.0797620639204979, "learning_rate": 6.071786071786072e-06, "loss": 1.5423, "step": 17620 }, { "epoch": 22.013796653796653, "grad_norm": 0.19221942126750946, "learning_rate": 6.06892606892607e-06, "loss": 0.0867, "step": 17630 }, { "epoch": 22.014054054054053, "grad_norm": 0.538609504699707, "learning_rate": 6.066066066066067e-06, "loss": 0.3392, "step": 17640 }, { "epoch": 22.014311454311454, "grad_norm": 0.6996312141418457, "learning_rate": 6.063206063206064e-06, "loss": 1.3827, "step": 17650 }, { "epoch": 22.014568854568854, "grad_norm": 0.003910961095243692, "learning_rate": 6.06034606034606e-06, "loss": 0.459, "step": 17660 }, { "epoch": 22.014826254826254, "grad_norm": 0.0005794151220470667, "learning_rate": 6.057486057486059e-06, "loss": 0.5004, "step": 17670 }, { "epoch": 22.015083655083654, "grad_norm": 0.0007814746350049973, "learning_rate": 6.054626054626055e-06, "loss": 0.007, "step": 17680 }, { "epoch": 22.015341055341054, "grad_norm": 0.00357211846858263, "learning_rate": 6.051766051766052e-06, "loss": 0.0053, "step": 17690 }, { "epoch": 22.015598455598454, "grad_norm": 516.8284912109375, "learning_rate": 6.048906048906049e-06, "loss": 1.0764, "step": 17700 }, { "epoch": 22.015855855855857, "grad_norm": 16.29941177368164, "learning_rate": 6.046046046046047e-06, "loss": 1.5809, "step": 17710 }, { "epoch": 22.016113256113258, "grad_norm": 0.0014146931935101748, "learning_rate": 6.0431860431860435e-06, "loss": 0.3413, "step": 17720 }, { "epoch": 22.016370656370658, "grad_norm": 376.0869140625, "learning_rate": 6.040326040326041e-06, "loss": 2.2279, "step": 17730 }, { "epoch": 22.016628056628058, "grad_norm": 0.005802272353321314, "learning_rate": 6.037466037466037e-06, "loss": 0.6852, "step": 17740 }, { "epoch": 22.016885456885458, "grad_norm": 0.07146825641393661, "learning_rate": 6.034606034606035e-06, "loss": 0.8904, "step": 17750 }, { "epoch": 22.017142857142858, "grad_norm": 0.005169512704014778, "learning_rate": 6.031746031746032e-06, "loss": 0.9166, "step": 17760 }, { "epoch": 22.017400257400258, "grad_norm": 0.0047473483718931675, "learning_rate": 6.028886028886029e-06, "loss": 2.7498, "step": 17770 }, { "epoch": 22.017657657657658, "grad_norm": 0.001801575068384409, "learning_rate": 6.0260260260260265e-06, "loss": 0.4033, "step": 17780 }, { "epoch": 22.017915057915058, "grad_norm": 0.19776633381843567, "learning_rate": 6.023166023166024e-06, "loss": 1.0597, "step": 17790 }, { "epoch": 22.018172458172458, "grad_norm": 0.6332172751426697, "learning_rate": 6.02030602030602e-06, "loss": 0.4597, "step": 17800 }, { "epoch": 22.018429858429858, "grad_norm": 33.11475372314453, "learning_rate": 6.0174460174460185e-06, "loss": 1.4584, "step": 17810 }, { "epoch": 22.018687258687258, "grad_norm": 0.4599050283432007, "learning_rate": 6.014586014586015e-06, "loss": 0.015, "step": 17820 }, { "epoch": 22.01894465894466, "grad_norm": 0.21705041825771332, "learning_rate": 6.011726011726012e-06, "loss": 0.5132, "step": 17830 }, { "epoch": 22.01920205920206, "grad_norm": 0.8084737062454224, "learning_rate": 6.008866008866009e-06, "loss": 1.9364, "step": 17840 }, { "epoch": 22.01945945945946, "grad_norm": 0.15941214561462402, "learning_rate": 6.006006006006007e-06, "loss": 0.4626, "step": 17850 }, { "epoch": 22.01971685971686, "grad_norm": 0.001215206808410585, "learning_rate": 6.003146003146003e-06, "loss": 1.1089, "step": 17860 }, { "epoch": 22.01997425997426, "grad_norm": 0.029706159606575966, "learning_rate": 6.000286000286001e-06, "loss": 0.8826, "step": 17870 }, { "epoch": 22.02, "eval_accuracy": 0.7391304347826086, "eval_loss": 1.0915201902389526, "eval_runtime": 13.4617, "eval_samples_per_second": 3.417, "eval_steps_per_second": 3.417, "step": 17871 }, { "epoch": 23.00023166023166, "grad_norm": 0.4556454122066498, "learning_rate": 5.997425997425997e-06, "loss": 0.3958, "step": 17880 }, { "epoch": 23.00048906048906, "grad_norm": 0.3868144452571869, "learning_rate": 5.994565994565995e-06, "loss": 0.5247, "step": 17890 }, { "epoch": 23.00074646074646, "grad_norm": 0.009736313484609127, "learning_rate": 5.991705991705992e-06, "loss": 0.4699, "step": 17900 }, { "epoch": 23.001003861003863, "grad_norm": 4.145432472229004, "learning_rate": 5.988845988845989e-06, "loss": 0.442, "step": 17910 }, { "epoch": 23.001261261261263, "grad_norm": 0.5545969605445862, "learning_rate": 5.985985985985987e-06, "loss": 0.004, "step": 17920 }, { "epoch": 23.001518661518663, "grad_norm": 0.016343528404831886, "learning_rate": 5.983125983125984e-06, "loss": 0.002, "step": 17930 }, { "epoch": 23.001776061776063, "grad_norm": 0.0030438120011240244, "learning_rate": 5.980265980265981e-06, "loss": 1.1311, "step": 17940 }, { "epoch": 23.002033462033463, "grad_norm": 0.13489748537540436, "learning_rate": 5.9774059774059775e-06, "loss": 0.5768, "step": 17950 }, { "epoch": 23.002290862290863, "grad_norm": 72.87088012695312, "learning_rate": 5.974545974545976e-06, "loss": 0.9573, "step": 17960 }, { "epoch": 23.002548262548263, "grad_norm": 0.05114838853478432, "learning_rate": 5.971685971685972e-06, "loss": 0.923, "step": 17970 }, { "epoch": 23.002805662805663, "grad_norm": 0.38993966579437256, "learning_rate": 5.9688259688259695e-06, "loss": 0.0118, "step": 17980 }, { "epoch": 23.003063063063063, "grad_norm": 0.0010464813094586134, "learning_rate": 5.965965965965966e-06, "loss": 0.4669, "step": 17990 }, { "epoch": 23.003320463320463, "grad_norm": 0.0008095205412246287, "learning_rate": 5.963105963105964e-06, "loss": 0.0063, "step": 18000 }, { "epoch": 23.003577863577863, "grad_norm": 0.002685692859813571, "learning_rate": 5.9602459602459606e-06, "loss": 0.002, "step": 18010 }, { "epoch": 23.003835263835263, "grad_norm": 0.0549185648560524, "learning_rate": 5.957385957385958e-06, "loss": 0.0077, "step": 18020 }, { "epoch": 23.004092664092664, "grad_norm": 0.0013326237676665187, "learning_rate": 5.954525954525955e-06, "loss": 0.0023, "step": 18030 }, { "epoch": 23.004350064350064, "grad_norm": 0.002016945742070675, "learning_rate": 5.9516659516659525e-06, "loss": 0.4834, "step": 18040 }, { "epoch": 23.004607464607464, "grad_norm": 0.00044487594277597964, "learning_rate": 5.948805948805949e-06, "loss": 1.4307, "step": 18050 }, { "epoch": 23.004864864864864, "grad_norm": 0.09762829542160034, "learning_rate": 5.945945945945947e-06, "loss": 1.1566, "step": 18060 }, { "epoch": 23.005122265122264, "grad_norm": 124.17290496826172, "learning_rate": 5.943085943085944e-06, "loss": 1.1647, "step": 18070 }, { "epoch": 23.005379665379664, "grad_norm": 0.00485936738550663, "learning_rate": 5.940225940225941e-06, "loss": 1.4024, "step": 18080 }, { "epoch": 23.005637065637064, "grad_norm": 0.20460176467895508, "learning_rate": 5.937365937365937e-06, "loss": 0.8586, "step": 18090 }, { "epoch": 23.005894465894468, "grad_norm": 0.4110458493232727, "learning_rate": 5.9345059345059356e-06, "loss": 0.4524, "step": 18100 }, { "epoch": 23.006151866151868, "grad_norm": 0.7026017904281616, "learning_rate": 5.931645931645932e-06, "loss": 0.46, "step": 18110 }, { "epoch": 23.006409266409268, "grad_norm": 0.053139615803956985, "learning_rate": 5.928785928785929e-06, "loss": 1.507, "step": 18120 }, { "epoch": 23.006666666666668, "grad_norm": 0.06488683074712753, "learning_rate": 5.925925925925926e-06, "loss": 1.1119, "step": 18130 }, { "epoch": 23.006924066924068, "grad_norm": 0.0023442870005965233, "learning_rate": 5.923065923065924e-06, "loss": 0.4408, "step": 18140 }, { "epoch": 23.007181467181468, "grad_norm": 0.0014166425680741668, "learning_rate": 5.9202059202059204e-06, "loss": 0.604, "step": 18150 }, { "epoch": 23.007438867438868, "grad_norm": 0.05282068997621536, "learning_rate": 5.917345917345918e-06, "loss": 0.6589, "step": 18160 }, { "epoch": 23.007696267696268, "grad_norm": 0.16524852812290192, "learning_rate": 5.914485914485914e-06, "loss": 0.6691, "step": 18170 }, { "epoch": 23.007953667953668, "grad_norm": 0.19815614819526672, "learning_rate": 5.911625911625912e-06, "loss": 0.5881, "step": 18180 }, { "epoch": 23.008211068211068, "grad_norm": 0.0784997045993805, "learning_rate": 5.908765908765909e-06, "loss": 0.6896, "step": 18190 }, { "epoch": 23.008468468468468, "grad_norm": 88.85114288330078, "learning_rate": 5.905905905905906e-06, "loss": 0.5887, "step": 18200 }, { "epoch": 23.00872586872587, "grad_norm": 0.0565468929708004, "learning_rate": 5.9030459030459035e-06, "loss": 0.0018, "step": 18210 }, { "epoch": 23.00898326898327, "grad_norm": 0.5154985785484314, "learning_rate": 5.900185900185901e-06, "loss": 0.0018, "step": 18220 }, { "epoch": 23.00924066924067, "grad_norm": 6.401853084564209, "learning_rate": 5.897325897325897e-06, "loss": 0.004, "step": 18230 }, { "epoch": 23.00949806949807, "grad_norm": 0.0013837914448231459, "learning_rate": 5.894465894465895e-06, "loss": 0.9277, "step": 18240 }, { "epoch": 23.00975546975547, "grad_norm": 0.5020064115524292, "learning_rate": 5.891605891605892e-06, "loss": 0.2748, "step": 18250 }, { "epoch": 23.01001287001287, "grad_norm": 0.00070767110446468, "learning_rate": 5.888745888745889e-06, "loss": 0.8912, "step": 18260 }, { "epoch": 23.01027027027027, "grad_norm": 113.7410659790039, "learning_rate": 5.885885885885886e-06, "loss": 1.8436, "step": 18270 }, { "epoch": 23.010527670527672, "grad_norm": 0.0004945895634591579, "learning_rate": 5.883025883025884e-06, "loss": 1.3281, "step": 18280 }, { "epoch": 23.010785070785072, "grad_norm": 0.05701057240366936, "learning_rate": 5.880165880165881e-06, "loss": 1.4695, "step": 18290 }, { "epoch": 23.011042471042472, "grad_norm": 66.18766021728516, "learning_rate": 5.877305877305878e-06, "loss": 1.8692, "step": 18300 }, { "epoch": 23.011299871299872, "grad_norm": 42.15882110595703, "learning_rate": 5.874445874445876e-06, "loss": 0.6605, "step": 18310 }, { "epoch": 23.011557271557272, "grad_norm": 0.0012678230414167047, "learning_rate": 5.871585871585872e-06, "loss": 0.4896, "step": 18320 }, { "epoch": 23.011814671814673, "grad_norm": 0.0016251800116151571, "learning_rate": 5.86872586872587e-06, "loss": 0.4608, "step": 18330 }, { "epoch": 23.012072072072073, "grad_norm": 0.0016299422131851315, "learning_rate": 5.865865865865866e-06, "loss": 0.5679, "step": 18340 }, { "epoch": 23.012329472329473, "grad_norm": 68.07299041748047, "learning_rate": 5.863005863005864e-06, "loss": 1.4937, "step": 18350 }, { "epoch": 23.012586872586873, "grad_norm": 85.31804656982422, "learning_rate": 5.860145860145861e-06, "loss": 0.5221, "step": 18360 }, { "epoch": 23.012844272844273, "grad_norm": 0.05515197291970253, "learning_rate": 5.857285857285858e-06, "loss": 0.507, "step": 18370 }, { "epoch": 23.013101673101673, "grad_norm": 0.0015318605583161116, "learning_rate": 5.8544258544258545e-06, "loss": 0.5488, "step": 18380 }, { "epoch": 23.013359073359073, "grad_norm": 105.14557647705078, "learning_rate": 5.851565851565853e-06, "loss": 0.5017, "step": 18390 }, { "epoch": 23.013616473616473, "grad_norm": 0.0008998190169222653, "learning_rate": 5.848705848705849e-06, "loss": 0.4514, "step": 18400 }, { "epoch": 23.013873873873873, "grad_norm": 32.71900939941406, "learning_rate": 5.8458458458458464e-06, "loss": 1.1053, "step": 18410 }, { "epoch": 23.014131274131273, "grad_norm": 47.391929626464844, "learning_rate": 5.842985842985843e-06, "loss": 0.5652, "step": 18420 }, { "epoch": 23.014388674388673, "grad_norm": 0.0010785746853798628, "learning_rate": 5.840125840125841e-06, "loss": 0.5836, "step": 18430 }, { "epoch": 23.014646074646073, "grad_norm": 0.09901858866214752, "learning_rate": 5.8372658372658375e-06, "loss": 0.9411, "step": 18440 }, { "epoch": 23.014903474903473, "grad_norm": 0.08682543784379959, "learning_rate": 5.834405834405835e-06, "loss": 0.0068, "step": 18450 }, { "epoch": 23.015160875160873, "grad_norm": 0.0029455472249537706, "learning_rate": 5.831545831545832e-06, "loss": 0.001, "step": 18460 }, { "epoch": 23.015418275418277, "grad_norm": 0.01022744458168745, "learning_rate": 5.8286858286858295e-06, "loss": 0.6507, "step": 18470 }, { "epoch": 23.015675675675677, "grad_norm": 0.3779960870742798, "learning_rate": 5.825825825825826e-06, "loss": 0.5542, "step": 18480 }, { "epoch": 23.015933075933077, "grad_norm": 8.886568069458008, "learning_rate": 5.822965822965823e-06, "loss": 1.0689, "step": 18490 }, { "epoch": 23.016190476190477, "grad_norm": 0.0006304460694082081, "learning_rate": 5.820105820105821e-06, "loss": 0.54, "step": 18500 }, { "epoch": 23.016447876447877, "grad_norm": 122.1751937866211, "learning_rate": 5.817245817245818e-06, "loss": 0.6934, "step": 18510 }, { "epoch": 23.016705276705277, "grad_norm": 0.020981237292289734, "learning_rate": 5.814385814385814e-06, "loss": 0.0029, "step": 18520 }, { "epoch": 23.016962676962677, "grad_norm": 0.0008985757594928145, "learning_rate": 5.8115258115258125e-06, "loss": 0.0038, "step": 18530 }, { "epoch": 23.017220077220077, "grad_norm": 0.0008014594204723835, "learning_rate": 5.808665808665809e-06, "loss": 1.7014, "step": 18540 }, { "epoch": 23.017477477477478, "grad_norm": 1.0590686798095703, "learning_rate": 5.805805805805806e-06, "loss": 0.461, "step": 18550 }, { "epoch": 23.017734877734878, "grad_norm": 0.003036707639694214, "learning_rate": 5.802945802945803e-06, "loss": 0.0031, "step": 18560 }, { "epoch": 23.017992277992278, "grad_norm": 0.194211944937706, "learning_rate": 5.800085800085801e-06, "loss": 1.5044, "step": 18570 }, { "epoch": 23.018249678249678, "grad_norm": 53.92101287841797, "learning_rate": 5.797225797225797e-06, "loss": 0.4982, "step": 18580 }, { "epoch": 23.018507078507078, "grad_norm": 0.11296827346086502, "learning_rate": 5.794365794365795e-06, "loss": 0.0015, "step": 18590 }, { "epoch": 23.018764478764478, "grad_norm": 0.0718424916267395, "learning_rate": 5.791505791505791e-06, "loss": 0.0019, "step": 18600 }, { "epoch": 23.019021879021878, "grad_norm": 0.05146656185388565, "learning_rate": 5.788645788645789e-06, "loss": 1.7993, "step": 18610 }, { "epoch": 23.019279279279278, "grad_norm": 107.91058349609375, "learning_rate": 5.785785785785786e-06, "loss": 1.0372, "step": 18620 }, { "epoch": 23.019536679536678, "grad_norm": 94.91177368164062, "learning_rate": 5.782925782925783e-06, "loss": 1.1122, "step": 18630 }, { "epoch": 23.019794079794078, "grad_norm": 0.0015893593663349748, "learning_rate": 5.78006578006578e-06, "loss": 0.4632, "step": 18640 }, { "epoch": 23.02, "eval_accuracy": 0.8913043478260869, "eval_loss": 0.6939725875854492, "eval_runtime": 13.4416, "eval_samples_per_second": 3.422, "eval_steps_per_second": 3.422, "step": 18648 }, { "epoch": 24.00005148005148, "grad_norm": 0.05807127431035042, "learning_rate": 5.777205777205778e-06, "loss": 1.1037, "step": 18650 }, { "epoch": 24.00030888030888, "grad_norm": 92.4770278930664, "learning_rate": 5.774345774345775e-06, "loss": 1.2418, "step": 18660 }, { "epoch": 24.000566280566282, "grad_norm": 0.07446417212486267, "learning_rate": 5.7714857714857716e-06, "loss": 0.5503, "step": 18670 }, { "epoch": 24.000823680823682, "grad_norm": 0.0009795395890250802, "learning_rate": 5.76862576862577e-06, "loss": 0.0079, "step": 18680 }, { "epoch": 24.001081081081082, "grad_norm": 0.5915771126747131, "learning_rate": 5.765765765765766e-06, "loss": 0.0016, "step": 18690 }, { "epoch": 24.001338481338482, "grad_norm": 1.0889242887496948, "learning_rate": 5.7629057629057635e-06, "loss": 0.2133, "step": 18700 }, { "epoch": 24.001595881595883, "grad_norm": 0.0007270134519785643, "learning_rate": 5.760045760045761e-06, "loss": 0.425, "step": 18710 }, { "epoch": 24.001853281853283, "grad_norm": 0.0018572467379271984, "learning_rate": 5.757185757185758e-06, "loss": 1.1419, "step": 18720 }, { "epoch": 24.002110682110683, "grad_norm": 0.02151697687804699, "learning_rate": 5.754325754325755e-06, "loss": 0.0007, "step": 18730 }, { "epoch": 24.002368082368083, "grad_norm": 0.003686880227178335, "learning_rate": 5.751465751465753e-06, "loss": 0.0021, "step": 18740 }, { "epoch": 24.002625482625483, "grad_norm": 0.0470479391515255, "learning_rate": 5.748605748605749e-06, "loss": 0.0016, "step": 18750 }, { "epoch": 24.002882882882883, "grad_norm": 0.06280643492937088, "learning_rate": 5.7457457457457466e-06, "loss": 0.1671, "step": 18760 }, { "epoch": 24.003140283140283, "grad_norm": 0.0023778968024998903, "learning_rate": 5.742885742885743e-06, "loss": 0.5986, "step": 18770 }, { "epoch": 24.003397683397683, "grad_norm": 0.14014458656311035, "learning_rate": 5.740025740025741e-06, "loss": 0.716, "step": 18780 }, { "epoch": 24.003655083655083, "grad_norm": 0.009725264273583889, "learning_rate": 5.737165737165738e-06, "loss": 0.0008, "step": 18790 }, { "epoch": 24.003912483912483, "grad_norm": 0.16021865606307983, "learning_rate": 5.734305734305735e-06, "loss": 0.0109, "step": 18800 }, { "epoch": 24.004169884169883, "grad_norm": 0.0022326132748275995, "learning_rate": 5.7314457314457315e-06, "loss": 0.4593, "step": 18810 }, { "epoch": 24.004427284427283, "grad_norm": 48.16182327270508, "learning_rate": 5.72858572858573e-06, "loss": 0.5535, "step": 18820 }, { "epoch": 24.004684684684683, "grad_norm": 0.034395068883895874, "learning_rate": 5.725725725725726e-06, "loss": 0.6363, "step": 18830 }, { "epoch": 24.004942084942083, "grad_norm": 93.80135345458984, "learning_rate": 5.722865722865723e-06, "loss": 0.7659, "step": 18840 }, { "epoch": 24.005199485199483, "grad_norm": 0.12201082706451416, "learning_rate": 5.72000572000572e-06, "loss": 1.012, "step": 18850 }, { "epoch": 24.005456885456887, "grad_norm": 44.175174713134766, "learning_rate": 5.717145717145718e-06, "loss": 1.1665, "step": 18860 }, { "epoch": 24.005714285714287, "grad_norm": 0.003605901263654232, "learning_rate": 5.7142857142857145e-06, "loss": 1.3019, "step": 18870 }, { "epoch": 24.005971685971687, "grad_norm": 0.02306421287357807, "learning_rate": 5.711425711425712e-06, "loss": 0.0016, "step": 18880 }, { "epoch": 24.006229086229087, "grad_norm": 0.0023764725774526596, "learning_rate": 5.708565708565708e-06, "loss": 0.3606, "step": 18890 }, { "epoch": 24.006486486486487, "grad_norm": 105.87065124511719, "learning_rate": 5.7057057057057065e-06, "loss": 2.0477, "step": 18900 }, { "epoch": 24.006743886743887, "grad_norm": 33.09611511230469, "learning_rate": 5.702845702845703e-06, "loss": 0.589, "step": 18910 }, { "epoch": 24.007001287001287, "grad_norm": 0.10097678750753403, "learning_rate": 5.6999856999857e-06, "loss": 0.9933, "step": 18920 }, { "epoch": 24.007258687258688, "grad_norm": 0.0016176817007362843, "learning_rate": 5.6971256971256976e-06, "loss": 1.1839, "step": 18930 }, { "epoch": 24.007516087516088, "grad_norm": 0.006734101101756096, "learning_rate": 5.694265694265695e-06, "loss": 1.6423, "step": 18940 }, { "epoch": 24.007773487773488, "grad_norm": 0.015379363670945168, "learning_rate": 5.691405691405691e-06, "loss": 0.6437, "step": 18950 }, { "epoch": 24.008030888030888, "grad_norm": 0.16267436742782593, "learning_rate": 5.6885456885456895e-06, "loss": 0.5995, "step": 18960 }, { "epoch": 24.008288288288288, "grad_norm": 0.11775770038366318, "learning_rate": 5.685685685685686e-06, "loss": 0.5405, "step": 18970 }, { "epoch": 24.008545688545688, "grad_norm": 0.007473708596080542, "learning_rate": 5.682825682825683e-06, "loss": 0.5841, "step": 18980 }, { "epoch": 24.008803088803088, "grad_norm": 0.02219763956964016, "learning_rate": 5.67996567996568e-06, "loss": 0.6271, "step": 18990 }, { "epoch": 24.009060489060488, "grad_norm": 0.03645698353648186, "learning_rate": 5.677105677105678e-06, "loss": 0.5254, "step": 19000 }, { "epoch": 24.009317889317888, "grad_norm": 0.13147737085819244, "learning_rate": 5.674245674245674e-06, "loss": 0.5281, "step": 19010 }, { "epoch": 24.009575289575288, "grad_norm": 0.0017814674647524953, "learning_rate": 5.671385671385672e-06, "loss": 0.5026, "step": 19020 }, { "epoch": 24.009832689832688, "grad_norm": 0.5326687097549438, "learning_rate": 5.668525668525668e-06, "loss": 0.0525, "step": 19030 }, { "epoch": 24.01009009009009, "grad_norm": 0.003802061080932617, "learning_rate": 5.665665665665666e-06, "loss": 0.2906, "step": 19040 }, { "epoch": 24.010347490347492, "grad_norm": 0.12540924549102783, "learning_rate": 5.662805662805664e-06, "loss": 0.6137, "step": 19050 }, { "epoch": 24.010604890604892, "grad_norm": 0.002418956020846963, "learning_rate": 5.65994565994566e-06, "loss": 0.2951, "step": 19060 }, { "epoch": 24.010862290862292, "grad_norm": 0.000946474785450846, "learning_rate": 5.657085657085658e-06, "loss": 0.0026, "step": 19070 }, { "epoch": 24.011119691119692, "grad_norm": 0.8769616484642029, "learning_rate": 5.654225654225655e-06, "loss": 1.3478, "step": 19080 }, { "epoch": 24.011377091377092, "grad_norm": 66.09345245361328, "learning_rate": 5.651365651365652e-06, "loss": 0.4886, "step": 19090 }, { "epoch": 24.011634491634492, "grad_norm": 60.73049545288086, "learning_rate": 5.6485056485056485e-06, "loss": 3.3388, "step": 19100 }, { "epoch": 24.011891891891892, "grad_norm": 0.6036048531532288, "learning_rate": 5.645645645645647e-06, "loss": 1.0743, "step": 19110 }, { "epoch": 24.012149292149292, "grad_norm": 0.00891997292637825, "learning_rate": 5.642785642785643e-06, "loss": 1.5969, "step": 19120 }, { "epoch": 24.012406692406692, "grad_norm": 0.007564838510006666, "learning_rate": 5.6399256399256405e-06, "loss": 1.0353, "step": 19130 }, { "epoch": 24.012664092664092, "grad_norm": 0.0008338902262039483, "learning_rate": 5.637065637065637e-06, "loss": 0.2451, "step": 19140 }, { "epoch": 24.012921492921492, "grad_norm": 66.34471893310547, "learning_rate": 5.634205634205635e-06, "loss": 3.2594, "step": 19150 }, { "epoch": 24.013178893178893, "grad_norm": 0.5221831798553467, "learning_rate": 5.631345631345632e-06, "loss": 0.4028, "step": 19160 }, { "epoch": 24.013436293436293, "grad_norm": 0.0011873157927766442, "learning_rate": 5.628485628485629e-06, "loss": 0.9757, "step": 19170 }, { "epoch": 24.013693693693693, "grad_norm": 0.011443239636719227, "learning_rate": 5.625625625625626e-06, "loss": 0.8103, "step": 19180 }, { "epoch": 24.013951093951093, "grad_norm": 0.7135658860206604, "learning_rate": 5.6227656227656235e-06, "loss": 0.0047, "step": 19190 }, { "epoch": 24.014208494208493, "grad_norm": 0.09131744503974915, "learning_rate": 5.61990561990562e-06, "loss": 0.529, "step": 19200 }, { "epoch": 24.014465894465893, "grad_norm": 2919.614501953125, "learning_rate": 5.617045617045618e-06, "loss": 0.5351, "step": 19210 }, { "epoch": 24.014723294723296, "grad_norm": 0.0010455803712829947, "learning_rate": 5.614185614185615e-06, "loss": 1.8551, "step": 19220 }, { "epoch": 24.014980694980697, "grad_norm": 0.0007820248720236123, "learning_rate": 5.611325611325612e-06, "loss": 0.3175, "step": 19230 }, { "epoch": 24.015238095238097, "grad_norm": 7.954380512237549, "learning_rate": 5.6084656084656084e-06, "loss": 0.6042, "step": 19240 }, { "epoch": 24.015495495495497, "grad_norm": 0.33642780780792236, "learning_rate": 5.605605605605607e-06, "loss": 2.7028, "step": 19250 }, { "epoch": 24.015752895752897, "grad_norm": 0.17822858691215515, "learning_rate": 5.602745602745603e-06, "loss": 0.0028, "step": 19260 }, { "epoch": 24.016010296010297, "grad_norm": 0.2161465287208557, "learning_rate": 5.5998855998856e-06, "loss": 0.0047, "step": 19270 }, { "epoch": 24.016267696267697, "grad_norm": 0.16744586825370789, "learning_rate": 5.597025597025597e-06, "loss": 0.4083, "step": 19280 }, { "epoch": 24.016525096525097, "grad_norm": 0.001031736028380692, "learning_rate": 5.594165594165595e-06, "loss": 1.0542, "step": 19290 }, { "epoch": 24.016782496782497, "grad_norm": 0.45700398087501526, "learning_rate": 5.5913055913055915e-06, "loss": 0.0029, "step": 19300 }, { "epoch": 24.017039897039897, "grad_norm": 0.12631818652153015, "learning_rate": 5.588445588445589e-06, "loss": 0.3904, "step": 19310 }, { "epoch": 24.017297297297297, "grad_norm": 0.0006879584980197251, "learning_rate": 5.585585585585585e-06, "loss": 0.3659, "step": 19320 }, { "epoch": 24.017554697554697, "grad_norm": 633.3937377929688, "learning_rate": 5.5827255827255834e-06, "loss": 1.9248, "step": 19330 }, { "epoch": 24.017812097812097, "grad_norm": 0.5299988985061646, "learning_rate": 5.57986557986558e-06, "loss": 0.0055, "step": 19340 }, { "epoch": 24.018069498069497, "grad_norm": 0.06562308967113495, "learning_rate": 5.577005577005577e-06, "loss": 0.5882, "step": 19350 }, { "epoch": 24.018326898326897, "grad_norm": 0.07744754105806351, "learning_rate": 5.574145574145574e-06, "loss": 0.9198, "step": 19360 }, { "epoch": 24.018584298584297, "grad_norm": 0.8179073929786682, "learning_rate": 5.571285571285572e-06, "loss": 1.6893, "step": 19370 }, { "epoch": 24.018841698841698, "grad_norm": 0.12629270553588867, "learning_rate": 5.568425568425568e-06, "loss": 0.4581, "step": 19380 }, { "epoch": 24.019099099099098, "grad_norm": 0.1680009514093399, "learning_rate": 5.565565565565566e-06, "loss": 0.0179, "step": 19390 }, { "epoch": 24.019356499356498, "grad_norm": 0.0008770914864726365, "learning_rate": 5.562705562705563e-06, "loss": 1.0297, "step": 19400 }, { "epoch": 24.0196138996139, "grad_norm": 0.01934773102402687, "learning_rate": 5.55984555984556e-06, "loss": 0.0036, "step": 19410 }, { "epoch": 24.0198712998713, "grad_norm": 0.0009819872211664915, "learning_rate": 5.556985556985558e-06, "loss": 0.4901, "step": 19420 }, { "epoch": 24.02, "eval_accuracy": 0.8260869565217391, "eval_loss": 1.0198997259140015, "eval_runtime": 13.4522, "eval_samples_per_second": 3.42, "eval_steps_per_second": 3.42, "step": 19425 }, { "epoch": 25.000128700128702, "grad_norm": 0.0006284094415605068, "learning_rate": 5.554125554125555e-06, "loss": 0.0018, "step": 19430 }, { "epoch": 25.000386100386102, "grad_norm": 0.20130889117717743, "learning_rate": 5.551265551265552e-06, "loss": 0.4494, "step": 19440 }, { "epoch": 25.000643500643502, "grad_norm": 0.2460191547870636, "learning_rate": 5.548405548405549e-06, "loss": 0.4706, "step": 19450 }, { "epoch": 25.000900900900902, "grad_norm": 0.23789767920970917, "learning_rate": 5.545545545545547e-06, "loss": 0.0023, "step": 19460 }, { "epoch": 25.001158301158302, "grad_norm": 0.0007821908802725375, "learning_rate": 5.542685542685543e-06, "loss": 0.4147, "step": 19470 }, { "epoch": 25.001415701415702, "grad_norm": 0.10670550167560577, "learning_rate": 5.539825539825541e-06, "loss": 0.4267, "step": 19480 }, { "epoch": 25.001673101673102, "grad_norm": 0.05941932648420334, "learning_rate": 5.536965536965537e-06, "loss": 0.0058, "step": 19490 }, { "epoch": 25.001930501930502, "grad_norm": 0.06344068050384521, "learning_rate": 5.534105534105535e-06, "loss": 0.0027, "step": 19500 }, { "epoch": 25.002187902187902, "grad_norm": 0.06859828531742096, "learning_rate": 5.531245531245532e-06, "loss": 0.5773, "step": 19510 }, { "epoch": 25.002445302445302, "grad_norm": 0.14273127913475037, "learning_rate": 5.528385528385529e-06, "loss": 0.0028, "step": 19520 }, { "epoch": 25.002702702702702, "grad_norm": 0.0016670461045578122, "learning_rate": 5.5255255255255255e-06, "loss": 1.1759, "step": 19530 }, { "epoch": 25.002960102960103, "grad_norm": 0.5187317132949829, "learning_rate": 5.522665522665524e-06, "loss": 0.5372, "step": 19540 }, { "epoch": 25.003217503217503, "grad_norm": 0.0007036603637970984, "learning_rate": 5.51980551980552e-06, "loss": 1.282, "step": 19550 }, { "epoch": 25.003474903474903, "grad_norm": 899.7644653320312, "learning_rate": 5.5169455169455175e-06, "loss": 0.7085, "step": 19560 }, { "epoch": 25.003732303732303, "grad_norm": 0.0008057368686422706, "learning_rate": 5.514085514085514e-06, "loss": 0.0022, "step": 19570 }, { "epoch": 25.003989703989703, "grad_norm": 83.44464111328125, "learning_rate": 5.511225511225512e-06, "loss": 0.4203, "step": 19580 }, { "epoch": 25.004247104247103, "grad_norm": 0.32760846614837646, "learning_rate": 5.5083655083655086e-06, "loss": 0.8353, "step": 19590 }, { "epoch": 25.004504504504503, "grad_norm": 8.652959823608398, "learning_rate": 5.505505505505506e-06, "loss": 0.1742, "step": 19600 }, { "epoch": 25.004761904761907, "grad_norm": 0.0009207252878695726, "learning_rate": 5.502645502645503e-06, "loss": 0.4249, "step": 19610 }, { "epoch": 25.005019305019307, "grad_norm": 0.0004057708429172635, "learning_rate": 5.4997854997855005e-06, "loss": 0.0023, "step": 19620 }, { "epoch": 25.005276705276707, "grad_norm": 0.046492137014865875, "learning_rate": 5.496925496925497e-06, "loss": 0.7061, "step": 19630 }, { "epoch": 25.005534105534107, "grad_norm": 0.03466697037220001, "learning_rate": 5.494065494065494e-06, "loss": 0.5153, "step": 19640 }, { "epoch": 25.005791505791507, "grad_norm": 0.2154761403799057, "learning_rate": 5.491205491205492e-06, "loss": 0.5921, "step": 19650 }, { "epoch": 25.006048906048907, "grad_norm": 98.8327865600586, "learning_rate": 5.488345488345489e-06, "loss": 1.7348, "step": 19660 }, { "epoch": 25.006306306306307, "grad_norm": 0.031177762895822525, "learning_rate": 5.485485485485485e-06, "loss": 1.097, "step": 19670 }, { "epoch": 25.006563706563707, "grad_norm": 0.03502604365348816, "learning_rate": 5.4826254826254836e-06, "loss": 0.6449, "step": 19680 }, { "epoch": 25.006821106821107, "grad_norm": 0.0010554458713158965, "learning_rate": 5.47976547976548e-06, "loss": 0.0009, "step": 19690 }, { "epoch": 25.007078507078507, "grad_norm": 135.35574340820312, "learning_rate": 5.476905476905477e-06, "loss": 0.6102, "step": 19700 }, { "epoch": 25.007335907335907, "grad_norm": 0.005914623849093914, "learning_rate": 5.474045474045474e-06, "loss": 1.36, "step": 19710 }, { "epoch": 25.007593307593307, "grad_norm": 0.011718814261257648, "learning_rate": 5.471185471185472e-06, "loss": 0.0047, "step": 19720 }, { "epoch": 25.007850707850707, "grad_norm": 32.16627883911133, "learning_rate": 5.4683254683254685e-06, "loss": 1.2207, "step": 19730 }, { "epoch": 25.008108108108107, "grad_norm": 0.0004114433249924332, "learning_rate": 5.465465465465466e-06, "loss": 1.9784, "step": 19740 }, { "epoch": 25.008365508365507, "grad_norm": 0.0007554457988590002, "learning_rate": 5.462605462605462e-06, "loss": 0.4903, "step": 19750 }, { "epoch": 25.008622908622907, "grad_norm": 0.09954472631216049, "learning_rate": 5.45974545974546e-06, "loss": 0.0039, "step": 19760 }, { "epoch": 25.008880308880308, "grad_norm": 0.000810124387498945, "learning_rate": 5.456885456885457e-06, "loss": 0.5262, "step": 19770 }, { "epoch": 25.009137709137708, "grad_norm": 0.000533245038241148, "learning_rate": 5.454025454025454e-06, "loss": 0.0051, "step": 19780 }, { "epoch": 25.009395109395108, "grad_norm": 33.438385009765625, "learning_rate": 5.451165451165452e-06, "loss": 0.537, "step": 19790 }, { "epoch": 25.00965250965251, "grad_norm": 187.0354766845703, "learning_rate": 5.448305448305449e-06, "loss": 0.8539, "step": 19800 }, { "epoch": 25.00990990990991, "grad_norm": 0.11020729690790176, "learning_rate": 5.445445445445446e-06, "loss": 0.6905, "step": 19810 }, { "epoch": 25.01016731016731, "grad_norm": 0.1317060887813568, "learning_rate": 5.442585442585443e-06, "loss": 0.498, "step": 19820 }, { "epoch": 25.01042471042471, "grad_norm": 0.028832558542490005, "learning_rate": 5.439725439725441e-06, "loss": 0.004, "step": 19830 }, { "epoch": 25.01068211068211, "grad_norm": 36.885658264160156, "learning_rate": 5.436865436865437e-06, "loss": 1.3712, "step": 19840 }, { "epoch": 25.01093951093951, "grad_norm": 0.062064073979854584, "learning_rate": 5.4340054340054346e-06, "loss": 0.0027, "step": 19850 }, { "epoch": 25.01119691119691, "grad_norm": 0.003508289810270071, "learning_rate": 5.431145431145432e-06, "loss": 1.2743, "step": 19860 }, { "epoch": 25.01145431145431, "grad_norm": 0.21302658319473267, "learning_rate": 5.428285428285429e-06, "loss": 0.0033, "step": 19870 }, { "epoch": 25.011711711711712, "grad_norm": 304.8260192871094, "learning_rate": 5.425425425425426e-06, "loss": 0.4843, "step": 19880 }, { "epoch": 25.011969111969112, "grad_norm": 0.0794137567281723, "learning_rate": 5.422565422565423e-06, "loss": 0.5961, "step": 19890 }, { "epoch": 25.012226512226512, "grad_norm": 1.3407971858978271, "learning_rate": 5.41970541970542e-06, "loss": 0.3193, "step": 19900 }, { "epoch": 25.012483912483912, "grad_norm": 0.0008371305884793401, "learning_rate": 5.416845416845418e-06, "loss": 0.0019, "step": 19910 }, { "epoch": 25.012741312741312, "grad_norm": 0.00039880804251879454, "learning_rate": 5.413985413985414e-06, "loss": 1.143, "step": 19920 }, { "epoch": 25.012998712998712, "grad_norm": 0.002268696902319789, "learning_rate": 5.411125411125412e-06, "loss": 0.002, "step": 19930 }, { "epoch": 25.013256113256112, "grad_norm": 140.1703643798828, "learning_rate": 5.408265408265409e-06, "loss": 0.021, "step": 19940 }, { "epoch": 25.013513513513512, "grad_norm": 26.61545181274414, "learning_rate": 5.405405405405406e-06, "loss": 0.0069, "step": 19950 }, { "epoch": 25.013770913770912, "grad_norm": 0.00035982835106551647, "learning_rate": 5.4025454025454025e-06, "loss": 0.5916, "step": 19960 }, { "epoch": 25.014028314028312, "grad_norm": 0.11984622478485107, "learning_rate": 5.399685399685401e-06, "loss": 0.5778, "step": 19970 }, { "epoch": 25.014285714285716, "grad_norm": 0.22817088663578033, "learning_rate": 5.396825396825397e-06, "loss": 0.0021, "step": 19980 }, { "epoch": 25.014543114543116, "grad_norm": 0.2587432861328125, "learning_rate": 5.3939653939653944e-06, "loss": 0.8712, "step": 19990 }, { "epoch": 25.014800514800516, "grad_norm": 0.007341683376580477, "learning_rate": 5.391105391105391e-06, "loss": 0.6829, "step": 20000 }, { "epoch": 25.015057915057916, "grad_norm": 0.0007906981627456844, "learning_rate": 5.388245388245389e-06, "loss": 0.5817, "step": 20010 }, { "epoch": 25.015315315315316, "grad_norm": 0.0988348200917244, "learning_rate": 5.3853853853853856e-06, "loss": 1.7047, "step": 20020 }, { "epoch": 25.015572715572716, "grad_norm": 0.0006614571902900934, "learning_rate": 5.382525382525383e-06, "loss": 0.4903, "step": 20030 }, { "epoch": 25.015830115830116, "grad_norm": 0.2373206615447998, "learning_rate": 5.379665379665379e-06, "loss": 0.002, "step": 20040 }, { "epoch": 25.016087516087516, "grad_norm": 0.016258088871836662, "learning_rate": 5.3768053768053775e-06, "loss": 0.0274, "step": 20050 }, { "epoch": 25.016344916344917, "grad_norm": 148.20025634765625, "learning_rate": 5.373945373945374e-06, "loss": 0.9257, "step": 20060 }, { "epoch": 25.016602316602317, "grad_norm": 0.030567055568099022, "learning_rate": 5.371085371085371e-06, "loss": 0.0024, "step": 20070 }, { "epoch": 25.016859716859717, "grad_norm": 0.007402033545076847, "learning_rate": 5.368225368225369e-06, "loss": 0.6018, "step": 20080 }, { "epoch": 25.017117117117117, "grad_norm": 0.0003875165421050042, "learning_rate": 5.365365365365366e-06, "loss": 1.2368, "step": 20090 }, { "epoch": 25.017374517374517, "grad_norm": 0.0308153685182333, "learning_rate": 5.362505362505362e-06, "loss": 0.0537, "step": 20100 }, { "epoch": 25.017631917631917, "grad_norm": 1280.4652099609375, "learning_rate": 5.3596453596453605e-06, "loss": 2.5197, "step": 20110 }, { "epoch": 25.017889317889317, "grad_norm": 2.078805685043335, "learning_rate": 5.356785356785357e-06, "loss": 0.8314, "step": 20120 }, { "epoch": 25.018146718146717, "grad_norm": 0.001093574333935976, "learning_rate": 5.353925353925354e-06, "loss": 0.6905, "step": 20130 }, { "epoch": 25.018404118404117, "grad_norm": 0.0019031533738598228, "learning_rate": 5.351065351065351e-06, "loss": 0.0019, "step": 20140 }, { "epoch": 25.018661518661517, "grad_norm": 0.07940136641263962, "learning_rate": 5.348205348205349e-06, "loss": 1.0073, "step": 20150 }, { "epoch": 25.018918918918917, "grad_norm": 0.004731150344014168, "learning_rate": 5.345345345345346e-06, "loss": 0.0022, "step": 20160 }, { "epoch": 25.01917631917632, "grad_norm": 0.007168655749410391, "learning_rate": 5.342485342485343e-06, "loss": 1.4627, "step": 20170 }, { "epoch": 25.01943371943372, "grad_norm": 0.0005902776028960943, "learning_rate": 5.339625339625341e-06, "loss": 0.0157, "step": 20180 }, { "epoch": 25.01969111969112, "grad_norm": 1.8064913749694824, "learning_rate": 5.336765336765337e-06, "loss": 1.1586, "step": 20190 }, { "epoch": 25.01994851994852, "grad_norm": 0.08647841215133667, "learning_rate": 5.333905333905335e-06, "loss": 0.3927, "step": 20200 }, { "epoch": 25.02, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.8669257760047913, "eval_runtime": 13.4055, "eval_samples_per_second": 3.431, "eval_steps_per_second": 3.431, "step": 20202 }, { "epoch": 26.00020592020592, "grad_norm": 41.40488815307617, "learning_rate": 5.331045331045331e-06, "loss": 1.0282, "step": 20210 }, { "epoch": 26.00046332046332, "grad_norm": 1.0805187225341797, "learning_rate": 5.328185328185329e-06, "loss": 1.0007, "step": 20220 }, { "epoch": 26.00072072072072, "grad_norm": 0.017646795138716698, "learning_rate": 5.325325325325326e-06, "loss": 0.4639, "step": 20230 }, { "epoch": 26.00097812097812, "grad_norm": 0.08881297707557678, "learning_rate": 5.322465322465323e-06, "loss": 0.0305, "step": 20240 }, { "epoch": 26.00123552123552, "grad_norm": 0.09640834480524063, "learning_rate": 5.31960531960532e-06, "loss": 0.494, "step": 20250 }, { "epoch": 26.001492921492922, "grad_norm": 87.36897277832031, "learning_rate": 5.316745316745318e-06, "loss": 1.564, "step": 20260 }, { "epoch": 26.001750321750322, "grad_norm": 0.0012800369877368212, "learning_rate": 5.313885313885314e-06, "loss": 0.0018, "step": 20270 }, { "epoch": 26.002007722007722, "grad_norm": 0.09953822195529938, "learning_rate": 5.3110253110253115e-06, "loss": 0.0041, "step": 20280 }, { "epoch": 26.002265122265122, "grad_norm": 39.34917449951172, "learning_rate": 5.308165308165308e-06, "loss": 1.5296, "step": 20290 }, { "epoch": 26.002522522522522, "grad_norm": 0.09963509440422058, "learning_rate": 5.305305305305306e-06, "loss": 0.0017, "step": 20300 }, { "epoch": 26.002779922779922, "grad_norm": 0.16209664940834045, "learning_rate": 5.302445302445303e-06, "loss": 0.494, "step": 20310 }, { "epoch": 26.003037323037322, "grad_norm": 0.0013167433207854629, "learning_rate": 5.2995852995853e-06, "loss": 0.0025, "step": 20320 }, { "epoch": 26.003294723294722, "grad_norm": 0.0038293786346912384, "learning_rate": 5.296725296725297e-06, "loss": 0.0017, "step": 20330 }, { "epoch": 26.003552123552122, "grad_norm": 0.0007873664726503193, "learning_rate": 5.293865293865295e-06, "loss": 0.5308, "step": 20340 }, { "epoch": 26.003809523809522, "grad_norm": 0.07472074031829834, "learning_rate": 5.291005291005291e-06, "loss": 1.9437, "step": 20350 }, { "epoch": 26.004066924066922, "grad_norm": 0.0006641810177825391, "learning_rate": 5.288145288145289e-06, "loss": 0.0143, "step": 20360 }, { "epoch": 26.004324324324326, "grad_norm": 0.2543509304523468, "learning_rate": 5.285285285285286e-06, "loss": 0.3686, "step": 20370 }, { "epoch": 26.004581724581726, "grad_norm": 0.0021348209120333195, "learning_rate": 5.282425282425283e-06, "loss": 0.6354, "step": 20380 }, { "epoch": 26.004839124839126, "grad_norm": 0.0008653805707581341, "learning_rate": 5.2795652795652795e-06, "loss": 0.3066, "step": 20390 }, { "epoch": 26.005096525096526, "grad_norm": 0.0004059734055772424, "learning_rate": 5.276705276705278e-06, "loss": 0.0022, "step": 20400 }, { "epoch": 26.005353925353926, "grad_norm": 0.0010638394160196185, "learning_rate": 5.273845273845274e-06, "loss": 0.5565, "step": 20410 }, { "epoch": 26.005611325611326, "grad_norm": 0.0026838104240596294, "learning_rate": 5.2709852709852714e-06, "loss": 0.5538, "step": 20420 }, { "epoch": 26.005868725868726, "grad_norm": 0.0007530002039857209, "learning_rate": 5.268125268125268e-06, "loss": 0.8306, "step": 20430 }, { "epoch": 26.006126126126127, "grad_norm": 0.04654620960354805, "learning_rate": 5.265265265265266e-06, "loss": 0.0011, "step": 20440 }, { "epoch": 26.006383526383527, "grad_norm": 1473.2542724609375, "learning_rate": 5.2624052624052625e-06, "loss": 0.1808, "step": 20450 }, { "epoch": 26.006640926640927, "grad_norm": 0.08300590515136719, "learning_rate": 5.25954525954526e-06, "loss": 0.9119, "step": 20460 }, { "epoch": 26.006898326898327, "grad_norm": 0.047874584794044495, "learning_rate": 5.256685256685256e-06, "loss": 0.0016, "step": 20470 }, { "epoch": 26.007155727155727, "grad_norm": 0.4152180254459381, "learning_rate": 5.2538252538252545e-06, "loss": 1.0339, "step": 20480 }, { "epoch": 26.007413127413127, "grad_norm": 0.000853995734360069, "learning_rate": 5.250965250965251e-06, "loss": 0.9289, "step": 20490 }, { "epoch": 26.007670527670527, "grad_norm": 0.08005805313587189, "learning_rate": 5.248105248105248e-06, "loss": 1.8532, "step": 20500 }, { "epoch": 26.007927927927927, "grad_norm": 1.376071810722351, "learning_rate": 5.245245245245245e-06, "loss": 0.6187, "step": 20510 }, { "epoch": 26.008185328185327, "grad_norm": 4.983120441436768, "learning_rate": 5.242385242385243e-06, "loss": 0.5194, "step": 20520 }, { "epoch": 26.008442728442727, "grad_norm": 0.01302572712302208, "learning_rate": 5.23952523952524e-06, "loss": 0.4392, "step": 20530 }, { "epoch": 26.008700128700127, "grad_norm": 0.05677972361445427, "learning_rate": 5.236665236665237e-06, "loss": 0.4977, "step": 20540 }, { "epoch": 26.008957528957527, "grad_norm": 0.18432343006134033, "learning_rate": 5.233805233805235e-06, "loss": 1.5045, "step": 20550 }, { "epoch": 26.00921492921493, "grad_norm": 0.0020320096518844366, "learning_rate": 5.230945230945231e-06, "loss": 0.0009, "step": 20560 }, { "epoch": 26.00947232947233, "grad_norm": 0.07179369032382965, "learning_rate": 5.228085228085229e-06, "loss": 1.7177, "step": 20570 }, { "epoch": 26.00972972972973, "grad_norm": 0.043726976960897446, "learning_rate": 5.225225225225226e-06, "loss": 0.9099, "step": 20580 }, { "epoch": 26.00998712998713, "grad_norm": 0.019464250653982162, "learning_rate": 5.222365222365223e-06, "loss": 1.0313, "step": 20590 }, { "epoch": 26.01024453024453, "grad_norm": 0.3122895061969757, "learning_rate": 5.21950521950522e-06, "loss": 1.1684, "step": 20600 }, { "epoch": 26.01050193050193, "grad_norm": 0.0006997102173045278, "learning_rate": 5.216645216645218e-06, "loss": 0.0393, "step": 20610 }, { "epoch": 26.01075933075933, "grad_norm": 39.65142059326172, "learning_rate": 5.213785213785214e-06, "loss": 0.5572, "step": 20620 }, { "epoch": 26.01101673101673, "grad_norm": 0.11646867543458939, "learning_rate": 5.210925210925212e-06, "loss": 0.0028, "step": 20630 }, { "epoch": 26.01127413127413, "grad_norm": 0.06890599429607391, "learning_rate": 5.208065208065208e-06, "loss": 0.0033, "step": 20640 }, { "epoch": 26.01153153153153, "grad_norm": 0.08268891274929047, "learning_rate": 5.205205205205206e-06, "loss": 0.5089, "step": 20650 }, { "epoch": 26.01178893178893, "grad_norm": 0.03577331081032753, "learning_rate": 5.202345202345203e-06, "loss": 1.2679, "step": 20660 }, { "epoch": 26.01204633204633, "grad_norm": 0.07769101113080978, "learning_rate": 5.1994851994852e-06, "loss": 0.0016, "step": 20670 }, { "epoch": 26.01230373230373, "grad_norm": 33.56912612915039, "learning_rate": 5.1966251966251966e-06, "loss": 1.2153, "step": 20680 }, { "epoch": 26.01256113256113, "grad_norm": 0.003670428181067109, "learning_rate": 5.193765193765195e-06, "loss": 0.9139, "step": 20690 }, { "epoch": 26.01281853281853, "grad_norm": 713.4417724609375, "learning_rate": 5.190905190905191e-06, "loss": 0.4005, "step": 20700 }, { "epoch": 26.013075933075932, "grad_norm": 0.0020978457760065794, "learning_rate": 5.1880451880451885e-06, "loss": 0.4827, "step": 20710 }, { "epoch": 26.013333333333332, "grad_norm": 0.0007727963966317475, "learning_rate": 5.185185185185185e-06, "loss": 0.4697, "step": 20720 }, { "epoch": 26.013590733590732, "grad_norm": 0.014731266535818577, "learning_rate": 5.182325182325183e-06, "loss": 0.4416, "step": 20730 }, { "epoch": 26.013848133848136, "grad_norm": 0.051897432655096054, "learning_rate": 5.17946517946518e-06, "loss": 0.0006, "step": 20740 }, { "epoch": 26.014105534105536, "grad_norm": 0.0008702686754986644, "learning_rate": 5.176605176605177e-06, "loss": 2.079, "step": 20750 }, { "epoch": 26.014362934362936, "grad_norm": 0.18641412258148193, "learning_rate": 5.173745173745173e-06, "loss": 0.0022, "step": 20760 }, { "epoch": 26.014620334620336, "grad_norm": 0.0020233727991580963, "learning_rate": 5.1708851708851716e-06, "loss": 0.0086, "step": 20770 }, { "epoch": 26.014877734877736, "grad_norm": 0.013192682527005672, "learning_rate": 5.168025168025168e-06, "loss": 0.9466, "step": 20780 }, { "epoch": 26.015135135135136, "grad_norm": 1.5512306690216064, "learning_rate": 5.165165165165165e-06, "loss": 0.0028, "step": 20790 }, { "epoch": 26.015392535392536, "grad_norm": 0.25116217136383057, "learning_rate": 5.162305162305163e-06, "loss": 0.0018, "step": 20800 }, { "epoch": 26.015649935649936, "grad_norm": 0.003960348200052977, "learning_rate": 5.15944515944516e-06, "loss": 0.0007, "step": 20810 }, { "epoch": 26.015907335907336, "grad_norm": 0.002162356860935688, "learning_rate": 5.1565851565851565e-06, "loss": 0.002, "step": 20820 }, { "epoch": 26.016164736164736, "grad_norm": 0.07155408710241318, "learning_rate": 5.153725153725155e-06, "loss": 0.027, "step": 20830 }, { "epoch": 26.016422136422136, "grad_norm": 0.10129252076148987, "learning_rate": 5.150865150865151e-06, "loss": 0.0012, "step": 20840 }, { "epoch": 26.016679536679536, "grad_norm": 0.004315822850912809, "learning_rate": 5.148005148005148e-06, "loss": 0.0052, "step": 20850 }, { "epoch": 26.016936936936936, "grad_norm": 0.001369721838273108, "learning_rate": 5.145145145145145e-06, "loss": 0.0012, "step": 20860 }, { "epoch": 26.017194337194336, "grad_norm": 0.006657337304204702, "learning_rate": 5.142285142285143e-06, "loss": 0.6222, "step": 20870 }, { "epoch": 26.017451737451736, "grad_norm": 35.71303939819336, "learning_rate": 5.1394251394251395e-06, "loss": 1.2577, "step": 20880 }, { "epoch": 26.017709137709137, "grad_norm": 0.0017888675210997462, "learning_rate": 5.136565136565137e-06, "loss": 0.0088, "step": 20890 }, { "epoch": 26.017966537966537, "grad_norm": 0.025619618594646454, "learning_rate": 5.133705133705133e-06, "loss": 1.1787, "step": 20900 }, { "epoch": 26.018223938223937, "grad_norm": 557.7401123046875, "learning_rate": 5.1308451308451314e-06, "loss": 0.8761, "step": 20910 }, { "epoch": 26.018481338481337, "grad_norm": 0.19152088463306427, "learning_rate": 5.127985127985129e-06, "loss": 0.001, "step": 20920 }, { "epoch": 26.01873873873874, "grad_norm": 47.91631317138672, "learning_rate": 5.125125125125125e-06, "loss": 0.6359, "step": 20930 }, { "epoch": 26.01899613899614, "grad_norm": 0.0006033479585312307, "learning_rate": 5.122265122265123e-06, "loss": 0.4703, "step": 20940 }, { "epoch": 26.01925353925354, "grad_norm": 0.07352429628372192, "learning_rate": 5.11940511940512e-06, "loss": 0.0185, "step": 20950 }, { "epoch": 26.01951093951094, "grad_norm": 0.08161481469869614, "learning_rate": 5.116545116545117e-06, "loss": 1.2037, "step": 20960 }, { "epoch": 26.01976833976834, "grad_norm": 0.12737224996089935, "learning_rate": 5.113685113685114e-06, "loss": 0.3564, "step": 20970 }, { "epoch": 26.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.7219700813293457, "eval_runtime": 13.359, "eval_samples_per_second": 3.443, "eval_steps_per_second": 3.443, "step": 20979 }, { "epoch": 27.00002574002574, "grad_norm": 0.4492325186729431, "learning_rate": 5.110825110825112e-06, "loss": 0.0428, "step": 20980 }, { "epoch": 27.00028314028314, "grad_norm": 0.10700222104787827, "learning_rate": 5.107965107965108e-06, "loss": 0.0011, "step": 20990 }, { "epoch": 27.00054054054054, "grad_norm": 0.4348405599594116, "learning_rate": 5.105105105105106e-06, "loss": 0.2933, "step": 21000 }, { "epoch": 27.00079794079794, "grad_norm": 0.0012466449989005923, "learning_rate": 5.102245102245103e-06, "loss": 0.0769, "step": 21010 }, { "epoch": 27.00105534105534, "grad_norm": 0.041840698570013046, "learning_rate": 5.0993850993851e-06, "loss": 1.6616, "step": 21020 }, { "epoch": 27.00131274131274, "grad_norm": 0.002618017140775919, "learning_rate": 5.096525096525097e-06, "loss": 0.0011, "step": 21030 }, { "epoch": 27.00157014157014, "grad_norm": 0.13534986972808838, "learning_rate": 5.093665093665094e-06, "loss": 0.0007, "step": 21040 }, { "epoch": 27.00182754182754, "grad_norm": 37.55997848510742, "learning_rate": 5.090805090805091e-06, "loss": 0.6014, "step": 21050 }, { "epoch": 27.00208494208494, "grad_norm": 348.0312194824219, "learning_rate": 5.087945087945089e-06, "loss": 0.8582, "step": 21060 }, { "epoch": 27.00234234234234, "grad_norm": 0.5563425421714783, "learning_rate": 5.085085085085085e-06, "loss": 0.726, "step": 21070 }, { "epoch": 27.00259974259974, "grad_norm": 28.312299728393555, "learning_rate": 5.082225082225083e-06, "loss": 1.2086, "step": 21080 }, { "epoch": 27.002857142857142, "grad_norm": 0.06256058067083359, "learning_rate": 5.07936507936508e-06, "loss": 0.4858, "step": 21090 }, { "epoch": 27.003114543114542, "grad_norm": 0.005022318102419376, "learning_rate": 5.076505076505077e-06, "loss": 0.0021, "step": 21100 }, { "epoch": 27.003371943371942, "grad_norm": 0.212180495262146, "learning_rate": 5.0736450736450735e-06, "loss": 1.107, "step": 21110 }, { "epoch": 27.003629343629342, "grad_norm": 0.0004522952949628234, "learning_rate": 5.070785070785072e-06, "loss": 1.1168, "step": 21120 }, { "epoch": 27.003886743886746, "grad_norm": 0.1687791496515274, "learning_rate": 5.067925067925068e-06, "loss": 0.5336, "step": 21130 }, { "epoch": 27.004144144144146, "grad_norm": 0.008968188427388668, "learning_rate": 5.0650650650650655e-06, "loss": 0.4525, "step": 21140 }, { "epoch": 27.004401544401546, "grad_norm": 0.14755140244960785, "learning_rate": 5.062205062205062e-06, "loss": 0.0031, "step": 21150 }, { "epoch": 27.004658944658946, "grad_norm": 36.4864501953125, "learning_rate": 5.05934505934506e-06, "loss": 0.4908, "step": 21160 }, { "epoch": 27.004916344916346, "grad_norm": 0.00807279348373413, "learning_rate": 5.056485056485057e-06, "loss": 0.7936, "step": 21170 }, { "epoch": 27.005173745173746, "grad_norm": 0.4851839244365692, "learning_rate": 5.053625053625054e-06, "loss": 0.6647, "step": 21180 }, { "epoch": 27.005431145431146, "grad_norm": 27.190128326416016, "learning_rate": 5.05076505076505e-06, "loss": 1.577, "step": 21190 }, { "epoch": 27.005688545688546, "grad_norm": 0.0018694992177188396, "learning_rate": 5.0479050479050485e-06, "loss": 1.2588, "step": 21200 }, { "epoch": 27.005945945945946, "grad_norm": 0.05228858068585396, "learning_rate": 5.045045045045045e-06, "loss": 0.4246, "step": 21210 }, { "epoch": 27.006203346203346, "grad_norm": 0.20007288455963135, "learning_rate": 5.042185042185042e-06, "loss": 0.01, "step": 21220 }, { "epoch": 27.006460746460746, "grad_norm": 305.2531433105469, "learning_rate": 5.03932503932504e-06, "loss": 0.2956, "step": 21230 }, { "epoch": 27.006718146718146, "grad_norm": 0.00089081289479509, "learning_rate": 5.036465036465037e-06, "loss": 0.5486, "step": 21240 }, { "epoch": 27.006975546975546, "grad_norm": 0.48225823044776917, "learning_rate": 5.0336050336050334e-06, "loss": 0.4939, "step": 21250 }, { "epoch": 27.007232947232946, "grad_norm": 0.19505609571933746, "learning_rate": 5.030745030745032e-06, "loss": 0.9111, "step": 21260 }, { "epoch": 27.007490347490346, "grad_norm": 0.0040650321170687675, "learning_rate": 5.027885027885028e-06, "loss": 0.146, "step": 21270 }, { "epoch": 27.007747747747747, "grad_norm": 0.986337423324585, "learning_rate": 5.025025025025025e-06, "loss": 0.0111, "step": 21280 }, { "epoch": 27.008005148005147, "grad_norm": 0.1955615133047104, "learning_rate": 5.022165022165023e-06, "loss": 0.3612, "step": 21290 }, { "epoch": 27.008262548262547, "grad_norm": 0.0011161335278302431, "learning_rate": 5.01930501930502e-06, "loss": 0.0009, "step": 21300 }, { "epoch": 27.008519948519947, "grad_norm": 0.02377697266638279, "learning_rate": 5.016445016445017e-06, "loss": 0.5797, "step": 21310 }, { "epoch": 27.00877734877735, "grad_norm": 447.38330078125, "learning_rate": 5.013585013585014e-06, "loss": 0.8126, "step": 21320 }, { "epoch": 27.00903474903475, "grad_norm": 0.01540659461170435, "learning_rate": 5.010725010725012e-06, "loss": 0.5582, "step": 21330 }, { "epoch": 27.00929214929215, "grad_norm": 0.0029602760914713144, "learning_rate": 5.0078650078650084e-06, "loss": 0.4369, "step": 21340 }, { "epoch": 27.00954954954955, "grad_norm": 310.1184387207031, "learning_rate": 5.005005005005006e-06, "loss": 0.5738, "step": 21350 }, { "epoch": 27.00980694980695, "grad_norm": 0.08122654259204865, "learning_rate": 5.002145002145002e-06, "loss": 0.0019, "step": 21360 }, { "epoch": 27.01006435006435, "grad_norm": 0.0014425546396523714, "learning_rate": 4.9992849992849995e-06, "loss": 1.5169, "step": 21370 }, { "epoch": 27.01032175032175, "grad_norm": 0.01068554725497961, "learning_rate": 4.996424996424997e-06, "loss": 0.0114, "step": 21380 }, { "epoch": 27.01057915057915, "grad_norm": 0.001254943199455738, "learning_rate": 4.993564993564994e-06, "loss": 1.7242, "step": 21390 }, { "epoch": 27.01083655083655, "grad_norm": 0.010038587264716625, "learning_rate": 4.990704990704991e-06, "loss": 0.0024, "step": 21400 }, { "epoch": 27.01109395109395, "grad_norm": 0.004530706908553839, "learning_rate": 4.987844987844988e-06, "loss": 0.508, "step": 21410 }, { "epoch": 27.01135135135135, "grad_norm": 0.10551851242780685, "learning_rate": 4.984984984984985e-06, "loss": 0.5313, "step": 21420 }, { "epoch": 27.01160875160875, "grad_norm": 0.11509599536657333, "learning_rate": 4.982124982124983e-06, "loss": 0.3598, "step": 21430 }, { "epoch": 27.01186615186615, "grad_norm": 0.0032966965809464455, "learning_rate": 4.979264979264979e-06, "loss": 0.6849, "step": 21440 }, { "epoch": 27.01212355212355, "grad_norm": 0.0011607161723077297, "learning_rate": 4.976404976404976e-06, "loss": 0.2958, "step": 21450 }, { "epoch": 27.01238095238095, "grad_norm": 0.11378136277198792, "learning_rate": 4.973544973544974e-06, "loss": 0.0032, "step": 21460 }, { "epoch": 27.01263835263835, "grad_norm": 0.0004367120563983917, "learning_rate": 4.970684970684971e-06, "loss": 0.0012, "step": 21470 }, { "epoch": 27.01289575289575, "grad_norm": 0.003959926310926676, "learning_rate": 4.967824967824968e-06, "loss": 1.2426, "step": 21480 }, { "epoch": 27.01315315315315, "grad_norm": 0.3200651705265045, "learning_rate": 4.964964964964966e-06, "loss": 0.8776, "step": 21490 }, { "epoch": 27.013410553410555, "grad_norm": 28.075603485107422, "learning_rate": 4.962104962104963e-06, "loss": 0.4647, "step": 21500 }, { "epoch": 27.013667953667955, "grad_norm": 20.377717971801758, "learning_rate": 4.95924495924496e-06, "loss": 1.3318, "step": 21510 }, { "epoch": 27.013925353925355, "grad_norm": 0.09059357643127441, "learning_rate": 4.956384956384957e-06, "loss": 0.0026, "step": 21520 }, { "epoch": 27.014182754182755, "grad_norm": 0.9779305458068848, "learning_rate": 4.953524953524954e-06, "loss": 0.0012, "step": 21530 }, { "epoch": 27.014440154440155, "grad_norm": 0.045352060347795486, "learning_rate": 4.950664950664951e-06, "loss": 0.6597, "step": 21540 }, { "epoch": 27.014697554697555, "grad_norm": 0.03662479668855667, "learning_rate": 4.947804947804949e-06, "loss": 0.9173, "step": 21550 }, { "epoch": 27.014954954954955, "grad_norm": 0.10112226754426956, "learning_rate": 4.944944944944945e-06, "loss": 0.6186, "step": 21560 }, { "epoch": 27.015212355212356, "grad_norm": 0.03256876394152641, "learning_rate": 4.9420849420849425e-06, "loss": 1.1788, "step": 21570 }, { "epoch": 27.015469755469756, "grad_norm": 0.2824130654335022, "learning_rate": 4.93922493922494e-06, "loss": 1.1216, "step": 21580 }, { "epoch": 27.015727155727156, "grad_norm": 0.0007668504840694368, "learning_rate": 4.936364936364937e-06, "loss": 0.4967, "step": 21590 }, { "epoch": 27.015984555984556, "grad_norm": 0.17322510480880737, "learning_rate": 4.9335049335049336e-06, "loss": 0.6486, "step": 21600 }, { "epoch": 27.016241956241956, "grad_norm": 0.14373072981834412, "learning_rate": 4.930644930644931e-06, "loss": 1.4562, "step": 21610 }, { "epoch": 27.016499356499356, "grad_norm": 0.002229807898402214, "learning_rate": 4.927784927784928e-06, "loss": 0.5973, "step": 21620 }, { "epoch": 27.016756756756756, "grad_norm": 1.7468568086624146, "learning_rate": 4.9249249249249255e-06, "loss": 0.0018, "step": 21630 }, { "epoch": 27.017014157014156, "grad_norm": 1.5412856340408325, "learning_rate": 4.922064922064923e-06, "loss": 0.5743, "step": 21640 }, { "epoch": 27.017271557271556, "grad_norm": 9.415769577026367, "learning_rate": 4.919204919204919e-06, "loss": 0.4402, "step": 21650 }, { "epoch": 27.017528957528956, "grad_norm": 0.00046175954048521817, "learning_rate": 4.916344916344917e-06, "loss": 0.5361, "step": 21660 }, { "epoch": 27.017786357786356, "grad_norm": 6.175512313842773, "learning_rate": 4.913484913484914e-06, "loss": 0.007, "step": 21670 }, { "epoch": 27.01804375804376, "grad_norm": 0.09624309092760086, "learning_rate": 4.910624910624911e-06, "loss": 0.473, "step": 21680 }, { "epoch": 27.01830115830116, "grad_norm": 0.0006763459532521665, "learning_rate": 4.907764907764908e-06, "loss": 0.0005, "step": 21690 }, { "epoch": 27.01855855855856, "grad_norm": 0.00037406483897939324, "learning_rate": 4.904904904904905e-06, "loss": 0.8364, "step": 21700 }, { "epoch": 27.01881595881596, "grad_norm": 0.23709970712661743, "learning_rate": 4.902044902044902e-06, "loss": 0.5329, "step": 21710 }, { "epoch": 27.01907335907336, "grad_norm": 0.0009810479823499918, "learning_rate": 4.8991848991849e-06, "loss": 1.3556, "step": 21720 }, { "epoch": 27.01933075933076, "grad_norm": 0.02598871849477291, "learning_rate": 4.896324896324897e-06, "loss": 0.5565, "step": 21730 }, { "epoch": 27.01958815958816, "grad_norm": 112.14535522460938, "learning_rate": 4.8934648934648935e-06, "loss": 1.6131, "step": 21740 }, { "epoch": 27.01984555984556, "grad_norm": 0.008748788386583328, "learning_rate": 4.890604890604891e-06, "loss": 0.0017, "step": 21750 }, { "epoch": 27.02, "eval_accuracy": 0.782608695652174, "eval_loss": 1.2440866231918335, "eval_runtime": 14.5083, "eval_samples_per_second": 3.171, "eval_steps_per_second": 3.171, "step": 21756 }, { "epoch": 28.00010296010296, "grad_norm": 0.0006102527840994298, "learning_rate": 4.887744887744888e-06, "loss": 0.4715, "step": 21760 }, { "epoch": 28.00036036036036, "grad_norm": 0.0769384428858757, "learning_rate": 4.884884884884885e-06, "loss": 0.002, "step": 21770 }, { "epoch": 28.00061776061776, "grad_norm": 0.0811397060751915, "learning_rate": 4.882024882024882e-06, "loss": 0.4448, "step": 21780 }, { "epoch": 28.00087516087516, "grad_norm": 0.002516659675166011, "learning_rate": 4.879164879164879e-06, "loss": 1.1136, "step": 21790 }, { "epoch": 28.00113256113256, "grad_norm": 0.06139345467090607, "learning_rate": 4.8763048763048765e-06, "loss": 0.0014, "step": 21800 }, { "epoch": 28.00138996138996, "grad_norm": 35.70652770996094, "learning_rate": 4.873444873444874e-06, "loss": 1.0875, "step": 21810 }, { "epoch": 28.00164736164736, "grad_norm": 128.56268310546875, "learning_rate": 4.870584870584871e-06, "loss": 0.4463, "step": 21820 }, { "epoch": 28.00190476190476, "grad_norm": 0.00042608179501257837, "learning_rate": 4.867724867724868e-06, "loss": 0.4507, "step": 21830 }, { "epoch": 28.00216216216216, "grad_norm": 0.05981049686670303, "learning_rate": 4.864864864864866e-06, "loss": 1.251, "step": 21840 }, { "epoch": 28.00241956241956, "grad_norm": 0.029568631201982498, "learning_rate": 4.862004862004862e-06, "loss": 0.498, "step": 21850 }, { "epoch": 28.00267696267696, "grad_norm": 41.19403839111328, "learning_rate": 4.8591448591448596e-06, "loss": 0.8085, "step": 21860 }, { "epoch": 28.00293436293436, "grad_norm": 0.035941943526268005, "learning_rate": 4.856284856284857e-06, "loss": 0.0015, "step": 21870 }, { "epoch": 28.00319176319176, "grad_norm": 0.01107192412018776, "learning_rate": 4.853424853424854e-06, "loss": 0.8204, "step": 21880 }, { "epoch": 28.003449163449165, "grad_norm": 0.0007803246262483299, "learning_rate": 4.8505648505648515e-06, "loss": 0.0033, "step": 21890 }, { "epoch": 28.003706563706565, "grad_norm": 39.610023498535156, "learning_rate": 4.847704847704848e-06, "loss": 0.5976, "step": 21900 }, { "epoch": 28.003963963963965, "grad_norm": 0.4089675843715668, "learning_rate": 4.844844844844845e-06, "loss": 0.0006, "step": 21910 }, { "epoch": 28.004221364221365, "grad_norm": 0.0013353163376450539, "learning_rate": 4.841984841984843e-06, "loss": 0.0025, "step": 21920 }, { "epoch": 28.004478764478765, "grad_norm": 0.0005668631056323647, "learning_rate": 4.83912483912484e-06, "loss": 1.0795, "step": 21930 }, { "epoch": 28.004736164736165, "grad_norm": 0.0004358472360763699, "learning_rate": 4.836264836264836e-06, "loss": 0.8763, "step": 21940 }, { "epoch": 28.004993564993566, "grad_norm": 0.34297168254852295, "learning_rate": 4.833404833404834e-06, "loss": 0.5577, "step": 21950 }, { "epoch": 28.005250965250966, "grad_norm": 0.10034578293561935, "learning_rate": 4.830544830544831e-06, "loss": 2.3757, "step": 21960 }, { "epoch": 28.005508365508366, "grad_norm": 0.12659744918346405, "learning_rate": 4.827684827684828e-06, "loss": 0.5404, "step": 21970 }, { "epoch": 28.005765765765766, "grad_norm": 0.06474514305591583, "learning_rate": 4.824824824824826e-06, "loss": 0.4778, "step": 21980 }, { "epoch": 28.006023166023166, "grad_norm": 0.03867054730653763, "learning_rate": 4.821964821964822e-06, "loss": 0.5697, "step": 21990 }, { "epoch": 28.006280566280566, "grad_norm": 0.003705323673784733, "learning_rate": 4.8191048191048194e-06, "loss": 0.53, "step": 22000 }, { "epoch": 28.006537966537966, "grad_norm": 0.014577334746718407, "learning_rate": 4.816244816244817e-06, "loss": 0.0034, "step": 22010 }, { "epoch": 28.006795366795366, "grad_norm": 0.07273665815591812, "learning_rate": 4.813384813384814e-06, "loss": 0.5665, "step": 22020 }, { "epoch": 28.007052767052766, "grad_norm": 0.14401213824748993, "learning_rate": 4.8105248105248105e-06, "loss": 0.0013, "step": 22030 }, { "epoch": 28.007310167310166, "grad_norm": 0.0013334174873307347, "learning_rate": 4.807664807664808e-06, "loss": 0.2891, "step": 22040 }, { "epoch": 28.007567567567566, "grad_norm": 0.02023886889219284, "learning_rate": 4.804804804804805e-06, "loss": 0.0011, "step": 22050 }, { "epoch": 28.007824967824966, "grad_norm": 0.14436078071594238, "learning_rate": 4.8019448019448025e-06, "loss": 0.0016, "step": 22060 }, { "epoch": 28.00808236808237, "grad_norm": 0.03297024220228195, "learning_rate": 4.7990847990848e-06, "loss": 0.5523, "step": 22070 }, { "epoch": 28.00833976833977, "grad_norm": 507.6208801269531, "learning_rate": 4.796224796224796e-06, "loss": 0.4174, "step": 22080 }, { "epoch": 28.00859716859717, "grad_norm": 0.016587285324931145, "learning_rate": 4.793364793364794e-06, "loss": 0.6593, "step": 22090 }, { "epoch": 28.00885456885457, "grad_norm": 0.0026779465842992067, "learning_rate": 4.790504790504791e-06, "loss": 0.6397, "step": 22100 }, { "epoch": 28.00911196911197, "grad_norm": 0.14133141934871674, "learning_rate": 4.787644787644788e-06, "loss": 0.1802, "step": 22110 }, { "epoch": 28.00936936936937, "grad_norm": 0.16812646389007568, "learning_rate": 4.784784784784785e-06, "loss": 0.4667, "step": 22120 }, { "epoch": 28.00962676962677, "grad_norm": 0.08822160959243774, "learning_rate": 4.781924781924782e-06, "loss": 1.18, "step": 22130 }, { "epoch": 28.00988416988417, "grad_norm": 0.17704392969608307, "learning_rate": 4.779064779064779e-06, "loss": 0.0016, "step": 22140 }, { "epoch": 28.01014157014157, "grad_norm": 0.4867551028728485, "learning_rate": 4.776204776204777e-06, "loss": 1.0967, "step": 22150 }, { "epoch": 28.01039897039897, "grad_norm": 0.54176926612854, "learning_rate": 4.773344773344773e-06, "loss": 0.6071, "step": 22160 }, { "epoch": 28.01065637065637, "grad_norm": 0.0038197352550923824, "learning_rate": 4.7704847704847704e-06, "loss": 0.442, "step": 22170 }, { "epoch": 28.01091377091377, "grad_norm": 0.08110248297452927, "learning_rate": 4.767624767624768e-06, "loss": 1.2637, "step": 22180 }, { "epoch": 28.01117117117117, "grad_norm": 0.12915049493312836, "learning_rate": 4.764764764764765e-06, "loss": 0.3288, "step": 22190 }, { "epoch": 28.01142857142857, "grad_norm": 50.06999588012695, "learning_rate": 4.761904761904762e-06, "loss": 0.3762, "step": 22200 }, { "epoch": 28.01168597168597, "grad_norm": 0.0005307564279064536, "learning_rate": 4.75904475904476e-06, "loss": 0.1265, "step": 22210 }, { "epoch": 28.01194337194337, "grad_norm": 0.03496575355529785, "learning_rate": 4.756184756184757e-06, "loss": 0.0023, "step": 22220 }, { "epoch": 28.01220077220077, "grad_norm": 0.0009349982719868422, "learning_rate": 4.753324753324754e-06, "loss": 0.0194, "step": 22230 }, { "epoch": 28.01245817245817, "grad_norm": 0.0008056398364715278, "learning_rate": 4.750464750464751e-06, "loss": 0.0018, "step": 22240 }, { "epoch": 28.01271557271557, "grad_norm": 0.06082630529999733, "learning_rate": 4.747604747604748e-06, "loss": 0.727, "step": 22250 }, { "epoch": 28.012972972972975, "grad_norm": 0.11555928736925125, "learning_rate": 4.7447447447447454e-06, "loss": 0.0015, "step": 22260 }, { "epoch": 28.013230373230375, "grad_norm": 0.012335697188973427, "learning_rate": 4.741884741884743e-06, "loss": 0.3981, "step": 22270 }, { "epoch": 28.013487773487775, "grad_norm": 0.00791518110781908, "learning_rate": 4.739024739024739e-06, "loss": 0.5654, "step": 22280 }, { "epoch": 28.013745173745175, "grad_norm": 708.4443359375, "learning_rate": 4.7361647361647365e-06, "loss": 0.4047, "step": 22290 }, { "epoch": 28.014002574002575, "grad_norm": 0.008482186123728752, "learning_rate": 4.733304733304734e-06, "loss": 0.0004, "step": 22300 }, { "epoch": 28.014259974259975, "grad_norm": 0.08238717913627625, "learning_rate": 4.730444730444731e-06, "loss": 0.0009, "step": 22310 }, { "epoch": 28.014517374517375, "grad_norm": 0.00034206442069262266, "learning_rate": 4.7275847275847285e-06, "loss": 0.196, "step": 22320 }, { "epoch": 28.014774774774775, "grad_norm": 0.02654080092906952, "learning_rate": 4.724724724724725e-06, "loss": 0.0011, "step": 22330 }, { "epoch": 28.015032175032175, "grad_norm": 42.99850845336914, "learning_rate": 4.721864721864722e-06, "loss": 1.2377, "step": 22340 }, { "epoch": 28.015289575289575, "grad_norm": 0.653175413608551, "learning_rate": 4.71900471900472e-06, "loss": 1.3387, "step": 22350 }, { "epoch": 28.015546975546975, "grad_norm": 0.40929099917411804, "learning_rate": 4.716144716144717e-06, "loss": 0.001, "step": 22360 }, { "epoch": 28.015804375804375, "grad_norm": 0.17331182956695557, "learning_rate": 4.713284713284713e-06, "loss": 0.0152, "step": 22370 }, { "epoch": 28.016061776061775, "grad_norm": 0.05048976093530655, "learning_rate": 4.710424710424711e-06, "loss": 1.2726, "step": 22380 }, { "epoch": 28.016319176319175, "grad_norm": 0.24569039046764374, "learning_rate": 4.707564707564708e-06, "loss": 0.6923, "step": 22390 }, { "epoch": 28.016576576576576, "grad_norm": 0.011856620199978352, "learning_rate": 4.704704704704705e-06, "loss": 0.4904, "step": 22400 }, { "epoch": 28.016833976833976, "grad_norm": 0.0016780027654021978, "learning_rate": 4.701844701844703e-06, "loss": 0.0008, "step": 22410 }, { "epoch": 28.017091377091376, "grad_norm": 0.0035610671620815992, "learning_rate": 4.698984698984699e-06, "loss": 0.0015, "step": 22420 }, { "epoch": 28.017348777348776, "grad_norm": 0.0016747131012380123, "learning_rate": 4.696124696124696e-06, "loss": 0.0011, "step": 22430 }, { "epoch": 28.01760617760618, "grad_norm": 0.020335784181952477, "learning_rate": 4.693264693264694e-06, "loss": 1.5259, "step": 22440 }, { "epoch": 28.01786357786358, "grad_norm": 0.004793055355548859, "learning_rate": 4.690404690404691e-06, "loss": 0.9107, "step": 22450 }, { "epoch": 28.01812097812098, "grad_norm": 3.8998618125915527, "learning_rate": 4.6875446875446875e-06, "loss": 0.9888, "step": 22460 }, { "epoch": 28.01837837837838, "grad_norm": 0.003152688266709447, "learning_rate": 4.684684684684685e-06, "loss": 0.8276, "step": 22470 }, { "epoch": 28.01863577863578, "grad_norm": 0.07769756019115448, "learning_rate": 4.681824681824682e-06, "loss": 0.0053, "step": 22480 }, { "epoch": 28.01889317889318, "grad_norm": 0.0015963425394147635, "learning_rate": 4.6789646789646795e-06, "loss": 0.4891, "step": 22490 }, { "epoch": 28.01915057915058, "grad_norm": 0.0019973055459558964, "learning_rate": 4.676104676104676e-06, "loss": 0.0014, "step": 22500 }, { "epoch": 28.01940797940798, "grad_norm": 0.18019846081733704, "learning_rate": 4.673244673244673e-06, "loss": 1.7923, "step": 22510 }, { "epoch": 28.01966537966538, "grad_norm": 0.0022248122841119766, "learning_rate": 4.6703846703846706e-06, "loss": 0.7992, "step": 22520 }, { "epoch": 28.01992277992278, "grad_norm": 171.22250366210938, "learning_rate": 4.667524667524668e-06, "loss": 1.1658, "step": 22530 }, { "epoch": 28.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.6606096625328064, "eval_runtime": 13.404, "eval_samples_per_second": 3.432, "eval_steps_per_second": 3.432, "step": 22533 }, { "epoch": 29.00018018018018, "grad_norm": 0.1187930479645729, "learning_rate": 4.664664664664665e-06, "loss": 0.0024, "step": 22540 }, { "epoch": 29.00043758043758, "grad_norm": 0.03540468215942383, "learning_rate": 4.661804661804662e-06, "loss": 0.0007, "step": 22550 }, { "epoch": 29.00069498069498, "grad_norm": 0.008411556482315063, "learning_rate": 4.658944658944659e-06, "loss": 0.1653, "step": 22560 }, { "epoch": 29.00095238095238, "grad_norm": 208.8349151611328, "learning_rate": 4.656084656084656e-06, "loss": 0.1932, "step": 22570 }, { "epoch": 29.00120978120978, "grad_norm": 0.5143762826919556, "learning_rate": 4.653224653224654e-06, "loss": 0.0046, "step": 22580 }, { "epoch": 29.00146718146718, "grad_norm": 0.7926735281944275, "learning_rate": 4.650364650364651e-06, "loss": 0.0034, "step": 22590 }, { "epoch": 29.00172458172458, "grad_norm": 0.000883811735548079, "learning_rate": 4.647504647504648e-06, "loss": 0.3767, "step": 22600 }, { "epoch": 29.00198198198198, "grad_norm": 686.049072265625, "learning_rate": 4.6446446446446456e-06, "loss": 0.4603, "step": 22610 }, { "epoch": 29.00223938223938, "grad_norm": 0.14210084080696106, "learning_rate": 4.641784641784642e-06, "loss": 0.0021, "step": 22620 }, { "epoch": 29.00249678249678, "grad_norm": 0.0504668764770031, "learning_rate": 4.638924638924639e-06, "loss": 0.4215, "step": 22630 }, { "epoch": 29.00275418275418, "grad_norm": 0.03664263337850571, "learning_rate": 4.636064636064637e-06, "loss": 0.0006, "step": 22640 }, { "epoch": 29.003011583011585, "grad_norm": 464.8655700683594, "learning_rate": 4.633204633204634e-06, "loss": 0.0139, "step": 22650 }, { "epoch": 29.003268983268985, "grad_norm": 0.00034964046790264547, "learning_rate": 4.630344630344631e-06, "loss": 1.1042, "step": 22660 }, { "epoch": 29.003526383526385, "grad_norm": 0.0003824385639745742, "learning_rate": 4.627484627484628e-06, "loss": 0.0007, "step": 22670 }, { "epoch": 29.003783783783785, "grad_norm": 0.0016709021292626858, "learning_rate": 4.624624624624625e-06, "loss": 1.7071, "step": 22680 }, { "epoch": 29.004041184041185, "grad_norm": 0.07814698666334152, "learning_rate": 4.621764621764622e-06, "loss": 0.5468, "step": 22690 }, { "epoch": 29.004298584298585, "grad_norm": 0.06099068745970726, "learning_rate": 4.61890461890462e-06, "loss": 0.7175, "step": 22700 }, { "epoch": 29.004555984555985, "grad_norm": 0.02045220322906971, "learning_rate": 4.616044616044616e-06, "loss": 0.5742, "step": 22710 }, { "epoch": 29.004813384813385, "grad_norm": 0.029201224446296692, "learning_rate": 4.6131846131846135e-06, "loss": 0.0006, "step": 22720 }, { "epoch": 29.005070785070785, "grad_norm": 6.332117557525635, "learning_rate": 4.610324610324611e-06, "loss": 0.006, "step": 22730 }, { "epoch": 29.005328185328185, "grad_norm": 0.12955205142498016, "learning_rate": 4.607464607464608e-06, "loss": 1.337, "step": 22740 }, { "epoch": 29.005585585585585, "grad_norm": 208.47537231445312, "learning_rate": 4.604604604604605e-06, "loss": 1.1339, "step": 22750 }, { "epoch": 29.005842985842985, "grad_norm": 0.03571253642439842, "learning_rate": 4.601744601744602e-06, "loss": 0.0006, "step": 22760 }, { "epoch": 29.006100386100385, "grad_norm": 0.018318068236112595, "learning_rate": 4.598884598884599e-06, "loss": 0.0017, "step": 22770 }, { "epoch": 29.006357786357785, "grad_norm": 0.0016119711799547076, "learning_rate": 4.5960245960245966e-06, "loss": 0.272, "step": 22780 }, { "epoch": 29.006615186615186, "grad_norm": 0.030253881588578224, "learning_rate": 4.593164593164594e-06, "loss": 0.6145, "step": 22790 }, { "epoch": 29.006872586872586, "grad_norm": 1.5183641910552979, "learning_rate": 4.59030459030459e-06, "loss": 0.0013, "step": 22800 }, { "epoch": 29.007129987129986, "grad_norm": 0.010023567825555801, "learning_rate": 4.587444587444588e-06, "loss": 0.5396, "step": 22810 }, { "epoch": 29.007387387387386, "grad_norm": 1.5276458263397217, "learning_rate": 4.584584584584585e-06, "loss": 0.0042, "step": 22820 }, { "epoch": 29.00764478764479, "grad_norm": 0.007271212060004473, "learning_rate": 4.581724581724582e-06, "loss": 0.6234, "step": 22830 }, { "epoch": 29.00790218790219, "grad_norm": 0.039576660841703415, "learning_rate": 4.578864578864579e-06, "loss": 0.0006, "step": 22840 }, { "epoch": 29.00815958815959, "grad_norm": 0.0008184545440599322, "learning_rate": 4.576004576004576e-06, "loss": 0.3065, "step": 22850 }, { "epoch": 29.00841698841699, "grad_norm": 0.012131177820265293, "learning_rate": 4.573144573144573e-06, "loss": 1.2567, "step": 22860 }, { "epoch": 29.00867438867439, "grad_norm": 0.06175015866756439, "learning_rate": 4.570284570284571e-06, "loss": 0.6189, "step": 22870 }, { "epoch": 29.00893178893179, "grad_norm": 0.03581232950091362, "learning_rate": 4.567424567424568e-06, "loss": 0.0004, "step": 22880 }, { "epoch": 29.00918918918919, "grad_norm": 54.22113037109375, "learning_rate": 4.5645645645645645e-06, "loss": 1.1175, "step": 22890 }, { "epoch": 29.00944658944659, "grad_norm": 24.692176818847656, "learning_rate": 4.561704561704562e-06, "loss": 0.8245, "step": 22900 }, { "epoch": 29.00970398970399, "grad_norm": 0.0129568325355649, "learning_rate": 4.558844558844559e-06, "loss": 0.0181, "step": 22910 }, { "epoch": 29.00996138996139, "grad_norm": 0.22810792922973633, "learning_rate": 4.5559845559845564e-06, "loss": 0.4374, "step": 22920 }, { "epoch": 29.01021879021879, "grad_norm": 0.030504779890179634, "learning_rate": 4.553124553124553e-06, "loss": 0.0009, "step": 22930 }, { "epoch": 29.01047619047619, "grad_norm": 0.010466916486620903, "learning_rate": 4.55026455026455e-06, "loss": 1.5878, "step": 22940 }, { "epoch": 29.01073359073359, "grad_norm": 0.005396629683673382, "learning_rate": 4.5474045474045475e-06, "loss": 0.0026, "step": 22950 }, { "epoch": 29.01099099099099, "grad_norm": 0.20146198570728302, "learning_rate": 4.544544544544545e-06, "loss": 0.4596, "step": 22960 }, { "epoch": 29.01124839124839, "grad_norm": 139.57632446289062, "learning_rate": 4.541684541684542e-06, "loss": 1.0697, "step": 22970 }, { "epoch": 29.01150579150579, "grad_norm": 85.71121215820312, "learning_rate": 4.5388245388245395e-06, "loss": 1.3387, "step": 22980 }, { "epoch": 29.01176319176319, "grad_norm": 0.12364278733730316, "learning_rate": 4.535964535964537e-06, "loss": 0.3547, "step": 22990 }, { "epoch": 29.01202059202059, "grad_norm": 0.0017210529185831547, "learning_rate": 4.533104533104533e-06, "loss": 0.0014, "step": 23000 }, { "epoch": 29.01227799227799, "grad_norm": 32.765933990478516, "learning_rate": 4.530244530244531e-06, "loss": 0.0297, "step": 23010 }, { "epoch": 29.012535392535394, "grad_norm": 38.97309875488281, "learning_rate": 4.527384527384528e-06, "loss": 0.5779, "step": 23020 }, { "epoch": 29.012792792792794, "grad_norm": 0.22549696266651154, "learning_rate": 4.524524524524525e-06, "loss": 0.0118, "step": 23030 }, { "epoch": 29.013050193050194, "grad_norm": 162.48883056640625, "learning_rate": 4.5216645216645225e-06, "loss": 1.2432, "step": 23040 }, { "epoch": 29.013307593307594, "grad_norm": 0.01860671490430832, "learning_rate": 4.518804518804519e-06, "loss": 1.0974, "step": 23050 }, { "epoch": 29.013564993564994, "grad_norm": 0.13944114744663239, "learning_rate": 4.515944515944516e-06, "loss": 0.0012, "step": 23060 }, { "epoch": 29.013822393822394, "grad_norm": 0.0077326660975813866, "learning_rate": 4.513084513084514e-06, "loss": 0.4396, "step": 23070 }, { "epoch": 29.014079794079795, "grad_norm": 0.0013942186487838626, "learning_rate": 4.510224510224511e-06, "loss": 0.0016, "step": 23080 }, { "epoch": 29.014337194337195, "grad_norm": 0.1694478839635849, "learning_rate": 4.5073645073645074e-06, "loss": 0.9909, "step": 23090 }, { "epoch": 29.014594594594595, "grad_norm": 156.31834411621094, "learning_rate": 4.504504504504505e-06, "loss": 0.0849, "step": 23100 }, { "epoch": 29.014851994851995, "grad_norm": 0.18722569942474365, "learning_rate": 4.501644501644502e-06, "loss": 0.4487, "step": 23110 }, { "epoch": 29.015109395109395, "grad_norm": 0.0004421714984346181, "learning_rate": 4.498784498784499e-06, "loss": 0.5625, "step": 23120 }, { "epoch": 29.015366795366795, "grad_norm": 279.7695617675781, "learning_rate": 4.495924495924497e-06, "loss": 0.4133, "step": 23130 }, { "epoch": 29.015624195624195, "grad_norm": 0.001477337907999754, "learning_rate": 4.493064493064493e-06, "loss": 2.0714, "step": 23140 }, { "epoch": 29.015881595881595, "grad_norm": 0.051959581673145294, "learning_rate": 4.4902044902044905e-06, "loss": 0.6586, "step": 23150 }, { "epoch": 29.016138996138995, "grad_norm": 59.180824279785156, "learning_rate": 4.487344487344488e-06, "loss": 1.1272, "step": 23160 }, { "epoch": 29.016396396396395, "grad_norm": 0.1820637434720993, "learning_rate": 4.484484484484485e-06, "loss": 0.5311, "step": 23170 }, { "epoch": 29.016653796653795, "grad_norm": 0.001803033985197544, "learning_rate": 4.481624481624482e-06, "loss": 2.0791, "step": 23180 }, { "epoch": 29.016911196911195, "grad_norm": 0.20301368832588196, "learning_rate": 4.478764478764479e-06, "loss": 0.5542, "step": 23190 }, { "epoch": 29.0171685971686, "grad_norm": 0.002093987073749304, "learning_rate": 4.475904475904476e-06, "loss": 0.0012, "step": 23200 }, { "epoch": 29.017425997426, "grad_norm": 1884.36181640625, "learning_rate": 4.4730444730444735e-06, "loss": 0.5681, "step": 23210 }, { "epoch": 29.0176833976834, "grad_norm": 0.0037956798914819956, "learning_rate": 4.470184470184471e-06, "loss": 0.0032, "step": 23220 }, { "epoch": 29.0179407979408, "grad_norm": 2.3361432552337646, "learning_rate": 4.467324467324467e-06, "loss": 0.4418, "step": 23230 }, { "epoch": 29.0181981981982, "grad_norm": 0.03866471350193024, "learning_rate": 4.464464464464465e-06, "loss": 1.0058, "step": 23240 }, { "epoch": 29.0184555984556, "grad_norm": 0.10158059746026993, "learning_rate": 4.461604461604462e-06, "loss": 0.367, "step": 23250 }, { "epoch": 29.018712998713, "grad_norm": 0.29946258664131165, "learning_rate": 4.458744458744459e-06, "loss": 0.5789, "step": 23260 }, { "epoch": 29.0189703989704, "grad_norm": 0.004636166617274284, "learning_rate": 4.455884455884456e-06, "loss": 0.6314, "step": 23270 }, { "epoch": 29.0192277992278, "grad_norm": 0.025164945051074028, "learning_rate": 4.453024453024453e-06, "loss": 0.0007, "step": 23280 }, { "epoch": 29.0194851994852, "grad_norm": 0.3000508248806, "learning_rate": 4.45016445016445e-06, "loss": 0.9031, "step": 23290 }, { "epoch": 29.0197425997426, "grad_norm": 0.002804351970553398, "learning_rate": 4.447304447304448e-06, "loss": 0.4417, "step": 23300 }, { "epoch": 29.02, "grad_norm": 0.04149908944964409, "learning_rate": 4.444444444444444e-06, "loss": 0.6756, "step": 23310 }, { "epoch": 29.02, "eval_accuracy": 0.717391304347826, "eval_loss": 1.6570743322372437, "eval_runtime": 13.3509, "eval_samples_per_second": 3.445, "eval_steps_per_second": 3.445, "step": 23310 }, { "epoch": 30.0002574002574, "grad_norm": 0.21637240052223206, "learning_rate": 4.4415844415844415e-06, "loss": 1.6747, "step": 23320 }, { "epoch": 30.0005148005148, "grad_norm": 0.8607198596000671, "learning_rate": 4.438724438724439e-06, "loss": 0.6147, "step": 23330 }, { "epoch": 30.0007722007722, "grad_norm": 0.0019639742095023394, "learning_rate": 4.435864435864436e-06, "loss": 0.0006, "step": 23340 }, { "epoch": 30.0010296010296, "grad_norm": 1.0920131206512451, "learning_rate": 4.433004433004433e-06, "loss": 1.0313, "step": 23350 }, { "epoch": 30.001287001287, "grad_norm": 0.0037155456375330687, "learning_rate": 4.430144430144431e-06, "loss": 0.5143, "step": 23360 }, { "epoch": 30.0015444015444, "grad_norm": 0.0018675295868888497, "learning_rate": 4.427284427284428e-06, "loss": 0.0016, "step": 23370 }, { "epoch": 30.0018018018018, "grad_norm": 0.008687763474881649, "learning_rate": 4.424424424424425e-06, "loss": 0.0012, "step": 23380 }, { "epoch": 30.0020592020592, "grad_norm": 0.03639661893248558, "learning_rate": 4.421564421564422e-06, "loss": 1.2112, "step": 23390 }, { "epoch": 30.0023166023166, "grad_norm": 0.12909619510173798, "learning_rate": 4.418704418704419e-06, "loss": 0.586, "step": 23400 }, { "epoch": 30.002574002574004, "grad_norm": 0.3514103591442108, "learning_rate": 4.4158444158444165e-06, "loss": 0.4516, "step": 23410 }, { "epoch": 30.002831402831404, "grad_norm": 37.94804000854492, "learning_rate": 4.412984412984414e-06, "loss": 0.2859, "step": 23420 }, { "epoch": 30.003088803088804, "grad_norm": 0.00035340170143172145, "learning_rate": 4.41012441012441e-06, "loss": 0.1207, "step": 23430 }, { "epoch": 30.003346203346204, "grad_norm": 0.00048636618885211647, "learning_rate": 4.4072644072644076e-06, "loss": 0.5357, "step": 23440 }, { "epoch": 30.003603603603604, "grad_norm": 0.00040685260319150984, "learning_rate": 4.404404404404405e-06, "loss": 0.0143, "step": 23450 }, { "epoch": 30.003861003861005, "grad_norm": 0.0009621041244827211, "learning_rate": 4.401544401544402e-06, "loss": 0.7266, "step": 23460 }, { "epoch": 30.004118404118405, "grad_norm": 0.04314970225095749, "learning_rate": 4.3986843986843995e-06, "loss": 1.1009, "step": 23470 }, { "epoch": 30.004375804375805, "grad_norm": 0.007667004596441984, "learning_rate": 4.395824395824396e-06, "loss": 0.0003, "step": 23480 }, { "epoch": 30.004633204633205, "grad_norm": 0.038383617997169495, "learning_rate": 4.392964392964393e-06, "loss": 0.002, "step": 23490 }, { "epoch": 30.004890604890605, "grad_norm": 0.03447192534804344, "learning_rate": 4.390104390104391e-06, "loss": 0.6983, "step": 23500 }, { "epoch": 30.005148005148005, "grad_norm": 0.03259385749697685, "learning_rate": 4.387244387244388e-06, "loss": 0.4584, "step": 23510 }, { "epoch": 30.005405405405405, "grad_norm": 0.005583514925092459, "learning_rate": 4.384384384384384e-06, "loss": 0.0099, "step": 23520 }, { "epoch": 30.005662805662805, "grad_norm": 0.01332216989248991, "learning_rate": 4.381524381524382e-06, "loss": 1.1229, "step": 23530 }, { "epoch": 30.005920205920205, "grad_norm": 0.0005782814114354551, "learning_rate": 4.378664378664379e-06, "loss": 0.5327, "step": 23540 }, { "epoch": 30.006177606177605, "grad_norm": 0.10082386434078217, "learning_rate": 4.375804375804376e-06, "loss": 0.4731, "step": 23550 }, { "epoch": 30.006435006435005, "grad_norm": 0.09667433053255081, "learning_rate": 4.372944372944373e-06, "loss": 0.4831, "step": 23560 }, { "epoch": 30.006692406692405, "grad_norm": 0.06305726617574692, "learning_rate": 4.37008437008437e-06, "loss": 1.1478, "step": 23570 }, { "epoch": 30.006949806949805, "grad_norm": 0.0013677224051207304, "learning_rate": 4.3672243672243675e-06, "loss": 0.2762, "step": 23580 }, { "epoch": 30.00720720720721, "grad_norm": 0.0005206306814216077, "learning_rate": 4.364364364364365e-06, "loss": 0.0009, "step": 23590 }, { "epoch": 30.00746460746461, "grad_norm": 0.06642114371061325, "learning_rate": 4.361504361504362e-06, "loss": 0.9313, "step": 23600 }, { "epoch": 30.00772200772201, "grad_norm": 0.20846591889858246, "learning_rate": 4.3586443586443586e-06, "loss": 0.5189, "step": 23610 }, { "epoch": 30.00797940797941, "grad_norm": 0.000537847401574254, "learning_rate": 4.355784355784356e-06, "loss": 0.0006, "step": 23620 }, { "epoch": 30.00823680823681, "grad_norm": 0.052646271884441376, "learning_rate": 4.352924352924353e-06, "loss": 0.0002, "step": 23630 }, { "epoch": 30.00849420849421, "grad_norm": 0.11752662807703018, "learning_rate": 4.3500643500643505e-06, "loss": 0.6548, "step": 23640 }, { "epoch": 30.00875160875161, "grad_norm": 170.06265258789062, "learning_rate": 4.347204347204347e-06, "loss": 0.7438, "step": 23650 }, { "epoch": 30.00900900900901, "grad_norm": 0.0005917979287914932, "learning_rate": 4.344344344344344e-06, "loss": 1.3001, "step": 23660 }, { "epoch": 30.00926640926641, "grad_norm": 0.014821925200521946, "learning_rate": 4.341484341484342e-06, "loss": 0.1067, "step": 23670 }, { "epoch": 30.00952380952381, "grad_norm": 0.0017121240962296724, "learning_rate": 4.338624338624339e-06, "loss": 0.5524, "step": 23680 }, { "epoch": 30.00978120978121, "grad_norm": 0.023720256984233856, "learning_rate": 4.335764335764336e-06, "loss": 0.0051, "step": 23690 }, { "epoch": 30.01003861003861, "grad_norm": 0.5435638427734375, "learning_rate": 4.332904332904333e-06, "loss": 0.0102, "step": 23700 }, { "epoch": 30.01029601029601, "grad_norm": 0.012425548397004604, "learning_rate": 4.330044330044331e-06, "loss": 0.1018, "step": 23710 }, { "epoch": 30.01055341055341, "grad_norm": 0.03509173542261124, "learning_rate": 4.327184327184328e-06, "loss": 0.1394, "step": 23720 }, { "epoch": 30.01081081081081, "grad_norm": 92.84154510498047, "learning_rate": 4.324324324324325e-06, "loss": 2.0118, "step": 23730 }, { "epoch": 30.01106821106821, "grad_norm": 254.40797424316406, "learning_rate": 4.321464321464322e-06, "loss": 1.1617, "step": 23740 }, { "epoch": 30.01132561132561, "grad_norm": 0.0050951456651091576, "learning_rate": 4.318604318604319e-06, "loss": 0.1634, "step": 23750 }, { "epoch": 30.01158301158301, "grad_norm": 0.00045341532677412033, "learning_rate": 4.315744315744317e-06, "loss": 0.0071, "step": 23760 }, { "epoch": 30.01184041184041, "grad_norm": 0.08173135668039322, "learning_rate": 4.312884312884313e-06, "loss": 0.0273, "step": 23770 }, { "epoch": 30.012097812097814, "grad_norm": 0.0005044002318754792, "learning_rate": 4.31002431002431e-06, "loss": 0.1153, "step": 23780 }, { "epoch": 30.012355212355214, "grad_norm": 0.11497832834720612, "learning_rate": 4.307164307164308e-06, "loss": 0.0008, "step": 23790 }, { "epoch": 30.012612612612614, "grad_norm": 0.12187304347753525, "learning_rate": 4.304304304304305e-06, "loss": 1.3989, "step": 23800 }, { "epoch": 30.012870012870014, "grad_norm": 5.552525520324707, "learning_rate": 4.301444301444302e-06, "loss": 0.136, "step": 23810 }, { "epoch": 30.013127413127414, "grad_norm": 0.0017980223055928946, "learning_rate": 4.298584298584299e-06, "loss": 0.001, "step": 23820 }, { "epoch": 30.013384813384814, "grad_norm": 74.36280822753906, "learning_rate": 4.295724295724296e-06, "loss": 1.3003, "step": 23830 }, { "epoch": 30.013642213642214, "grad_norm": 0.01991387829184532, "learning_rate": 4.2928642928642934e-06, "loss": 0.0004, "step": 23840 }, { "epoch": 30.013899613899614, "grad_norm": 0.020584069192409515, "learning_rate": 4.290004290004291e-06, "loss": 0.6308, "step": 23850 }, { "epoch": 30.014157014157014, "grad_norm": 0.0006809376063756645, "learning_rate": 4.287144287144287e-06, "loss": 0.0012, "step": 23860 }, { "epoch": 30.014414414414414, "grad_norm": 0.00031621515518054366, "learning_rate": 4.2842842842842845e-06, "loss": 0.0014, "step": 23870 }, { "epoch": 30.014671814671814, "grad_norm": 190.67312622070312, "learning_rate": 4.281424281424282e-06, "loss": 1.3941, "step": 23880 }, { "epoch": 30.014929214929214, "grad_norm": 0.16922950744628906, "learning_rate": 4.278564278564279e-06, "loss": 0.7061, "step": 23890 }, { "epoch": 30.015186615186614, "grad_norm": 0.010776959359645844, "learning_rate": 4.275704275704276e-06, "loss": 0.0011, "step": 23900 }, { "epoch": 30.015444015444015, "grad_norm": 8.569811820983887, "learning_rate": 4.272844272844273e-06, "loss": 1.355, "step": 23910 }, { "epoch": 30.015701415701415, "grad_norm": 0.21295268833637238, "learning_rate": 4.26998426998427e-06, "loss": 0.6669, "step": 23920 }, { "epoch": 30.015958815958815, "grad_norm": 0.0388345941901207, "learning_rate": 4.267124267124268e-06, "loss": 0.6833, "step": 23930 }, { "epoch": 30.016216216216215, "grad_norm": 0.03433572128415108, "learning_rate": 4.264264264264265e-06, "loss": 0.0011, "step": 23940 }, { "epoch": 30.016473616473615, "grad_norm": 0.02409375086426735, "learning_rate": 4.261404261404261e-06, "loss": 0.5623, "step": 23950 }, { "epoch": 30.01673101673102, "grad_norm": 0.0004545144329313189, "learning_rate": 4.258544258544259e-06, "loss": 0.0004, "step": 23960 }, { "epoch": 30.01698841698842, "grad_norm": 0.10565972328186035, "learning_rate": 4.255684255684256e-06, "loss": 0.0009, "step": 23970 }, { "epoch": 30.01724581724582, "grad_norm": 0.0003957377339247614, "learning_rate": 4.252824252824253e-06, "loss": 0.7128, "step": 23980 }, { "epoch": 30.01750321750322, "grad_norm": 0.06979605555534363, "learning_rate": 4.24996424996425e-06, "loss": 0.0011, "step": 23990 }, { "epoch": 30.01776061776062, "grad_norm": 0.014289547689259052, "learning_rate": 4.247104247104247e-06, "loss": 0.0015, "step": 24000 }, { "epoch": 30.01801801801802, "grad_norm": 0.03242909535765648, "learning_rate": 4.2442442442442444e-06, "loss": 0.5711, "step": 24010 }, { "epoch": 30.01827541827542, "grad_norm": 0.01781926117837429, "learning_rate": 4.241384241384242e-06, "loss": 0.1299, "step": 24020 }, { "epoch": 30.01853281853282, "grad_norm": 0.030324866995215416, "learning_rate": 4.238524238524239e-06, "loss": 0.6162, "step": 24030 }, { "epoch": 30.01879021879022, "grad_norm": 103.36428833007812, "learning_rate": 4.2356642356642355e-06, "loss": 0.7377, "step": 24040 }, { "epoch": 30.01904761904762, "grad_norm": 0.0003896548296324909, "learning_rate": 4.232804232804233e-06, "loss": 0.0011, "step": 24050 }, { "epoch": 30.01930501930502, "grad_norm": 0.11918792128562927, "learning_rate": 4.22994422994423e-06, "loss": 1.2608, "step": 24060 }, { "epoch": 30.01956241956242, "grad_norm": 0.00038182828575372696, "learning_rate": 4.2270842270842275e-06, "loss": 1.147, "step": 24070 }, { "epoch": 30.01981981981982, "grad_norm": 0.1870720386505127, "learning_rate": 4.224224224224225e-06, "loss": 0.0165, "step": 24080 }, { "epoch": 30.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.7047258615493774, "eval_runtime": 13.4178, "eval_samples_per_second": 3.428, "eval_steps_per_second": 3.428, "step": 24087 }, { "epoch": 31.00007722007722, "grad_norm": 0.0019550484139472246, "learning_rate": 4.221364221364222e-06, "loss": 0.0009, "step": 24090 }, { "epoch": 31.00033462033462, "grad_norm": 0.004212299361824989, "learning_rate": 4.2185042185042194e-06, "loss": 0.4331, "step": 24100 }, { "epoch": 31.00059202059202, "grad_norm": 0.0004280452849343419, "learning_rate": 4.215644215644216e-06, "loss": 0.001, "step": 24110 }, { "epoch": 31.00084942084942, "grad_norm": 0.00045957849943079054, "learning_rate": 4.212784212784213e-06, "loss": 0.5633, "step": 24120 }, { "epoch": 31.00110682110682, "grad_norm": 0.006528822239488363, "learning_rate": 4.2099242099242105e-06, "loss": 0.0008, "step": 24130 }, { "epoch": 31.00136422136422, "grad_norm": 0.023158643394708633, "learning_rate": 4.207064207064208e-06, "loss": 0.4536, "step": 24140 }, { "epoch": 31.00162162162162, "grad_norm": 0.04101202264428139, "learning_rate": 4.204204204204204e-06, "loss": 0.9761, "step": 24150 }, { "epoch": 31.00187902187902, "grad_norm": 0.0005613996763713658, "learning_rate": 4.201344201344202e-06, "loss": 0.693, "step": 24160 }, { "epoch": 31.002136422136424, "grad_norm": 0.013050409033894539, "learning_rate": 4.198484198484199e-06, "loss": 0.0005, "step": 24170 }, { "epoch": 31.002393822393824, "grad_norm": 0.00033386447466909885, "learning_rate": 4.195624195624196e-06, "loss": 0.2389, "step": 24180 }, { "epoch": 31.002651222651224, "grad_norm": 0.0003567083622328937, "learning_rate": 4.192764192764194e-06, "loss": 0.2853, "step": 24190 }, { "epoch": 31.002908622908624, "grad_norm": 0.0013058069162070751, "learning_rate": 4.18990418990419e-06, "loss": 0.002, "step": 24200 }, { "epoch": 31.003166023166024, "grad_norm": 0.0475904643535614, "learning_rate": 4.187044187044187e-06, "loss": 0.0008, "step": 24210 }, { "epoch": 31.003423423423424, "grad_norm": 0.00036910572089254856, "learning_rate": 4.184184184184185e-06, "loss": 0.0004, "step": 24220 }, { "epoch": 31.003680823680824, "grad_norm": 0.3366129398345947, "learning_rate": 4.181324181324182e-06, "loss": 0.3606, "step": 24230 }, { "epoch": 31.003938223938224, "grad_norm": 0.0003529265522956848, "learning_rate": 4.1784641784641785e-06, "loss": 0.4474, "step": 24240 }, { "epoch": 31.004195624195624, "grad_norm": 0.2059638500213623, "learning_rate": 4.175604175604176e-06, "loss": 0.0879, "step": 24250 }, { "epoch": 31.004453024453024, "grad_norm": 0.00036747619742527604, "learning_rate": 4.172744172744173e-06, "loss": 0.0002, "step": 24260 }, { "epoch": 31.004710424710424, "grad_norm": 217.43408203125, "learning_rate": 4.16988416988417e-06, "loss": 0.5037, "step": 24270 }, { "epoch": 31.004967824967824, "grad_norm": 0.061313144862651825, "learning_rate": 4.167024167024168e-06, "loss": 0.0009, "step": 24280 }, { "epoch": 31.005225225225225, "grad_norm": 0.010008785873651505, "learning_rate": 4.164164164164164e-06, "loss": 0.0001, "step": 24290 }, { "epoch": 31.005482625482625, "grad_norm": 0.0011877252254635096, "learning_rate": 4.1613041613041615e-06, "loss": 2.6035, "step": 24300 }, { "epoch": 31.005740025740025, "grad_norm": 0.049051184207201004, "learning_rate": 4.158444158444159e-06, "loss": 0.4011, "step": 24310 }, { "epoch": 31.005997425997425, "grad_norm": 0.004785649012774229, "learning_rate": 4.155584155584156e-06, "loss": 0.6697, "step": 24320 }, { "epoch": 31.006254826254825, "grad_norm": 0.00534382788464427, "learning_rate": 4.152724152724153e-06, "loss": 0.4767, "step": 24330 }, { "epoch": 31.006512226512225, "grad_norm": 742.736083984375, "learning_rate": 4.14986414986415e-06, "loss": 0.5331, "step": 24340 }, { "epoch": 31.00676962676963, "grad_norm": 0.00026568840257823467, "learning_rate": 4.147004147004147e-06, "loss": 0.2242, "step": 24350 }, { "epoch": 31.00702702702703, "grad_norm": 0.0005520674167200923, "learning_rate": 4.1441441441441446e-06, "loss": 0.6095, "step": 24360 }, { "epoch": 31.00728442728443, "grad_norm": 0.019691677764058113, "learning_rate": 4.141284141284142e-06, "loss": 0.0005, "step": 24370 }, { "epoch": 31.00754182754183, "grad_norm": 0.022847900167107582, "learning_rate": 4.138424138424138e-06, "loss": 0.0009, "step": 24380 }, { "epoch": 31.00779922779923, "grad_norm": 0.00487444456666708, "learning_rate": 4.135564135564136e-06, "loss": 0.9965, "step": 24390 }, { "epoch": 31.00805662805663, "grad_norm": 1136.6373291015625, "learning_rate": 4.132704132704133e-06, "loss": 1.1896, "step": 24400 }, { "epoch": 31.00831402831403, "grad_norm": 0.0009329996537417173, "learning_rate": 4.12984412984413e-06, "loss": 0.4841, "step": 24410 }, { "epoch": 31.00857142857143, "grad_norm": 0.0002942239516414702, "learning_rate": 4.126984126984127e-06, "loss": 0.4838, "step": 24420 }, { "epoch": 31.00882882882883, "grad_norm": 0.000892312207724899, "learning_rate": 4.124124124124124e-06, "loss": 0.0002, "step": 24430 }, { "epoch": 31.00908622908623, "grad_norm": 84.1487808227539, "learning_rate": 4.121264121264121e-06, "loss": 0.5793, "step": 24440 }, { "epoch": 31.00934362934363, "grad_norm": 0.0861109271645546, "learning_rate": 4.118404118404119e-06, "loss": 0.7574, "step": 24450 }, { "epoch": 31.00960102960103, "grad_norm": 0.053891342133283615, "learning_rate": 4.115544115544116e-06, "loss": 0.4293, "step": 24460 }, { "epoch": 31.00985842985843, "grad_norm": 0.022826682776212692, "learning_rate": 4.112684112684113e-06, "loss": 0.1316, "step": 24470 }, { "epoch": 31.01011583011583, "grad_norm": 0.012865704484283924, "learning_rate": 4.109824109824111e-06, "loss": 0.5083, "step": 24480 }, { "epoch": 31.01037323037323, "grad_norm": 0.0002833681064657867, "learning_rate": 4.106964106964107e-06, "loss": 0.9789, "step": 24490 }, { "epoch": 31.01063063063063, "grad_norm": 0.1123170480132103, "learning_rate": 4.1041041041041045e-06, "loss": 0.3515, "step": 24500 }, { "epoch": 31.01088803088803, "grad_norm": 0.026177385821938515, "learning_rate": 4.101244101244102e-06, "loss": 0.0005, "step": 24510 }, { "epoch": 31.01114543114543, "grad_norm": 0.00027682280051521957, "learning_rate": 4.098384098384099e-06, "loss": 0.6438, "step": 24520 }, { "epoch": 31.011402831402833, "grad_norm": 0.029603002592921257, "learning_rate": 4.095524095524096e-06, "loss": 0.6032, "step": 24530 }, { "epoch": 31.011660231660233, "grad_norm": 0.0008603780879639089, "learning_rate": 4.092664092664093e-06, "loss": 0.6519, "step": 24540 }, { "epoch": 31.011917631917633, "grad_norm": 0.012208040803670883, "learning_rate": 4.08980408980409e-06, "loss": 0.7892, "step": 24550 }, { "epoch": 31.012175032175033, "grad_norm": 0.00032583274878561497, "learning_rate": 4.0869440869440875e-06, "loss": 0.002, "step": 24560 }, { "epoch": 31.012432432432433, "grad_norm": 0.00026643762248568237, "learning_rate": 4.084084084084085e-06, "loss": 0.0021, "step": 24570 }, { "epoch": 31.012689832689833, "grad_norm": 0.002847940195351839, "learning_rate": 4.081224081224081e-06, "loss": 0.9015, "step": 24580 }, { "epoch": 31.012947232947234, "grad_norm": 0.016298290342092514, "learning_rate": 4.078364078364079e-06, "loss": 0.0007, "step": 24590 }, { "epoch": 31.013204633204634, "grad_norm": 0.04232637211680412, "learning_rate": 4.075504075504076e-06, "loss": 0.3838, "step": 24600 }, { "epoch": 31.013462033462034, "grad_norm": 0.0029244639445096254, "learning_rate": 4.072644072644073e-06, "loss": 0.6542, "step": 24610 }, { "epoch": 31.013719433719434, "grad_norm": 0.07914727926254272, "learning_rate": 4.0697840697840706e-06, "loss": 0.2355, "step": 24620 }, { "epoch": 31.013976833976834, "grad_norm": 0.07038512825965881, "learning_rate": 4.066924066924067e-06, "loss": 0.5107, "step": 24630 }, { "epoch": 31.014234234234234, "grad_norm": 0.00048367600538767874, "learning_rate": 4.064064064064064e-06, "loss": 0.6973, "step": 24640 }, { "epoch": 31.014491634491634, "grad_norm": 0.00034090675762854517, "learning_rate": 4.061204061204062e-06, "loss": 0.1097, "step": 24650 }, { "epoch": 31.014749034749034, "grad_norm": 421.32574462890625, "learning_rate": 4.058344058344059e-06, "loss": 1.0028, "step": 24660 }, { "epoch": 31.015006435006434, "grad_norm": 0.11087636649608612, "learning_rate": 4.0554840554840554e-06, "loss": 0.0003, "step": 24670 }, { "epoch": 31.015263835263834, "grad_norm": 0.05585281923413277, "learning_rate": 4.052624052624053e-06, "loss": 0.6875, "step": 24680 }, { "epoch": 31.015521235521234, "grad_norm": 0.04771759733557701, "learning_rate": 4.04976404976405e-06, "loss": 0.0004, "step": 24690 }, { "epoch": 31.015778635778634, "grad_norm": 0.010960754007101059, "learning_rate": 4.046904046904047e-06, "loss": 1.7557, "step": 24700 }, { "epoch": 31.016036036036034, "grad_norm": 0.17178845405578613, "learning_rate": 4.044044044044044e-06, "loss": 0.5631, "step": 24710 }, { "epoch": 31.016293436293438, "grad_norm": 0.07653289288282394, "learning_rate": 4.041184041184041e-06, "loss": 0.324, "step": 24720 }, { "epoch": 31.016550836550838, "grad_norm": 0.00036575511330738664, "learning_rate": 4.0383240383240385e-06, "loss": 1.0434, "step": 24730 }, { "epoch": 31.016808236808238, "grad_norm": 0.047172967344522476, "learning_rate": 4.035464035464036e-06, "loss": 0.4783, "step": 24740 }, { "epoch": 31.017065637065638, "grad_norm": 8.044768333435059, "learning_rate": 4.032604032604033e-06, "loss": 0.0029, "step": 24750 }, { "epoch": 31.017323037323038, "grad_norm": 0.007932187058031559, "learning_rate": 4.02974402974403e-06, "loss": 0.0013, "step": 24760 }, { "epoch": 31.01758043758044, "grad_norm": 0.02719135954976082, "learning_rate": 4.026884026884027e-06, "loss": 0.6169, "step": 24770 }, { "epoch": 31.01783783783784, "grad_norm": 0.00028610503068193793, "learning_rate": 4.024024024024024e-06, "loss": 0.0003, "step": 24780 }, { "epoch": 31.01809523809524, "grad_norm": 0.021465850993990898, "learning_rate": 4.0211640211640215e-06, "loss": 0.495, "step": 24790 }, { "epoch": 31.01835263835264, "grad_norm": 3.276745319366455, "learning_rate": 4.018304018304018e-06, "loss": 0.0036, "step": 24800 }, { "epoch": 31.01861003861004, "grad_norm": 0.0002894099452532828, "learning_rate": 4.015444015444015e-06, "loss": 0.0085, "step": 24810 }, { "epoch": 31.01886743886744, "grad_norm": 0.00047004703083075583, "learning_rate": 4.012584012584013e-06, "loss": 0.6427, "step": 24820 }, { "epoch": 31.01912483912484, "grad_norm": 0.005355236120522022, "learning_rate": 4.00972400972401e-06, "loss": 0.0008, "step": 24830 }, { "epoch": 31.01938223938224, "grad_norm": 9.087779998779297, "learning_rate": 4.006864006864007e-06, "loss": 0.4508, "step": 24840 }, { "epoch": 31.01963963963964, "grad_norm": 0.08665086328983307, "learning_rate": 4.004004004004005e-06, "loss": 1.1483, "step": 24850 }, { "epoch": 31.01989703989704, "grad_norm": 0.0019484072690829635, "learning_rate": 4.001144001144002e-06, "loss": 1.7687, "step": 24860 }, { "epoch": 31.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.6157187819480896, "eval_runtime": 13.4004, "eval_samples_per_second": 3.433, "eval_steps_per_second": 3.433, "step": 24864 }, { "epoch": 32.00015444015444, "grad_norm": 0.020324936136603355, "learning_rate": 3.998283998283999e-06, "loss": 0.7948, "step": 24870 }, { "epoch": 32.00041184041184, "grad_norm": 0.0005573392263613641, "learning_rate": 3.995423995423996e-06, "loss": 0.0005, "step": 24880 }, { "epoch": 32.00066924066924, "grad_norm": 0.022215578705072403, "learning_rate": 3.992563992563993e-06, "loss": 0.0014, "step": 24890 }, { "epoch": 32.00092664092664, "grad_norm": 0.2555524408817291, "learning_rate": 3.98970398970399e-06, "loss": 0.4709, "step": 24900 }, { "epoch": 32.00118404118404, "grad_norm": 0.0002522377180866897, "learning_rate": 3.986843986843988e-06, "loss": 0.5217, "step": 24910 }, { "epoch": 32.00144144144144, "grad_norm": 0.0034697242081165314, "learning_rate": 3.983983983983984e-06, "loss": 1.5856, "step": 24920 }, { "epoch": 32.00169884169884, "grad_norm": 0.0024689610581845045, "learning_rate": 3.9811239811239814e-06, "loss": 0.0005, "step": 24930 }, { "epoch": 32.00195624195624, "grad_norm": 0.00033844856079667807, "learning_rate": 3.978263978263979e-06, "loss": 0.0468, "step": 24940 }, { "epoch": 32.00221364221364, "grad_norm": 1.361294150352478, "learning_rate": 3.975403975403976e-06, "loss": 0.3848, "step": 24950 }, { "epoch": 32.00247104247104, "grad_norm": 0.0002234355779364705, "learning_rate": 3.9725439725439725e-06, "loss": 0.001, "step": 24960 }, { "epoch": 32.00272844272844, "grad_norm": 105.61128997802734, "learning_rate": 3.96968396968397e-06, "loss": 0.5425, "step": 24970 }, { "epoch": 32.002985842985844, "grad_norm": 0.0008264643256552517, "learning_rate": 3.966823966823967e-06, "loss": 0.0005, "step": 24980 }, { "epoch": 32.00324324324324, "grad_norm": 0.0004085392283741385, "learning_rate": 3.9639639639639645e-06, "loss": 0.8064, "step": 24990 }, { "epoch": 32.003500643500644, "grad_norm": 0.009747138246893883, "learning_rate": 3.961103961103962e-06, "loss": 0.2604, "step": 25000 }, { "epoch": 32.00375804375804, "grad_norm": 0.002869596006348729, "learning_rate": 3.958243958243958e-06, "loss": 0.716, "step": 25010 }, { "epoch": 32.004015444015444, "grad_norm": 0.008356648497283459, "learning_rate": 3.955383955383956e-06, "loss": 0.6306, "step": 25020 }, { "epoch": 32.00427284427285, "grad_norm": 0.0006081182509660721, "learning_rate": 3.952523952523953e-06, "loss": 0.0037, "step": 25030 }, { "epoch": 32.004530244530244, "grad_norm": 0.0005686648073606193, "learning_rate": 3.94966394966395e-06, "loss": 0.0006, "step": 25040 }, { "epoch": 32.00478764478765, "grad_norm": 0.0034270593896508217, "learning_rate": 3.946803946803947e-06, "loss": 0.0019, "step": 25050 }, { "epoch": 32.005045045045044, "grad_norm": 0.023503242060542107, "learning_rate": 3.943943943943944e-06, "loss": 0.0014, "step": 25060 }, { "epoch": 32.00530244530245, "grad_norm": 59.9887580871582, "learning_rate": 3.941083941083941e-06, "loss": 1.1883, "step": 25070 }, { "epoch": 32.005559845559844, "grad_norm": 0.11845483630895615, "learning_rate": 3.938223938223939e-06, "loss": 0.001, "step": 25080 }, { "epoch": 32.00581724581725, "grad_norm": 0.020497610792517662, "learning_rate": 3.935363935363936e-06, "loss": 0.3378, "step": 25090 }, { "epoch": 32.006074646074644, "grad_norm": 0.007013235706835985, "learning_rate": 3.932503932503932e-06, "loss": 1.4059, "step": 25100 }, { "epoch": 32.00633204633205, "grad_norm": 0.02268589660525322, "learning_rate": 3.92964392964393e-06, "loss": 0.7253, "step": 25110 }, { "epoch": 32.006589446589444, "grad_norm": 0.00040192139567807317, "learning_rate": 3.926783926783927e-06, "loss": 1.0919, "step": 25120 }, { "epoch": 32.00684684684685, "grad_norm": 0.00022598865325562656, "learning_rate": 3.923923923923924e-06, "loss": 0.0539, "step": 25130 }, { "epoch": 32.007104247104245, "grad_norm": 0.029369860887527466, "learning_rate": 3.921063921063921e-06, "loss": 0.0015, "step": 25140 }, { "epoch": 32.00736164736165, "grad_norm": 0.0013951669679954648, "learning_rate": 3.918203918203918e-06, "loss": 0.003, "step": 25150 }, { "epoch": 32.007619047619045, "grad_norm": 253.09658813476562, "learning_rate": 3.9153439153439155e-06, "loss": 0.536, "step": 25160 }, { "epoch": 32.00787644787645, "grad_norm": 155.09249877929688, "learning_rate": 3.912483912483913e-06, "loss": 0.9932, "step": 25170 }, { "epoch": 32.008133848133845, "grad_norm": 0.008935080841183662, "learning_rate": 3.90962390962391e-06, "loss": 0.5176, "step": 25180 }, { "epoch": 32.00839124839125, "grad_norm": 0.010084914974868298, "learning_rate": 3.9067639067639066e-06, "loss": 0.5228, "step": 25190 }, { "epoch": 32.00864864864865, "grad_norm": 107.02832794189453, "learning_rate": 3.903903903903904e-06, "loss": 1.1572, "step": 25200 }, { "epoch": 32.00890604890605, "grad_norm": 0.013532307930290699, "learning_rate": 3.901043901043902e-06, "loss": 0.5614, "step": 25210 }, { "epoch": 32.00916344916345, "grad_norm": 739.5260620117188, "learning_rate": 3.8981838981838985e-06, "loss": 0.4832, "step": 25220 }, { "epoch": 32.00942084942085, "grad_norm": 113.25529479980469, "learning_rate": 3.895323895323896e-06, "loss": 1.1142, "step": 25230 }, { "epoch": 32.00967824967825, "grad_norm": 0.4678283929824829, "learning_rate": 3.892463892463893e-06, "loss": 0.5052, "step": 25240 }, { "epoch": 32.00993564993565, "grad_norm": 229.15992736816406, "learning_rate": 3.8896038896038905e-06, "loss": 0.5591, "step": 25250 }, { "epoch": 32.01019305019305, "grad_norm": 135.36276245117188, "learning_rate": 3.886743886743887e-06, "loss": 0.8544, "step": 25260 }, { "epoch": 32.01045045045045, "grad_norm": 98.59178161621094, "learning_rate": 3.883883883883884e-06, "loss": 1.0966, "step": 25270 }, { "epoch": 32.01070785070785, "grad_norm": 56.150657653808594, "learning_rate": 3.8810238810238816e-06, "loss": 0.2389, "step": 25280 }, { "epoch": 32.01096525096525, "grad_norm": 0.33948537707328796, "learning_rate": 3.878163878163879e-06, "loss": 0.2457, "step": 25290 }, { "epoch": 32.01122265122265, "grad_norm": 0.014132946729660034, "learning_rate": 3.875303875303875e-06, "loss": 0.001, "step": 25300 }, { "epoch": 32.01148005148005, "grad_norm": 0.06854890286922455, "learning_rate": 3.872443872443873e-06, "loss": 0.0011, "step": 25310 }, { "epoch": 32.01173745173745, "grad_norm": 0.43976348638534546, "learning_rate": 3.86958386958387e-06, "loss": 0.648, "step": 25320 }, { "epoch": 32.01199485199485, "grad_norm": 0.00041257721022702754, "learning_rate": 3.866723866723867e-06, "loss": 0.001, "step": 25330 }, { "epoch": 32.01225225225225, "grad_norm": 0.00022966216783970594, "learning_rate": 3.863863863863865e-06, "loss": 0.4409, "step": 25340 }, { "epoch": 32.01250965250965, "grad_norm": 0.000402516481699422, "learning_rate": 3.861003861003861e-06, "loss": 0.0005, "step": 25350 }, { "epoch": 32.01276705276705, "grad_norm": 0.010474207811057568, "learning_rate": 3.858143858143858e-06, "loss": 0.579, "step": 25360 }, { "epoch": 32.01302445302445, "grad_norm": 0.21792848408222198, "learning_rate": 3.855283855283856e-06, "loss": 0.8084, "step": 25370 }, { "epoch": 32.01328185328185, "grad_norm": 0.009377431124448776, "learning_rate": 3.852423852423853e-06, "loss": 0.4596, "step": 25380 }, { "epoch": 32.01353925353926, "grad_norm": 0.01633077673614025, "learning_rate": 3.8495638495638495e-06, "loss": 0.0007, "step": 25390 }, { "epoch": 32.01379665379665, "grad_norm": 0.10646045953035355, "learning_rate": 3.846703846703847e-06, "loss": 0.3559, "step": 25400 }, { "epoch": 32.01405405405406, "grad_norm": 0.03003956936299801, "learning_rate": 3.843843843843844e-06, "loss": 0.0277, "step": 25410 }, { "epoch": 32.01431145431145, "grad_norm": 0.0002730604028329253, "learning_rate": 3.8409838409838415e-06, "loss": 0.1572, "step": 25420 }, { "epoch": 32.01456885456886, "grad_norm": 0.0002827415300998837, "learning_rate": 3.838123838123839e-06, "loss": 0.006, "step": 25430 }, { "epoch": 32.014826254826254, "grad_norm": 0.01532386802136898, "learning_rate": 3.835263835263835e-06, "loss": 0.0006, "step": 25440 }, { "epoch": 32.01508365508366, "grad_norm": 0.0004180877876933664, "learning_rate": 3.8324038324038326e-06, "loss": 0.0003, "step": 25450 }, { "epoch": 32.015341055341054, "grad_norm": 0.00066907680593431, "learning_rate": 3.82954382954383e-06, "loss": 0.4843, "step": 25460 }, { "epoch": 32.01559845559846, "grad_norm": 0.00021530227968469262, "learning_rate": 3.826683826683827e-06, "loss": 2.1196, "step": 25470 }, { "epoch": 32.015855855855854, "grad_norm": 0.05011071264743805, "learning_rate": 3.823823823823824e-06, "loss": 0.0141, "step": 25480 }, { "epoch": 32.01611325611326, "grad_norm": 0.005272657610476017, "learning_rate": 3.820963820963821e-06, "loss": 0.0011, "step": 25490 }, { "epoch": 32.016370656370654, "grad_norm": 0.08962148427963257, "learning_rate": 3.818103818103818e-06, "loss": 0.0014, "step": 25500 }, { "epoch": 32.01662805662806, "grad_norm": 0.09578029066324234, "learning_rate": 3.815243815243816e-06, "loss": 0.675, "step": 25510 }, { "epoch": 32.016885456885454, "grad_norm": 0.04941503703594208, "learning_rate": 3.8123838123838125e-06, "loss": 0.0019, "step": 25520 }, { "epoch": 32.01714285714286, "grad_norm": 0.3917921185493469, "learning_rate": 3.80952380952381e-06, "loss": 0.0006, "step": 25530 }, { "epoch": 32.017400257400254, "grad_norm": 0.053205542266368866, "learning_rate": 3.8066638066638067e-06, "loss": 0.0003, "step": 25540 }, { "epoch": 32.01765765765766, "grad_norm": 0.0002905249421019107, "learning_rate": 3.803803803803804e-06, "loss": 0.8974, "step": 25550 }, { "epoch": 32.017915057915054, "grad_norm": 105.43701934814453, "learning_rate": 3.800943800943801e-06, "loss": 1.1082, "step": 25560 }, { "epoch": 32.01817245817246, "grad_norm": 0.18681477010250092, "learning_rate": 3.7980837980837982e-06, "loss": 0.0003, "step": 25570 }, { "epoch": 32.01842985842986, "grad_norm": 0.0005921496194787323, "learning_rate": 3.795223795223796e-06, "loss": 0.0322, "step": 25580 }, { "epoch": 32.01868725868726, "grad_norm": 0.0002967441687360406, "learning_rate": 3.792363792363793e-06, "loss": 0.0008, "step": 25590 }, { "epoch": 32.01894465894466, "grad_norm": 0.03639024868607521, "learning_rate": 3.78950378950379e-06, "loss": 0.1673, "step": 25600 }, { "epoch": 32.01920205920206, "grad_norm": 0.00020979787223041058, "learning_rate": 3.786643786643787e-06, "loss": 0.6608, "step": 25610 }, { "epoch": 32.01945945945946, "grad_norm": 0.0004491186118684709, "learning_rate": 3.7837837837837844e-06, "loss": 0.0006, "step": 25620 }, { "epoch": 32.01971685971686, "grad_norm": 0.14153051376342773, "learning_rate": 3.7809237809237813e-06, "loss": 0.4424, "step": 25630 }, { "epoch": 32.01997425997426, "grad_norm": 0.1150650754570961, "learning_rate": 3.7780637780637786e-06, "loss": 0.0036, "step": 25640 }, { "epoch": 32.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 1.0988526344299316, "eval_runtime": 13.5282, "eval_samples_per_second": 3.4, "eval_steps_per_second": 3.4, "step": 25641 }, { "epoch": 33.00023166023166, "grad_norm": 0.0002022027620114386, "learning_rate": 3.7752037752037755e-06, "loss": 0.0001, "step": 25650 }, { "epoch": 33.00048906048906, "grad_norm": 0.6226126551628113, "learning_rate": 3.772343772343773e-06, "loss": 0.6352, "step": 25660 }, { "epoch": 33.00074646074646, "grad_norm": 0.00021000955894123763, "learning_rate": 3.76948376948377e-06, "loss": 0.0012, "step": 25670 }, { "epoch": 33.00100386100386, "grad_norm": 0.0005035571521148086, "learning_rate": 3.766623766623767e-06, "loss": 0.0003, "step": 25680 }, { "epoch": 33.00126126126126, "grad_norm": 0.0008679556776769459, "learning_rate": 3.7637637637637643e-06, "loss": 0.0001, "step": 25690 }, { "epoch": 33.00151866151866, "grad_norm": 0.000201143222511746, "learning_rate": 3.7609037609037612e-06, "loss": 0.0049, "step": 25700 }, { "epoch": 33.00177606177606, "grad_norm": 0.00031288861646316946, "learning_rate": 3.7580437580437585e-06, "loss": 0.0017, "step": 25710 }, { "epoch": 33.00203346203346, "grad_norm": 0.0003889285144396126, "learning_rate": 3.7551837551837554e-06, "loss": 0.0616, "step": 25720 }, { "epoch": 33.00229086229086, "grad_norm": 0.034657277166843414, "learning_rate": 3.7523237523237528e-06, "loss": 0.7658, "step": 25730 }, { "epoch": 33.00254826254826, "grad_norm": 0.0001886966492747888, "learning_rate": 3.7494637494637496e-06, "loss": 0.0004, "step": 25740 }, { "epoch": 33.00280566280566, "grad_norm": 0.007791679818183184, "learning_rate": 3.746603746603747e-06, "loss": 0.0003, "step": 25750 }, { "epoch": 33.00306306306306, "grad_norm": 0.013618108816444874, "learning_rate": 3.743743743743744e-06, "loss": 0.2169, "step": 25760 }, { "epoch": 33.00332046332046, "grad_norm": 0.00634233420714736, "learning_rate": 3.740883740883741e-06, "loss": 0.0002, "step": 25770 }, { "epoch": 33.00357786357787, "grad_norm": 0.0011721225455403328, "learning_rate": 3.7380237380237385e-06, "loss": 0.4377, "step": 25780 }, { "epoch": 33.00383526383526, "grad_norm": 0.0002262504567625001, "learning_rate": 3.7351637351637354e-06, "loss": 0.0301, "step": 25790 }, { "epoch": 33.00409266409267, "grad_norm": 0.00018854282097890973, "learning_rate": 3.7323037323037327e-06, "loss": 0.8319, "step": 25800 }, { "epoch": 33.004350064350064, "grad_norm": 0.13399730622768402, "learning_rate": 3.7294437294437296e-06, "loss": 0.092, "step": 25810 }, { "epoch": 33.00460746460747, "grad_norm": 0.02621803991496563, "learning_rate": 3.726583726583727e-06, "loss": 0.4453, "step": 25820 }, { "epoch": 33.004864864864864, "grad_norm": 2.0014894008636475, "learning_rate": 3.723723723723724e-06, "loss": 0.1793, "step": 25830 }, { "epoch": 33.00512226512227, "grad_norm": 214.54891967773438, "learning_rate": 3.720863720863721e-06, "loss": 0.0317, "step": 25840 }, { "epoch": 33.005379665379664, "grad_norm": 0.005550840869545937, "learning_rate": 3.718003718003718e-06, "loss": 0.7145, "step": 25850 }, { "epoch": 33.00563706563707, "grad_norm": 0.007037818897515535, "learning_rate": 3.7151437151437153e-06, "loss": 0.7924, "step": 25860 }, { "epoch": 33.005894465894464, "grad_norm": 0.009896584786474705, "learning_rate": 3.7122837122837122e-06, "loss": 0.4399, "step": 25870 }, { "epoch": 33.00615186615187, "grad_norm": 0.17795123159885406, "learning_rate": 3.7094237094237095e-06, "loss": 1.1506, "step": 25880 }, { "epoch": 33.006409266409264, "grad_norm": 0.0003295901115052402, "learning_rate": 3.706563706563707e-06, "loss": 0.7586, "step": 25890 }, { "epoch": 33.00666666666667, "grad_norm": 0.006462703924626112, "learning_rate": 3.7037037037037037e-06, "loss": 0.0015, "step": 25900 }, { "epoch": 33.006924066924064, "grad_norm": 0.0007809298695065081, "learning_rate": 3.700843700843701e-06, "loss": 0.0002, "step": 25910 }, { "epoch": 33.00718146718147, "grad_norm": 0.10808756947517395, "learning_rate": 3.697983697983698e-06, "loss": 1.0891, "step": 25920 }, { "epoch": 33.007438867438864, "grad_norm": 0.0005664303316734731, "learning_rate": 3.6951236951236953e-06, "loss": 1.0756, "step": 25930 }, { "epoch": 33.00769626769627, "grad_norm": 0.39256948232650757, "learning_rate": 3.692263692263692e-06, "loss": 0.6975, "step": 25940 }, { "epoch": 33.007953667953664, "grad_norm": 0.05600004643201828, "learning_rate": 3.68940368940369e-06, "loss": 0.0004, "step": 25950 }, { "epoch": 33.00821106821107, "grad_norm": 0.0007221155683510005, "learning_rate": 3.6865436865436872e-06, "loss": 1.3454, "step": 25960 }, { "epoch": 33.00846846846847, "grad_norm": 0.0249467883259058, "learning_rate": 3.683683683683684e-06, "loss": 0.0002, "step": 25970 }, { "epoch": 33.00872586872587, "grad_norm": 0.01132212020456791, "learning_rate": 3.6808236808236814e-06, "loss": 0.8555, "step": 25980 }, { "epoch": 33.00898326898327, "grad_norm": 0.20580242574214935, "learning_rate": 3.6779636779636783e-06, "loss": 0.0009, "step": 25990 }, { "epoch": 33.00924066924067, "grad_norm": 0.004588236566632986, "learning_rate": 3.6751036751036756e-06, "loss": 0.0003, "step": 26000 }, { "epoch": 33.00949806949807, "grad_norm": 51.92314529418945, "learning_rate": 3.6722436722436725e-06, "loss": 0.768, "step": 26010 }, { "epoch": 33.00975546975547, "grad_norm": 0.023511553183197975, "learning_rate": 3.66938366938367e-06, "loss": 0.1939, "step": 26020 }, { "epoch": 33.01001287001287, "grad_norm": 0.00020286561630200595, "learning_rate": 3.666523666523667e-06, "loss": 1.453, "step": 26030 }, { "epoch": 33.01027027027027, "grad_norm": 0.03150779381394386, "learning_rate": 3.663663663663664e-06, "loss": 0.5429, "step": 26040 }, { "epoch": 33.01052767052767, "grad_norm": 1292.7117919921875, "learning_rate": 3.6608036608036614e-06, "loss": 0.4714, "step": 26050 }, { "epoch": 33.01078507078507, "grad_norm": 0.0039928508922457695, "learning_rate": 3.6579436579436583e-06, "loss": 0.0025, "step": 26060 }, { "epoch": 33.01104247104247, "grad_norm": 0.004026371985673904, "learning_rate": 3.6550836550836556e-06, "loss": 0.2678, "step": 26070 }, { "epoch": 33.01129987129987, "grad_norm": 0.001805619103834033, "learning_rate": 3.6522236522236525e-06, "loss": 0.0078, "step": 26080 }, { "epoch": 33.01155727155727, "grad_norm": 0.0019118794007226825, "learning_rate": 3.6493636493636498e-06, "loss": 0.5989, "step": 26090 }, { "epoch": 33.01181467181467, "grad_norm": 0.003230776870623231, "learning_rate": 3.6465036465036467e-06, "loss": 0.0005, "step": 26100 }, { "epoch": 33.01207207207207, "grad_norm": 0.00032252943492494524, "learning_rate": 3.643643643643644e-06, "loss": 0.0008, "step": 26110 }, { "epoch": 33.01232947232947, "grad_norm": 0.0008710320107638836, "learning_rate": 3.6407836407836413e-06, "loss": 0.4259, "step": 26120 }, { "epoch": 33.01258687258687, "grad_norm": 0.014103787951171398, "learning_rate": 3.637923637923638e-06, "loss": 0.001, "step": 26130 }, { "epoch": 33.012844272844276, "grad_norm": 0.06376767158508301, "learning_rate": 3.6350636350636355e-06, "loss": 0.0009, "step": 26140 }, { "epoch": 33.01310167310167, "grad_norm": 0.0013607190921902657, "learning_rate": 3.6322036322036324e-06, "loss": 0.066, "step": 26150 }, { "epoch": 33.01335907335908, "grad_norm": 0.00027021009009331465, "learning_rate": 3.6293436293436297e-06, "loss": 0.0029, "step": 26160 }, { "epoch": 33.01361647361647, "grad_norm": 0.01180395856499672, "learning_rate": 3.6264836264836266e-06, "loss": 0.0004, "step": 26170 }, { "epoch": 33.01387387387388, "grad_norm": 0.04038818180561066, "learning_rate": 3.623623623623624e-06, "loss": 0.0004, "step": 26180 }, { "epoch": 33.01413127413127, "grad_norm": 0.016394222155213356, "learning_rate": 3.620763620763621e-06, "loss": 0.7459, "step": 26190 }, { "epoch": 33.01438867438868, "grad_norm": 577.7449340820312, "learning_rate": 3.617903617903618e-06, "loss": 0.2242, "step": 26200 }, { "epoch": 33.01464607464607, "grad_norm": 0.015523134730756283, "learning_rate": 3.615043615043615e-06, "loss": 0.0001, "step": 26210 }, { "epoch": 33.01490347490348, "grad_norm": 0.02036185935139656, "learning_rate": 3.6121836121836124e-06, "loss": 1.3635, "step": 26220 }, { "epoch": 33.01516087516087, "grad_norm": 45.18238830566406, "learning_rate": 3.6093236093236097e-06, "loss": 2.0981, "step": 26230 }, { "epoch": 33.01541827541828, "grad_norm": 0.09267007559537888, "learning_rate": 3.6064636064636066e-06, "loss": 0.639, "step": 26240 }, { "epoch": 33.01567567567567, "grad_norm": 1.0800601243972778, "learning_rate": 3.603603603603604e-06, "loss": 0.0284, "step": 26250 }, { "epoch": 33.01593307593308, "grad_norm": 0.0005480324034579098, "learning_rate": 3.6007436007436008e-06, "loss": 0.9278, "step": 26260 }, { "epoch": 33.016190476190474, "grad_norm": 0.27580541372299194, "learning_rate": 3.597883597883598e-06, "loss": 0.6449, "step": 26270 }, { "epoch": 33.01644787644788, "grad_norm": 0.0772259458899498, "learning_rate": 3.595023595023595e-06, "loss": 0.0012, "step": 26280 }, { "epoch": 33.016705276705274, "grad_norm": 0.036952629685401917, "learning_rate": 3.5921635921635923e-06, "loss": 0.0406, "step": 26290 }, { "epoch": 33.01696267696268, "grad_norm": 0.0002245551295345649, "learning_rate": 3.589303589303589e-06, "loss": 0.001, "step": 26300 }, { "epoch": 33.017220077220074, "grad_norm": 3.1717610359191895, "learning_rate": 3.5864435864435865e-06, "loss": 0.001, "step": 26310 }, { "epoch": 33.01747747747748, "grad_norm": 0.0382111519575119, "learning_rate": 3.5835835835835834e-06, "loss": 0.4692, "step": 26320 }, { "epoch": 33.01773487773488, "grad_norm": 0.0640905350446701, "learning_rate": 3.580723580723581e-06, "loss": 0.5739, "step": 26330 }, { "epoch": 33.01799227799228, "grad_norm": 0.00023219177091959864, "learning_rate": 3.5778635778635785e-06, "loss": 0.9233, "step": 26340 }, { "epoch": 33.01824967824968, "grad_norm": 0.040953151881694794, "learning_rate": 3.5750035750035753e-06, "loss": 0.0011, "step": 26350 }, { "epoch": 33.01850707850708, "grad_norm": 16.874282836914062, "learning_rate": 3.5721435721435727e-06, "loss": 0.6596, "step": 26360 }, { "epoch": 33.01876447876448, "grad_norm": 0.0005724704824388027, "learning_rate": 3.56928356928357e-06, "loss": 0.0016, "step": 26370 }, { "epoch": 33.01902187902188, "grad_norm": 401.7337341308594, "learning_rate": 3.566423566423567e-06, "loss": 0.4095, "step": 26380 }, { "epoch": 33.01927927927928, "grad_norm": 0.0027081759180873632, "learning_rate": 3.563563563563564e-06, "loss": 1.1057, "step": 26390 }, { "epoch": 33.01953667953668, "grad_norm": 0.11730697751045227, "learning_rate": 3.560703560703561e-06, "loss": 1.2431, "step": 26400 }, { "epoch": 33.01979407979408, "grad_norm": 0.0015030332142487168, "learning_rate": 3.5578435578435584e-06, "loss": 0.6257, "step": 26410 }, { "epoch": 33.02, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.9700731039047241, "eval_runtime": 13.4331, "eval_samples_per_second": 3.424, "eval_steps_per_second": 3.424, "step": 26418 }, { "epoch": 34.00005148005148, "grad_norm": 0.21256451308727264, "learning_rate": 3.5549835549835553e-06, "loss": 1.1551, "step": 26420 }, { "epoch": 34.00030888030888, "grad_norm": 0.005612839478999376, "learning_rate": 3.5521235521235526e-06, "loss": 0.5618, "step": 26430 }, { "epoch": 34.00056628056628, "grad_norm": 0.0003517004370223731, "learning_rate": 3.5492635492635495e-06, "loss": 0.0014, "step": 26440 }, { "epoch": 34.00082368082368, "grad_norm": 0.01810491643846035, "learning_rate": 3.546403546403547e-06, "loss": 0.6831, "step": 26450 }, { "epoch": 34.00108108108108, "grad_norm": 0.00031709542963653803, "learning_rate": 3.5435435435435437e-06, "loss": 0.0011, "step": 26460 }, { "epoch": 34.00133848133848, "grad_norm": 0.0002846124698407948, "learning_rate": 3.540683540683541e-06, "loss": 0.4127, "step": 26470 }, { "epoch": 34.00159588159588, "grad_norm": 0.00024031595967244357, "learning_rate": 3.5378235378235383e-06, "loss": 0.0572, "step": 26480 }, { "epoch": 34.00185328185328, "grad_norm": 42.76933288574219, "learning_rate": 3.5349635349635352e-06, "loss": 0.0024, "step": 26490 }, { "epoch": 34.00211068211068, "grad_norm": 0.061154335737228394, "learning_rate": 3.5321035321035326e-06, "loss": 0.0007, "step": 26500 }, { "epoch": 34.00236808236808, "grad_norm": 0.0024257488548755646, "learning_rate": 3.5292435292435294e-06, "loss": 0.0002, "step": 26510 }, { "epoch": 34.00262548262548, "grad_norm": 0.0003811272617895156, "learning_rate": 3.5263835263835268e-06, "loss": 0.0004, "step": 26520 }, { "epoch": 34.002882882882886, "grad_norm": 0.0008752444991841912, "learning_rate": 3.5235235235235237e-06, "loss": 0.4638, "step": 26530 }, { "epoch": 34.00314028314028, "grad_norm": 0.001001368393190205, "learning_rate": 3.520663520663521e-06, "loss": 0.6171, "step": 26540 }, { "epoch": 34.00339768339769, "grad_norm": 0.13201755285263062, "learning_rate": 3.517803517803518e-06, "loss": 1.412, "step": 26550 }, { "epoch": 34.00365508365508, "grad_norm": 0.00029380101477727294, "learning_rate": 3.514943514943515e-06, "loss": 0.0001, "step": 26560 }, { "epoch": 34.00391248391249, "grad_norm": 0.07130458950996399, "learning_rate": 3.512083512083512e-06, "loss": 0.4946, "step": 26570 }, { "epoch": 34.00416988416988, "grad_norm": 0.025920139625668526, "learning_rate": 3.5092235092235094e-06, "loss": 0.8401, "step": 26580 }, { "epoch": 34.00442728442729, "grad_norm": 97.6776351928711, "learning_rate": 3.5063635063635067e-06, "loss": 0.6201, "step": 26590 }, { "epoch": 34.00468468468468, "grad_norm": 0.0005312726716510952, "learning_rate": 3.5035035035035036e-06, "loss": 0.0005, "step": 26600 }, { "epoch": 34.00494208494209, "grad_norm": 0.012904675677418709, "learning_rate": 3.500643500643501e-06, "loss": 0.0123, "step": 26610 }, { "epoch": 34.00519948519948, "grad_norm": 0.006391404662281275, "learning_rate": 3.497783497783498e-06, "loss": 0.0015, "step": 26620 }, { "epoch": 34.00545688545689, "grad_norm": 0.009654326364398003, "learning_rate": 3.494923494923495e-06, "loss": 0.0014, "step": 26630 }, { "epoch": 34.005714285714284, "grad_norm": 0.002458939328789711, "learning_rate": 3.492063492063492e-06, "loss": 0.5645, "step": 26640 }, { "epoch": 34.00597168597169, "grad_norm": 0.05094308778643608, "learning_rate": 3.4892034892034893e-06, "loss": 0.0003, "step": 26650 }, { "epoch": 34.006229086229084, "grad_norm": 0.01629532128572464, "learning_rate": 3.4863434863434862e-06, "loss": 0.0007, "step": 26660 }, { "epoch": 34.00648648648649, "grad_norm": 0.001868433435447514, "learning_rate": 3.4834834834834835e-06, "loss": 0.001, "step": 26670 }, { "epoch": 34.006743886743884, "grad_norm": 0.024310791864991188, "learning_rate": 3.480623480623481e-06, "loss": 0.0005, "step": 26680 }, { "epoch": 34.00700128700129, "grad_norm": 0.0002190342784160748, "learning_rate": 3.4777634777634777e-06, "loss": 0.7477, "step": 26690 }, { "epoch": 34.007258687258684, "grad_norm": 0.00019727973267436028, "learning_rate": 3.4749034749034755e-06, "loss": 0.7275, "step": 26700 }, { "epoch": 34.00751608751609, "grad_norm": 90.7525634765625, "learning_rate": 3.4720434720434724e-06, "loss": 0.8832, "step": 26710 }, { "epoch": 34.00777348777349, "grad_norm": 476.1946716308594, "learning_rate": 3.4691834691834697e-06, "loss": 0.8877, "step": 26720 }, { "epoch": 34.00803088803089, "grad_norm": 0.012554511427879333, "learning_rate": 3.466323466323467e-06, "loss": 0.0015, "step": 26730 }, { "epoch": 34.00828828828829, "grad_norm": 0.17711642384529114, "learning_rate": 3.463463463463464e-06, "loss": 0.6558, "step": 26740 }, { "epoch": 34.00854568854569, "grad_norm": 0.00020986588788218796, "learning_rate": 3.4606034606034612e-06, "loss": 0.0043, "step": 26750 }, { "epoch": 34.00880308880309, "grad_norm": 0.11533048748970032, "learning_rate": 3.457743457743458e-06, "loss": 0.0004, "step": 26760 }, { "epoch": 34.00906048906049, "grad_norm": 0.0015373900532722473, "learning_rate": 3.4548834548834554e-06, "loss": 0.6428, "step": 26770 }, { "epoch": 34.00931788931789, "grad_norm": 0.0005574403912760317, "learning_rate": 3.4520234520234523e-06, "loss": 0.0005, "step": 26780 }, { "epoch": 34.00957528957529, "grad_norm": 0.006771887186914682, "learning_rate": 3.4491634491634496e-06, "loss": 0.0007, "step": 26790 }, { "epoch": 34.00983268983269, "grad_norm": 0.018115438520908356, "learning_rate": 3.4463034463034465e-06, "loss": 0.4471, "step": 26800 }, { "epoch": 34.01009009009009, "grad_norm": 0.0823565125465393, "learning_rate": 3.443443443443444e-06, "loss": 0.5876, "step": 26810 }, { "epoch": 34.01034749034749, "grad_norm": 0.01303765270859003, "learning_rate": 3.440583440583441e-06, "loss": 1.1356, "step": 26820 }, { "epoch": 34.01060489060489, "grad_norm": 0.0010142725659534335, "learning_rate": 3.437723437723438e-06, "loss": 0.0068, "step": 26830 }, { "epoch": 34.01086229086229, "grad_norm": 0.09757779538631439, "learning_rate": 3.4348634348634354e-06, "loss": 1.2388, "step": 26840 }, { "epoch": 34.01111969111969, "grad_norm": 0.053004637360572815, "learning_rate": 3.4320034320034323e-06, "loss": 0.0015, "step": 26850 }, { "epoch": 34.01137709137709, "grad_norm": 0.11356706917285919, "learning_rate": 3.4291434291434296e-06, "loss": 0.004, "step": 26860 }, { "epoch": 34.01163449163449, "grad_norm": 0.10561950504779816, "learning_rate": 3.4262834262834265e-06, "loss": 0.0004, "step": 26870 }, { "epoch": 34.01189189189189, "grad_norm": 0.0003318323288112879, "learning_rate": 3.423423423423424e-06, "loss": 0.0005, "step": 26880 }, { "epoch": 34.01214929214929, "grad_norm": 0.0002169922081520781, "learning_rate": 3.4205634205634207e-06, "loss": 0.0003, "step": 26890 }, { "epoch": 34.01240669240669, "grad_norm": 0.07491030544042587, "learning_rate": 3.417703417703418e-06, "loss": 0.0004, "step": 26900 }, { "epoch": 34.012664092664096, "grad_norm": 0.6812867522239685, "learning_rate": 3.414843414843415e-06, "loss": 0.0472, "step": 26910 }, { "epoch": 34.01292149292149, "grad_norm": 0.08314365893602371, "learning_rate": 3.411983411983412e-06, "loss": 0.0003, "step": 26920 }, { "epoch": 34.013178893178896, "grad_norm": 0.05190544202923775, "learning_rate": 3.4091234091234095e-06, "loss": 0.0892, "step": 26930 }, { "epoch": 34.01343629343629, "grad_norm": 0.03272266685962677, "learning_rate": 3.4062634062634064e-06, "loss": 0.0001, "step": 26940 }, { "epoch": 34.013693693693696, "grad_norm": 0.039729151874780655, "learning_rate": 3.4034034034034037e-06, "loss": 0.6702, "step": 26950 }, { "epoch": 34.01395109395109, "grad_norm": 0.00022988698037806898, "learning_rate": 3.4005434005434006e-06, "loss": 1.3983, "step": 26960 }, { "epoch": 34.014208494208496, "grad_norm": 0.005083046387881041, "learning_rate": 3.397683397683398e-06, "loss": 0.0006, "step": 26970 }, { "epoch": 34.01446589446589, "grad_norm": 271.5005187988281, "learning_rate": 3.394823394823395e-06, "loss": 1.3513, "step": 26980 }, { "epoch": 34.0147232947233, "grad_norm": 109.3269271850586, "learning_rate": 3.391963391963392e-06, "loss": 0.5311, "step": 26990 }, { "epoch": 34.01498069498069, "grad_norm": 0.0002787598641589284, "learning_rate": 3.389103389103389e-06, "loss": 0.021, "step": 27000 }, { "epoch": 34.0152380952381, "grad_norm": 0.00025051116244867444, "learning_rate": 3.3862433862433864e-06, "loss": 0.0021, "step": 27010 }, { "epoch": 34.01549549549549, "grad_norm": 0.0006493125110864639, "learning_rate": 3.3833833833833833e-06, "loss": 0.4824, "step": 27020 }, { "epoch": 34.0157528957529, "grad_norm": 0.0007675174274481833, "learning_rate": 3.3805233805233806e-06, "loss": 0.1143, "step": 27030 }, { "epoch": 34.01601029601029, "grad_norm": 0.1199033185839653, "learning_rate": 3.377663377663378e-06, "loss": 0.0004, "step": 27040 }, { "epoch": 34.0162676962677, "grad_norm": 0.0002853220503311604, "learning_rate": 3.3748033748033748e-06, "loss": 0.0003, "step": 27050 }, { "epoch": 34.01652509652509, "grad_norm": 0.08917862176895142, "learning_rate": 3.371943371943372e-06, "loss": 0.0004, "step": 27060 }, { "epoch": 34.0167824967825, "grad_norm": 0.0002305887028342113, "learning_rate": 3.369083369083369e-06, "loss": 0.6585, "step": 27070 }, { "epoch": 34.01703989703989, "grad_norm": 0.04280995950102806, "learning_rate": 3.3662233662233667e-06, "loss": 0.2125, "step": 27080 }, { "epoch": 34.0172972972973, "grad_norm": 0.005567069165408611, "learning_rate": 3.363363363363364e-06, "loss": 0.5481, "step": 27090 }, { "epoch": 34.0175546975547, "grad_norm": 0.017813654616475105, "learning_rate": 3.360503360503361e-06, "loss": 0.0007, "step": 27100 }, { "epoch": 34.0178120978121, "grad_norm": 92.60106658935547, "learning_rate": 3.3576433576433583e-06, "loss": 0.7287, "step": 27110 }, { "epoch": 34.0180694980695, "grad_norm": 0.10312855243682861, "learning_rate": 3.354783354783355e-06, "loss": 0.001, "step": 27120 }, { "epoch": 34.0183268983269, "grad_norm": 0.05467154458165169, "learning_rate": 3.3519233519233525e-06, "loss": 0.001, "step": 27130 }, { "epoch": 34.0185842985843, "grad_norm": 0.10436591506004333, "learning_rate": 3.3490633490633494e-06, "loss": 1.2953, "step": 27140 }, { "epoch": 34.0188416988417, "grad_norm": 0.06830277293920517, "learning_rate": 3.3462033462033467e-06, "loss": 0.0005, "step": 27150 }, { "epoch": 34.0190990990991, "grad_norm": 0.029437022283673286, "learning_rate": 3.3433433433433436e-06, "loss": 0.0002, "step": 27160 }, { "epoch": 34.0193564993565, "grad_norm": 2062.808837890625, "learning_rate": 3.340483340483341e-06, "loss": 0.4116, "step": 27170 }, { "epoch": 34.0196138996139, "grad_norm": 0.00020808493718504906, "learning_rate": 3.337623337623338e-06, "loss": 0.081, "step": 27180 }, { "epoch": 34.0198712998713, "grad_norm": 0.00027362650143913925, "learning_rate": 3.334763334763335e-06, "loss": 0.8758, "step": 27190 }, { "epoch": 34.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.7715871930122375, "eval_runtime": 13.3796, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "step": 27195 }, { "epoch": 35.0001287001287, "grad_norm": 0.00017348758410662413, "learning_rate": 3.3319033319033324e-06, "loss": 1.0786, "step": 27200 }, { "epoch": 35.0003861003861, "grad_norm": 0.0002206278295489028, "learning_rate": 3.3290433290433293e-06, "loss": 0.6193, "step": 27210 }, { "epoch": 35.0006435006435, "grad_norm": 0.0069132959470152855, "learning_rate": 3.3261833261833266e-06, "loss": 0.5806, "step": 27220 }, { "epoch": 35.0009009009009, "grad_norm": 0.3105623424053192, "learning_rate": 3.3233233233233235e-06, "loss": 0.4066, "step": 27230 }, { "epoch": 35.0011583011583, "grad_norm": 0.00029745243955403566, "learning_rate": 3.320463320463321e-06, "loss": 0.0074, "step": 27240 }, { "epoch": 35.0014157014157, "grad_norm": 0.005316603928804398, "learning_rate": 3.3176033176033177e-06, "loss": 0.1438, "step": 27250 }, { "epoch": 35.0016731016731, "grad_norm": 0.00018269273277837783, "learning_rate": 3.314743314743315e-06, "loss": 0.0002, "step": 27260 }, { "epoch": 35.0019305019305, "grad_norm": 0.0015236226608976722, "learning_rate": 3.311883311883312e-06, "loss": 0.5008, "step": 27270 }, { "epoch": 35.0021879021879, "grad_norm": 0.021271970123052597, "learning_rate": 3.3090233090233092e-06, "loss": 0.0003, "step": 27280 }, { "epoch": 35.0024453024453, "grad_norm": 0.0007113626343198121, "learning_rate": 3.3061633061633066e-06, "loss": 1.2656, "step": 27290 }, { "epoch": 35.002702702702706, "grad_norm": 0.3437110185623169, "learning_rate": 3.3033033033033035e-06, "loss": 0.0008, "step": 27300 }, { "epoch": 35.0029601029601, "grad_norm": 0.042603738605976105, "learning_rate": 3.3004433004433008e-06, "loss": 0.7157, "step": 27310 }, { "epoch": 35.003217503217506, "grad_norm": 0.0068983961828053, "learning_rate": 3.2975832975832977e-06, "loss": 0.5931, "step": 27320 }, { "epoch": 35.0034749034749, "grad_norm": 0.023130234330892563, "learning_rate": 3.294723294723295e-06, "loss": 0.0004, "step": 27330 }, { "epoch": 35.003732303732306, "grad_norm": 0.0014327826211228967, "learning_rate": 3.291863291863292e-06, "loss": 0.4843, "step": 27340 }, { "epoch": 35.0039897039897, "grad_norm": 0.0004222426505293697, "learning_rate": 3.289003289003289e-06, "loss": 0.5337, "step": 27350 }, { "epoch": 35.004247104247106, "grad_norm": 0.0003112400881946087, "learning_rate": 3.286143286143286e-06, "loss": 0.6144, "step": 27360 }, { "epoch": 35.0045045045045, "grad_norm": 0.00023315486032515764, "learning_rate": 3.2832832832832834e-06, "loss": 0.3117, "step": 27370 }, { "epoch": 35.00476190476191, "grad_norm": 0.01298606488853693, "learning_rate": 3.2804232804232807e-06, "loss": 0.2838, "step": 27380 }, { "epoch": 35.0050193050193, "grad_norm": 161.7285919189453, "learning_rate": 3.2775632775632776e-06, "loss": 0.5173, "step": 27390 }, { "epoch": 35.00527670527671, "grad_norm": 0.0003556629817467183, "learning_rate": 3.274703274703275e-06, "loss": 0.6745, "step": 27400 }, { "epoch": 35.0055341055341, "grad_norm": 0.003990973811596632, "learning_rate": 3.271843271843272e-06, "loss": 0.0003, "step": 27410 }, { "epoch": 35.00579150579151, "grad_norm": 0.017456263303756714, "learning_rate": 3.268983268983269e-06, "loss": 0.0003, "step": 27420 }, { "epoch": 35.0060489060489, "grad_norm": 377.7240905761719, "learning_rate": 3.266123266123266e-06, "loss": 0.0187, "step": 27430 }, { "epoch": 35.00630630630631, "grad_norm": 0.051734283566474915, "learning_rate": 3.2632632632632633e-06, "loss": 0.0003, "step": 27440 }, { "epoch": 35.0065637065637, "grad_norm": 0.053064629435539246, "learning_rate": 3.260403260403261e-06, "loss": 0.0024, "step": 27450 }, { "epoch": 35.00682110682111, "grad_norm": 0.16284862160682678, "learning_rate": 3.257543257543258e-06, "loss": 0.9852, "step": 27460 }, { "epoch": 35.0070785070785, "grad_norm": 0.0042984443716704845, "learning_rate": 3.2546832546832553e-06, "loss": 0.3026, "step": 27470 }, { "epoch": 35.00733590733591, "grad_norm": 0.00021199326147325337, "learning_rate": 3.251823251823252e-06, "loss": 0.3458, "step": 27480 }, { "epoch": 35.00759330759331, "grad_norm": 1034.568115234375, "learning_rate": 3.2489632489632495e-06, "loss": 0.381, "step": 27490 }, { "epoch": 35.00785070785071, "grad_norm": 0.12477569282054901, "learning_rate": 3.2461032461032464e-06, "loss": 0.0004, "step": 27500 }, { "epoch": 35.00810810810811, "grad_norm": 0.042156290262937546, "learning_rate": 3.2432432432432437e-06, "loss": 0.8273, "step": 27510 }, { "epoch": 35.00836550836551, "grad_norm": 0.003120782785117626, "learning_rate": 3.240383240383241e-06, "loss": 0.0007, "step": 27520 }, { "epoch": 35.00862290862291, "grad_norm": 131.19398498535156, "learning_rate": 3.237523237523238e-06, "loss": 0.5689, "step": 27530 }, { "epoch": 35.00888030888031, "grad_norm": 0.00033146803616546094, "learning_rate": 3.2346632346632352e-06, "loss": 0.0002, "step": 27540 }, { "epoch": 35.00913770913771, "grad_norm": 152.272216796875, "learning_rate": 3.231803231803232e-06, "loss": 0.6175, "step": 27550 }, { "epoch": 35.00939510939511, "grad_norm": 0.016422213986516, "learning_rate": 3.2289432289432294e-06, "loss": 0.6991, "step": 27560 }, { "epoch": 35.00965250965251, "grad_norm": 0.02891531027853489, "learning_rate": 3.2260832260832263e-06, "loss": 0.0145, "step": 27570 }, { "epoch": 35.00990990990991, "grad_norm": 0.034770023077726364, "learning_rate": 3.2232232232232236e-06, "loss": 1.3164, "step": 27580 }, { "epoch": 35.01016731016731, "grad_norm": 0.2806760370731354, "learning_rate": 3.2203632203632205e-06, "loss": 0.1784, "step": 27590 }, { "epoch": 35.01042471042471, "grad_norm": 58.18006896972656, "learning_rate": 3.217503217503218e-06, "loss": 1.1186, "step": 27600 }, { "epoch": 35.01068211068211, "grad_norm": 0.025718385353684425, "learning_rate": 3.2146432146432147e-06, "loss": 0.0003, "step": 27610 }, { "epoch": 35.01093951093951, "grad_norm": 0.0030768548604100943, "learning_rate": 3.211783211783212e-06, "loss": 0.0002, "step": 27620 }, { "epoch": 35.01119691119691, "grad_norm": 0.010804054327309132, "learning_rate": 3.2089232089232094e-06, "loss": 0.7885, "step": 27630 }, { "epoch": 35.01145431145431, "grad_norm": 250.21238708496094, "learning_rate": 3.2060632060632063e-06, "loss": 0.4832, "step": 27640 }, { "epoch": 35.01171171171171, "grad_norm": 57.603599548339844, "learning_rate": 3.2032032032032036e-06, "loss": 1.1185, "step": 27650 }, { "epoch": 35.011969111969115, "grad_norm": 0.0002042093692580238, "learning_rate": 3.2003432003432005e-06, "loss": 0.6971, "step": 27660 }, { "epoch": 35.01222651222651, "grad_norm": 584.781982421875, "learning_rate": 3.197483197483198e-06, "loss": 0.0292, "step": 27670 }, { "epoch": 35.012483912483916, "grad_norm": 0.11098294705152512, "learning_rate": 3.1946231946231947e-06, "loss": 0.0022, "step": 27680 }, { "epoch": 35.01274131274131, "grad_norm": 0.0507424995303154, "learning_rate": 3.191763191763192e-06, "loss": 0.0008, "step": 27690 }, { "epoch": 35.012998712998716, "grad_norm": 0.00076050974894315, "learning_rate": 3.188903188903189e-06, "loss": 0.5183, "step": 27700 }, { "epoch": 35.01325611325611, "grad_norm": 0.00023733789566904306, "learning_rate": 3.1860431860431862e-06, "loss": 0.0005, "step": 27710 }, { "epoch": 35.013513513513516, "grad_norm": 0.0004732540692202747, "learning_rate": 3.183183183183183e-06, "loss": 0.0005, "step": 27720 }, { "epoch": 35.01377091377091, "grad_norm": 0.0001804508501663804, "learning_rate": 3.1803231803231804e-06, "loss": 0.0007, "step": 27730 }, { "epoch": 35.014028314028316, "grad_norm": 0.1139734536409378, "learning_rate": 3.1774631774631777e-06, "loss": 0.0011, "step": 27740 }, { "epoch": 35.01428571428571, "grad_norm": 0.15720994770526886, "learning_rate": 3.1746031746031746e-06, "loss": 0.0006, "step": 27750 }, { "epoch": 35.014543114543116, "grad_norm": 0.005728594027459621, "learning_rate": 3.171743171743172e-06, "loss": 0.2534, "step": 27760 }, { "epoch": 35.01480051480051, "grad_norm": 0.00016303425945807248, "learning_rate": 3.168883168883169e-06, "loss": 0.4451, "step": 27770 }, { "epoch": 35.015057915057916, "grad_norm": 0.07668960839509964, "learning_rate": 3.166023166023166e-06, "loss": 0.0004, "step": 27780 }, { "epoch": 35.01531531531531, "grad_norm": 0.0008509852923452854, "learning_rate": 3.163163163163163e-06, "loss": 1.3581, "step": 27790 }, { "epoch": 35.015572715572716, "grad_norm": 0.3970625400543213, "learning_rate": 3.1603031603031604e-06, "loss": 0.0006, "step": 27800 }, { "epoch": 35.01583011583011, "grad_norm": 0.49919551610946655, "learning_rate": 3.1574431574431573e-06, "loss": 0.0007, "step": 27810 }, { "epoch": 35.01608751608752, "grad_norm": 0.4793255627155304, "learning_rate": 3.154583154583155e-06, "loss": 0.0008, "step": 27820 }, { "epoch": 35.01634491634491, "grad_norm": 0.12254327535629272, "learning_rate": 3.1517231517231523e-06, "loss": 0.4944, "step": 27830 }, { "epoch": 35.01660231660232, "grad_norm": 0.051711589097976685, "learning_rate": 3.148863148863149e-06, "loss": 0.0002, "step": 27840 }, { "epoch": 35.01685971685972, "grad_norm": 0.00018205131345894188, "learning_rate": 3.1460031460031465e-06, "loss": 0.0002, "step": 27850 }, { "epoch": 35.01711711711712, "grad_norm": 0.00019165266712661833, "learning_rate": 3.1431431431431434e-06, "loss": 0.8208, "step": 27860 }, { "epoch": 35.01737451737452, "grad_norm": 0.0013323579914867878, "learning_rate": 3.1402831402831407e-06, "loss": 0.0122, "step": 27870 }, { "epoch": 35.01763191763192, "grad_norm": 0.00027685629902407527, "learning_rate": 3.137423137423138e-06, "loss": 0.0122, "step": 27880 }, { "epoch": 35.01788931788932, "grad_norm": 0.01839580573141575, "learning_rate": 3.134563134563135e-06, "loss": 0.6481, "step": 27890 }, { "epoch": 35.01814671814672, "grad_norm": 0.0037593143060803413, "learning_rate": 3.1317031317031323e-06, "loss": 0.0001, "step": 27900 }, { "epoch": 35.01840411840412, "grad_norm": 1.110424518585205, "learning_rate": 3.128843128843129e-06, "loss": 0.5803, "step": 27910 }, { "epoch": 35.01866151866152, "grad_norm": 0.0064668068662285805, "learning_rate": 3.1259831259831265e-06, "loss": 0.0005, "step": 27920 }, { "epoch": 35.01891891891892, "grad_norm": 0.00647960789501667, "learning_rate": 3.1231231231231234e-06, "loss": 0.0038, "step": 27930 }, { "epoch": 35.01917631917632, "grad_norm": 0.03522304818034172, "learning_rate": 3.1202631202631207e-06, "loss": 0.0729, "step": 27940 }, { "epoch": 35.01943371943372, "grad_norm": 1.5073069334030151, "learning_rate": 3.1174031174031176e-06, "loss": 0.6523, "step": 27950 }, { "epoch": 35.01969111969112, "grad_norm": 114.54082489013672, "learning_rate": 3.114543114543115e-06, "loss": 1.3528, "step": 27960 }, { "epoch": 35.01994851994852, "grad_norm": 0.0004169368767179549, "learning_rate": 3.1116831116831118e-06, "loss": 0.5316, "step": 27970 }, { "epoch": 35.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 0.917574405670166, "eval_runtime": 14.3651, "eval_samples_per_second": 3.202, "eval_steps_per_second": 3.202, "step": 27972 }, { "epoch": 36.00020592020592, "grad_norm": 0.00026314210845157504, "learning_rate": 3.108823108823109e-06, "loss": 0.7594, "step": 27980 }, { "epoch": 36.00046332046332, "grad_norm": 0.06673434376716614, "learning_rate": 3.1059631059631064e-06, "loss": 0.0005, "step": 27990 }, { "epoch": 36.00072072072072, "grad_norm": 0.0002622131141833961, "learning_rate": 3.1031031031031033e-06, "loss": 0.5207, "step": 28000 }, { "epoch": 36.00097812097812, "grad_norm": 0.007425097282975912, "learning_rate": 3.1002431002431006e-06, "loss": 0.1283, "step": 28010 }, { "epoch": 36.00123552123552, "grad_norm": 678.264404296875, "learning_rate": 3.0973830973830975e-06, "loss": 0.2884, "step": 28020 }, { "epoch": 36.00149292149292, "grad_norm": 0.00021443287550937384, "learning_rate": 3.094523094523095e-06, "loss": 0.0003, "step": 28030 }, { "epoch": 36.00175032175032, "grad_norm": 0.02657965198159218, "learning_rate": 3.0916630916630917e-06, "loss": 0.0007, "step": 28040 }, { "epoch": 36.002007722007725, "grad_norm": 0.08193910866975784, "learning_rate": 3.088803088803089e-06, "loss": 0.0443, "step": 28050 }, { "epoch": 36.00226512226512, "grad_norm": 0.13849781453609467, "learning_rate": 3.085943085943086e-06, "loss": 0.0004, "step": 28060 }, { "epoch": 36.002522522522526, "grad_norm": 0.002490939339622855, "learning_rate": 3.0830830830830832e-06, "loss": 0.001, "step": 28070 }, { "epoch": 36.00277992277992, "grad_norm": 0.020132167264819145, "learning_rate": 3.0802230802230806e-06, "loss": 0.0011, "step": 28080 }, { "epoch": 36.003037323037326, "grad_norm": 0.0001936969201778993, "learning_rate": 3.0773630773630775e-06, "loss": 0.6241, "step": 28090 }, { "epoch": 36.00329472329472, "grad_norm": 0.00044216442620381713, "learning_rate": 3.0745030745030748e-06, "loss": 0.4918, "step": 28100 }, { "epoch": 36.003552123552126, "grad_norm": 0.0017083561979234219, "learning_rate": 3.0716430716430717e-06, "loss": 0.1778, "step": 28110 }, { "epoch": 36.00380952380952, "grad_norm": 0.01577197201550007, "learning_rate": 3.068783068783069e-06, "loss": 0.0003, "step": 28120 }, { "epoch": 36.004066924066926, "grad_norm": 0.03330492228269577, "learning_rate": 3.065923065923066e-06, "loss": 0.0003, "step": 28130 }, { "epoch": 36.00432432432432, "grad_norm": 0.009137704968452454, "learning_rate": 3.063063063063063e-06, "loss": 0.5943, "step": 28140 }, { "epoch": 36.004581724581726, "grad_norm": 1.6320407390594482, "learning_rate": 3.06020306020306e-06, "loss": 1.329, "step": 28150 }, { "epoch": 36.00483912483912, "grad_norm": 0.001672191545367241, "learning_rate": 3.0573430573430574e-06, "loss": 0.8598, "step": 28160 }, { "epoch": 36.005096525096526, "grad_norm": 0.0024032602086663246, "learning_rate": 3.0544830544830543e-06, "loss": 0.001, "step": 28170 }, { "epoch": 36.00535392535392, "grad_norm": 720.0140380859375, "learning_rate": 3.0516230516230516e-06, "loss": 0.0433, "step": 28180 }, { "epoch": 36.005611325611326, "grad_norm": 0.01528639905154705, "learning_rate": 3.048763048763049e-06, "loss": 0.1401, "step": 28190 }, { "epoch": 36.00586872586872, "grad_norm": 0.12506289780139923, "learning_rate": 3.0459030459030462e-06, "loss": 0.0004, "step": 28200 }, { "epoch": 36.00612612612613, "grad_norm": 0.012107270769774914, "learning_rate": 3.0430430430430436e-06, "loss": 0.5241, "step": 28210 }, { "epoch": 36.00638352638352, "grad_norm": 0.003177333390340209, "learning_rate": 3.040183040183041e-06, "loss": 0.7474, "step": 28220 }, { "epoch": 36.00664092664093, "grad_norm": 0.00022184972476679832, "learning_rate": 3.0373230373230378e-06, "loss": 0.143, "step": 28230 }, { "epoch": 36.00689832689833, "grad_norm": 0.022349726408720016, "learning_rate": 3.034463034463035e-06, "loss": 0.6655, "step": 28240 }, { "epoch": 36.00715572715573, "grad_norm": 0.0002947472967207432, "learning_rate": 3.031603031603032e-06, "loss": 0.0025, "step": 28250 }, { "epoch": 36.00741312741313, "grad_norm": 0.0007760194130241871, "learning_rate": 3.0287430287430293e-06, "loss": 0.0004, "step": 28260 }, { "epoch": 36.00767052767053, "grad_norm": 116.30671691894531, "learning_rate": 3.025883025883026e-06, "loss": 0.7956, "step": 28270 }, { "epoch": 36.00792792792793, "grad_norm": 0.00033408516901545227, "learning_rate": 3.0230230230230235e-06, "loss": 0.0, "step": 28280 }, { "epoch": 36.00818532818533, "grad_norm": 0.0354166217148304, "learning_rate": 3.0201630201630204e-06, "loss": 0.0003, "step": 28290 }, { "epoch": 36.00844272844273, "grad_norm": 0.0003061169118154794, "learning_rate": 3.0173030173030177e-06, "loss": 0.0001, "step": 28300 }, { "epoch": 36.00870012870013, "grad_norm": 0.003866319777444005, "learning_rate": 3.0144430144430146e-06, "loss": 1.8578, "step": 28310 }, { "epoch": 36.00895752895753, "grad_norm": 0.0002617123245727271, "learning_rate": 3.011583011583012e-06, "loss": 0.6685, "step": 28320 }, { "epoch": 36.00921492921493, "grad_norm": 0.00019824574701488018, "learning_rate": 3.0087230087230092e-06, "loss": 0.0003, "step": 28330 }, { "epoch": 36.00947232947233, "grad_norm": 0.00023168852203525603, "learning_rate": 3.005863005863006e-06, "loss": 1.0579, "step": 28340 }, { "epoch": 36.00972972972973, "grad_norm": 0.11857693642377853, "learning_rate": 3.0030030030030034e-06, "loss": 0.0007, "step": 28350 }, { "epoch": 36.00998712998713, "grad_norm": 0.039131905883550644, "learning_rate": 3.0001430001430003e-06, "loss": 0.0009, "step": 28360 }, { "epoch": 36.01024453024453, "grad_norm": 0.009820118546485901, "learning_rate": 2.9972829972829977e-06, "loss": 0.8717, "step": 28370 }, { "epoch": 36.01050193050193, "grad_norm": 0.00021176527661737055, "learning_rate": 2.9944229944229945e-06, "loss": 0.0004, "step": 28380 }, { "epoch": 36.01075933075933, "grad_norm": 0.18563033640384674, "learning_rate": 2.991562991562992e-06, "loss": 1.3191, "step": 28390 }, { "epoch": 36.01101673101673, "grad_norm": 0.00015481845184694976, "learning_rate": 2.9887029887029888e-06, "loss": 1.5741, "step": 28400 }, { "epoch": 36.01127413127413, "grad_norm": 0.0009244285756722093, "learning_rate": 2.985842985842986e-06, "loss": 0.0011, "step": 28410 }, { "epoch": 36.01153153153153, "grad_norm": 0.08380212634801865, "learning_rate": 2.982982982982983e-06, "loss": 0.0014, "step": 28420 }, { "epoch": 36.011788931788935, "grad_norm": 0.01747632957994938, "learning_rate": 2.9801229801229803e-06, "loss": 0.036, "step": 28430 }, { "epoch": 36.01204633204633, "grad_norm": 143.46397399902344, "learning_rate": 2.9772629772629776e-06, "loss": 0.7545, "step": 28440 }, { "epoch": 36.012303732303735, "grad_norm": 0.00015246524708345532, "learning_rate": 2.9744029744029745e-06, "loss": 0.0012, "step": 28450 }, { "epoch": 36.01256113256113, "grad_norm": 0.2742994725704193, "learning_rate": 2.971542971542972e-06, "loss": 0.0007, "step": 28460 }, { "epoch": 36.012818532818535, "grad_norm": 92.01673126220703, "learning_rate": 2.9686829686829687e-06, "loss": 1.2388, "step": 28470 }, { "epoch": 36.01307593307593, "grad_norm": 0.010131720453500748, "learning_rate": 2.965822965822966e-06, "loss": 0.0005, "step": 28480 }, { "epoch": 36.013333333333335, "grad_norm": 156.0325164794922, "learning_rate": 2.962962962962963e-06, "loss": 1.373, "step": 28490 }, { "epoch": 36.01359073359073, "grad_norm": 0.016434205695986748, "learning_rate": 2.9601029601029602e-06, "loss": 0.0017, "step": 28500 }, { "epoch": 36.013848133848136, "grad_norm": 0.0648985505104065, "learning_rate": 2.957242957242957e-06, "loss": 0.6322, "step": 28510 }, { "epoch": 36.01410553410553, "grad_norm": 0.18576735258102417, "learning_rate": 2.9543829543829544e-06, "loss": 0.0007, "step": 28520 }, { "epoch": 36.014362934362936, "grad_norm": 0.014449382200837135, "learning_rate": 2.9515229515229517e-06, "loss": 1.0034, "step": 28530 }, { "epoch": 36.01462033462033, "grad_norm": 0.0007821593899279833, "learning_rate": 2.9486629486629486e-06, "loss": 0.5917, "step": 28540 }, { "epoch": 36.014877734877736, "grad_norm": 0.00015635178715456277, "learning_rate": 2.945802945802946e-06, "loss": 1.1249, "step": 28550 }, { "epoch": 36.01513513513513, "grad_norm": 0.10193686932325363, "learning_rate": 2.942942942942943e-06, "loss": 0.9982, "step": 28560 }, { "epoch": 36.015392535392536, "grad_norm": 0.07530353218317032, "learning_rate": 2.9400829400829406e-06, "loss": 0.0005, "step": 28570 }, { "epoch": 36.01564993564993, "grad_norm": 0.026423171162605286, "learning_rate": 2.937222937222938e-06, "loss": 0.0012, "step": 28580 }, { "epoch": 36.015907335907336, "grad_norm": 0.0017025470733642578, "learning_rate": 2.934362934362935e-06, "loss": 0.0004, "step": 28590 }, { "epoch": 36.01616473616474, "grad_norm": 0.00017218876746483147, "learning_rate": 2.931502931502932e-06, "loss": 0.0006, "step": 28600 }, { "epoch": 36.016422136422136, "grad_norm": 0.00020879317889921367, "learning_rate": 2.928642928642929e-06, "loss": 0.0002, "step": 28610 }, { "epoch": 36.01667953667954, "grad_norm": 0.00023135027731768787, "learning_rate": 2.9257829257829263e-06, "loss": 0.5356, "step": 28620 }, { "epoch": 36.016936936936936, "grad_norm": 0.0003467734786681831, "learning_rate": 2.9229229229229232e-06, "loss": 0.0003, "step": 28630 }, { "epoch": 36.01719433719434, "grad_norm": 0.0012130647664889693, "learning_rate": 2.9200629200629205e-06, "loss": 0.6772, "step": 28640 }, { "epoch": 36.01745173745174, "grad_norm": 0.00030612791306339204, "learning_rate": 2.9172029172029174e-06, "loss": 0.0006, "step": 28650 }, { "epoch": 36.01770913770914, "grad_norm": 0.023205522447824478, "learning_rate": 2.9143429143429147e-06, "loss": 0.0006, "step": 28660 }, { "epoch": 36.01796653796654, "grad_norm": 0.017860963940620422, "learning_rate": 2.9114829114829116e-06, "loss": 0.303, "step": 28670 }, { "epoch": 36.01822393822394, "grad_norm": 0.037889543920755386, "learning_rate": 2.908622908622909e-06, "loss": 0.9838, "step": 28680 }, { "epoch": 36.01848133848134, "grad_norm": 0.061780329793691635, "learning_rate": 2.9057629057629063e-06, "loss": 0.0001, "step": 28690 }, { "epoch": 36.01873873873874, "grad_norm": 0.0001773426338331774, "learning_rate": 2.902902902902903e-06, "loss": 0.0002, "step": 28700 }, { "epoch": 36.01899613899614, "grad_norm": 0.09331512451171875, "learning_rate": 2.9000429000429005e-06, "loss": 0.0009, "step": 28710 }, { "epoch": 36.01925353925354, "grad_norm": 0.00026550382608547807, "learning_rate": 2.8971828971828974e-06, "loss": 0.0003, "step": 28720 }, { "epoch": 36.01951093951094, "grad_norm": 0.01748647913336754, "learning_rate": 2.8943228943228947e-06, "loss": 0.0004, "step": 28730 }, { "epoch": 36.01976833976834, "grad_norm": 0.0793762281537056, "learning_rate": 2.8914628914628916e-06, "loss": 0.0003, "step": 28740 }, { "epoch": 36.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 1.074432373046875, "eval_runtime": 13.3897, "eval_samples_per_second": 3.435, "eval_steps_per_second": 3.435, "step": 28749 }, { "epoch": 37.00002574002574, "grad_norm": 0.017261644825339317, "learning_rate": 2.888602888602889e-06, "loss": 0.3016, "step": 28750 }, { "epoch": 37.00028314028314, "grad_norm": 0.00018201515194959939, "learning_rate": 2.8857428857428858e-06, "loss": 0.0005, "step": 28760 }, { "epoch": 37.00054054054054, "grad_norm": 0.09265059977769852, "learning_rate": 2.882882882882883e-06, "loss": 0.0004, "step": 28770 }, { "epoch": 37.00079794079794, "grad_norm": 0.0002512499631848186, "learning_rate": 2.8800228800228804e-06, "loss": 0.6045, "step": 28780 }, { "epoch": 37.00105534105534, "grad_norm": 0.021960824728012085, "learning_rate": 2.8771628771628773e-06, "loss": 0.4089, "step": 28790 }, { "epoch": 37.00131274131274, "grad_norm": 0.00020900735398754478, "learning_rate": 2.8743028743028746e-06, "loss": 0.5131, "step": 28800 }, { "epoch": 37.00157014157014, "grad_norm": 0.00446329265832901, "learning_rate": 2.8714428714428715e-06, "loss": 0.0002, "step": 28810 }, { "epoch": 37.001827541827545, "grad_norm": 0.009537389501929283, "learning_rate": 2.868582868582869e-06, "loss": 0.0019, "step": 28820 }, { "epoch": 37.00208494208494, "grad_norm": 0.05497797206044197, "learning_rate": 2.8657228657228657e-06, "loss": 0.005, "step": 28830 }, { "epoch": 37.002342342342345, "grad_norm": 0.00030364299891516566, "learning_rate": 2.862862862862863e-06, "loss": 0.6848, "step": 28840 }, { "epoch": 37.00259974259974, "grad_norm": 0.004607612267136574, "learning_rate": 2.86000286000286e-06, "loss": 0.8919, "step": 28850 }, { "epoch": 37.002857142857145, "grad_norm": 0.002171023515984416, "learning_rate": 2.8571428571428573e-06, "loss": 0.5662, "step": 28860 }, { "epoch": 37.00311454311454, "grad_norm": 0.00719066895544529, "learning_rate": 2.854282854282854e-06, "loss": 0.0003, "step": 28870 }, { "epoch": 37.003371943371945, "grad_norm": 0.08742506057024002, "learning_rate": 2.8514228514228515e-06, "loss": 0.7729, "step": 28880 }, { "epoch": 37.00362934362934, "grad_norm": 0.004545649979263544, "learning_rate": 2.8485628485628488e-06, "loss": 0.0001, "step": 28890 }, { "epoch": 37.003886743886746, "grad_norm": 0.00015914670075289905, "learning_rate": 2.8457028457028457e-06, "loss": 0.7993, "step": 28900 }, { "epoch": 37.00414414414414, "grad_norm": 0.052335429936647415, "learning_rate": 2.842842842842843e-06, "loss": 1.0677, "step": 28910 }, { "epoch": 37.004401544401546, "grad_norm": 0.00018764731066767126, "learning_rate": 2.83998283998284e-06, "loss": 0.0002, "step": 28920 }, { "epoch": 37.00465894465894, "grad_norm": 0.00018724115216173232, "learning_rate": 2.837122837122837e-06, "loss": 0.2418, "step": 28930 }, { "epoch": 37.004916344916346, "grad_norm": 0.004428797867149115, "learning_rate": 2.834262834262834e-06, "loss": 0.0005, "step": 28940 }, { "epoch": 37.00517374517374, "grad_norm": 0.004312903620302677, "learning_rate": 2.831402831402832e-06, "loss": 0.7043, "step": 28950 }, { "epoch": 37.005431145431146, "grad_norm": 465.94049072265625, "learning_rate": 2.828542828542829e-06, "loss": 0.0731, "step": 28960 }, { "epoch": 37.00568854568854, "grad_norm": 0.021510232239961624, "learning_rate": 2.825682825682826e-06, "loss": 0.0002, "step": 28970 }, { "epoch": 37.005945945945946, "grad_norm": 0.00018029431521426886, "learning_rate": 2.8228228228228234e-06, "loss": 0.0026, "step": 28980 }, { "epoch": 37.00620334620335, "grad_norm": 0.0017462585819885135, "learning_rate": 2.8199628199628202e-06, "loss": 0.2045, "step": 28990 }, { "epoch": 37.006460746460746, "grad_norm": 0.00019625762070063502, "learning_rate": 2.8171028171028176e-06, "loss": 0.0001, "step": 29000 }, { "epoch": 37.00671814671815, "grad_norm": 0.024823561310768127, "learning_rate": 2.8142428142428145e-06, "loss": 0.0451, "step": 29010 }, { "epoch": 37.006975546975546, "grad_norm": 0.00015227263793349266, "learning_rate": 2.8113828113828118e-06, "loss": 0.106, "step": 29020 }, { "epoch": 37.00723294723295, "grad_norm": 0.00022190021991264075, "learning_rate": 2.808522808522809e-06, "loss": 0.5192, "step": 29030 }, { "epoch": 37.00749034749035, "grad_norm": 0.00021235458552837372, "learning_rate": 2.805662805662806e-06, "loss": 0.8077, "step": 29040 }, { "epoch": 37.00774774774775, "grad_norm": 0.0744086354970932, "learning_rate": 2.8028028028028033e-06, "loss": 0.7212, "step": 29050 }, { "epoch": 37.00800514800515, "grad_norm": 0.028785334900021553, "learning_rate": 2.7999427999428e-06, "loss": 0.022, "step": 29060 }, { "epoch": 37.00826254826255, "grad_norm": 0.0001730113581288606, "learning_rate": 2.7970827970827975e-06, "loss": 0.007, "step": 29070 }, { "epoch": 37.00851994851995, "grad_norm": 0.00028456852305680513, "learning_rate": 2.7942227942227944e-06, "loss": 1.4105, "step": 29080 }, { "epoch": 37.00877734877735, "grad_norm": 0.2067694216966629, "learning_rate": 2.7913627913627917e-06, "loss": 0.0902, "step": 29090 }, { "epoch": 37.00903474903475, "grad_norm": 4.730393409729004, "learning_rate": 2.7885027885027886e-06, "loss": 0.0014, "step": 29100 }, { "epoch": 37.00929214929215, "grad_norm": 280.6280212402344, "learning_rate": 2.785642785642786e-06, "loss": 0.7313, "step": 29110 }, { "epoch": 37.00954954954955, "grad_norm": 0.0012725815176963806, "learning_rate": 2.782782782782783e-06, "loss": 0.0001, "step": 29120 }, { "epoch": 37.00980694980695, "grad_norm": 0.10952453315258026, "learning_rate": 2.77992277992278e-06, "loss": 0.0004, "step": 29130 }, { "epoch": 37.01006435006435, "grad_norm": 0.674946665763855, "learning_rate": 2.7770627770627775e-06, "loss": 0.3738, "step": 29140 }, { "epoch": 37.01032175032175, "grad_norm": 0.006087814457714558, "learning_rate": 2.7742027742027743e-06, "loss": 0.0005, "step": 29150 }, { "epoch": 37.01057915057915, "grad_norm": 0.008630234748125076, "learning_rate": 2.7713427713427717e-06, "loss": 0.0002, "step": 29160 }, { "epoch": 37.01083655083655, "grad_norm": 0.0001840332115534693, "learning_rate": 2.7684827684827686e-06, "loss": 0.0002, "step": 29170 }, { "epoch": 37.011093951093955, "grad_norm": 0.0036154910922050476, "learning_rate": 2.765622765622766e-06, "loss": 0.0005, "step": 29180 }, { "epoch": 37.01135135135135, "grad_norm": 0.02959679253399372, "learning_rate": 2.7627627627627628e-06, "loss": 0.6275, "step": 29190 }, { "epoch": 37.011608751608755, "grad_norm": 0.01243556383997202, "learning_rate": 2.75990275990276e-06, "loss": 0.0002, "step": 29200 }, { "epoch": 37.01186615186615, "grad_norm": 0.07791236788034439, "learning_rate": 2.757042757042757e-06, "loss": 0.8283, "step": 29210 }, { "epoch": 37.012123552123555, "grad_norm": 0.012300877831876278, "learning_rate": 2.7541827541827543e-06, "loss": 0.0005, "step": 29220 }, { "epoch": 37.01238095238095, "grad_norm": 0.0001772117830114439, "learning_rate": 2.7513227513227516e-06, "loss": 0.2855, "step": 29230 }, { "epoch": 37.012638352638355, "grad_norm": 0.010969296097755432, "learning_rate": 2.7484627484627485e-06, "loss": 0.0034, "step": 29240 }, { "epoch": 37.01289575289575, "grad_norm": 0.00019608347793109715, "learning_rate": 2.745602745602746e-06, "loss": 0.7902, "step": 29250 }, { "epoch": 37.013153153153155, "grad_norm": 0.020125016570091248, "learning_rate": 2.7427427427427427e-06, "loss": 0.0003, "step": 29260 }, { "epoch": 37.01341055341055, "grad_norm": 0.0045450483448803425, "learning_rate": 2.73988273988274e-06, "loss": 0.0001, "step": 29270 }, { "epoch": 37.013667953667955, "grad_norm": 0.0002109996130457148, "learning_rate": 2.737022737022737e-06, "loss": 0.0001, "step": 29280 }, { "epoch": 37.01392535392535, "grad_norm": 0.00018298572103958577, "learning_rate": 2.7341627341627342e-06, "loss": 1.0783, "step": 29290 }, { "epoch": 37.014182754182755, "grad_norm": 0.08364742249250412, "learning_rate": 2.731302731302731e-06, "loss": 0.3974, "step": 29300 }, { "epoch": 37.01444015444015, "grad_norm": 0.032810915261507034, "learning_rate": 2.7284427284427284e-06, "loss": 0.2355, "step": 29310 }, { "epoch": 37.014697554697555, "grad_norm": 0.30951929092407227, "learning_rate": 2.725582725582726e-06, "loss": 0.001, "step": 29320 }, { "epoch": 37.01495495495495, "grad_norm": 0.0006139618926681578, "learning_rate": 2.722722722722723e-06, "loss": 0.0841, "step": 29330 }, { "epoch": 37.015212355212356, "grad_norm": 0.034589651972055435, "learning_rate": 2.7198627198627204e-06, "loss": 0.0009, "step": 29340 }, { "epoch": 37.01546975546975, "grad_norm": 0.0001762336032697931, "learning_rate": 2.7170027170027173e-06, "loss": 0.001, "step": 29350 }, { "epoch": 37.015727155727156, "grad_norm": 0.0002292945428052917, "learning_rate": 2.7141427141427146e-06, "loss": 0.1161, "step": 29360 }, { "epoch": 37.01598455598456, "grad_norm": 0.0370081327855587, "learning_rate": 2.7112827112827115e-06, "loss": 0.0004, "step": 29370 }, { "epoch": 37.016241956241956, "grad_norm": 0.04158321022987366, "learning_rate": 2.708422708422709e-06, "loss": 0.0004, "step": 29380 }, { "epoch": 37.01649935649936, "grad_norm": 0.004836615175008774, "learning_rate": 2.705562705562706e-06, "loss": 0.7461, "step": 29390 }, { "epoch": 37.016756756756756, "grad_norm": 0.025562850758433342, "learning_rate": 2.702702702702703e-06, "loss": 0.5474, "step": 29400 }, { "epoch": 37.01701415701416, "grad_norm": 0.00021090406517032534, "learning_rate": 2.6998426998427003e-06, "loss": 0.0001, "step": 29410 }, { "epoch": 37.017271557271556, "grad_norm": 0.00023956765653565526, "learning_rate": 2.6969826969826972e-06, "loss": 0.1954, "step": 29420 }, { "epoch": 37.01752895752896, "grad_norm": 0.0024973272811621428, "learning_rate": 2.6941226941226945e-06, "loss": 0.0784, "step": 29430 }, { "epoch": 37.017786357786356, "grad_norm": 0.03849514201283455, "learning_rate": 2.6912626912626914e-06, "loss": 0.6281, "step": 29440 }, { "epoch": 37.01804375804376, "grad_norm": 0.0002072292409138754, "learning_rate": 2.6884026884026887e-06, "loss": 1.933, "step": 29450 }, { "epoch": 37.018301158301156, "grad_norm": 0.0017715301364660263, "learning_rate": 2.6855426855426856e-06, "loss": 0.0, "step": 29460 }, { "epoch": 37.01855855855856, "grad_norm": 0.00020758021855726838, "learning_rate": 2.682682682682683e-06, "loss": 0.8772, "step": 29470 }, { "epoch": 37.018815958815956, "grad_norm": 0.036694519221782684, "learning_rate": 2.6798226798226803e-06, "loss": 0.0022, "step": 29480 }, { "epoch": 37.01907335907336, "grad_norm": 0.5028038024902344, "learning_rate": 2.676962676962677e-06, "loss": 0.0005, "step": 29490 }, { "epoch": 37.01933075933076, "grad_norm": 0.08280939608812332, "learning_rate": 2.6741026741026745e-06, "loss": 0.7947, "step": 29500 }, { "epoch": 37.01958815958816, "grad_norm": 0.031048625707626343, "learning_rate": 2.6712426712426714e-06, "loss": 0.0004, "step": 29510 }, { "epoch": 37.01984555984556, "grad_norm": 0.08677981793880463, "learning_rate": 2.6683826683826687e-06, "loss": 0.0005, "step": 29520 }, { "epoch": 37.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.9154806733131409, "eval_runtime": 13.414, "eval_samples_per_second": 3.429, "eval_steps_per_second": 3.429, "step": 29526 }, { "epoch": 38.00010296010296, "grad_norm": 57.652462005615234, "learning_rate": 2.6655226655226656e-06, "loss": 2.1915, "step": 29530 }, { "epoch": 38.00036036036036, "grad_norm": 3.105980396270752, "learning_rate": 2.662662662662663e-06, "loss": 0.0005, "step": 29540 }, { "epoch": 38.00061776061776, "grad_norm": 0.3790825307369232, "learning_rate": 2.65980265980266e-06, "loss": 0.0005, "step": 29550 }, { "epoch": 38.00087516087516, "grad_norm": 0.0002181073505198583, "learning_rate": 2.656942656942657e-06, "loss": 0.3792, "step": 29560 }, { "epoch": 38.001132561132565, "grad_norm": 0.0001607909653102979, "learning_rate": 2.654082654082654e-06, "loss": 0.0001, "step": 29570 }, { "epoch": 38.00138996138996, "grad_norm": 0.10881076753139496, "learning_rate": 2.6512226512226513e-06, "loss": 0.7041, "step": 29580 }, { "epoch": 38.001647361647365, "grad_norm": 615.3811645507812, "learning_rate": 2.6483626483626486e-06, "loss": 0.0372, "step": 29590 }, { "epoch": 38.00190476190476, "grad_norm": 0.00022765577887184918, "learning_rate": 2.6455026455026455e-06, "loss": 0.0003, "step": 29600 }, { "epoch": 38.002162162162165, "grad_norm": 0.0011044471757486463, "learning_rate": 2.642642642642643e-06, "loss": 0.0001, "step": 29610 }, { "epoch": 38.00241956241956, "grad_norm": 0.43733513355255127, "learning_rate": 2.6397826397826397e-06, "loss": 0.0011, "step": 29620 }, { "epoch": 38.002676962676965, "grad_norm": 0.00021193055727053434, "learning_rate": 2.636922636922637e-06, "loss": 0.8158, "step": 29630 }, { "epoch": 38.00293436293436, "grad_norm": 0.00447743758559227, "learning_rate": 2.634062634062634e-06, "loss": 0.5663, "step": 29640 }, { "epoch": 38.003191763191765, "grad_norm": 576.5663452148438, "learning_rate": 2.6312026312026313e-06, "loss": 0.4441, "step": 29650 }, { "epoch": 38.00344916344916, "grad_norm": 0.006028820760548115, "learning_rate": 2.628342628342628e-06, "loss": 0.0002, "step": 29660 }, { "epoch": 38.003706563706565, "grad_norm": 0.01456479262560606, "learning_rate": 2.6254826254826255e-06, "loss": 0.3368, "step": 29670 }, { "epoch": 38.00396396396396, "grad_norm": 0.006685736123472452, "learning_rate": 2.6226226226226224e-06, "loss": 0.0004, "step": 29680 }, { "epoch": 38.004221364221365, "grad_norm": 0.016805648803710938, "learning_rate": 2.61976261976262e-06, "loss": 0.1144, "step": 29690 }, { "epoch": 38.00447876447876, "grad_norm": 0.00028289741021580994, "learning_rate": 2.6169026169026174e-06, "loss": 0.0001, "step": 29700 }, { "epoch": 38.004736164736165, "grad_norm": 0.00025284860748797655, "learning_rate": 2.6140426140426143e-06, "loss": 0.4801, "step": 29710 }, { "epoch": 38.00499356499356, "grad_norm": 0.0012302598915994167, "learning_rate": 2.6111826111826116e-06, "loss": 0.6057, "step": 29720 }, { "epoch": 38.005250965250966, "grad_norm": 0.04306511580944061, "learning_rate": 2.608322608322609e-06, "loss": 0.4271, "step": 29730 }, { "epoch": 38.00550836550836, "grad_norm": 0.1301436573266983, "learning_rate": 2.605462605462606e-06, "loss": 0.0005, "step": 29740 }, { "epoch": 38.005765765765766, "grad_norm": 0.14988690614700317, "learning_rate": 2.602602602602603e-06, "loss": 0.0001, "step": 29750 }, { "epoch": 38.00602316602317, "grad_norm": 0.002918446669355035, "learning_rate": 2.5997425997426e-06, "loss": 0.0005, "step": 29760 }, { "epoch": 38.006280566280566, "grad_norm": 0.005147375166416168, "learning_rate": 2.5968825968825974e-06, "loss": 0.7509, "step": 29770 }, { "epoch": 38.00653796653797, "grad_norm": 0.00018102384638041258, "learning_rate": 2.5940225940225943e-06, "loss": 0.6546, "step": 29780 }, { "epoch": 38.006795366795366, "grad_norm": 0.03137935698032379, "learning_rate": 2.5911625911625916e-06, "loss": 0.5553, "step": 29790 }, { "epoch": 38.00705276705277, "grad_norm": 0.005157016683369875, "learning_rate": 2.5883025883025885e-06, "loss": 0.0019, "step": 29800 }, { "epoch": 38.007310167310166, "grad_norm": 0.00020223679894115776, "learning_rate": 2.5854425854425858e-06, "loss": 0.0002, "step": 29810 }, { "epoch": 38.00756756756757, "grad_norm": 0.10321886092424393, "learning_rate": 2.5825825825825827e-06, "loss": 0.0002, "step": 29820 }, { "epoch": 38.007824967824966, "grad_norm": 0.0003532360424287617, "learning_rate": 2.57972257972258e-06, "loss": 0.0004, "step": 29830 }, { "epoch": 38.00808236808237, "grad_norm": 0.0043226853013038635, "learning_rate": 2.5768625768625773e-06, "loss": 0.4117, "step": 29840 }, { "epoch": 38.008339768339766, "grad_norm": 0.017380457371473312, "learning_rate": 2.574002574002574e-06, "loss": 0.0001, "step": 29850 }, { "epoch": 38.00859716859717, "grad_norm": 0.0010595513740554452, "learning_rate": 2.5711425711425715e-06, "loss": 0.0008, "step": 29860 }, { "epoch": 38.00885456885457, "grad_norm": 0.00034468816011212766, "learning_rate": 2.5682825682825684e-06, "loss": 0.5743, "step": 29870 }, { "epoch": 38.00911196911197, "grad_norm": 0.0005547988112084568, "learning_rate": 2.5654225654225657e-06, "loss": 0.0002, "step": 29880 }, { "epoch": 38.00936936936937, "grad_norm": 0.00622868025675416, "learning_rate": 2.5625625625625626e-06, "loss": 0.0002, "step": 29890 }, { "epoch": 38.00962676962677, "grad_norm": 0.01443453598767519, "learning_rate": 2.55970255970256e-06, "loss": 0.0008, "step": 29900 }, { "epoch": 38.00988416988417, "grad_norm": 0.00020728138042613864, "learning_rate": 2.556842556842557e-06, "loss": 0.0002, "step": 29910 }, { "epoch": 38.01014157014157, "grad_norm": 504.9588928222656, "learning_rate": 2.553982553982554e-06, "loss": 0.4423, "step": 29920 }, { "epoch": 38.01039897039897, "grad_norm": 0.012988753616809845, "learning_rate": 2.5511225511225515e-06, "loss": 0.0001, "step": 29930 }, { "epoch": 38.01065637065637, "grad_norm": 0.0057409703731536865, "learning_rate": 2.5482625482625484e-06, "loss": 0.0052, "step": 29940 }, { "epoch": 38.010913770913774, "grad_norm": 0.00012208484986331314, "learning_rate": 2.5454025454025457e-06, "loss": 0.001, "step": 29950 }, { "epoch": 38.01117117117117, "grad_norm": 0.02826821058988571, "learning_rate": 2.5425425425425426e-06, "loss": 0.0002, "step": 29960 }, { "epoch": 38.011428571428574, "grad_norm": 352.06964111328125, "learning_rate": 2.53968253968254e-06, "loss": 1.1301, "step": 29970 }, { "epoch": 38.01168597168597, "grad_norm": 0.0001997912477236241, "learning_rate": 2.5368225368225368e-06, "loss": 0.0001, "step": 29980 }, { "epoch": 38.011943371943374, "grad_norm": 0.00014716804435011, "learning_rate": 2.533962533962534e-06, "loss": 1.1997, "step": 29990 }, { "epoch": 38.01220077220077, "grad_norm": 2.6033856868743896, "learning_rate": 2.531102531102531e-06, "loss": 0.0005, "step": 30000 }, { "epoch": 38.012458172458174, "grad_norm": 0.00014160611317493021, "learning_rate": 2.5282425282425283e-06, "loss": 0.1244, "step": 30010 }, { "epoch": 38.01271557271557, "grad_norm": 0.0004008954856544733, "learning_rate": 2.525382525382525e-06, "loss": 0.0001, "step": 30020 }, { "epoch": 38.012972972972975, "grad_norm": 0.0023208346683532, "learning_rate": 2.5225225225225225e-06, "loss": 0.0044, "step": 30030 }, { "epoch": 38.01323037323037, "grad_norm": 0.003921369090676308, "learning_rate": 2.51966251966252e-06, "loss": 0.0006, "step": 30040 }, { "epoch": 38.013487773487775, "grad_norm": 0.005466674454510212, "learning_rate": 2.5168025168025167e-06, "loss": 0.0001, "step": 30050 }, { "epoch": 38.01374517374517, "grad_norm": 0.00016683118883520365, "learning_rate": 2.513942513942514e-06, "loss": 0.0001, "step": 30060 }, { "epoch": 38.014002574002575, "grad_norm": 0.00019211266771890223, "learning_rate": 2.5110825110825113e-06, "loss": 0.0003, "step": 30070 }, { "epoch": 38.01425997425997, "grad_norm": 0.00022057870228309184, "learning_rate": 2.5082225082225087e-06, "loss": 0.0004, "step": 30080 }, { "epoch": 38.014517374517375, "grad_norm": 0.00017979362746700644, "learning_rate": 2.505362505362506e-06, "loss": 0.0006, "step": 30090 }, { "epoch": 38.01477477477477, "grad_norm": 0.00035356191801838577, "learning_rate": 2.502502502502503e-06, "loss": 0.0001, "step": 30100 }, { "epoch": 38.015032175032175, "grad_norm": 0.010907981544733047, "learning_rate": 2.4996424996424998e-06, "loss": 0.1697, "step": 30110 }, { "epoch": 38.01528957528958, "grad_norm": 0.03007809817790985, "learning_rate": 2.496782496782497e-06, "loss": 0.0002, "step": 30120 }, { "epoch": 38.015546975546975, "grad_norm": 0.001186450943350792, "learning_rate": 2.493922493922494e-06, "loss": 0.0283, "step": 30130 }, { "epoch": 38.01580437580438, "grad_norm": 0.021413983777165413, "learning_rate": 2.4910624910624913e-06, "loss": 0.0001, "step": 30140 }, { "epoch": 38.016061776061775, "grad_norm": 0.00018151052063331008, "learning_rate": 2.488202488202488e-06, "loss": 0.768, "step": 30150 }, { "epoch": 38.01631917631918, "grad_norm": 0.00312127941288054, "learning_rate": 2.4853424853424855e-06, "loss": 0.0005, "step": 30160 }, { "epoch": 38.016576576576576, "grad_norm": 0.0001102391179301776, "learning_rate": 2.482482482482483e-06, "loss": 0.0001, "step": 30170 }, { "epoch": 38.01683397683398, "grad_norm": 0.0002471956831868738, "learning_rate": 2.47962247962248e-06, "loss": 0.0007, "step": 30180 }, { "epoch": 38.017091377091376, "grad_norm": 0.0004663833533413708, "learning_rate": 2.476762476762477e-06, "loss": 0.0129, "step": 30190 }, { "epoch": 38.01734877734878, "grad_norm": 0.00016829062951728702, "learning_rate": 2.4739024739024743e-06, "loss": 0.3392, "step": 30200 }, { "epoch": 38.017606177606176, "grad_norm": 0.003199836239218712, "learning_rate": 2.4710424710424712e-06, "loss": 0.7917, "step": 30210 }, { "epoch": 38.01786357786358, "grad_norm": 0.0024653258733451366, "learning_rate": 2.4681824681824685e-06, "loss": 0.4412, "step": 30220 }, { "epoch": 38.018120978120976, "grad_norm": 0.01302412524819374, "learning_rate": 2.4653224653224654e-06, "loss": 0.0155, "step": 30230 }, { "epoch": 38.01837837837838, "grad_norm": 0.00023153946676757187, "learning_rate": 2.4624624624624628e-06, "loss": 0.0001, "step": 30240 }, { "epoch": 38.018635778635776, "grad_norm": 0.010636151768267155, "learning_rate": 2.4596024596024596e-06, "loss": 0.0001, "step": 30250 }, { "epoch": 38.01889317889318, "grad_norm": 0.019402464851737022, "learning_rate": 2.456742456742457e-06, "loss": 0.8413, "step": 30260 }, { "epoch": 38.019150579150576, "grad_norm": 0.00025949705741368234, "learning_rate": 2.453882453882454e-06, "loss": 0.0014, "step": 30270 }, { "epoch": 38.01940797940798, "grad_norm": 0.014147725887596607, "learning_rate": 2.451022451022451e-06, "loss": 0.0003, "step": 30280 }, { "epoch": 38.019665379665376, "grad_norm": 0.00033936629188247025, "learning_rate": 2.4481624481624485e-06, "loss": 0.0024, "step": 30290 }, { "epoch": 38.01992277992278, "grad_norm": 0.00021008138719480485, "learning_rate": 2.4453024453024454e-06, "loss": 0.2137, "step": 30300 }, { "epoch": 38.02, "eval_accuracy": 0.9130434782608695, "eval_loss": 0.5413678884506226, "eval_runtime": 13.3642, "eval_samples_per_second": 3.442, "eval_steps_per_second": 3.442, "step": 30303 }, { "epoch": 39.00018018018018, "grad_norm": 0.047418005764484406, "learning_rate": 2.4424424424424427e-06, "loss": 0.9208, "step": 30310 }, { "epoch": 39.00043758043758, "grad_norm": 0.010265672579407692, "learning_rate": 2.4395824395824396e-06, "loss": 0.0001, "step": 30320 }, { "epoch": 39.00069498069498, "grad_norm": 0.0024453650694340467, "learning_rate": 2.436722436722437e-06, "loss": 0.0001, "step": 30330 }, { "epoch": 39.000952380952384, "grad_norm": 0.007709293160587549, "learning_rate": 2.433862433862434e-06, "loss": 0.0023, "step": 30340 }, { "epoch": 39.00120978120978, "grad_norm": 0.03284775838255882, "learning_rate": 2.431002431002431e-06, "loss": 0.0002, "step": 30350 }, { "epoch": 39.001467181467184, "grad_norm": 0.00015525065828114748, "learning_rate": 2.4281424281424284e-06, "loss": 0.0152, "step": 30360 }, { "epoch": 39.00172458172458, "grad_norm": 0.0003922375326510519, "learning_rate": 2.4252824252824258e-06, "loss": 1.1854, "step": 30370 }, { "epoch": 39.001981981981984, "grad_norm": 0.0003774458891712129, "learning_rate": 2.4224224224224226e-06, "loss": 0.0861, "step": 30380 }, { "epoch": 39.00223938223938, "grad_norm": 0.00016582466196268797, "learning_rate": 2.41956241956242e-06, "loss": 0.0001, "step": 30390 }, { "epoch": 39.002496782496785, "grad_norm": 0.00012327576405368745, "learning_rate": 2.416702416702417e-06, "loss": 0.0001, "step": 30400 }, { "epoch": 39.00275418275418, "grad_norm": 0.06711563467979431, "learning_rate": 2.413842413842414e-06, "loss": 0.0002, "step": 30410 }, { "epoch": 39.003011583011585, "grad_norm": 0.06605685502290726, "learning_rate": 2.410982410982411e-06, "loss": 0.0006, "step": 30420 }, { "epoch": 39.00326898326898, "grad_norm": 0.00011760670895455405, "learning_rate": 2.4081224081224084e-06, "loss": 1.4446, "step": 30430 }, { "epoch": 39.003526383526385, "grad_norm": 0.015367030166089535, "learning_rate": 2.4052624052624053e-06, "loss": 0.731, "step": 30440 }, { "epoch": 39.00378378378378, "grad_norm": 612.0079345703125, "learning_rate": 2.4024024024024026e-06, "loss": 0.4344, "step": 30450 }, { "epoch": 39.004041184041185, "grad_norm": 0.16250044107437134, "learning_rate": 2.3995423995424e-06, "loss": 0.0002, "step": 30460 }, { "epoch": 39.00429858429858, "grad_norm": 0.00013442897761706263, "learning_rate": 2.396682396682397e-06, "loss": 0.0035, "step": 30470 }, { "epoch": 39.004555984555985, "grad_norm": 0.03660891577601433, "learning_rate": 2.393822393822394e-06, "loss": 0.0001, "step": 30480 }, { "epoch": 39.00481338481338, "grad_norm": 0.01504859421402216, "learning_rate": 2.390962390962391e-06, "loss": 0.0011, "step": 30490 }, { "epoch": 39.005070785070785, "grad_norm": 0.6826884150505066, "learning_rate": 2.3881023881023883e-06, "loss": 0.0003, "step": 30500 }, { "epoch": 39.00532818532819, "grad_norm": 0.0002525031450204551, "learning_rate": 2.3852423852423852e-06, "loss": 0.0001, "step": 30510 }, { "epoch": 39.005585585585585, "grad_norm": 0.00015134667046368122, "learning_rate": 2.3823823823823825e-06, "loss": 0.0003, "step": 30520 }, { "epoch": 39.00584298584299, "grad_norm": 0.045858077704906464, "learning_rate": 2.37952237952238e-06, "loss": 0.0002, "step": 30530 }, { "epoch": 39.006100386100385, "grad_norm": 0.000713690766133368, "learning_rate": 2.376662376662377e-06, "loss": 0.0001, "step": 30540 }, { "epoch": 39.00635778635779, "grad_norm": 0.00014348112745210528, "learning_rate": 2.373802373802374e-06, "loss": 0.0206, "step": 30550 }, { "epoch": 39.006615186615186, "grad_norm": 0.017485586926341057, "learning_rate": 2.3709423709423714e-06, "loss": 0.5571, "step": 30560 }, { "epoch": 39.00687258687259, "grad_norm": 0.0076943435706198215, "learning_rate": 2.3680823680823683e-06, "loss": 0.0036, "step": 30570 }, { "epoch": 39.007129987129986, "grad_norm": 0.006811995059251785, "learning_rate": 2.3652223652223656e-06, "loss": 0.0001, "step": 30580 }, { "epoch": 39.00738738738739, "grad_norm": 0.00021085226035211235, "learning_rate": 2.3623623623623625e-06, "loss": 0.0002, "step": 30590 }, { "epoch": 39.007644787644786, "grad_norm": 0.00011505138536449522, "learning_rate": 2.35950235950236e-06, "loss": 0.0006, "step": 30600 }, { "epoch": 39.00790218790219, "grad_norm": 0.0032912762835621834, "learning_rate": 2.3566423566423567e-06, "loss": 0.0001, "step": 30610 }, { "epoch": 39.008159588159586, "grad_norm": 0.00032590367482043803, "learning_rate": 2.353782353782354e-06, "loss": 0.0005, "step": 30620 }, { "epoch": 39.00841698841699, "grad_norm": 0.10024252533912659, "learning_rate": 2.3509223509223513e-06, "loss": 0.0002, "step": 30630 }, { "epoch": 39.008674388674386, "grad_norm": 0.0001177530357381329, "learning_rate": 2.348062348062348e-06, "loss": 0.8669, "step": 30640 }, { "epoch": 39.00893178893179, "grad_norm": 0.009781216271221638, "learning_rate": 2.3452023452023455e-06, "loss": 0.0001, "step": 30650 }, { "epoch": 39.009189189189186, "grad_norm": 0.0025387846399098635, "learning_rate": 2.3423423423423424e-06, "loss": 0.0011, "step": 30660 }, { "epoch": 39.00944658944659, "grad_norm": 0.00022111479484010488, "learning_rate": 2.3394823394823397e-06, "loss": 0.0857, "step": 30670 }, { "epoch": 39.009703989703986, "grad_norm": 0.004516247659921646, "learning_rate": 2.3366223366223366e-06, "loss": 0.0012, "step": 30680 }, { "epoch": 39.00996138996139, "grad_norm": 0.00013777939602732658, "learning_rate": 2.333762333762334e-06, "loss": 0.6783, "step": 30690 }, { "epoch": 39.010218790218794, "grad_norm": 0.00011778510815929621, "learning_rate": 2.330902330902331e-06, "loss": 0.9963, "step": 30700 }, { "epoch": 39.01047619047619, "grad_norm": 0.010461336001753807, "learning_rate": 2.328042328042328e-06, "loss": 0.7411, "step": 30710 }, { "epoch": 39.010733590733594, "grad_norm": 0.016033122316002846, "learning_rate": 2.3251823251823255e-06, "loss": 0.6801, "step": 30720 }, { "epoch": 39.01099099099099, "grad_norm": 0.003915534354746342, "learning_rate": 2.3223223223223228e-06, "loss": 0.603, "step": 30730 }, { "epoch": 39.011248391248394, "grad_norm": 0.02247695066034794, "learning_rate": 2.3194623194623197e-06, "loss": 0.033, "step": 30740 }, { "epoch": 39.01150579150579, "grad_norm": 0.008587141521275043, "learning_rate": 2.316602316602317e-06, "loss": 0.6416, "step": 30750 }, { "epoch": 39.011763191763194, "grad_norm": 0.0012964047491550446, "learning_rate": 2.313742313742314e-06, "loss": 0.0001, "step": 30760 }, { "epoch": 39.01202059202059, "grad_norm": 0.007788754999637604, "learning_rate": 2.310882310882311e-06, "loss": 0.0002, "step": 30770 }, { "epoch": 39.012277992277994, "grad_norm": 0.00040538437315262854, "learning_rate": 2.308022308022308e-06, "loss": 0.7998, "step": 30780 }, { "epoch": 39.01253539253539, "grad_norm": 0.00013059898628853261, "learning_rate": 2.3051623051623054e-06, "loss": 0.6301, "step": 30790 }, { "epoch": 39.012792792792794, "grad_norm": 0.010551397688686848, "learning_rate": 2.3023023023023023e-06, "loss": 0.0051, "step": 30800 }, { "epoch": 39.01305019305019, "grad_norm": 0.00023208599304780364, "learning_rate": 2.2994422994422996e-06, "loss": 0.6185, "step": 30810 }, { "epoch": 39.013307593307594, "grad_norm": 0.003290843451395631, "learning_rate": 2.296582296582297e-06, "loss": 0.0001, "step": 30820 }, { "epoch": 39.01356499356499, "grad_norm": 0.005354827735573053, "learning_rate": 2.293722293722294e-06, "loss": 0.0002, "step": 30830 }, { "epoch": 39.013822393822394, "grad_norm": 0.001084979739971459, "learning_rate": 2.290862290862291e-06, "loss": 0.0002, "step": 30840 }, { "epoch": 39.01407979407979, "grad_norm": 0.07646601647138596, "learning_rate": 2.288002288002288e-06, "loss": 0.0003, "step": 30850 }, { "epoch": 39.014337194337195, "grad_norm": 0.004698839038610458, "learning_rate": 2.2851422851422854e-06, "loss": 0.214, "step": 30860 }, { "epoch": 39.01459459459459, "grad_norm": 0.0005614709225483239, "learning_rate": 2.2822822822822822e-06, "loss": 0.0001, "step": 30870 }, { "epoch": 39.014851994851995, "grad_norm": 0.00012098588194930926, "learning_rate": 2.2794222794222796e-06, "loss": 0.0002, "step": 30880 }, { "epoch": 39.0151093951094, "grad_norm": 0.13583892583847046, "learning_rate": 2.2765622765622765e-06, "loss": 0.0007, "step": 30890 }, { "epoch": 39.015366795366795, "grad_norm": 0.00043689829180948436, "learning_rate": 2.2737022737022738e-06, "loss": 0.0003, "step": 30900 }, { "epoch": 39.0156241956242, "grad_norm": 0.002140444004908204, "learning_rate": 2.270842270842271e-06, "loss": 1.9341, "step": 30910 }, { "epoch": 39.015881595881595, "grad_norm": 0.04902840033173561, "learning_rate": 2.2679822679822684e-06, "loss": 0.0004, "step": 30920 }, { "epoch": 39.016138996139, "grad_norm": 0.00013759516878053546, "learning_rate": 2.2651222651222653e-06, "loss": 0.4478, "step": 30930 }, { "epoch": 39.016396396396395, "grad_norm": 158.75863647460938, "learning_rate": 2.2622622622622626e-06, "loss": 1.3205, "step": 30940 }, { "epoch": 39.0166537966538, "grad_norm": 0.49175313115119934, "learning_rate": 2.2594022594022595e-06, "loss": 0.0002, "step": 30950 }, { "epoch": 39.016911196911195, "grad_norm": 0.036261558532714844, "learning_rate": 2.256542256542257e-06, "loss": 0.0001, "step": 30960 }, { "epoch": 39.0171685971686, "grad_norm": 0.00017356501484755427, "learning_rate": 2.2536822536822537e-06, "loss": 0.5837, "step": 30970 }, { "epoch": 39.017425997425995, "grad_norm": 0.000755143875721842, "learning_rate": 2.250822250822251e-06, "loss": 0.0003, "step": 30980 }, { "epoch": 39.0176833976834, "grad_norm": 0.0005234266864135861, "learning_rate": 2.2479622479622483e-06, "loss": 0.0002, "step": 30990 }, { "epoch": 39.017940797940796, "grad_norm": 0.0005713241989724338, "learning_rate": 2.2451022451022452e-06, "loss": 0.0001, "step": 31000 }, { "epoch": 39.0181981981982, "grad_norm": 0.005339688155800104, "learning_rate": 2.2422422422422426e-06, "loss": 0.5243, "step": 31010 }, { "epoch": 39.018455598455596, "grad_norm": 0.00022389739751815796, "learning_rate": 2.2393822393822394e-06, "loss": 0.0005, "step": 31020 }, { "epoch": 39.018712998713, "grad_norm": 0.00026733646518550813, "learning_rate": 2.2365222365222368e-06, "loss": 0.0002, "step": 31030 }, { "epoch": 39.018970398970396, "grad_norm": 0.009177249856293201, "learning_rate": 2.2336622336622337e-06, "loss": 0.2119, "step": 31040 }, { "epoch": 39.0192277992278, "grad_norm": 0.12196987867355347, "learning_rate": 2.230802230802231e-06, "loss": 0.0001, "step": 31050 }, { "epoch": 39.0194851994852, "grad_norm": 0.021783724427223206, "learning_rate": 2.227942227942228e-06, "loss": 0.691, "step": 31060 }, { "epoch": 39.0197425997426, "grad_norm": 0.007660671602934599, "learning_rate": 2.225082225082225e-06, "loss": 0.0002, "step": 31070 }, { "epoch": 39.02, "grad_norm": 0.0010431857081130147, "learning_rate": 2.222222222222222e-06, "loss": 0.0001, "step": 31080 }, { "epoch": 39.02, "eval_accuracy": 0.8260869565217391, "eval_loss": 0.9474976658821106, "eval_runtime": 13.4713, "eval_samples_per_second": 3.415, "eval_steps_per_second": 3.415, "step": 31080 }, { "epoch": 40.000257400257404, "grad_norm": 1.181172251701355, "learning_rate": 2.2193622193622194e-06, "loss": 0.0002, "step": 31090 }, { "epoch": 40.0005148005148, "grad_norm": 318.863037109375, "learning_rate": 2.2165022165022167e-06, "loss": 0.7819, "step": 31100 }, { "epoch": 40.000772200772204, "grad_norm": 0.00014746197848580778, "learning_rate": 2.213642213642214e-06, "loss": 0.0002, "step": 31110 }, { "epoch": 40.0010296010296, "grad_norm": 144.7231903076172, "learning_rate": 2.210782210782211e-06, "loss": 0.7112, "step": 31120 }, { "epoch": 40.001287001287004, "grad_norm": 0.00015773468476254493, "learning_rate": 2.2079222079222082e-06, "loss": 0.2482, "step": 31130 }, { "epoch": 40.0015444015444, "grad_norm": 0.0001816485309973359, "learning_rate": 2.205062205062205e-06, "loss": 1.637, "step": 31140 }, { "epoch": 40.001801801801804, "grad_norm": 0.034062862396240234, "learning_rate": 2.2022022022022024e-06, "loss": 0.5285, "step": 31150 }, { "epoch": 40.0020592020592, "grad_norm": 0.0027663600631058216, "learning_rate": 2.1993421993421998e-06, "loss": 0.6391, "step": 31160 }, { "epoch": 40.002316602316604, "grad_norm": 0.010885432362556458, "learning_rate": 2.1964821964821967e-06, "loss": 0.0001, "step": 31170 }, { "epoch": 40.002574002574, "grad_norm": 0.04000730440020561, "learning_rate": 2.193622193622194e-06, "loss": 0.0004, "step": 31180 }, { "epoch": 40.002831402831404, "grad_norm": 0.004812656436115503, "learning_rate": 2.190762190762191e-06, "loss": 0.0017, "step": 31190 }, { "epoch": 40.0030888030888, "grad_norm": 0.06480229645967484, "learning_rate": 2.187902187902188e-06, "loss": 0.3643, "step": 31200 }, { "epoch": 40.003346203346204, "grad_norm": 0.003973096143454313, "learning_rate": 2.185042185042185e-06, "loss": 0.0004, "step": 31210 }, { "epoch": 40.0036036036036, "grad_norm": 0.0010743311140686274, "learning_rate": 2.1821821821821824e-06, "loss": 0.0001, "step": 31220 }, { "epoch": 40.003861003861005, "grad_norm": 0.01999141089618206, "learning_rate": 2.1793221793221793e-06, "loss": 0.6939, "step": 31230 }, { "epoch": 40.0041184041184, "grad_norm": 0.0030548074282705784, "learning_rate": 2.1764621764621766e-06, "loss": 0.8081, "step": 31240 }, { "epoch": 40.004375804375805, "grad_norm": 0.004580581560730934, "learning_rate": 2.1736021736021735e-06, "loss": 0.0003, "step": 31250 }, { "epoch": 40.0046332046332, "grad_norm": 0.6698066592216492, "learning_rate": 2.170742170742171e-06, "loss": 0.0003, "step": 31260 }, { "epoch": 40.004890604890605, "grad_norm": 0.00941177923232317, "learning_rate": 2.167882167882168e-06, "loss": 0.0002, "step": 31270 }, { "epoch": 40.00514800514801, "grad_norm": 0.0010183549020439386, "learning_rate": 2.1650221650221654e-06, "loss": 0.0001, "step": 31280 }, { "epoch": 40.005405405405405, "grad_norm": 8.488154411315918, "learning_rate": 2.1621621621621623e-06, "loss": 0.0007, "step": 31290 }, { "epoch": 40.00566280566281, "grad_norm": 0.0022410727106034756, "learning_rate": 2.1593021593021596e-06, "loss": 0.0209, "step": 31300 }, { "epoch": 40.005920205920205, "grad_norm": 0.034624092280864716, "learning_rate": 2.1564421564421565e-06, "loss": 0.0003, "step": 31310 }, { "epoch": 40.00617760617761, "grad_norm": 0.01160801388323307, "learning_rate": 2.153582153582154e-06, "loss": 0.5756, "step": 31320 }, { "epoch": 40.006435006435005, "grad_norm": 0.00036862597335129976, "learning_rate": 2.150722150722151e-06, "loss": 0.0001, "step": 31330 }, { "epoch": 40.00669240669241, "grad_norm": 0.00013573635078500956, "learning_rate": 2.147862147862148e-06, "loss": 0.9687, "step": 31340 }, { "epoch": 40.006949806949805, "grad_norm": 0.0024660457856953144, "learning_rate": 2.1450021450021454e-06, "loss": 0.0002, "step": 31350 }, { "epoch": 40.00720720720721, "grad_norm": 0.0038319462910294533, "learning_rate": 2.1421421421421423e-06, "loss": 0.0001, "step": 31360 }, { "epoch": 40.007464607464605, "grad_norm": 0.005298302508890629, "learning_rate": 2.1392821392821396e-06, "loss": 0.0001, "step": 31370 }, { "epoch": 40.00772200772201, "grad_norm": 0.00020501506514847279, "learning_rate": 2.1364221364221365e-06, "loss": 0.0001, "step": 31380 }, { "epoch": 40.007979407979406, "grad_norm": 0.003177186707034707, "learning_rate": 2.133562133562134e-06, "loss": 0.0004, "step": 31390 }, { "epoch": 40.00823680823681, "grad_norm": 0.0029437614139169455, "learning_rate": 2.1307021307021307e-06, "loss": 0.0002, "step": 31400 }, { "epoch": 40.008494208494206, "grad_norm": 0.00016003473137971014, "learning_rate": 2.127842127842128e-06, "loss": 0.0005, "step": 31410 }, { "epoch": 40.00875160875161, "grad_norm": 3.366640090942383, "learning_rate": 2.124982124982125e-06, "loss": 0.0017, "step": 31420 }, { "epoch": 40.009009009009006, "grad_norm": 0.00017837573250290006, "learning_rate": 2.1221221221221222e-06, "loss": 0.0, "step": 31430 }, { "epoch": 40.00926640926641, "grad_norm": 0.00028988305712118745, "learning_rate": 2.1192621192621195e-06, "loss": 0.0001, "step": 31440 }, { "epoch": 40.00952380952381, "grad_norm": 0.06588441133499146, "learning_rate": 2.1164021164021164e-06, "loss": 0.755, "step": 31450 }, { "epoch": 40.00978120978121, "grad_norm": 0.00015987700317054987, "learning_rate": 2.1135421135421137e-06, "loss": 0.2524, "step": 31460 }, { "epoch": 40.01003861003861, "grad_norm": 0.002687298459932208, "learning_rate": 2.110682110682111e-06, "loss": 0.001, "step": 31470 }, { "epoch": 40.01029601029601, "grad_norm": 0.016389530152082443, "learning_rate": 2.107822107822108e-06, "loss": 0.6573, "step": 31480 }, { "epoch": 40.01055341055341, "grad_norm": 0.0076734754256904125, "learning_rate": 2.1049621049621053e-06, "loss": 0.7998, "step": 31490 }, { "epoch": 40.01081081081081, "grad_norm": 0.3663455843925476, "learning_rate": 2.102102102102102e-06, "loss": 0.0005, "step": 31500 }, { "epoch": 40.01106821106821, "grad_norm": 0.0001318360009463504, "learning_rate": 2.0992420992420995e-06, "loss": 0.0001, "step": 31510 }, { "epoch": 40.01132561132561, "grad_norm": 0.5110870003700256, "learning_rate": 2.096382096382097e-06, "loss": 0.0766, "step": 31520 }, { "epoch": 40.011583011583014, "grad_norm": 0.00019474855798762292, "learning_rate": 2.0935220935220937e-06, "loss": 0.0004, "step": 31530 }, { "epoch": 40.01184041184041, "grad_norm": 0.04191095754504204, "learning_rate": 2.090662090662091e-06, "loss": 0.2068, "step": 31540 }, { "epoch": 40.012097812097814, "grad_norm": 0.08732067048549652, "learning_rate": 2.087802087802088e-06, "loss": 0.0001, "step": 31550 }, { "epoch": 40.01235521235521, "grad_norm": 0.00027026841416954994, "learning_rate": 2.084942084942085e-06, "loss": 0.0035, "step": 31560 }, { "epoch": 40.012612612612614, "grad_norm": 0.0018748992588371038, "learning_rate": 2.082082082082082e-06, "loss": 0.4388, "step": 31570 }, { "epoch": 40.01287001287001, "grad_norm": 0.00020655177650041878, "learning_rate": 2.0792220792220794e-06, "loss": 0.0002, "step": 31580 }, { "epoch": 40.013127413127414, "grad_norm": 0.0026093849446624517, "learning_rate": 2.0763620763620763e-06, "loss": 0.6074, "step": 31590 }, { "epoch": 40.01338481338481, "grad_norm": 0.0001526171836303547, "learning_rate": 2.0735020735020736e-06, "loss": 0.6795, "step": 31600 }, { "epoch": 40.013642213642214, "grad_norm": 0.0050857714377343655, "learning_rate": 2.070642070642071e-06, "loss": 0.0106, "step": 31610 }, { "epoch": 40.01389961389961, "grad_norm": 0.17710165679454803, "learning_rate": 2.067782067782068e-06, "loss": 0.0004, "step": 31620 }, { "epoch": 40.014157014157014, "grad_norm": 0.23658306896686554, "learning_rate": 2.064922064922065e-06, "loss": 0.0094, "step": 31630 }, { "epoch": 40.01441441441442, "grad_norm": 0.00034343061270192266, "learning_rate": 2.062062062062062e-06, "loss": 0.0001, "step": 31640 }, { "epoch": 40.014671814671814, "grad_norm": 0.00010646691225701943, "learning_rate": 2.0592020592020594e-06, "loss": 0.0001, "step": 31650 }, { "epoch": 40.01492921492922, "grad_norm": 0.0020296790171414614, "learning_rate": 2.0563420563420567e-06, "loss": 1.1483, "step": 31660 }, { "epoch": 40.015186615186614, "grad_norm": 0.00015698590141255409, "learning_rate": 2.0534820534820536e-06, "loss": 0.0, "step": 31670 }, { "epoch": 40.01544401544402, "grad_norm": 0.00016668056196067482, "learning_rate": 2.050622050622051e-06, "loss": 0.0001, "step": 31680 }, { "epoch": 40.015701415701415, "grad_norm": 0.0005182051681913435, "learning_rate": 2.047762047762048e-06, "loss": 0.0001, "step": 31690 }, { "epoch": 40.01595881595882, "grad_norm": 0.2584684491157532, "learning_rate": 2.044902044902045e-06, "loss": 0.0003, "step": 31700 }, { "epoch": 40.016216216216215, "grad_norm": 0.18375691771507263, "learning_rate": 2.0420420420420424e-06, "loss": 0.0001, "step": 31710 }, { "epoch": 40.01647361647362, "grad_norm": 0.00018776406068354845, "learning_rate": 2.0391820391820393e-06, "loss": 0.0, "step": 31720 }, { "epoch": 40.016731016731015, "grad_norm": 0.017002640292048454, "learning_rate": 2.0363220363220366e-06, "loss": 0.0001, "step": 31730 }, { "epoch": 40.01698841698842, "grad_norm": 0.0015852090436965227, "learning_rate": 2.0334620334620335e-06, "loss": 0.0001, "step": 31740 }, { "epoch": 40.017245817245815, "grad_norm": 0.0003951498947571963, "learning_rate": 2.030602030602031e-06, "loss": 0.0003, "step": 31750 }, { "epoch": 40.01750321750322, "grad_norm": 0.00012422552390489727, "learning_rate": 2.0277420277420277e-06, "loss": 1.2037, "step": 31760 }, { "epoch": 40.017760617760615, "grad_norm": 0.019571000710129738, "learning_rate": 2.024882024882025e-06, "loss": 0.0001, "step": 31770 }, { "epoch": 40.01801801801802, "grad_norm": 0.00016542627417948097, "learning_rate": 2.022022022022022e-06, "loss": 0.0102, "step": 31780 }, { "epoch": 40.018275418275415, "grad_norm": 0.029966507107019424, "learning_rate": 2.0191620191620192e-06, "loss": 0.6281, "step": 31790 }, { "epoch": 40.01853281853282, "grad_norm": 0.00015886305482126772, "learning_rate": 2.0163020163020166e-06, "loss": 0.0002, "step": 31800 }, { "epoch": 40.018790218790215, "grad_norm": 0.005060768220573664, "learning_rate": 2.0134420134420135e-06, "loss": 0.0001, "step": 31810 }, { "epoch": 40.01904761904762, "grad_norm": 0.0025004101917147636, "learning_rate": 2.0105820105820108e-06, "loss": 0.2131, "step": 31820 }, { "epoch": 40.01930501930502, "grad_norm": 0.000141930824611336, "learning_rate": 2.0077220077220077e-06, "loss": 0.5586, "step": 31830 }, { "epoch": 40.01956241956242, "grad_norm": 0.1792692095041275, "learning_rate": 2.004862004862005e-06, "loss": 0.0003, "step": 31840 }, { "epoch": 40.01981981981982, "grad_norm": 0.002483024261891842, "learning_rate": 2.0020020020020023e-06, "loss": 0.0022, "step": 31850 }, { "epoch": 40.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 1.057834267616272, "eval_runtime": 13.3792, "eval_samples_per_second": 3.438, "eval_steps_per_second": 3.438, "step": 31857 }, { "epoch": 41.00007722007722, "grad_norm": 0.0013759952271357179, "learning_rate": 1.9991419991419996e-06, "loss": 0.7711, "step": 31860 }, { "epoch": 41.00033462033462, "grad_norm": 0.0012656886829063296, "learning_rate": 1.9962819962819965e-06, "loss": 0.0, "step": 31870 }, { "epoch": 41.00059202059202, "grad_norm": 0.012028630822896957, "learning_rate": 1.993421993421994e-06, "loss": 0.8055, "step": 31880 }, { "epoch": 41.00084942084942, "grad_norm": 0.0012869435595348477, "learning_rate": 1.9905619905619907e-06, "loss": 0.0002, "step": 31890 }, { "epoch": 41.00110682110682, "grad_norm": 0.0013365877093747258, "learning_rate": 1.987701987701988e-06, "loss": 0.0001, "step": 31900 }, { "epoch": 41.00136422136422, "grad_norm": 0.05966697260737419, "learning_rate": 1.984841984841985e-06, "loss": 0.0009, "step": 31910 }, { "epoch": 41.001621621621624, "grad_norm": 548.8873901367188, "learning_rate": 1.9819819819819822e-06, "loss": 0.6607, "step": 31920 }, { "epoch": 41.00187902187902, "grad_norm": 0.00041702756425365806, "learning_rate": 1.979121979121979e-06, "loss": 0.5822, "step": 31930 }, { "epoch": 41.002136422136424, "grad_norm": 0.0010571606690064073, "learning_rate": 1.9762619762619764e-06, "loss": 0.0001, "step": 31940 }, { "epoch": 41.00239382239382, "grad_norm": 0.0010254288790747523, "learning_rate": 1.9734019734019733e-06, "loss": 0.0024, "step": 31950 }, { "epoch": 41.002651222651224, "grad_norm": 0.0024843502324074507, "learning_rate": 1.9705419705419707e-06, "loss": 0.0002, "step": 31960 }, { "epoch": 41.00290862290862, "grad_norm": 0.0002254961582366377, "learning_rate": 1.967681967681968e-06, "loss": 1.3888, "step": 31970 }, { "epoch": 41.003166023166024, "grad_norm": 0.0001984317641472444, "learning_rate": 1.964821964821965e-06, "loss": 0.0087, "step": 31980 }, { "epoch": 41.00342342342342, "grad_norm": 0.00011000720405718312, "learning_rate": 1.961961961961962e-06, "loss": 0.0045, "step": 31990 }, { "epoch": 41.003680823680824, "grad_norm": 0.015376497060060501, "learning_rate": 1.959101959101959e-06, "loss": 0.0006, "step": 32000 }, { "epoch": 41.00393822393822, "grad_norm": 0.0003337947418913245, "learning_rate": 1.9562419562419564e-06, "loss": 0.0001, "step": 32010 }, { "epoch": 41.004195624195624, "grad_norm": 0.00041247939225286245, "learning_rate": 1.9533819533819533e-06, "loss": 0.0001, "step": 32020 }, { "epoch": 41.00445302445303, "grad_norm": 0.001562226563692093, "learning_rate": 1.950521950521951e-06, "loss": 0.0003, "step": 32030 }, { "epoch": 41.004710424710424, "grad_norm": 0.00014134070079308003, "learning_rate": 1.947661947661948e-06, "loss": 0.0, "step": 32040 }, { "epoch": 41.00496782496783, "grad_norm": 0.00018032017396762967, "learning_rate": 1.9448019448019452e-06, "loss": 0.0001, "step": 32050 }, { "epoch": 41.005225225225225, "grad_norm": 0.009866469539701939, "learning_rate": 1.941941941941942e-06, "loss": 0.0002, "step": 32060 }, { "epoch": 41.00548262548263, "grad_norm": 0.0001611222542123869, "learning_rate": 1.9390819390819394e-06, "loss": 0.0004, "step": 32070 }, { "epoch": 41.005740025740025, "grad_norm": 0.0015964763006195426, "learning_rate": 1.9362219362219363e-06, "loss": 0.0001, "step": 32080 }, { "epoch": 41.00599742599743, "grad_norm": 0.0021054623648524284, "learning_rate": 1.9333619333619337e-06, "loss": 0.7093, "step": 32090 }, { "epoch": 41.006254826254825, "grad_norm": 0.00017562608991283923, "learning_rate": 1.9305019305019305e-06, "loss": 0.0001, "step": 32100 }, { "epoch": 41.00651222651223, "grad_norm": 0.00044666044414043427, "learning_rate": 1.927641927641928e-06, "loss": 0.0, "step": 32110 }, { "epoch": 41.006769626769625, "grad_norm": 0.0006799431866966188, "learning_rate": 1.9247819247819248e-06, "loss": 0.0001, "step": 32120 }, { "epoch": 41.00702702702703, "grad_norm": 0.0002899241226259619, "learning_rate": 1.921921921921922e-06, "loss": 0.0001, "step": 32130 }, { "epoch": 41.007284427284425, "grad_norm": 0.0003254143812227994, "learning_rate": 1.9190619190619194e-06, "loss": 0.0001, "step": 32140 }, { "epoch": 41.00754182754183, "grad_norm": 9.479661093791947e-05, "learning_rate": 1.9162019162019163e-06, "loss": 0.0, "step": 32150 }, { "epoch": 41.007799227799225, "grad_norm": 0.0019586803391575813, "learning_rate": 1.9133419133419136e-06, "loss": 0.0001, "step": 32160 }, { "epoch": 41.00805662805663, "grad_norm": 0.0001963915565283969, "learning_rate": 1.9104819104819105e-06, "loss": 0.0001, "step": 32170 }, { "epoch": 41.008314028314025, "grad_norm": 0.00012030902871629223, "learning_rate": 1.907621907621908e-06, "loss": 0.0217, "step": 32180 }, { "epoch": 41.00857142857143, "grad_norm": 0.00992583204060793, "learning_rate": 1.904761904761905e-06, "loss": 0.0048, "step": 32190 }, { "epoch": 41.008828828828825, "grad_norm": 0.0002358455676585436, "learning_rate": 1.901901901901902e-06, "loss": 0.8052, "step": 32200 }, { "epoch": 41.00908622908623, "grad_norm": 0.0581081286072731, "learning_rate": 1.8990418990418991e-06, "loss": 0.0001, "step": 32210 }, { "epoch": 41.00934362934363, "grad_norm": 2753.812255859375, "learning_rate": 1.8961818961818964e-06, "loss": 0.975, "step": 32220 }, { "epoch": 41.00960102960103, "grad_norm": 0.001163011766038835, "learning_rate": 1.8933218933218935e-06, "loss": 0.0001, "step": 32230 }, { "epoch": 41.00985842985843, "grad_norm": 0.04853722080588341, "learning_rate": 1.8904618904618906e-06, "loss": 0.5376, "step": 32240 }, { "epoch": 41.01011583011583, "grad_norm": 0.00030269287526607513, "learning_rate": 1.8876018876018877e-06, "loss": 0.0001, "step": 32250 }, { "epoch": 41.01037323037323, "grad_norm": 0.00014615173859056085, "learning_rate": 1.884741884741885e-06, "loss": 0.0001, "step": 32260 }, { "epoch": 41.01063063063063, "grad_norm": 0.00015686519327573478, "learning_rate": 1.8818818818818822e-06, "loss": 0.0001, "step": 32270 }, { "epoch": 41.01088803088803, "grad_norm": 0.00044415026786737144, "learning_rate": 1.8790218790218793e-06, "loss": 0.0001, "step": 32280 }, { "epoch": 41.01114543114543, "grad_norm": 0.00014384774840436876, "learning_rate": 1.8761618761618764e-06, "loss": 0.0001, "step": 32290 }, { "epoch": 41.01140283140283, "grad_norm": 0.00019766957848332822, "learning_rate": 1.8733018733018735e-06, "loss": 0.0008, "step": 32300 }, { "epoch": 41.01166023166023, "grad_norm": 0.00011445001291576773, "learning_rate": 1.8704418704418706e-06, "loss": 0.0002, "step": 32310 }, { "epoch": 41.01191763191763, "grad_norm": 0.003054665867239237, "learning_rate": 1.8675818675818677e-06, "loss": 0.0004, "step": 32320 }, { "epoch": 41.01217503217503, "grad_norm": 0.00013076022150926292, "learning_rate": 1.8647218647218648e-06, "loss": 0.0669, "step": 32330 }, { "epoch": 41.01243243243243, "grad_norm": 0.0016582345124334097, "learning_rate": 1.861861861861862e-06, "loss": 0.9401, "step": 32340 }, { "epoch": 41.01268983268983, "grad_norm": 0.001524147461168468, "learning_rate": 1.859001859001859e-06, "loss": 0.0002, "step": 32350 }, { "epoch": 41.01294723294723, "grad_norm": 0.007222165819257498, "learning_rate": 1.8561418561418561e-06, "loss": 0.0002, "step": 32360 }, { "epoch": 41.01320463320463, "grad_norm": 0.00016654320643283427, "learning_rate": 1.8532818532818534e-06, "loss": 0.0, "step": 32370 }, { "epoch": 41.013462033462034, "grad_norm": 0.0013375479029491544, "learning_rate": 1.8504218504218505e-06, "loss": 0.2057, "step": 32380 }, { "epoch": 41.01371943371943, "grad_norm": 0.010129381902515888, "learning_rate": 1.8475618475618476e-06, "loss": 0.856, "step": 32390 }, { "epoch": 41.013976833976834, "grad_norm": 0.00015381610137410462, "learning_rate": 1.844701844701845e-06, "loss": 0.0002, "step": 32400 }, { "epoch": 41.01423423423424, "grad_norm": 0.00014513394853565842, "learning_rate": 1.841841841841842e-06, "loss": 0.0001, "step": 32410 }, { "epoch": 41.014491634491634, "grad_norm": 0.00014010151789989322, "learning_rate": 1.8389818389818392e-06, "loss": 0.6838, "step": 32420 }, { "epoch": 41.01474903474904, "grad_norm": 0.0005243502091616392, "learning_rate": 1.8361218361218363e-06, "loss": 0.8736, "step": 32430 }, { "epoch": 41.015006435006434, "grad_norm": 0.004362246487289667, "learning_rate": 1.8332618332618336e-06, "loss": 0.144, "step": 32440 }, { "epoch": 41.01526383526384, "grad_norm": 0.003542808350175619, "learning_rate": 1.8304018304018307e-06, "loss": 0.2341, "step": 32450 }, { "epoch": 41.015521235521234, "grad_norm": 0.00015553591947536916, "learning_rate": 1.8275418275418278e-06, "loss": 0.0001, "step": 32460 }, { "epoch": 41.01577863577864, "grad_norm": 0.00012182680075056851, "learning_rate": 1.8246818246818249e-06, "loss": 0.519, "step": 32470 }, { "epoch": 41.016036036036034, "grad_norm": 0.03936436027288437, "learning_rate": 1.821821821821822e-06, "loss": 0.0025, "step": 32480 }, { "epoch": 41.01629343629344, "grad_norm": 0.0007767226779833436, "learning_rate": 1.818961818961819e-06, "loss": 0.0, "step": 32490 }, { "epoch": 41.016550836550834, "grad_norm": 0.0001796115393517539, "learning_rate": 1.8161018161018162e-06, "loss": 0.0001, "step": 32500 }, { "epoch": 41.01680823680824, "grad_norm": 88.20077514648438, "learning_rate": 1.8132418132418133e-06, "loss": 1.2879, "step": 32510 }, { "epoch": 41.017065637065635, "grad_norm": 0.006687271874397993, "learning_rate": 1.8103818103818104e-06, "loss": 0.0003, "step": 32520 }, { "epoch": 41.01732303732304, "grad_norm": 0.00014735499280504882, "learning_rate": 1.8075218075218075e-06, "loss": 0.5289, "step": 32530 }, { "epoch": 41.017580437580435, "grad_norm": 0.00012671224249061197, "learning_rate": 1.8046618046618048e-06, "loss": 0.6025, "step": 32540 }, { "epoch": 41.01783783783784, "grad_norm": 0.00012978336599189788, "learning_rate": 1.801801801801802e-06, "loss": 0.0004, "step": 32550 }, { "epoch": 41.018095238095235, "grad_norm": 0.0001393378188367933, "learning_rate": 1.798941798941799e-06, "loss": 0.7477, "step": 32560 }, { "epoch": 41.01835263835264, "grad_norm": 0.00018257521151099354, "learning_rate": 1.7960817960817961e-06, "loss": 0.7311, "step": 32570 }, { "epoch": 41.01861003861004, "grad_norm": 0.00029136036755517125, "learning_rate": 1.7932217932217933e-06, "loss": 0.0002, "step": 32580 }, { "epoch": 41.01886743886744, "grad_norm": 1.4105013608932495, "learning_rate": 1.7903617903617906e-06, "loss": 0.0005, "step": 32590 }, { "epoch": 41.01912483912484, "grad_norm": 0.00852140597999096, "learning_rate": 1.7875017875017877e-06, "loss": 0.2117, "step": 32600 }, { "epoch": 41.01938223938224, "grad_norm": 0.004909228533506393, "learning_rate": 1.784641784641785e-06, "loss": 0.0001, "step": 32610 }, { "epoch": 41.01963963963964, "grad_norm": 0.00011149414058309048, "learning_rate": 1.781781781781782e-06, "loss": 0.0001, "step": 32620 }, { "epoch": 41.01989703989704, "grad_norm": 0.00012031041114823893, "learning_rate": 1.7789217789217792e-06, "loss": 0.0002, "step": 32630 }, { "epoch": 41.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.9567879438400269, "eval_runtime": 13.3891, "eval_samples_per_second": 3.436, "eval_steps_per_second": 3.436, "step": 32634 }, { "epoch": 42.00015444015444, "grad_norm": 0.00020218401914462447, "learning_rate": 1.7760617760617763e-06, "loss": 0.0728, "step": 32640 }, { "epoch": 42.00041184041184, "grad_norm": 0.00011965364683419466, "learning_rate": 1.7732017732017734e-06, "loss": 0.0005, "step": 32650 }, { "epoch": 42.00066924066924, "grad_norm": 0.00013755726104136556, "learning_rate": 1.7703417703417705e-06, "loss": 0.0001, "step": 32660 }, { "epoch": 42.00092664092664, "grad_norm": 0.003897932590916753, "learning_rate": 1.7674817674817676e-06, "loss": 0.5112, "step": 32670 }, { "epoch": 42.00118404118404, "grad_norm": 0.048069994896650314, "learning_rate": 1.7646217646217647e-06, "loss": 0.0002, "step": 32680 }, { "epoch": 42.00144144144144, "grad_norm": 0.00015302524843718857, "learning_rate": 1.7617617617617618e-06, "loss": 0.0002, "step": 32690 }, { "epoch": 42.00169884169884, "grad_norm": 0.0002129245549440384, "learning_rate": 1.758901758901759e-06, "loss": 0.0001, "step": 32700 }, { "epoch": 42.00195624195624, "grad_norm": 0.004387426655739546, "learning_rate": 1.756041756041756e-06, "loss": 0.0001, "step": 32710 }, { "epoch": 42.00221364221364, "grad_norm": 0.01488508377224207, "learning_rate": 1.7531817531817534e-06, "loss": 0.0001, "step": 32720 }, { "epoch": 42.00247104247104, "grad_norm": 0.00032001244835555553, "learning_rate": 1.7503217503217505e-06, "loss": 0.005, "step": 32730 }, { "epoch": 42.00272844272844, "grad_norm": 0.002208408433943987, "learning_rate": 1.7474617474617476e-06, "loss": 0.5717, "step": 32740 }, { "epoch": 42.002985842985844, "grad_norm": 0.00012605212396010756, "learning_rate": 1.7446017446017447e-06, "loss": 0.0, "step": 32750 }, { "epoch": 42.00324324324324, "grad_norm": 0.0014979569241404533, "learning_rate": 1.7417417417417418e-06, "loss": 0.0, "step": 32760 }, { "epoch": 42.003500643500644, "grad_norm": 0.003774981014430523, "learning_rate": 1.7388817388817389e-06, "loss": 0.0001, "step": 32770 }, { "epoch": 42.00375804375804, "grad_norm": 0.0028361384756863117, "learning_rate": 1.7360217360217362e-06, "loss": 0.7856, "step": 32780 }, { "epoch": 42.004015444015444, "grad_norm": 0.0033012754283845425, "learning_rate": 1.7331617331617335e-06, "loss": 0.0003, "step": 32790 }, { "epoch": 42.00427284427285, "grad_norm": 0.009487603791058064, "learning_rate": 1.7303017303017306e-06, "loss": 0.0003, "step": 32800 }, { "epoch": 42.004530244530244, "grad_norm": 0.00019623312982730567, "learning_rate": 1.7274417274417277e-06, "loss": 0.0001, "step": 32810 }, { "epoch": 42.00478764478765, "grad_norm": 0.00011614287359407172, "learning_rate": 1.7245817245817248e-06, "loss": 0.6143, "step": 32820 }, { "epoch": 42.005045045045044, "grad_norm": 0.0002354454481974244, "learning_rate": 1.721721721721722e-06, "loss": 0.6405, "step": 32830 }, { "epoch": 42.00530244530245, "grad_norm": 0.03378787264227867, "learning_rate": 1.718861718861719e-06, "loss": 0.0001, "step": 32840 }, { "epoch": 42.005559845559844, "grad_norm": 0.00012451726070139557, "learning_rate": 1.7160017160017161e-06, "loss": 0.0002, "step": 32850 }, { "epoch": 42.00581724581725, "grad_norm": 0.0002813052851706743, "learning_rate": 1.7131417131417132e-06, "loss": 0.0002, "step": 32860 }, { "epoch": 42.006074646074644, "grad_norm": 0.0054261996410787106, "learning_rate": 1.7102817102817103e-06, "loss": 0.0001, "step": 32870 }, { "epoch": 42.00633204633205, "grad_norm": 0.00012816753587685525, "learning_rate": 1.7074217074217074e-06, "loss": 0.0002, "step": 32880 }, { "epoch": 42.006589446589444, "grad_norm": 0.0003143631329294294, "learning_rate": 1.7045617045617048e-06, "loss": 0.0042, "step": 32890 }, { "epoch": 42.00684684684685, "grad_norm": 0.00046908739022910595, "learning_rate": 1.7017017017017019e-06, "loss": 0.781, "step": 32900 }, { "epoch": 42.007104247104245, "grad_norm": 0.03740377351641655, "learning_rate": 1.698841698841699e-06, "loss": 0.0001, "step": 32910 }, { "epoch": 42.00736164736165, "grad_norm": 0.0009596379823051393, "learning_rate": 1.695981695981696e-06, "loss": 0.0001, "step": 32920 }, { "epoch": 42.007619047619045, "grad_norm": 0.012743496336042881, "learning_rate": 1.6931216931216932e-06, "loss": 0.0001, "step": 32930 }, { "epoch": 42.00787644787645, "grad_norm": 0.002485014731064439, "learning_rate": 1.6902616902616903e-06, "loss": 0.6629, "step": 32940 }, { "epoch": 42.008133848133845, "grad_norm": 0.0100320503115654, "learning_rate": 1.6874016874016874e-06, "loss": 0.0003, "step": 32950 }, { "epoch": 42.00839124839125, "grad_norm": 0.010429958812892437, "learning_rate": 1.6845416845416845e-06, "loss": 0.0003, "step": 32960 }, { "epoch": 42.00864864864865, "grad_norm": 0.01115471962839365, "learning_rate": 1.681681681681682e-06, "loss": 0.0003, "step": 32970 }, { "epoch": 42.00890604890605, "grad_norm": 144.6180877685547, "learning_rate": 1.6788216788216791e-06, "loss": 0.7403, "step": 32980 }, { "epoch": 42.00916344916345, "grad_norm": 1396.6766357421875, "learning_rate": 1.6759616759616762e-06, "loss": 0.6216, "step": 32990 }, { "epoch": 42.00942084942085, "grad_norm": 0.017528045922517776, "learning_rate": 1.6731016731016733e-06, "loss": 0.0001, "step": 33000 }, { "epoch": 42.00967824967825, "grad_norm": 0.0125763900578022, "learning_rate": 1.6702416702416704e-06, "loss": 0.0001, "step": 33010 }, { "epoch": 42.00993564993565, "grad_norm": 0.0016611794708296657, "learning_rate": 1.6673816673816675e-06, "loss": 0.0, "step": 33020 }, { "epoch": 42.01019305019305, "grad_norm": 0.0032114211935549974, "learning_rate": 1.6645216645216646e-06, "loss": 0.808, "step": 33030 }, { "epoch": 42.01045045045045, "grad_norm": 0.04180116951465607, "learning_rate": 1.6616616616616618e-06, "loss": 0.0006, "step": 33040 }, { "epoch": 42.01070785070785, "grad_norm": 0.32980695366859436, "learning_rate": 1.6588016588016589e-06, "loss": 0.0002, "step": 33050 }, { "epoch": 42.01096525096525, "grad_norm": 0.008240937255322933, "learning_rate": 1.655941655941656e-06, "loss": 0.6066, "step": 33060 }, { "epoch": 42.01122265122265, "grad_norm": 0.00017696853319648653, "learning_rate": 1.6530816530816533e-06, "loss": 0.0, "step": 33070 }, { "epoch": 42.01148005148005, "grad_norm": 0.00016973470337688923, "learning_rate": 1.6502216502216504e-06, "loss": 0.0001, "step": 33080 }, { "epoch": 42.01173745173745, "grad_norm": 0.0002044615539489314, "learning_rate": 1.6473616473616475e-06, "loss": 0.0001, "step": 33090 }, { "epoch": 42.01199485199485, "grad_norm": 0.008063897490501404, "learning_rate": 1.6445016445016446e-06, "loss": 0.0001, "step": 33100 }, { "epoch": 42.01225225225225, "grad_norm": 0.00011532429925864562, "learning_rate": 1.6416416416416417e-06, "loss": 0.0001, "step": 33110 }, { "epoch": 42.01250965250965, "grad_norm": 0.054353538900613785, "learning_rate": 1.6387816387816388e-06, "loss": 0.0002, "step": 33120 }, { "epoch": 42.01276705276705, "grad_norm": 0.00019760456052608788, "learning_rate": 1.635921635921636e-06, "loss": 0.0001, "step": 33130 }, { "epoch": 42.01302445302445, "grad_norm": 0.052227336913347244, "learning_rate": 1.633061633061633e-06, "loss": 0.0002, "step": 33140 }, { "epoch": 42.01328185328185, "grad_norm": 0.08783034235239029, "learning_rate": 1.6302016302016305e-06, "loss": 0.4019, "step": 33150 }, { "epoch": 42.01353925353926, "grad_norm": 0.007176580838859081, "learning_rate": 1.6273416273416276e-06, "loss": 0.0001, "step": 33160 }, { "epoch": 42.01379665379665, "grad_norm": 0.14134147763252258, "learning_rate": 1.6244816244816247e-06, "loss": 0.0002, "step": 33170 }, { "epoch": 42.01405405405406, "grad_norm": 0.004731915425509214, "learning_rate": 1.6216216216216219e-06, "loss": 0.0001, "step": 33180 }, { "epoch": 42.01431145431145, "grad_norm": 0.0035558564122766256, "learning_rate": 1.618761618761619e-06, "loss": 0.7162, "step": 33190 }, { "epoch": 42.01456885456886, "grad_norm": 0.015174226835370064, "learning_rate": 1.615901615901616e-06, "loss": 0.0001, "step": 33200 }, { "epoch": 42.014826254826254, "grad_norm": 0.010268859565258026, "learning_rate": 1.6130416130416132e-06, "loss": 0.0002, "step": 33210 }, { "epoch": 42.01508365508366, "grad_norm": 0.00010535326146055013, "learning_rate": 1.6101816101816103e-06, "loss": 0.0005, "step": 33220 }, { "epoch": 42.015341055341054, "grad_norm": 0.00044961340609006584, "learning_rate": 1.6073216073216074e-06, "loss": 0.0001, "step": 33230 }, { "epoch": 42.01559845559846, "grad_norm": 0.00024218950420618057, "learning_rate": 1.6044616044616047e-06, "loss": 0.0001, "step": 33240 }, { "epoch": 42.015855855855854, "grad_norm": 0.013476267457008362, "learning_rate": 1.6016016016016018e-06, "loss": 0.0022, "step": 33250 }, { "epoch": 42.01611325611326, "grad_norm": 0.01417740061879158, "learning_rate": 1.598741598741599e-06, "loss": 0.0005, "step": 33260 }, { "epoch": 42.016370656370654, "grad_norm": 0.00013865696382708848, "learning_rate": 1.595881595881596e-06, "loss": 0.0001, "step": 33270 }, { "epoch": 42.01662805662806, "grad_norm": 0.009123004972934723, "learning_rate": 1.5930215930215931e-06, "loss": 0.0001, "step": 33280 }, { "epoch": 42.016885456885454, "grad_norm": 0.0426633283495903, "learning_rate": 1.5901615901615902e-06, "loss": 0.0001, "step": 33290 }, { "epoch": 42.01714285714286, "grad_norm": 0.00010858638415811583, "learning_rate": 1.5873015873015873e-06, "loss": 0.0001, "step": 33300 }, { "epoch": 42.017400257400254, "grad_norm": 0.01740678958594799, "learning_rate": 1.5844415844415844e-06, "loss": 0.6447, "step": 33310 }, { "epoch": 42.01765765765766, "grad_norm": 0.0025633275508880615, "learning_rate": 1.5815815815815815e-06, "loss": 0.738, "step": 33320 }, { "epoch": 42.017915057915054, "grad_norm": 0.06856297701597214, "learning_rate": 1.5787215787215786e-06, "loss": 0.0003, "step": 33330 }, { "epoch": 42.01817245817246, "grad_norm": 0.006150628440082073, "learning_rate": 1.5758615758615762e-06, "loss": 0.0001, "step": 33340 }, { "epoch": 42.01842985842986, "grad_norm": 0.0021094612311571836, "learning_rate": 1.5730015730015733e-06, "loss": 0.3257, "step": 33350 }, { "epoch": 42.01868725868726, "grad_norm": 0.003045695135369897, "learning_rate": 1.5701415701415704e-06, "loss": 0.0001, "step": 33360 }, { "epoch": 42.01894465894466, "grad_norm": 0.002286852104589343, "learning_rate": 1.5672815672815675e-06, "loss": 0.6679, "step": 33370 }, { "epoch": 42.01920205920206, "grad_norm": 0.002063462510704994, "learning_rate": 1.5644215644215646e-06, "loss": 0.0001, "step": 33380 }, { "epoch": 42.01945945945946, "grad_norm": 0.00013045952073298395, "learning_rate": 1.5615615615615617e-06, "loss": 0.6961, "step": 33390 }, { "epoch": 42.01971685971686, "grad_norm": 0.0018174780998378992, "learning_rate": 1.5587015587015588e-06, "loss": 0.1587, "step": 33400 }, { "epoch": 42.01997425997426, "grad_norm": 0.0012939365115016699, "learning_rate": 1.5558415558415559e-06, "loss": 0.0004, "step": 33410 }, { "epoch": 42.02, "eval_accuracy": 0.8695652173913043, "eval_loss": 0.9224266409873962, "eval_runtime": 13.4044, "eval_samples_per_second": 3.432, "eval_steps_per_second": 3.432, "step": 33411 }, { "epoch": 43.00023166023166, "grad_norm": 0.01454122643917799, "learning_rate": 1.5529815529815532e-06, "loss": 0.0002, "step": 33420 }, { "epoch": 43.00048906048906, "grad_norm": 0.011734708212316036, "learning_rate": 1.5501215501215503e-06, "loss": 0.0004, "step": 33430 }, { "epoch": 43.00074646074646, "grad_norm": 0.0002132106601493433, "learning_rate": 1.5472615472615474e-06, "loss": 0.6844, "step": 33440 }, { "epoch": 43.00100386100386, "grad_norm": 0.0011723071802407503, "learning_rate": 1.5444015444015445e-06, "loss": 0.0023, "step": 33450 }, { "epoch": 43.00126126126126, "grad_norm": 0.006746041122823954, "learning_rate": 1.5415415415415416e-06, "loss": 0.0006, "step": 33460 }, { "epoch": 43.00151866151866, "grad_norm": 0.0007373033440671861, "learning_rate": 1.5386815386815387e-06, "loss": 1.6536, "step": 33470 }, { "epoch": 43.00177606177606, "grad_norm": 0.00580536900088191, "learning_rate": 1.5358215358215358e-06, "loss": 0.0003, "step": 33480 }, { "epoch": 43.00203346203346, "grad_norm": 0.00019677856471389532, "learning_rate": 1.532961532961533e-06, "loss": 0.0002, "step": 33490 }, { "epoch": 43.00229086229086, "grad_norm": 0.006432530004531145, "learning_rate": 1.53010153010153e-06, "loss": 0.7492, "step": 33500 }, { "epoch": 43.00254826254826, "grad_norm": 0.003972894046455622, "learning_rate": 1.5272415272415271e-06, "loss": 0.0002, "step": 33510 }, { "epoch": 43.00280566280566, "grad_norm": 0.004245332442224026, "learning_rate": 1.5243815243815245e-06, "loss": 0.0001, "step": 33520 }, { "epoch": 43.00306306306306, "grad_norm": 0.06526504456996918, "learning_rate": 1.5215215215215218e-06, "loss": 0.0001, "step": 33530 }, { "epoch": 43.00332046332046, "grad_norm": 0.004649861715734005, "learning_rate": 1.5186615186615189e-06, "loss": 0.0002, "step": 33540 }, { "epoch": 43.00357786357787, "grad_norm": 9.96380767901428e-05, "learning_rate": 1.515801515801516e-06, "loss": 0.0005, "step": 33550 }, { "epoch": 43.00383526383526, "grad_norm": 0.005374431610107422, "learning_rate": 1.512941512941513e-06, "loss": 0.0002, "step": 33560 }, { "epoch": 43.00409266409267, "grad_norm": 0.00018989373347721994, "learning_rate": 1.5100815100815102e-06, "loss": 0.0, "step": 33570 }, { "epoch": 43.004350064350064, "grad_norm": 0.008197961375117302, "learning_rate": 1.5072215072215073e-06, "loss": 0.0001, "step": 33580 }, { "epoch": 43.00460746460747, "grad_norm": 0.012032076716423035, "learning_rate": 1.5043615043615046e-06, "loss": 0.0001, "step": 33590 }, { "epoch": 43.004864864864864, "grad_norm": 0.00011511553020682186, "learning_rate": 1.5015015015015017e-06, "loss": 0.0005, "step": 33600 }, { "epoch": 43.00512226512227, "grad_norm": 0.00012108176451874897, "learning_rate": 1.4986414986414988e-06, "loss": 0.0001, "step": 33610 }, { "epoch": 43.005379665379664, "grad_norm": 0.00024196848971769214, "learning_rate": 1.495781495781496e-06, "loss": 0.1988, "step": 33620 }, { "epoch": 43.00563706563707, "grad_norm": 9.511480311630294e-05, "learning_rate": 1.492921492921493e-06, "loss": 0.0001, "step": 33630 }, { "epoch": 43.005894465894464, "grad_norm": 0.0001018472685245797, "learning_rate": 1.4900614900614901e-06, "loss": 0.34, "step": 33640 }, { "epoch": 43.00615186615187, "grad_norm": 217.52088928222656, "learning_rate": 1.4872014872014872e-06, "loss": 0.5665, "step": 33650 }, { "epoch": 43.006409266409264, "grad_norm": 0.02129535749554634, "learning_rate": 1.4843414843414843e-06, "loss": 0.0062, "step": 33660 }, { "epoch": 43.00666666666667, "grad_norm": 0.049556683748960495, "learning_rate": 1.4814814814814815e-06, "loss": 0.0007, "step": 33670 }, { "epoch": 43.006924066924064, "grad_norm": 0.0052298568189144135, "learning_rate": 1.4786214786214786e-06, "loss": 0.0003, "step": 33680 }, { "epoch": 43.00718146718147, "grad_norm": 0.041593704372644424, "learning_rate": 1.4757614757614759e-06, "loss": 0.0003, "step": 33690 }, { "epoch": 43.007438867438864, "grad_norm": 0.00040065948269329965, "learning_rate": 1.472901472901473e-06, "loss": 0.3137, "step": 33700 }, { "epoch": 43.00769626769627, "grad_norm": 0.030675746500492096, "learning_rate": 1.4700414700414703e-06, "loss": 0.0003, "step": 33710 }, { "epoch": 43.007953667953664, "grad_norm": 0.0022851689718663692, "learning_rate": 1.4671814671814674e-06, "loss": 0.0, "step": 33720 }, { "epoch": 43.00821106821107, "grad_norm": 101.7066650390625, "learning_rate": 1.4643214643214645e-06, "loss": 1.5302, "step": 33730 }, { "epoch": 43.00846846846847, "grad_norm": 0.00015403941506519914, "learning_rate": 1.4614614614614616e-06, "loss": 0.0001, "step": 33740 }, { "epoch": 43.00872586872587, "grad_norm": 0.0009902218589559197, "learning_rate": 1.4586014586014587e-06, "loss": 0.0001, "step": 33750 }, { "epoch": 43.00898326898327, "grad_norm": 0.0025801286101341248, "learning_rate": 1.4557414557414558e-06, "loss": 0.0002, "step": 33760 }, { "epoch": 43.00924066924067, "grad_norm": 0.00010533163003856316, "learning_rate": 1.4528814528814531e-06, "loss": 0.0003, "step": 33770 }, { "epoch": 43.00949806949807, "grad_norm": 39.131736755371094, "learning_rate": 1.4500214500214502e-06, "loss": 0.0072, "step": 33780 }, { "epoch": 43.00975546975547, "grad_norm": 0.00011560120765352622, "learning_rate": 1.4471614471614473e-06, "loss": 0.0001, "step": 33790 }, { "epoch": 43.01001287001287, "grad_norm": 0.014448067173361778, "learning_rate": 1.4443014443014444e-06, "loss": 0.0001, "step": 33800 }, { "epoch": 43.01027027027027, "grad_norm": 0.05728612467646599, "learning_rate": 1.4414414414414416e-06, "loss": 0.0005, "step": 33810 }, { "epoch": 43.01052767052767, "grad_norm": 0.0005739001790061593, "learning_rate": 1.4385814385814387e-06, "loss": 0.0001, "step": 33820 }, { "epoch": 43.01078507078507, "grad_norm": 0.00013156767818145454, "learning_rate": 1.4357214357214358e-06, "loss": 0.0001, "step": 33830 }, { "epoch": 43.01104247104247, "grad_norm": 0.0010646074078977108, "learning_rate": 1.4328614328614329e-06, "loss": 0.0001, "step": 33840 }, { "epoch": 43.01129987129987, "grad_norm": 0.00016523963131476194, "learning_rate": 1.43000143000143e-06, "loss": 0.0001, "step": 33850 }, { "epoch": 43.01155727155727, "grad_norm": 0.01101844571530819, "learning_rate": 1.427141427141427e-06, "loss": 1.569, "step": 33860 }, { "epoch": 43.01181467181467, "grad_norm": 9.992035484174266e-05, "learning_rate": 1.4242814242814244e-06, "loss": 0.0001, "step": 33870 }, { "epoch": 43.01207207207207, "grad_norm": 0.0001401646004524082, "learning_rate": 1.4214214214214215e-06, "loss": 0.0004, "step": 33880 }, { "epoch": 43.01232947232947, "grad_norm": 0.027710143476724625, "learning_rate": 1.4185614185614186e-06, "loss": 0.0002, "step": 33890 }, { "epoch": 43.01258687258687, "grad_norm": 0.002572421682998538, "learning_rate": 1.415701415701416e-06, "loss": 0.002, "step": 33900 }, { "epoch": 43.012844272844276, "grad_norm": 0.002672908827662468, "learning_rate": 1.412841412841413e-06, "loss": 0.4745, "step": 33910 }, { "epoch": 43.01310167310167, "grad_norm": 0.00011232582619413733, "learning_rate": 1.4099814099814101e-06, "loss": 0.4356, "step": 33920 }, { "epoch": 43.01335907335908, "grad_norm": 0.010794050060212612, "learning_rate": 1.4071214071214072e-06, "loss": 0.0004, "step": 33930 }, { "epoch": 43.01361647361647, "grad_norm": 0.021403295919299126, "learning_rate": 1.4042614042614045e-06, "loss": 0.0001, "step": 33940 }, { "epoch": 43.01387387387388, "grad_norm": 0.003955175634473562, "learning_rate": 1.4014014014014016e-06, "loss": 0.0003, "step": 33950 }, { "epoch": 43.01413127413127, "grad_norm": 0.0001854293659562245, "learning_rate": 1.3985413985413988e-06, "loss": 0.0001, "step": 33960 }, { "epoch": 43.01438867438868, "grad_norm": 0.004186289384961128, "learning_rate": 1.3956813956813959e-06, "loss": 0.0, "step": 33970 }, { "epoch": 43.01464607464607, "grad_norm": 0.003534914692863822, "learning_rate": 1.392821392821393e-06, "loss": 0.0002, "step": 33980 }, { "epoch": 43.01490347490348, "grad_norm": 0.004942872561514378, "learning_rate": 1.38996138996139e-06, "loss": 0.0001, "step": 33990 }, { "epoch": 43.01516087516087, "grad_norm": 0.020380662754178047, "learning_rate": 1.3871013871013872e-06, "loss": 0.6943, "step": 34000 }, { "epoch": 43.01541827541828, "grad_norm": 0.013380438089370728, "learning_rate": 1.3842413842413843e-06, "loss": 0.0002, "step": 34010 }, { "epoch": 43.01567567567567, "grad_norm": 0.00011852156603708863, "learning_rate": 1.3813813813813814e-06, "loss": 0.0001, "step": 34020 }, { "epoch": 43.01593307593308, "grad_norm": 0.02771720103919506, "learning_rate": 1.3785213785213785e-06, "loss": 0.0558, "step": 34030 }, { "epoch": 43.016190476190474, "grad_norm": 0.004810509271919727, "learning_rate": 1.3756613756613758e-06, "loss": 0.0002, "step": 34040 }, { "epoch": 43.01644787644788, "grad_norm": 0.001095824409276247, "learning_rate": 1.372801372801373e-06, "loss": 0.0982, "step": 34050 }, { "epoch": 43.016705276705274, "grad_norm": 0.028301896527409554, "learning_rate": 1.36994136994137e-06, "loss": 0.0057, "step": 34060 }, { "epoch": 43.01696267696268, "grad_norm": 8.951701602200046e-05, "learning_rate": 1.3670813670813671e-06, "loss": 0.0, "step": 34070 }, { "epoch": 43.017220077220074, "grad_norm": 0.00012243095261510462, "learning_rate": 1.3642213642213642e-06, "loss": 0.0019, "step": 34080 }, { "epoch": 43.01747747747748, "grad_norm": 0.08369873464107513, "learning_rate": 1.3613613613613615e-06, "loss": 0.0001, "step": 34090 }, { "epoch": 43.01773487773488, "grad_norm": 0.12311464548110962, "learning_rate": 1.3585013585013586e-06, "loss": 0.0035, "step": 34100 }, { "epoch": 43.01799227799228, "grad_norm": 0.0011633503017947078, "learning_rate": 1.3556413556413557e-06, "loss": 0.0001, "step": 34110 }, { "epoch": 43.01824967824968, "grad_norm": 0.2869292199611664, "learning_rate": 1.352781352781353e-06, "loss": 0.6535, "step": 34120 }, { "epoch": 43.01850707850708, "grad_norm": 0.0009136784356087446, "learning_rate": 1.3499213499213502e-06, "loss": 0.0001, "step": 34130 }, { "epoch": 43.01876447876448, "grad_norm": 0.017915774136781693, "learning_rate": 1.3470613470613473e-06, "loss": 0.7692, "step": 34140 }, { "epoch": 43.01902187902188, "grad_norm": 0.013292906805872917, "learning_rate": 1.3442013442013444e-06, "loss": 0.1451, "step": 34150 }, { "epoch": 43.01927927927928, "grad_norm": 0.49672406911849976, "learning_rate": 1.3413413413413415e-06, "loss": 0.0002, "step": 34160 }, { "epoch": 43.01953667953668, "grad_norm": 0.018338019028306007, "learning_rate": 1.3384813384813386e-06, "loss": 0.3841, "step": 34170 }, { "epoch": 43.01979407979408, "grad_norm": 0.0016882263589650393, "learning_rate": 1.3356213356213357e-06, "loss": 0.0, "step": 34180 }, { "epoch": 43.02, "eval_accuracy": 0.8913043478260869, "eval_loss": 0.8144087791442871, "eval_runtime": 13.4412, "eval_samples_per_second": 3.422, "eval_steps_per_second": 3.422, "step": 34188 }, { "epoch": 44.00005148005148, "grad_norm": 0.00010920155182247981, "learning_rate": 1.3327613327613328e-06, "loss": 0.0001, "step": 34190 }, { "epoch": 44.00030888030888, "grad_norm": 0.00023170172062236816, "learning_rate": 1.32990132990133e-06, "loss": 0.0001, "step": 34200 }, { "epoch": 44.00056628056628, "grad_norm": 0.00012434182281140238, "learning_rate": 1.327041327041327e-06, "loss": 0.0, "step": 34210 }, { "epoch": 44.00082368082368, "grad_norm": 0.0024254657328128815, "learning_rate": 1.3241813241813243e-06, "loss": 0.0001, "step": 34220 }, { "epoch": 44.00108108108108, "grad_norm": 0.001370239071547985, "learning_rate": 1.3213213213213214e-06, "loss": 0.0002, "step": 34230 }, { "epoch": 44.00133848133848, "grad_norm": 0.0040776995010674, "learning_rate": 1.3184613184613185e-06, "loss": 0.0001, "step": 34240 }, { "epoch": 44.00159588159588, "grad_norm": 0.0010753278620541096, "learning_rate": 1.3156013156013156e-06, "loss": 0.0021, "step": 34250 }, { "epoch": 44.00185328185328, "grad_norm": 0.00011391542648198083, "learning_rate": 1.3127413127413127e-06, "loss": 0.0004, "step": 34260 }, { "epoch": 44.00211068211068, "grad_norm": 0.00021202664356678724, "learning_rate": 1.30988130988131e-06, "loss": 0.3768, "step": 34270 }, { "epoch": 44.00236808236808, "grad_norm": 0.0013385345228016376, "learning_rate": 1.3070213070213072e-06, "loss": 0.0, "step": 34280 }, { "epoch": 44.00262548262548, "grad_norm": 9.8246535344515e-05, "learning_rate": 1.3041613041613045e-06, "loss": 0.0001, "step": 34290 }, { "epoch": 44.002882882882886, "grad_norm": 0.00012314043124206364, "learning_rate": 1.3013013013013016e-06, "loss": 0.0, "step": 34300 }, { "epoch": 44.00314028314028, "grad_norm": 0.00023305923969019204, "learning_rate": 1.2984412984412987e-06, "loss": 0.0001, "step": 34310 }, { "epoch": 44.00339768339769, "grad_norm": 0.006541033275425434, "learning_rate": 1.2955812955812958e-06, "loss": 0.0002, "step": 34320 }, { "epoch": 44.00365508365508, "grad_norm": 0.008542906492948532, "learning_rate": 1.2927212927212929e-06, "loss": 0.0002, "step": 34330 }, { "epoch": 44.00391248391249, "grad_norm": 0.00016316522669512779, "learning_rate": 1.28986128986129e-06, "loss": 0.5992, "step": 34340 }, { "epoch": 44.00416988416988, "grad_norm": 1539.5146484375, "learning_rate": 1.287001287001287e-06, "loss": 0.1944, "step": 34350 }, { "epoch": 44.00442728442729, "grad_norm": 0.0014140707207843661, "learning_rate": 1.2841412841412842e-06, "loss": 0.0, "step": 34360 }, { "epoch": 44.00468468468468, "grad_norm": 0.00015174942382145673, "learning_rate": 1.2812812812812813e-06, "loss": 0.0, "step": 34370 }, { "epoch": 44.00494208494209, "grad_norm": 0.026701737195253372, "learning_rate": 1.2784212784212784e-06, "loss": 0.0, "step": 34380 }, { "epoch": 44.00519948519948, "grad_norm": 0.00013119878713041544, "learning_rate": 1.2755612755612757e-06, "loss": 0.0001, "step": 34390 }, { "epoch": 44.00545688545689, "grad_norm": 0.00010385591303929687, "learning_rate": 1.2727012727012728e-06, "loss": 0.0, "step": 34400 }, { "epoch": 44.005714285714284, "grad_norm": 0.0004606852598953992, "learning_rate": 1.26984126984127e-06, "loss": 0.0, "step": 34410 }, { "epoch": 44.00597168597169, "grad_norm": 0.002633896190673113, "learning_rate": 1.266981266981267e-06, "loss": 0.0001, "step": 34420 }, { "epoch": 44.006229086229084, "grad_norm": 0.001557589159347117, "learning_rate": 1.2641212641212641e-06, "loss": 0.0, "step": 34430 }, { "epoch": 44.00648648648649, "grad_norm": 0.03303082287311554, "learning_rate": 1.2612612612612613e-06, "loss": 0.0023, "step": 34440 }, { "epoch": 44.006743886743884, "grad_norm": 0.00011313861614326015, "learning_rate": 1.2584012584012584e-06, "loss": 0.0001, "step": 34450 }, { "epoch": 44.00700128700129, "grad_norm": 0.02742510288953781, "learning_rate": 1.2555412555412557e-06, "loss": 0.0001, "step": 34460 }, { "epoch": 44.007258687258684, "grad_norm": 0.0010636849328875542, "learning_rate": 1.252681252681253e-06, "loss": 0.0, "step": 34470 }, { "epoch": 44.00751608751609, "grad_norm": 9.581966878613457e-05, "learning_rate": 1.2498212498212499e-06, "loss": 0.0, "step": 34480 }, { "epoch": 44.00777348777349, "grad_norm": 1947.2777099609375, "learning_rate": 1.246961246961247e-06, "loss": 0.3187, "step": 34490 }, { "epoch": 44.00803088803089, "grad_norm": 0.00014478244702331722, "learning_rate": 1.244101244101244e-06, "loss": 0.6529, "step": 34500 }, { "epoch": 44.00828828828829, "grad_norm": 0.0012537414440885186, "learning_rate": 1.2412412412412414e-06, "loss": 0.0097, "step": 34510 }, { "epoch": 44.00854568854569, "grad_norm": 0.00011815100879175588, "learning_rate": 1.2383812383812385e-06, "loss": 0.4375, "step": 34520 }, { "epoch": 44.00880308880309, "grad_norm": 0.00031987254624255, "learning_rate": 1.2355212355212356e-06, "loss": 0.0001, "step": 34530 }, { "epoch": 44.00906048906049, "grad_norm": 0.0006125512300059199, "learning_rate": 1.2326612326612327e-06, "loss": 0.0002, "step": 34540 }, { "epoch": 44.00931788931789, "grad_norm": 8.829426224110648e-05, "learning_rate": 1.2298012298012298e-06, "loss": 0.6567, "step": 34550 }, { "epoch": 44.00957528957529, "grad_norm": 0.0020937060471624136, "learning_rate": 1.226941226941227e-06, "loss": 0.0001, "step": 34560 }, { "epoch": 44.00983268983269, "grad_norm": 0.0015685156686231494, "learning_rate": 1.2240812240812242e-06, "loss": 0.0, "step": 34570 }, { "epoch": 44.01009009009009, "grad_norm": 0.0005102535942569375, "learning_rate": 1.2212212212212213e-06, "loss": 1.1215, "step": 34580 }, { "epoch": 44.01034749034749, "grad_norm": 0.0015307324938476086, "learning_rate": 1.2183612183612185e-06, "loss": 0.0, "step": 34590 }, { "epoch": 44.01060489060489, "grad_norm": 0.00014208511856850237, "learning_rate": 1.2155012155012156e-06, "loss": 0.0038, "step": 34600 }, { "epoch": 44.01086229086229, "grad_norm": 0.04187508672475815, "learning_rate": 1.2126412126412129e-06, "loss": 0.6378, "step": 34610 }, { "epoch": 44.01111969111969, "grad_norm": 0.0010099312057718635, "learning_rate": 1.20978120978121e-06, "loss": 0.0, "step": 34620 }, { "epoch": 44.01137709137709, "grad_norm": 0.00010222556011285633, "learning_rate": 1.206921206921207e-06, "loss": 0.0, "step": 34630 }, { "epoch": 44.01163449163449, "grad_norm": 0.01367103774100542, "learning_rate": 1.2040612040612042e-06, "loss": 0.0, "step": 34640 }, { "epoch": 44.01189189189189, "grad_norm": 0.0009156797314062715, "learning_rate": 1.2012012012012013e-06, "loss": 0.0001, "step": 34650 }, { "epoch": 44.01214929214929, "grad_norm": 0.020239707082509995, "learning_rate": 1.1983411983411984e-06, "loss": 0.0, "step": 34660 }, { "epoch": 44.01240669240669, "grad_norm": 0.00010953854507533833, "learning_rate": 1.1954811954811955e-06, "loss": 0.0006, "step": 34670 }, { "epoch": 44.012664092664096, "grad_norm": 0.00010805286729009822, "learning_rate": 1.1926211926211926e-06, "loss": 0.6203, "step": 34680 }, { "epoch": 44.01292149292149, "grad_norm": 3.057513952255249, "learning_rate": 1.18976118976119e-06, "loss": 0.0004, "step": 34690 }, { "epoch": 44.013178893178896, "grad_norm": 40.827030181884766, "learning_rate": 1.186901186901187e-06, "loss": 0.0015, "step": 34700 }, { "epoch": 44.01343629343629, "grad_norm": 0.03450491279363632, "learning_rate": 1.1840411840411841e-06, "loss": 0.0001, "step": 34710 }, { "epoch": 44.013693693693696, "grad_norm": 0.00013904368097428232, "learning_rate": 1.1811811811811812e-06, "loss": 0.0, "step": 34720 }, { "epoch": 44.01395109395109, "grad_norm": 7.998216460691765e-05, "learning_rate": 1.1783211783211783e-06, "loss": 0.1652, "step": 34730 }, { "epoch": 44.014208494208496, "grad_norm": 0.00040033727418631315, "learning_rate": 1.1754611754611757e-06, "loss": 0.0001, "step": 34740 }, { "epoch": 44.01446589446589, "grad_norm": 0.00010231852502329275, "learning_rate": 1.1726011726011728e-06, "loss": 0.0005, "step": 34750 }, { "epoch": 44.0147232947233, "grad_norm": 0.0009837534744292498, "learning_rate": 1.1697411697411699e-06, "loss": 0.7573, "step": 34760 }, { "epoch": 44.01498069498069, "grad_norm": 0.003812544047832489, "learning_rate": 1.166881166881167e-06, "loss": 0.0025, "step": 34770 }, { "epoch": 44.0152380952381, "grad_norm": 0.0010106575209647417, "learning_rate": 1.164021164021164e-06, "loss": 0.0003, "step": 34780 }, { "epoch": 44.01549549549549, "grad_norm": 8.687885565450415e-05, "learning_rate": 1.1611611611611614e-06, "loss": 0.0001, "step": 34790 }, { "epoch": 44.0157528957529, "grad_norm": 0.0001750994415488094, "learning_rate": 1.1583011583011585e-06, "loss": 0.0001, "step": 34800 }, { "epoch": 44.01601029601029, "grad_norm": 0.0008143062004819512, "learning_rate": 1.1554411554411556e-06, "loss": 0.0001, "step": 34810 }, { "epoch": 44.0162676962677, "grad_norm": 0.0009398268884979188, "learning_rate": 1.1525811525811527e-06, "loss": 0.0001, "step": 34820 }, { "epoch": 44.01652509652509, "grad_norm": 0.0001232112990692258, "learning_rate": 1.1497211497211498e-06, "loss": 0.0001, "step": 34830 }, { "epoch": 44.0167824967825, "grad_norm": 0.006615647114813328, "learning_rate": 1.146861146861147e-06, "loss": 0.0567, "step": 34840 }, { "epoch": 44.01703989703989, "grad_norm": 0.0021121406462043524, "learning_rate": 1.144001144001144e-06, "loss": 0.0003, "step": 34850 }, { "epoch": 44.0172972972973, "grad_norm": 9.626035898691043e-05, "learning_rate": 1.1411411411411411e-06, "loss": 0.0001, "step": 34860 }, { "epoch": 44.0175546975547, "grad_norm": 0.00023816150496713817, "learning_rate": 1.1382811382811382e-06, "loss": 0.0001, "step": 34870 }, { "epoch": 44.0178120978121, "grad_norm": 0.00011487925803521648, "learning_rate": 1.1354211354211355e-06, "loss": 0.0002, "step": 34880 }, { "epoch": 44.0180694980695, "grad_norm": 0.009908772073686123, "learning_rate": 1.1325611325611326e-06, "loss": 1.6454, "step": 34890 }, { "epoch": 44.0183268983269, "grad_norm": 0.004898641724139452, "learning_rate": 1.1297011297011298e-06, "loss": 0.6973, "step": 34900 }, { "epoch": 44.0185842985843, "grad_norm": 9.289468289352953e-05, "learning_rate": 1.1268411268411269e-06, "loss": 0.0001, "step": 34910 }, { "epoch": 44.0188416988417, "grad_norm": 0.006554802879691124, "learning_rate": 1.1239811239811242e-06, "loss": 0.7225, "step": 34920 }, { "epoch": 44.0190990990991, "grad_norm": 0.002455679466947913, "learning_rate": 1.1211211211211213e-06, "loss": 0.0004, "step": 34930 }, { "epoch": 44.0193564993565, "grad_norm": 0.07329325377941132, "learning_rate": 1.1182611182611184e-06, "loss": 0.0002, "step": 34940 }, { "epoch": 44.0196138996139, "grad_norm": 0.00023317259910982102, "learning_rate": 1.1154011154011155e-06, "loss": 0.0001, "step": 34950 }, { "epoch": 44.0198712998713, "grad_norm": 0.04382844269275665, "learning_rate": 1.1125411125411126e-06, "loss": 0.0001, "step": 34960 }, { "epoch": 44.02, "eval_accuracy": 0.8913043478260869, "eval_loss": 0.9010905623435974, "eval_runtime": 13.3897, "eval_samples_per_second": 3.435, "eval_steps_per_second": 3.435, "step": 34965 }, { "epoch": 45.0001287001287, "grad_norm": 0.005866614170372486, "learning_rate": 1.1096811096811097e-06, "loss": 0.0001, "step": 34970 }, { "epoch": 45.0003861003861, "grad_norm": 0.00011965092562604696, "learning_rate": 1.106821106821107e-06, "loss": 0.1422, "step": 34980 }, { "epoch": 45.0006435006435, "grad_norm": 0.0015373146161437035, "learning_rate": 1.1039611039611041e-06, "loss": 0.0014, "step": 34990 }, { "epoch": 45.0009009009009, "grad_norm": 0.02903032675385475, "learning_rate": 1.1011011011011012e-06, "loss": 0.2738, "step": 35000 }, { "epoch": 45.0011583011583, "grad_norm": 0.00014115635713096708, "learning_rate": 1.0982410982410983e-06, "loss": 0.0451, "step": 35010 }, { "epoch": 45.0014157014157, "grad_norm": 0.00010962053056573495, "learning_rate": 1.0953810953810954e-06, "loss": 0.0, "step": 35020 }, { "epoch": 45.0016731016731, "grad_norm": 0.0038448001723736525, "learning_rate": 1.0925210925210925e-06, "loss": 0.0001, "step": 35030 }, { "epoch": 45.0019305019305, "grad_norm": 0.00015873707889113575, "learning_rate": 1.0896610896610896e-06, "loss": 0.0001, "step": 35040 }, { "epoch": 45.0021879021879, "grad_norm": 0.003100020345300436, "learning_rate": 1.0868010868010867e-06, "loss": 0.0, "step": 35050 }, { "epoch": 45.0024453024453, "grad_norm": 0.00020110583864152431, "learning_rate": 1.083941083941084e-06, "loss": 0.0, "step": 35060 }, { "epoch": 45.002702702702706, "grad_norm": 0.00012975893332622945, "learning_rate": 1.0810810810810812e-06, "loss": 0.0001, "step": 35070 }, { "epoch": 45.0029601029601, "grad_norm": 9.7593292593956e-05, "learning_rate": 1.0782210782210783e-06, "loss": 0.0006, "step": 35080 }, { "epoch": 45.003217503217506, "grad_norm": 0.02327113039791584, "learning_rate": 1.0753610753610756e-06, "loss": 0.728, "step": 35090 }, { "epoch": 45.0034749034749, "grad_norm": 0.03272142633795738, "learning_rate": 1.0725010725010727e-06, "loss": 0.0001, "step": 35100 }, { "epoch": 45.003732303732306, "grad_norm": 0.026449372991919518, "learning_rate": 1.0696410696410698e-06, "loss": 0.0001, "step": 35110 }, { "epoch": 45.0039897039897, "grad_norm": 0.0006191537249833345, "learning_rate": 1.066781066781067e-06, "loss": 0.0829, "step": 35120 }, { "epoch": 45.004247104247106, "grad_norm": 9.959912858903408e-05, "learning_rate": 1.063921063921064e-06, "loss": 0.0001, "step": 35130 }, { "epoch": 45.0045045045045, "grad_norm": 0.18045932054519653, "learning_rate": 1.0610610610610611e-06, "loss": 0.0001, "step": 35140 }, { "epoch": 45.00476190476191, "grad_norm": 0.0007838630117475986, "learning_rate": 1.0582010582010582e-06, "loss": 0.0, "step": 35150 }, { "epoch": 45.0050193050193, "grad_norm": 0.06128101423382759, "learning_rate": 1.0553410553410555e-06, "loss": 0.0001, "step": 35160 }, { "epoch": 45.00527670527671, "grad_norm": 0.018773818388581276, "learning_rate": 1.0524810524810526e-06, "loss": 0.0002, "step": 35170 }, { "epoch": 45.0055341055341, "grad_norm": 0.01248914934694767, "learning_rate": 1.0496210496210497e-06, "loss": 0.0005, "step": 35180 }, { "epoch": 45.00579150579151, "grad_norm": 0.0021200559567660093, "learning_rate": 1.0467610467610468e-06, "loss": 0.0, "step": 35190 }, { "epoch": 45.0060489060489, "grad_norm": 0.0001151402248069644, "learning_rate": 1.043901043901044e-06, "loss": 0.0002, "step": 35200 }, { "epoch": 45.00630630630631, "grad_norm": 0.00010334582475479692, "learning_rate": 1.041041041041041e-06, "loss": 0.0001, "step": 35210 }, { "epoch": 45.0065637065637, "grad_norm": 0.00017368808039464056, "learning_rate": 1.0381810381810382e-06, "loss": 0.7997, "step": 35220 }, { "epoch": 45.00682110682111, "grad_norm": 0.0025292395148426294, "learning_rate": 1.0353210353210355e-06, "loss": 1.6189, "step": 35230 }, { "epoch": 45.0070785070785, "grad_norm": 0.0001754663826432079, "learning_rate": 1.0324610324610326e-06, "loss": 0.0001, "step": 35240 }, { "epoch": 45.00733590733591, "grad_norm": 0.14910262823104858, "learning_rate": 1.0296010296010297e-06, "loss": 0.0002, "step": 35250 }, { "epoch": 45.00759330759331, "grad_norm": 0.00010118578211404383, "learning_rate": 1.0267410267410268e-06, "loss": 0.0594, "step": 35260 }, { "epoch": 45.00785070785071, "grad_norm": 0.001283423276618123, "learning_rate": 1.023881023881024e-06, "loss": 0.0001, "step": 35270 }, { "epoch": 45.00810810810811, "grad_norm": 0.0028013254050165415, "learning_rate": 1.0210210210210212e-06, "loss": 0.0, "step": 35280 }, { "epoch": 45.00836550836551, "grad_norm": 0.0008561990689486265, "learning_rate": 1.0181610181610183e-06, "loss": 0.0001, "step": 35290 }, { "epoch": 45.00862290862291, "grad_norm": 0.00010187633597524837, "learning_rate": 1.0153010153010154e-06, "loss": 0.0, "step": 35300 }, { "epoch": 45.00888030888031, "grad_norm": 0.008484124206006527, "learning_rate": 1.0124410124410125e-06, "loss": 0.8, "step": 35310 }, { "epoch": 45.00913770913771, "grad_norm": 0.0002765395911410451, "learning_rate": 1.0095810095810096e-06, "loss": 0.0, "step": 35320 }, { "epoch": 45.00939510939511, "grad_norm": 3.5977704524993896, "learning_rate": 1.0067210067210067e-06, "loss": 0.0006, "step": 35330 }, { "epoch": 45.00965250965251, "grad_norm": 0.0013253793586045504, "learning_rate": 1.0038610038610038e-06, "loss": 0.0525, "step": 35340 }, { "epoch": 45.00990990990991, "grad_norm": 0.002857430372387171, "learning_rate": 1.0010010010010011e-06, "loss": 0.0001, "step": 35350 }, { "epoch": 45.01016731016731, "grad_norm": 0.006012418307363987, "learning_rate": 9.981409981409983e-07, "loss": 0.0, "step": 35360 }, { "epoch": 45.01042471042471, "grad_norm": 0.046247564256191254, "learning_rate": 9.952809952809954e-07, "loss": 0.0004, "step": 35370 }, { "epoch": 45.01068211068211, "grad_norm": 0.0015133422566577792, "learning_rate": 9.924209924209925e-07, "loss": 0.0, "step": 35380 }, { "epoch": 45.01093951093951, "grad_norm": 0.0030911520589143038, "learning_rate": 9.895609895609896e-07, "loss": 0.0002, "step": 35390 }, { "epoch": 45.01119691119691, "grad_norm": 0.00036034543882124126, "learning_rate": 9.867009867009867e-07, "loss": 0.7626, "step": 35400 }, { "epoch": 45.01145431145431, "grad_norm": 0.0031866899225860834, "learning_rate": 9.83840983840984e-07, "loss": 0.0, "step": 35410 }, { "epoch": 45.01171171171171, "grad_norm": 0.009315420873463154, "learning_rate": 9.80980980980981e-07, "loss": 0.0001, "step": 35420 }, { "epoch": 45.011969111969115, "grad_norm": 0.00187424395699054, "learning_rate": 9.781209781209782e-07, "loss": 0.8233, "step": 35430 }, { "epoch": 45.01222651222651, "grad_norm": 0.007069937419146299, "learning_rate": 9.752609752609755e-07, "loss": 0.7884, "step": 35440 }, { "epoch": 45.012483912483916, "grad_norm": 0.00011888705921592191, "learning_rate": 9.724009724009726e-07, "loss": 0.0006, "step": 35450 }, { "epoch": 45.01274131274131, "grad_norm": 0.00011121502029709518, "learning_rate": 9.695409695409697e-07, "loss": 0.0017, "step": 35460 }, { "epoch": 45.012998712998716, "grad_norm": 0.00048588740173727274, "learning_rate": 9.666809666809668e-07, "loss": 0.0017, "step": 35470 }, { "epoch": 45.01325611325611, "grad_norm": 0.0035627170000225306, "learning_rate": 9.63820963820964e-07, "loss": 0.0001, "step": 35480 }, { "epoch": 45.013513513513516, "grad_norm": 0.0016204885905608535, "learning_rate": 9.60960960960961e-07, "loss": 0.0, "step": 35490 }, { "epoch": 45.01377091377091, "grad_norm": 1.0253617763519287, "learning_rate": 9.581009581009581e-07, "loss": 0.0004, "step": 35500 }, { "epoch": 45.014028314028316, "grad_norm": 0.7940694093704224, "learning_rate": 9.552409552409552e-07, "loss": 0.0002, "step": 35510 }, { "epoch": 45.01428571428571, "grad_norm": 8.757525210967287e-05, "learning_rate": 9.523809523809525e-07, "loss": 0.0, "step": 35520 }, { "epoch": 45.014543114543116, "grad_norm": 0.014474915340542793, "learning_rate": 9.495209495209496e-07, "loss": 0.0159, "step": 35530 }, { "epoch": 45.01480051480051, "grad_norm": 0.01842646673321724, "learning_rate": 9.466609466609468e-07, "loss": 0.0001, "step": 35540 }, { "epoch": 45.015057915057916, "grad_norm": 0.004527392331510782, "learning_rate": 9.438009438009439e-07, "loss": 0.0001, "step": 35550 }, { "epoch": 45.01531531531531, "grad_norm": 0.00041223526932299137, "learning_rate": 9.409409409409411e-07, "loss": 0.0, "step": 35560 }, { "epoch": 45.015572715572716, "grad_norm": 0.000709443585947156, "learning_rate": 9.380809380809382e-07, "loss": 0.0001, "step": 35570 }, { "epoch": 45.01583011583011, "grad_norm": 0.00010205066064372659, "learning_rate": 9.352209352209353e-07, "loss": 1.0062, "step": 35580 }, { "epoch": 45.01608751608752, "grad_norm": 0.0038576810620725155, "learning_rate": 9.323609323609324e-07, "loss": 0.0, "step": 35590 }, { "epoch": 45.01634491634491, "grad_norm": 0.0001053248270181939, "learning_rate": 9.295009295009295e-07, "loss": 0.3508, "step": 35600 }, { "epoch": 45.01660231660232, "grad_norm": 0.006075258832424879, "learning_rate": 9.266409266409267e-07, "loss": 0.0001, "step": 35610 }, { "epoch": 45.01685971685972, "grad_norm": 0.00010978049976984039, "learning_rate": 9.237809237809238e-07, "loss": 0.0001, "step": 35620 }, { "epoch": 45.01711711711712, "grad_norm": 0.0025920916814357042, "learning_rate": 9.20920920920921e-07, "loss": 0.0002, "step": 35630 }, { "epoch": 45.01737451737452, "grad_norm": 0.0029896306805312634, "learning_rate": 9.180609180609181e-07, "loss": 0.0, "step": 35640 }, { "epoch": 45.01763191763192, "grad_norm": 0.004719553980976343, "learning_rate": 9.152009152009153e-07, "loss": 0.0, "step": 35650 }, { "epoch": 45.01788931788932, "grad_norm": 0.01105885673314333, "learning_rate": 9.123409123409124e-07, "loss": 0.013, "step": 35660 }, { "epoch": 45.01814671814672, "grad_norm": 344.09503173828125, "learning_rate": 9.094809094809096e-07, "loss": 0.5786, "step": 35670 }, { "epoch": 45.01840411840412, "grad_norm": 0.00010508803825359792, "learning_rate": 9.066209066209067e-07, "loss": 0.0, "step": 35680 }, { "epoch": 45.01866151866152, "grad_norm": 0.0030339634977281094, "learning_rate": 9.037609037609038e-07, "loss": 0.0001, "step": 35690 }, { "epoch": 45.01891891891892, "grad_norm": 0.08380759507417679, "learning_rate": 9.00900900900901e-07, "loss": 0.0001, "step": 35700 }, { "epoch": 45.01917631917632, "grad_norm": 50.496097564697266, "learning_rate": 8.980408980408981e-07, "loss": 0.0095, "step": 35710 }, { "epoch": 45.01943371943372, "grad_norm": 0.00014803580415900797, "learning_rate": 8.951808951808953e-07, "loss": 0.0001, "step": 35720 }, { "epoch": 45.01969111969112, "grad_norm": 9.634919842937961e-05, "learning_rate": 8.923208923208925e-07, "loss": 0.0001, "step": 35730 }, { "epoch": 45.01994851994852, "grad_norm": 909.0189208984375, "learning_rate": 8.894608894608896e-07, "loss": 0.6661, "step": 35740 }, { "epoch": 45.02, "eval_accuracy": 0.8478260869565217, "eval_loss": 1.0514214038848877, "eval_runtime": 13.4683, "eval_samples_per_second": 3.415, "eval_steps_per_second": 3.415, "step": 35742 }, { "epoch": 46.00020592020592, "grad_norm": 0.007054031360894442, "learning_rate": 8.866008866008867e-07, "loss": 0.0001, "step": 35750 }, { "epoch": 46.00046332046332, "grad_norm": 0.004835424479097128, "learning_rate": 8.837408837408838e-07, "loss": 0.0002, "step": 35760 }, { "epoch": 46.00072072072072, "grad_norm": 0.00011531000927789137, "learning_rate": 8.808808808808809e-07, "loss": 0.0, "step": 35770 }, { "epoch": 46.00097812097812, "grad_norm": 0.001735526486299932, "learning_rate": 8.78020878020878e-07, "loss": 0.0002, "step": 35780 }, { "epoch": 46.00123552123552, "grad_norm": 0.0001596819784026593, "learning_rate": 8.751608751608752e-07, "loss": 0.0, "step": 35790 }, { "epoch": 46.00149292149292, "grad_norm": 0.0007862728089094162, "learning_rate": 8.723008723008723e-07, "loss": 0.0001, "step": 35800 }, { "epoch": 46.00175032175032, "grad_norm": 0.00195333338342607, "learning_rate": 8.694408694408694e-07, "loss": 0.4595, "step": 35810 }, { "epoch": 46.002007722007725, "grad_norm": 0.03994870185852051, "learning_rate": 8.665808665808668e-07, "loss": 0.0002, "step": 35820 }, { "epoch": 46.00226512226512, "grad_norm": 0.012519776821136475, "learning_rate": 8.637208637208639e-07, "loss": 0.0, "step": 35830 }, { "epoch": 46.002522522522526, "grad_norm": 0.004235388711094856, "learning_rate": 8.60860860860861e-07, "loss": 0.0, "step": 35840 }, { "epoch": 46.00277992277992, "grad_norm": 0.00017162068979814649, "learning_rate": 8.580008580008581e-07, "loss": 0.4289, "step": 35850 }, { "epoch": 46.003037323037326, "grad_norm": 0.00011935268412344158, "learning_rate": 8.551408551408552e-07, "loss": 0.0002, "step": 35860 }, { "epoch": 46.00329472329472, "grad_norm": 0.00021309407020453364, "learning_rate": 8.522808522808524e-07, "loss": 0.0001, "step": 35870 }, { "epoch": 46.003552123552126, "grad_norm": 0.03302145376801491, "learning_rate": 8.494208494208495e-07, "loss": 0.0002, "step": 35880 }, { "epoch": 46.00380952380952, "grad_norm": 0.00010466259845998138, "learning_rate": 8.465608465608466e-07, "loss": 0.0001, "step": 35890 }, { "epoch": 46.004066924066926, "grad_norm": 0.0013877922901883721, "learning_rate": 8.437008437008437e-07, "loss": 0.0001, "step": 35900 }, { "epoch": 46.00432432432432, "grad_norm": 0.006681135855615139, "learning_rate": 8.40840840840841e-07, "loss": 0.0001, "step": 35910 }, { "epoch": 46.004581724581726, "grad_norm": 0.00010747831402113661, "learning_rate": 8.379808379808381e-07, "loss": 0.0, "step": 35920 }, { "epoch": 46.00483912483912, "grad_norm": 0.002587482100352645, "learning_rate": 8.351208351208352e-07, "loss": 0.0, "step": 35930 }, { "epoch": 46.005096525096526, "grad_norm": 0.00039881718112155795, "learning_rate": 8.322608322608323e-07, "loss": 0.495, "step": 35940 }, { "epoch": 46.00535392535392, "grad_norm": 8.408023131778464e-05, "learning_rate": 8.294008294008294e-07, "loss": 0.0, "step": 35950 }, { "epoch": 46.005611325611326, "grad_norm": 0.007310986518859863, "learning_rate": 8.265408265408266e-07, "loss": 0.0003, "step": 35960 }, { "epoch": 46.00586872586872, "grad_norm": 0.0002170134976040572, "learning_rate": 8.236808236808237e-07, "loss": 0.8696, "step": 35970 }, { "epoch": 46.00612612612613, "grad_norm": 0.007625481579452753, "learning_rate": 8.208208208208208e-07, "loss": 0.0001, "step": 35980 }, { "epoch": 46.00638352638352, "grad_norm": 8.718504977878183e-05, "learning_rate": 8.17960817960818e-07, "loss": 0.0001, "step": 35990 }, { "epoch": 46.00664092664093, "grad_norm": 0.00394862936809659, "learning_rate": 8.151008151008153e-07, "loss": 0.0002, "step": 36000 }, { "epoch": 46.00689832689833, "grad_norm": 9.449994831811637e-05, "learning_rate": 8.122408122408124e-07, "loss": 0.2956, "step": 36010 }, { "epoch": 46.00715572715573, "grad_norm": 0.004683338571339846, "learning_rate": 8.093808093808095e-07, "loss": 0.0001, "step": 36020 }, { "epoch": 46.00741312741313, "grad_norm": 0.00012272816093172878, "learning_rate": 8.065208065208066e-07, "loss": 0.0003, "step": 36030 }, { "epoch": 46.00767052767053, "grad_norm": 0.061585329473018646, "learning_rate": 8.036608036608037e-07, "loss": 0.0, "step": 36040 }, { "epoch": 46.00792792792793, "grad_norm": 6465.0048828125, "learning_rate": 8.008008008008009e-07, "loss": 0.3987, "step": 36050 }, { "epoch": 46.00818532818533, "grad_norm": 0.0032067273277789354, "learning_rate": 7.97940797940798e-07, "loss": 0.0, "step": 36060 }, { "epoch": 46.00844272844273, "grad_norm": 0.004086290951818228, "learning_rate": 7.950807950807951e-07, "loss": 0.9153, "step": 36070 }, { "epoch": 46.00870012870013, "grad_norm": 0.24981355667114258, "learning_rate": 7.922207922207922e-07, "loss": 0.0001, "step": 36080 }, { "epoch": 46.00895752895753, "grad_norm": 0.027362490072846413, "learning_rate": 7.893607893607893e-07, "loss": 0.0001, "step": 36090 }, { "epoch": 46.00921492921493, "grad_norm": 0.008485764265060425, "learning_rate": 7.865007865007866e-07, "loss": 0.187, "step": 36100 }, { "epoch": 46.00947232947233, "grad_norm": 0.00039889125037007034, "learning_rate": 7.836407836407837e-07, "loss": 0.0001, "step": 36110 }, { "epoch": 46.00972972972973, "grad_norm": 0.10861273854970932, "learning_rate": 7.807807807807808e-07, "loss": 0.0002, "step": 36120 }, { "epoch": 46.00998712998713, "grad_norm": 0.003896931419149041, "learning_rate": 7.779207779207779e-07, "loss": 0.0, "step": 36130 }, { "epoch": 46.01024453024453, "grad_norm": 0.001219980069436133, "learning_rate": 7.750607750607752e-07, "loss": 0.0001, "step": 36140 }, { "epoch": 46.01050193050193, "grad_norm": 0.00012802051787730306, "learning_rate": 7.722007722007723e-07, "loss": 0.0001, "step": 36150 }, { "epoch": 46.01075933075933, "grad_norm": 0.001733460114337504, "learning_rate": 7.693407693407694e-07, "loss": 0.0, "step": 36160 }, { "epoch": 46.01101673101673, "grad_norm": 0.01749817654490471, "learning_rate": 7.664807664807665e-07, "loss": 0.0002, "step": 36170 }, { "epoch": 46.01127413127413, "grad_norm": 0.02637387067079544, "learning_rate": 7.636207636207636e-07, "loss": 0.0001, "step": 36180 }, { "epoch": 46.01153153153153, "grad_norm": 0.0005767493858002126, "learning_rate": 7.607607607607609e-07, "loss": 0.0001, "step": 36190 }, { "epoch": 46.011788931788935, "grad_norm": 0.00018126668874174356, "learning_rate": 7.57900757900758e-07, "loss": 0.003, "step": 36200 }, { "epoch": 46.01204633204633, "grad_norm": 0.00014345711679197848, "learning_rate": 7.550407550407551e-07, "loss": 0.0001, "step": 36210 }, { "epoch": 46.012303732303735, "grad_norm": 0.009981050156056881, "learning_rate": 7.521807521807523e-07, "loss": 0.0001, "step": 36220 }, { "epoch": 46.01256113256113, "grad_norm": 0.00011738213652279228, "learning_rate": 7.493207493207494e-07, "loss": 0.0, "step": 36230 }, { "epoch": 46.012818532818535, "grad_norm": 1049.6097412109375, "learning_rate": 7.464607464607465e-07, "loss": 1.1861, "step": 36240 }, { "epoch": 46.01307593307593, "grad_norm": 187.1683349609375, "learning_rate": 7.436007436007436e-07, "loss": 0.787, "step": 36250 }, { "epoch": 46.013333333333335, "grad_norm": 0.0012502912431955338, "learning_rate": 7.407407407407407e-07, "loss": 0.0389, "step": 36260 }, { "epoch": 46.01359073359073, "grad_norm": 0.00012157708260929212, "learning_rate": 7.378807378807379e-07, "loss": 0.0001, "step": 36270 }, { "epoch": 46.013848133848136, "grad_norm": 0.002692732261493802, "learning_rate": 7.350207350207351e-07, "loss": 0.0001, "step": 36280 }, { "epoch": 46.01410553410553, "grad_norm": 0.0001023231161525473, "learning_rate": 7.321607321607323e-07, "loss": 0.9306, "step": 36290 }, { "epoch": 46.014362934362936, "grad_norm": 0.10785011947154999, "learning_rate": 7.293007293007294e-07, "loss": 0.0001, "step": 36300 }, { "epoch": 46.01462033462033, "grad_norm": 0.004624065011739731, "learning_rate": 7.264407264407266e-07, "loss": 0.0001, "step": 36310 }, { "epoch": 46.014877734877736, "grad_norm": 0.0320514477789402, "learning_rate": 7.235807235807237e-07, "loss": 0.6644, "step": 36320 }, { "epoch": 46.01513513513513, "grad_norm": 0.07943771779537201, "learning_rate": 7.207207207207208e-07, "loss": 0.0001, "step": 36330 }, { "epoch": 46.015392535392536, "grad_norm": 0.0013853806303814054, "learning_rate": 7.178607178607179e-07, "loss": 0.1372, "step": 36340 }, { "epoch": 46.01564993564993, "grad_norm": 0.00339730572886765, "learning_rate": 7.15000715000715e-07, "loss": 0.0, "step": 36350 }, { "epoch": 46.015907335907336, "grad_norm": 0.00013834710989613086, "learning_rate": 7.121407121407122e-07, "loss": 0.0002, "step": 36360 }, { "epoch": 46.01616473616474, "grad_norm": 0.01571325585246086, "learning_rate": 7.092807092807093e-07, "loss": 0.0001, "step": 36370 }, { "epoch": 46.016422136422136, "grad_norm": 0.00020861165830865502, "learning_rate": 7.064207064207065e-07, "loss": 0.0002, "step": 36380 }, { "epoch": 46.01667953667954, "grad_norm": 30.054359436035156, "learning_rate": 7.035607035607036e-07, "loss": 0.7005, "step": 36390 }, { "epoch": 46.016936936936936, "grad_norm": 9.627743565943092e-05, "learning_rate": 7.007007007007008e-07, "loss": 0.0007, "step": 36400 }, { "epoch": 46.01719433719434, "grad_norm": 0.005623129662126303, "learning_rate": 6.978406978406979e-07, "loss": 0.0, "step": 36410 }, { "epoch": 46.01745173745174, "grad_norm": 0.04780065640807152, "learning_rate": 6.94980694980695e-07, "loss": 0.0001, "step": 36420 }, { "epoch": 46.01770913770914, "grad_norm": 0.00010322517482563853, "learning_rate": 6.921206921206921e-07, "loss": 0.0002, "step": 36430 }, { "epoch": 46.01796653796654, "grad_norm": 0.0031095435842871666, "learning_rate": 6.892606892606892e-07, "loss": 0.7759, "step": 36440 }, { "epoch": 46.01822393822394, "grad_norm": 0.00010584430856397375, "learning_rate": 6.864006864006865e-07, "loss": 0.0, "step": 36450 }, { "epoch": 46.01848133848134, "grad_norm": 0.00017411337466910481, "learning_rate": 6.835406835406836e-07, "loss": 0.0, "step": 36460 }, { "epoch": 46.01873873873874, "grad_norm": 0.008927185088396072, "learning_rate": 6.806806806806808e-07, "loss": 0.0001, "step": 36470 }, { "epoch": 46.01899613899614, "grad_norm": 0.0015496767591685057, "learning_rate": 6.778206778206779e-07, "loss": 0.0001, "step": 36480 }, { "epoch": 46.01925353925354, "grad_norm": 0.0566856823861599, "learning_rate": 6.749606749606751e-07, "loss": 0.2111, "step": 36490 }, { "epoch": 46.01951093951094, "grad_norm": 0.00018851790810003877, "learning_rate": 6.721006721006722e-07, "loss": 0.0005, "step": 36500 }, { "epoch": 46.01976833976834, "grad_norm": 0.005153292324393988, "learning_rate": 6.692406692406693e-07, "loss": 0.0001, "step": 36510 }, { "epoch": 46.02, "eval_accuracy": 0.8913043478260869, "eval_loss": 0.8273398280143738, "eval_runtime": 13.4638, "eval_samples_per_second": 3.417, "eval_steps_per_second": 3.417, "step": 36519 }, { "epoch": 47.00002574002574, "grad_norm": 0.00952563900500536, "learning_rate": 6.663806663806664e-07, "loss": 0.1634, "step": 36520 }, { "epoch": 47.00028314028314, "grad_norm": 0.05393436178565025, "learning_rate": 6.635206635206635e-07, "loss": 0.0001, "step": 36530 }, { "epoch": 47.00054054054054, "grad_norm": 0.07490912079811096, "learning_rate": 6.606606606606607e-07, "loss": 0.0001, "step": 36540 }, { "epoch": 47.00079794079794, "grad_norm": 8.754934242460877e-05, "learning_rate": 6.578006578006578e-07, "loss": 0.0, "step": 36550 }, { "epoch": 47.00105534105534, "grad_norm": 0.003464779816567898, "learning_rate": 6.54940654940655e-07, "loss": 0.0001, "step": 36560 }, { "epoch": 47.00131274131274, "grad_norm": 0.00010122011735802516, "learning_rate": 6.520806520806522e-07, "loss": 0.0002, "step": 36570 }, { "epoch": 47.00157014157014, "grad_norm": 199.99314880371094, "learning_rate": 6.492206492206493e-07, "loss": 0.7026, "step": 36580 }, { "epoch": 47.001827541827545, "grad_norm": 0.0013138225767761469, "learning_rate": 6.463606463606464e-07, "loss": 0.0, "step": 36590 }, { "epoch": 47.00208494208494, "grad_norm": 0.00027888649492524564, "learning_rate": 6.435006435006435e-07, "loss": 0.0002, "step": 36600 }, { "epoch": 47.002342342342345, "grad_norm": 380.2559814453125, "learning_rate": 6.406406406406407e-07, "loss": 0.7617, "step": 36610 }, { "epoch": 47.00259974259974, "grad_norm": 0.0002311421267222613, "learning_rate": 6.377806377806379e-07, "loss": 0.0, "step": 36620 }, { "epoch": 47.002857142857145, "grad_norm": 0.002484462922438979, "learning_rate": 6.34920634920635e-07, "loss": 0.0001, "step": 36630 }, { "epoch": 47.00311454311454, "grad_norm": 0.3317424952983856, "learning_rate": 6.320606320606321e-07, "loss": 0.0001, "step": 36640 }, { "epoch": 47.003371943371945, "grad_norm": 0.0013046682579442859, "learning_rate": 6.292006292006292e-07, "loss": 0.0, "step": 36650 }, { "epoch": 47.00362934362934, "grad_norm": 0.0008944895234890282, "learning_rate": 6.263406263406265e-07, "loss": 0.4445, "step": 36660 }, { "epoch": 47.003886743886746, "grad_norm": 0.01090352050960064, "learning_rate": 6.234806234806235e-07, "loss": 0.0001, "step": 36670 }, { "epoch": 47.00414414414414, "grad_norm": 0.00010999095684383065, "learning_rate": 6.206206206206207e-07, "loss": 0.0001, "step": 36680 }, { "epoch": 47.004401544401546, "grad_norm": 0.0012661719229072332, "learning_rate": 6.177606177606178e-07, "loss": 0.0002, "step": 36690 }, { "epoch": 47.00465894465894, "grad_norm": 0.003985162358731031, "learning_rate": 6.149006149006149e-07, "loss": 0.0, "step": 36700 }, { "epoch": 47.004916344916346, "grad_norm": 0.00013246751041151583, "learning_rate": 6.120406120406121e-07, "loss": 0.0035, "step": 36710 }, { "epoch": 47.00517374517374, "grad_norm": 0.027062317356467247, "learning_rate": 6.091806091806092e-07, "loss": 0.5117, "step": 36720 }, { "epoch": 47.005431145431146, "grad_norm": 0.004527709446847439, "learning_rate": 6.063206063206064e-07, "loss": 0.001, "step": 36730 }, { "epoch": 47.00568854568854, "grad_norm": 0.00012451779912225902, "learning_rate": 6.034606034606035e-07, "loss": 0.0001, "step": 36740 }, { "epoch": 47.005945945945946, "grad_norm": 0.08242174237966537, "learning_rate": 6.006006006006006e-07, "loss": 0.0001, "step": 36750 }, { "epoch": 47.00620334620335, "grad_norm": 0.0005578892887569964, "learning_rate": 5.977405977405978e-07, "loss": 0.0001, "step": 36760 }, { "epoch": 47.006460746460746, "grad_norm": 0.0012626610696315765, "learning_rate": 5.94880594880595e-07, "loss": 0.298, "step": 36770 }, { "epoch": 47.00671814671815, "grad_norm": 0.00010453144204802811, "learning_rate": 5.920205920205921e-07, "loss": 0.0001, "step": 36780 }, { "epoch": 47.006975546975546, "grad_norm": 0.000996722374111414, "learning_rate": 5.891605891605892e-07, "loss": 0.0, "step": 36790 }, { "epoch": 47.00723294723295, "grad_norm": 9.595556184649467e-05, "learning_rate": 5.863005863005864e-07, "loss": 0.0027, "step": 36800 }, { "epoch": 47.00749034749035, "grad_norm": 1.0915254354476929, "learning_rate": 5.834405834405835e-07, "loss": 0.0003, "step": 36810 }, { "epoch": 47.00774774774775, "grad_norm": 0.00010088778799399734, "learning_rate": 5.805805805805807e-07, "loss": 0.3989, "step": 36820 }, { "epoch": 47.00800514800515, "grad_norm": 0.00810331478714943, "learning_rate": 5.777205777205778e-07, "loss": 0.0001, "step": 36830 }, { "epoch": 47.00826254826255, "grad_norm": 7.67952369642444e-05, "learning_rate": 5.748605748605749e-07, "loss": 0.0001, "step": 36840 }, { "epoch": 47.00851994851995, "grad_norm": 0.0010103165404871106, "learning_rate": 5.72000572000572e-07, "loss": 0.0, "step": 36850 }, { "epoch": 47.00877734877735, "grad_norm": 0.007856501266360283, "learning_rate": 5.691405691405691e-07, "loss": 0.0, "step": 36860 }, { "epoch": 47.00903474903475, "grad_norm": 0.0008229284430854023, "learning_rate": 5.662805662805663e-07, "loss": 0.0001, "step": 36870 }, { "epoch": 47.00929214929215, "grad_norm": 0.0014737971359863877, "learning_rate": 5.634205634205634e-07, "loss": 0.6789, "step": 36880 }, { "epoch": 47.00954954954955, "grad_norm": 0.10162188857793808, "learning_rate": 5.605605605605606e-07, "loss": 0.0001, "step": 36890 }, { "epoch": 47.00980694980695, "grad_norm": 0.00013945061073172837, "learning_rate": 5.577005577005577e-07, "loss": 0.5182, "step": 36900 }, { "epoch": 47.01006435006435, "grad_norm": 0.0001227227330673486, "learning_rate": 5.548405548405548e-07, "loss": 0.0, "step": 36910 }, { "epoch": 47.01032175032175, "grad_norm": 9.417844557901844e-05, "learning_rate": 5.519805519805521e-07, "loss": 0.0151, "step": 36920 }, { "epoch": 47.01057915057915, "grad_norm": 0.020008182153105736, "learning_rate": 5.491205491205492e-07, "loss": 0.0005, "step": 36930 }, { "epoch": 47.01083655083655, "grad_norm": 0.00020941771799698472, "learning_rate": 5.462605462605463e-07, "loss": 0.0013, "step": 36940 }, { "epoch": 47.011093951093955, "grad_norm": 0.05506380647420883, "learning_rate": 5.434005434005434e-07, "loss": 0.0001, "step": 36950 }, { "epoch": 47.01135135135135, "grad_norm": 0.0007317409035749733, "learning_rate": 5.405405405405406e-07, "loss": 0.0003, "step": 36960 }, { "epoch": 47.011608751608755, "grad_norm": 0.005294485948979855, "learning_rate": 5.376805376805378e-07, "loss": 0.116, "step": 36970 }, { "epoch": 47.01186615186615, "grad_norm": 9.708470315672457e-05, "learning_rate": 5.348205348205349e-07, "loss": 0.0001, "step": 36980 }, { "epoch": 47.012123552123555, "grad_norm": 0.0071139405481517315, "learning_rate": 5.31960531960532e-07, "loss": 0.0, "step": 36990 }, { "epoch": 47.01238095238095, "grad_norm": 0.002544434741139412, "learning_rate": 5.291005291005291e-07, "loss": 0.0, "step": 37000 }, { "epoch": 47.012638352638355, "grad_norm": 0.06576728820800781, "learning_rate": 5.262405262405263e-07, "loss": 0.0001, "step": 37010 }, { "epoch": 47.01289575289575, "grad_norm": 7.955427281558514e-05, "learning_rate": 5.233805233805234e-07, "loss": 0.0001, "step": 37020 }, { "epoch": 47.013153153153155, "grad_norm": 9.721639071358368e-05, "learning_rate": 5.205205205205205e-07, "loss": 0.0, "step": 37030 }, { "epoch": 47.01341055341055, "grad_norm": 0.049452438950538635, "learning_rate": 5.176605176605177e-07, "loss": 0.0, "step": 37040 }, { "epoch": 47.013667953667955, "grad_norm": 0.00011968828039243817, "learning_rate": 5.148005148005148e-07, "loss": 0.0001, "step": 37050 }, { "epoch": 47.01392535392535, "grad_norm": 0.0015481059672310948, "learning_rate": 5.11940511940512e-07, "loss": 0.0, "step": 37060 }, { "epoch": 47.014182754182755, "grad_norm": 0.0013427004450932145, "learning_rate": 5.090805090805092e-07, "loss": 0.9195, "step": 37070 }, { "epoch": 47.01444015444015, "grad_norm": 0.001583964447490871, "learning_rate": 5.062205062205063e-07, "loss": 0.0001, "step": 37080 }, { "epoch": 47.014697554697555, "grad_norm": 0.014870808459818363, "learning_rate": 5.033605033605034e-07, "loss": 0.735, "step": 37090 }, { "epoch": 47.01495495495495, "grad_norm": 9.150934783974662e-05, "learning_rate": 5.005005005005006e-07, "loss": 0.0001, "step": 37100 }, { "epoch": 47.015212355212356, "grad_norm": 0.009138006716966629, "learning_rate": 4.976404976404977e-07, "loss": 0.0, "step": 37110 }, { "epoch": 47.01546975546975, "grad_norm": 0.00016449633403681219, "learning_rate": 4.947804947804948e-07, "loss": 0.0, "step": 37120 }, { "epoch": 47.015727155727156, "grad_norm": 0.001054841559380293, "learning_rate": 4.91920491920492e-07, "loss": 0.0405, "step": 37130 }, { "epoch": 47.01598455598456, "grad_norm": 0.00013857490557711571, "learning_rate": 4.890604890604891e-07, "loss": 0.0001, "step": 37140 }, { "epoch": 47.016241956241956, "grad_norm": 0.0011062290286645293, "learning_rate": 4.862004862004863e-07, "loss": 0.0001, "step": 37150 }, { "epoch": 47.01649935649936, "grad_norm": 0.0005065145669505, "learning_rate": 4.833404833404834e-07, "loss": 1.0933, "step": 37160 }, { "epoch": 47.016756756756756, "grad_norm": 0.029881233349442482, "learning_rate": 4.804804804804805e-07, "loss": 0.0, "step": 37170 }, { "epoch": 47.01701415701416, "grad_norm": 0.009099447168409824, "learning_rate": 4.776204776204776e-07, "loss": 0.0002, "step": 37180 }, { "epoch": 47.017271557271556, "grad_norm": 0.003578891744837165, "learning_rate": 4.747604747604748e-07, "loss": 0.7147, "step": 37190 }, { "epoch": 47.01752895752896, "grad_norm": 0.0010232250206172466, "learning_rate": 4.7190047190047194e-07, "loss": 0.0001, "step": 37200 }, { "epoch": 47.017786357786356, "grad_norm": 0.00027414108626544476, "learning_rate": 4.690404690404691e-07, "loss": 0.0002, "step": 37210 }, { "epoch": 47.01804375804376, "grad_norm": 0.07519324123859406, "learning_rate": 4.661804661804662e-07, "loss": 0.0, "step": 37220 }, { "epoch": 47.018301158301156, "grad_norm": 0.00012404580775182694, "learning_rate": 4.6332046332046336e-07, "loss": 0.0001, "step": 37230 }, { "epoch": 47.01855855855856, "grad_norm": 0.0008826501434668899, "learning_rate": 4.604604604604605e-07, "loss": 0.0, "step": 37240 }, { "epoch": 47.018815958815956, "grad_norm": 0.0012050550431013107, "learning_rate": 4.5760045760045767e-07, "loss": 0.0001, "step": 37250 }, { "epoch": 47.01907335907336, "grad_norm": 6.927676440682262e-05, "learning_rate": 4.547404547404548e-07, "loss": 0.7524, "step": 37260 }, { "epoch": 47.01933075933076, "grad_norm": 0.00022927786631044, "learning_rate": 4.518804518804519e-07, "loss": 0.2072, "step": 37270 }, { "epoch": 47.01958815958816, "grad_norm": 0.0018792460905387998, "learning_rate": 4.4902044902044904e-07, "loss": 0.0001, "step": 37280 }, { "epoch": 47.01984555984556, "grad_norm": 0.003901133546605706, "learning_rate": 4.4616044616044625e-07, "loss": 0.0, "step": 37290 }, { "epoch": 47.02, "eval_accuracy": 0.8913043478260869, "eval_loss": 0.7096506357192993, "eval_runtime": 13.4018, "eval_samples_per_second": 3.432, "eval_steps_per_second": 3.432, "step": 37296 }, { "epoch": 48.00010296010296, "grad_norm": 0.0013056639581918716, "learning_rate": 4.4330044330044335e-07, "loss": 0.5536, "step": 37300 }, { "epoch": 48.00036036036036, "grad_norm": 8.839034853735939e-05, "learning_rate": 4.4044044044044046e-07, "loss": 0.0001, "step": 37310 }, { "epoch": 48.00061776061776, "grad_norm": 0.02358631044626236, "learning_rate": 4.375804375804376e-07, "loss": 0.0001, "step": 37320 }, { "epoch": 48.00087516087516, "grad_norm": 0.00010084093810291961, "learning_rate": 4.347204347204347e-07, "loss": 0.7232, "step": 37330 }, { "epoch": 48.001132561132565, "grad_norm": 0.0037173698656260967, "learning_rate": 4.3186043186043193e-07, "loss": 0.0001, "step": 37340 }, { "epoch": 48.00138996138996, "grad_norm": 0.0001251946232514456, "learning_rate": 4.2900042900042903e-07, "loss": 0.0001, "step": 37350 }, { "epoch": 48.001647361647365, "grad_norm": 0.01967187225818634, "learning_rate": 4.261404261404262e-07, "loss": 0.0001, "step": 37360 }, { "epoch": 48.00190476190476, "grad_norm": 0.010023032315075397, "learning_rate": 4.232804232804233e-07, "loss": 0.0, "step": 37370 }, { "epoch": 48.002162162162165, "grad_norm": 0.06681618839502335, "learning_rate": 4.204204204204205e-07, "loss": 0.0008, "step": 37380 }, { "epoch": 48.00241956241956, "grad_norm": 0.021821511909365654, "learning_rate": 4.175604175604176e-07, "loss": 0.0002, "step": 37390 }, { "epoch": 48.002676962676965, "grad_norm": 0.0001355089625576511, "learning_rate": 4.147004147004147e-07, "loss": 0.5061, "step": 37400 }, { "epoch": 48.00293436293436, "grad_norm": 0.038224369287490845, "learning_rate": 4.1184041184041187e-07, "loss": 0.668, "step": 37410 }, { "epoch": 48.003191763191765, "grad_norm": 0.00025807946803979576, "learning_rate": 4.08980408980409e-07, "loss": 0.0, "step": 37420 }, { "epoch": 48.00344916344916, "grad_norm": 0.00012352765770629048, "learning_rate": 4.061204061204062e-07, "loss": 0.0001, "step": 37430 }, { "epoch": 48.003706563706565, "grad_norm": 0.021487237885594368, "learning_rate": 4.032604032604033e-07, "loss": 0.0001, "step": 37440 }, { "epoch": 48.00396396396396, "grad_norm": 1613.104736328125, "learning_rate": 4.0040040040040045e-07, "loss": 0.2208, "step": 37450 }, { "epoch": 48.004221364221365, "grad_norm": 0.05000975728034973, "learning_rate": 3.9754039754039755e-07, "loss": 0.0001, "step": 37460 }, { "epoch": 48.00447876447876, "grad_norm": 0.0008557880646549165, "learning_rate": 3.9468039468039466e-07, "loss": 0.0001, "step": 37470 }, { "epoch": 48.004736164736165, "grad_norm": 0.0027655167505145073, "learning_rate": 3.9182039182039187e-07, "loss": 0.0001, "step": 37480 }, { "epoch": 48.00499356499356, "grad_norm": 0.11204931885004044, "learning_rate": 3.8896038896038897e-07, "loss": 0.0002, "step": 37490 }, { "epoch": 48.005250965250966, "grad_norm": 0.08630715310573578, "learning_rate": 3.8610038610038613e-07, "loss": 0.0004, "step": 37500 }, { "epoch": 48.00550836550836, "grad_norm": 0.0018811404006555676, "learning_rate": 3.8324038324038323e-07, "loss": 0.0, "step": 37510 }, { "epoch": 48.005765765765766, "grad_norm": 0.0012397312093526125, "learning_rate": 3.8038038038038044e-07, "loss": 0.0, "step": 37520 }, { "epoch": 48.00602316602317, "grad_norm": 0.002020134124904871, "learning_rate": 3.7752037752037755e-07, "loss": 0.0001, "step": 37530 }, { "epoch": 48.006280566280566, "grad_norm": 0.029110634699463844, "learning_rate": 3.746603746603747e-07, "loss": 0.0001, "step": 37540 }, { "epoch": 48.00653796653797, "grad_norm": 0.0019776870030909777, "learning_rate": 3.718003718003718e-07, "loss": 0.0003, "step": 37550 }, { "epoch": 48.006795366795366, "grad_norm": 0.07283729314804077, "learning_rate": 3.6894036894036897e-07, "loss": 0.0003, "step": 37560 }, { "epoch": 48.00705276705277, "grad_norm": 0.00014232056855689734, "learning_rate": 3.660803660803661e-07, "loss": 0.0193, "step": 37570 }, { "epoch": 48.007310167310166, "grad_norm": 0.17029236257076263, "learning_rate": 3.632203632203633e-07, "loss": 0.0001, "step": 37580 }, { "epoch": 48.00756756756757, "grad_norm": 0.02278299257159233, "learning_rate": 3.603603603603604e-07, "loss": 0.0001, "step": 37590 }, { "epoch": 48.007824967824966, "grad_norm": 0.00011213924881303683, "learning_rate": 3.575003575003575e-07, "loss": 0.0001, "step": 37600 }, { "epoch": 48.00808236808237, "grad_norm": 0.0008133391384035349, "learning_rate": 3.5464035464035465e-07, "loss": 0.0301, "step": 37610 }, { "epoch": 48.008339768339766, "grad_norm": 0.0009331071632914245, "learning_rate": 3.517803517803518e-07, "loss": 0.0, "step": 37620 }, { "epoch": 48.00859716859717, "grad_norm": 8.731058915145695e-05, "learning_rate": 3.4892034892034896e-07, "loss": 0.0002, "step": 37630 }, { "epoch": 48.00885456885457, "grad_norm": 0.011118177324533463, "learning_rate": 3.4606034606034607e-07, "loss": 0.0, "step": 37640 }, { "epoch": 48.00911196911197, "grad_norm": 0.00018614377768244594, "learning_rate": 3.4320034320034323e-07, "loss": 0.0, "step": 37650 }, { "epoch": 48.00936936936937, "grad_norm": 8.32139776321128e-05, "learning_rate": 3.403403403403404e-07, "loss": 0.0, "step": 37660 }, { "epoch": 48.00962676962677, "grad_norm": 0.021617701277136803, "learning_rate": 3.3748033748033754e-07, "loss": 0.0001, "step": 37670 }, { "epoch": 48.00988416988417, "grad_norm": 0.029266489669680595, "learning_rate": 3.3462033462033465e-07, "loss": 0.0002, "step": 37680 }, { "epoch": 48.01014157014157, "grad_norm": 0.0012129413662478328, "learning_rate": 3.3176033176033175e-07, "loss": 0.0006, "step": 37690 }, { "epoch": 48.01039897039897, "grad_norm": 0.005568423308432102, "learning_rate": 3.289003289003289e-07, "loss": 0.0, "step": 37700 }, { "epoch": 48.01065637065637, "grad_norm": 0.0016812310786917806, "learning_rate": 3.260403260403261e-07, "loss": 0.1846, "step": 37710 }, { "epoch": 48.010913770913774, "grad_norm": 0.0011652051471173763, "learning_rate": 3.231803231803232e-07, "loss": 0.0002, "step": 37720 }, { "epoch": 48.01117117117117, "grad_norm": 8.414344483753666e-05, "learning_rate": 3.2032032032032033e-07, "loss": 0.0001, "step": 37730 }, { "epoch": 48.011428571428574, "grad_norm": 0.0011205601040273905, "learning_rate": 3.174603174603175e-07, "loss": 0.0, "step": 37740 }, { "epoch": 48.01168597168597, "grad_norm": 0.0004462198994588107, "learning_rate": 3.146003146003146e-07, "loss": 0.0001, "step": 37750 }, { "epoch": 48.011943371943374, "grad_norm": 0.0015702313976362348, "learning_rate": 3.1174031174031175e-07, "loss": 0.0, "step": 37760 }, { "epoch": 48.01220077220077, "grad_norm": 0.00011221476597711444, "learning_rate": 3.088803088803089e-07, "loss": 0.8953, "step": 37770 }, { "epoch": 48.012458172458174, "grad_norm": 0.0006436722469516098, "learning_rate": 3.0602030602030606e-07, "loss": 0.0009, "step": 37780 }, { "epoch": 48.01271557271557, "grad_norm": 3.6860032081604004, "learning_rate": 3.031603031603032e-07, "loss": 0.4928, "step": 37790 }, { "epoch": 48.012972972972975, "grad_norm": 0.0011715868022292852, "learning_rate": 3.003003003003003e-07, "loss": 0.0, "step": 37800 }, { "epoch": 48.01323037323037, "grad_norm": 339.0137634277344, "learning_rate": 2.974402974402975e-07, "loss": 0.7127, "step": 37810 }, { "epoch": 48.013487773487775, "grad_norm": 0.00025066794478334486, "learning_rate": 2.945802945802946e-07, "loss": 0.0, "step": 37820 }, { "epoch": 48.01374517374517, "grad_norm": 0.0003706521529238671, "learning_rate": 2.9172029172029174e-07, "loss": 0.0, "step": 37830 }, { "epoch": 48.014002574002575, "grad_norm": 0.00023929473536554724, "learning_rate": 2.888602888602889e-07, "loss": 0.0, "step": 37840 }, { "epoch": 48.01425997425997, "grad_norm": 0.0029671115335077047, "learning_rate": 2.86000286000286e-07, "loss": 0.0001, "step": 37850 }, { "epoch": 48.014517374517375, "grad_norm": 0.03844938799738884, "learning_rate": 2.8314028314028316e-07, "loss": 0.0001, "step": 37860 }, { "epoch": 48.01477477477477, "grad_norm": 3.0009055137634277, "learning_rate": 2.802802802802803e-07, "loss": 0.0005, "step": 37870 }, { "epoch": 48.015032175032175, "grad_norm": 0.028874320909380913, "learning_rate": 2.774202774202774e-07, "loss": 0.0001, "step": 37880 }, { "epoch": 48.01528957528958, "grad_norm": 0.011996056884527206, "learning_rate": 2.745602745602746e-07, "loss": 0.0, "step": 37890 }, { "epoch": 48.015546975546975, "grad_norm": 0.0015611989656463265, "learning_rate": 2.717002717002717e-07, "loss": 0.0096, "step": 37900 }, { "epoch": 48.01580437580438, "grad_norm": 0.0020811811555176973, "learning_rate": 2.688402688402689e-07, "loss": 0.9104, "step": 37910 }, { "epoch": 48.016061776061775, "grad_norm": 0.007127484772354364, "learning_rate": 2.65980265980266e-07, "loss": 0.0001, "step": 37920 }, { "epoch": 48.01631917631918, "grad_norm": 0.0019268447067588568, "learning_rate": 2.6312026312026316e-07, "loss": 0.0002, "step": 37930 }, { "epoch": 48.016576576576576, "grad_norm": 9.653266897657886e-05, "learning_rate": 2.6026026026026026e-07, "loss": 0.0, "step": 37940 }, { "epoch": 48.01683397683398, "grad_norm": 0.00013772105739917606, "learning_rate": 2.574002574002574e-07, "loss": 0.0001, "step": 37950 }, { "epoch": 48.017091377091376, "grad_norm": 0.0012763019185513258, "learning_rate": 2.545402545402546e-07, "loss": 0.0, "step": 37960 }, { "epoch": 48.01734877734878, "grad_norm": 0.0019373123068362474, "learning_rate": 2.516802516802517e-07, "loss": 0.0, "step": 37970 }, { "epoch": 48.017606177606176, "grad_norm": 0.0032342688646167517, "learning_rate": 2.4882024882024884e-07, "loss": 0.0, "step": 37980 }, { "epoch": 48.01786357786358, "grad_norm": 0.008433103561401367, "learning_rate": 2.45960245960246e-07, "loss": 0.0001, "step": 37990 }, { "epoch": 48.018120978120976, "grad_norm": 8.630425145383924e-05, "learning_rate": 2.4310024310024315e-07, "loss": 0.0, "step": 38000 }, { "epoch": 48.01837837837838, "grad_norm": 0.00015035143587738276, "learning_rate": 2.4024024024024026e-07, "loss": 0.966, "step": 38010 }, { "epoch": 48.018635778635776, "grad_norm": 0.0001188807946164161, "learning_rate": 2.373802373802374e-07, "loss": 0.0001, "step": 38020 }, { "epoch": 48.01889317889318, "grad_norm": 8.999750571092591e-05, "learning_rate": 2.3452023452023455e-07, "loss": 0.0, "step": 38030 }, { "epoch": 48.019150579150576, "grad_norm": 0.008205074816942215, "learning_rate": 2.3166023166023168e-07, "loss": 0.8337, "step": 38040 }, { "epoch": 48.01940797940798, "grad_norm": 3213.668701171875, "learning_rate": 2.2880022880022884e-07, "loss": 0.1913, "step": 38050 }, { "epoch": 48.019665379665376, "grad_norm": 0.00011710776743711904, "learning_rate": 2.2594022594022594e-07, "loss": 0.0001, "step": 38060 }, { "epoch": 48.01992277992278, "grad_norm": 8.933376375352964e-05, "learning_rate": 2.2308022308022312e-07, "loss": 0.6658, "step": 38070 }, { "epoch": 48.02, "eval_accuracy": 0.8913043478260869, "eval_loss": 0.831529438495636, "eval_runtime": 13.4096, "eval_samples_per_second": 3.43, "eval_steps_per_second": 3.43, "step": 38073 }, { "epoch": 49.00018018018018, "grad_norm": 0.07369162142276764, "learning_rate": 2.2022022022022023e-07, "loss": 0.0, "step": 38080 }, { "epoch": 49.00043758043758, "grad_norm": 0.5429767370223999, "learning_rate": 2.1736021736021736e-07, "loss": 0.6264, "step": 38090 }, { "epoch": 49.00069498069498, "grad_norm": 0.00045454761129803956, "learning_rate": 2.1450021450021452e-07, "loss": 0.0005, "step": 38100 }, { "epoch": 49.000952380952384, "grad_norm": 0.0005570737412199378, "learning_rate": 2.1164021164021165e-07, "loss": 0.0002, "step": 38110 }, { "epoch": 49.00120978120978, "grad_norm": 9.545540524413809e-05, "learning_rate": 2.087802087802088e-07, "loss": 0.0001, "step": 38120 }, { "epoch": 49.001467181467184, "grad_norm": 0.0019436365691944957, "learning_rate": 2.0592020592020594e-07, "loss": 0.0, "step": 38130 }, { "epoch": 49.00172458172458, "grad_norm": 0.020056238397955894, "learning_rate": 2.030602030602031e-07, "loss": 0.0001, "step": 38140 }, { "epoch": 49.001981981981984, "grad_norm": 0.020614050328731537, "learning_rate": 2.0020020020020022e-07, "loss": 0.8467, "step": 38150 }, { "epoch": 49.00223938223938, "grad_norm": 0.0006698184879496694, "learning_rate": 1.9734019734019733e-07, "loss": 0.0, "step": 38160 }, { "epoch": 49.002496782496785, "grad_norm": 0.0013321598526090384, "learning_rate": 1.9448019448019449e-07, "loss": 0.4252, "step": 38170 }, { "epoch": 49.00275418275418, "grad_norm": 0.03110680542886257, "learning_rate": 1.9162019162019162e-07, "loss": 0.0, "step": 38180 }, { "epoch": 49.003011583011585, "grad_norm": 0.0001595134090166539, "learning_rate": 1.8876018876018877e-07, "loss": 0.0001, "step": 38190 }, { "epoch": 49.00326898326898, "grad_norm": 0.002772995037958026, "learning_rate": 1.859001859001859e-07, "loss": 0.0, "step": 38200 }, { "epoch": 49.003526383526385, "grad_norm": 0.00313870538957417, "learning_rate": 1.8304018304018306e-07, "loss": 0.0001, "step": 38210 }, { "epoch": 49.00378378378378, "grad_norm": 0.00930901151150465, "learning_rate": 1.801801801801802e-07, "loss": 0.0002, "step": 38220 }, { "epoch": 49.004041184041185, "grad_norm": 0.00011457037180662155, "learning_rate": 1.7732017732017732e-07, "loss": 0.0001, "step": 38230 }, { "epoch": 49.00429858429858, "grad_norm": 0.012397563084959984, "learning_rate": 1.7446017446017448e-07, "loss": 0.0001, "step": 38240 }, { "epoch": 49.004555984555985, "grad_norm": 0.011724770069122314, "learning_rate": 1.7160017160017161e-07, "loss": 0.0001, "step": 38250 }, { "epoch": 49.00481338481338, "grad_norm": 7.894120790297166e-05, "learning_rate": 1.6874016874016877e-07, "loss": 0.0001, "step": 38260 }, { "epoch": 49.005070785070785, "grad_norm": 0.004039763938635588, "learning_rate": 1.6588016588016588e-07, "loss": 0.0, "step": 38270 }, { "epoch": 49.00532818532819, "grad_norm": 0.0477052703499794, "learning_rate": 1.6302016302016306e-07, "loss": 0.0001, "step": 38280 }, { "epoch": 49.005585585585585, "grad_norm": 0.010268225334584713, "learning_rate": 1.6016016016016016e-07, "loss": 0.0001, "step": 38290 }, { "epoch": 49.00584298584299, "grad_norm": 0.03354515880346298, "learning_rate": 1.573001573001573e-07, "loss": 0.0001, "step": 38300 }, { "epoch": 49.006100386100385, "grad_norm": 0.00013072919682599604, "learning_rate": 1.5444015444015445e-07, "loss": 0.0, "step": 38310 }, { "epoch": 49.00635778635779, "grad_norm": 0.00982525385916233, "learning_rate": 1.515801515801516e-07, "loss": 1.4921, "step": 38320 }, { "epoch": 49.006615186615186, "grad_norm": 0.01039874367415905, "learning_rate": 1.4872014872014874e-07, "loss": 0.0, "step": 38330 }, { "epoch": 49.00687258687259, "grad_norm": 0.0014984187437221408, "learning_rate": 1.4586014586014587e-07, "loss": 0.0, "step": 38340 }, { "epoch": 49.007129987129986, "grad_norm": 0.00010380911407992244, "learning_rate": 1.43000143000143e-07, "loss": 0.0, "step": 38350 }, { "epoch": 49.00738738738739, "grad_norm": 0.00010410812683403492, "learning_rate": 1.4014014014014016e-07, "loss": 0.0, "step": 38360 }, { "epoch": 49.007644787644786, "grad_norm": 9.377163951285183e-05, "learning_rate": 1.372801372801373e-07, "loss": 0.0, "step": 38370 }, { "epoch": 49.00790218790219, "grad_norm": 0.0019170086598023772, "learning_rate": 1.3442013442013445e-07, "loss": 0.0001, "step": 38380 }, { "epoch": 49.008159588159586, "grad_norm": 0.00012893872917629778, "learning_rate": 1.3156013156013158e-07, "loss": 1.0042, "step": 38390 }, { "epoch": 49.00841698841699, "grad_norm": 7.869133696658537e-05, "learning_rate": 1.287001287001287e-07, "loss": 0.0008, "step": 38400 }, { "epoch": 49.008674388674386, "grad_norm": 0.00012004271411569789, "learning_rate": 1.2584012584012584e-07, "loss": 0.0, "step": 38410 }, { "epoch": 49.00893178893179, "grad_norm": 0.0014769094996154308, "learning_rate": 1.22980122980123e-07, "loss": 0.0009, "step": 38420 }, { "epoch": 49.009189189189186, "grad_norm": 0.003862373996526003, "learning_rate": 1.2012012012012013e-07, "loss": 0.0001, "step": 38430 }, { "epoch": 49.00944658944659, "grad_norm": 0.0006913309334777296, "learning_rate": 1.1726011726011727e-07, "loss": 0.0003, "step": 38440 }, { "epoch": 49.009703989703986, "grad_norm": 0.0021493281237781048, "learning_rate": 1.1440011440011442e-07, "loss": 0.0004, "step": 38450 }, { "epoch": 49.00996138996139, "grad_norm": 0.00039128007483668625, "learning_rate": 1.1154011154011156e-07, "loss": 0.0001, "step": 38460 }, { "epoch": 49.010218790218794, "grad_norm": 0.003142331726849079, "learning_rate": 1.0868010868010868e-07, "loss": 0.0002, "step": 38470 }, { "epoch": 49.01047619047619, "grad_norm": 0.0017112774075940251, "learning_rate": 1.0582010582010582e-07, "loss": 0.0, "step": 38480 }, { "epoch": 49.010733590733594, "grad_norm": 0.0016064579831436276, "learning_rate": 1.0296010296010297e-07, "loss": 0.0, "step": 38490 }, { "epoch": 49.01099099099099, "grad_norm": 8.536849054507911e-05, "learning_rate": 1.0010010010010011e-07, "loss": 0.851, "step": 38500 }, { "epoch": 49.011248391248394, "grad_norm": 0.000131205641082488, "learning_rate": 9.724009724009724e-08, "loss": 0.5012, "step": 38510 }, { "epoch": 49.01150579150579, "grad_norm": 0.00011044665006920695, "learning_rate": 9.438009438009439e-08, "loss": 0.0, "step": 38520 }, { "epoch": 49.011763191763194, "grad_norm": 0.0001505005348008126, "learning_rate": 9.152009152009153e-08, "loss": 0.0, "step": 38530 }, { "epoch": 49.01202059202059, "grad_norm": 0.0006344806170091033, "learning_rate": 8.866008866008866e-08, "loss": 0.0001, "step": 38540 }, { "epoch": 49.012277992277994, "grad_norm": 0.01434845756739378, "learning_rate": 8.580008580008581e-08, "loss": 0.6324, "step": 38550 }, { "epoch": 49.01253539253539, "grad_norm": 0.0017600515857338905, "learning_rate": 8.294008294008294e-08, "loss": 0.0009, "step": 38560 }, { "epoch": 49.012792792792794, "grad_norm": 0.0006000173161737621, "learning_rate": 8.008008008008008e-08, "loss": 0.0, "step": 38570 }, { "epoch": 49.01305019305019, "grad_norm": 0.0001577949442435056, "learning_rate": 7.722007722007723e-08, "loss": 0.0001, "step": 38580 }, { "epoch": 49.013307593307594, "grad_norm": 0.0021368926391005516, "learning_rate": 7.436007436007437e-08, "loss": 0.0002, "step": 38590 }, { "epoch": 49.01356499356499, "grad_norm": 0.02230561152100563, "learning_rate": 7.15000715000715e-08, "loss": 0.0001, "step": 38600 }, { "epoch": 49.013822393822394, "grad_norm": 0.00016711508214939386, "learning_rate": 6.864006864006865e-08, "loss": 0.8394, "step": 38610 }, { "epoch": 49.01407979407979, "grad_norm": 0.004358669742941856, "learning_rate": 6.578006578006579e-08, "loss": 0.0003, "step": 38620 }, { "epoch": 49.014337194337195, "grad_norm": 0.0006673623574897647, "learning_rate": 6.292006292006292e-08, "loss": 0.0, "step": 38630 }, { "epoch": 49.01459459459459, "grad_norm": 605.3671264648438, "learning_rate": 6.006006006006006e-08, "loss": 0.089, "step": 38640 }, { "epoch": 49.014851994851995, "grad_norm": 0.0001498550409451127, "learning_rate": 5.720005720005721e-08, "loss": 0.65, "step": 38650 }, { "epoch": 49.0151093951094, "grad_norm": 0.0008860075031407177, "learning_rate": 5.434005434005434e-08, "loss": 0.0001, "step": 38660 }, { "epoch": 49.015366795366795, "grad_norm": 0.0030720012728124857, "learning_rate": 5.1480051480051484e-08, "loss": 0.0, "step": 38670 }, { "epoch": 49.0156241956242, "grad_norm": 0.0020494323689490557, "learning_rate": 4.862004862004862e-08, "loss": 0.0001, "step": 38680 }, { "epoch": 49.015881595881595, "grad_norm": 0.0007796235731802881, "learning_rate": 4.5760045760045766e-08, "loss": 0.0001, "step": 38690 }, { "epoch": 49.016138996139, "grad_norm": 0.000528481206856668, "learning_rate": 4.2900042900042903e-08, "loss": 0.0002, "step": 38700 }, { "epoch": 49.016396396396395, "grad_norm": 0.0076874056831002235, "learning_rate": 4.004004004004004e-08, "loss": 0.0001, "step": 38710 }, { "epoch": 49.0166537966538, "grad_norm": 0.00017405615653842688, "learning_rate": 3.7180037180037185e-08, "loss": 0.7798, "step": 38720 }, { "epoch": 49.016911196911195, "grad_norm": 0.002475382061675191, "learning_rate": 3.432003432003432e-08, "loss": 0.0, "step": 38730 }, { "epoch": 49.0171685971686, "grad_norm": 0.0001674262312008068, "learning_rate": 3.146003146003146e-08, "loss": 0.7042, "step": 38740 }, { "epoch": 49.017425997425995, "grad_norm": 0.0012000445276498795, "learning_rate": 2.8600028600028604e-08, "loss": 0.0001, "step": 38750 }, { "epoch": 49.0176833976834, "grad_norm": 0.00012928430805914104, "learning_rate": 2.5740025740025742e-08, "loss": 0.0, "step": 38760 }, { "epoch": 49.017940797940796, "grad_norm": 0.0012572959531098604, "learning_rate": 2.2880022880022883e-08, "loss": 0.0001, "step": 38770 }, { "epoch": 49.0181981981982, "grad_norm": 0.00019746992620639503, "learning_rate": 2.002002002002002e-08, "loss": 0.0018, "step": 38780 }, { "epoch": 49.018455598455596, "grad_norm": 0.020095746964216232, "learning_rate": 1.716001716001716e-08, "loss": 0.0001, "step": 38790 }, { "epoch": 49.018712998713, "grad_norm": 9.711348684504628e-05, "learning_rate": 1.4300014300014302e-08, "loss": 0.0002, "step": 38800 }, { "epoch": 49.018970398970396, "grad_norm": 0.009967109188437462, "learning_rate": 1.1440011440011441e-08, "loss": 0.0007, "step": 38810 }, { "epoch": 49.0192277992278, "grad_norm": 0.0005083610885776579, "learning_rate": 8.58000858000858e-09, "loss": 0.0001, "step": 38820 }, { "epoch": 49.0194851994852, "grad_norm": 0.00010245986777590588, "learning_rate": 5.720005720005721e-09, "loss": 0.0001, "step": 38830 }, { "epoch": 49.0197425997426, "grad_norm": 0.014355776831507683, "learning_rate": 2.8600028600028604e-09, "loss": 0.0001, "step": 38840 }, { "epoch": 49.02, "grad_norm": 0.00027233664877712727, "learning_rate": 0.0, "loss": 0.0, "step": 38850 }, { "epoch": 49.02, "eval_accuracy": 0.8913043478260869, "eval_loss": 0.857639491558075, "eval_runtime": 15.9664, "eval_samples_per_second": 2.881, "eval_steps_per_second": 2.881, "step": 38850 }, { "epoch": 49.02, "step": 38850, "total_flos": 1.7059137234927944e+20, "train_loss": 0.625811186066474, "train_runtime": 28909.5385, "train_samples_per_second": 1.344, "train_steps_per_second": 1.344 }, { "epoch": 49.02, "eval_accuracy": 0.9130434782608695, "eval_loss": 0.42057543992996216, "eval_runtime": 13.3636, "eval_samples_per_second": 3.442, "eval_steps_per_second": 3.442, "step": 38850 }, { "epoch": 49.02, "eval_accuracy": 0.9130434782608695, "eval_loss": 0.42057543992996216, "eval_runtime": 13.3586, "eval_samples_per_second": 3.443, "eval_steps_per_second": 3.443, "step": 38850 } ], "logging_steps": 10, "max_steps": 38850, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7059137234927944e+20, "train_batch_size": 1, "trial_name": null, "trial_params": null }