{ "best_metric": 0.9356, "best_model_checkpoint": "checkpoint/vit-large/checkpoint-11970", "epoch": 100.0, "eval_steps": 500, "global_step": 66500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 2.7020020484924316, "learning_rate": 9.998496240601504e-06, "loss": 4.6119, "step": 10 }, { "epoch": 0.03, "grad_norm": 2.892003059387207, "learning_rate": 9.996992481203008e-06, "loss": 4.5669, "step": 20 }, { "epoch": 0.05, "grad_norm": 2.7313311100006104, "learning_rate": 9.995488721804511e-06, "loss": 4.5263, "step": 30 }, { "epoch": 0.06, "grad_norm": 2.659421443939209, "learning_rate": 9.993984962406017e-06, "loss": 4.4729, "step": 40 }, { "epoch": 0.08, "grad_norm": 2.846480131149292, "learning_rate": 9.992481203007518e-06, "loss": 4.3985, "step": 50 }, { "epoch": 0.09, "grad_norm": 3.0880706310272217, "learning_rate": 9.990977443609024e-06, "loss": 4.3247, "step": 60 }, { "epoch": 0.11, "grad_norm": 3.202064037322998, "learning_rate": 9.989473684210527e-06, "loss": 4.233, "step": 70 }, { "epoch": 0.12, "grad_norm": 4.243076324462891, "learning_rate": 9.98796992481203e-06, "loss": 4.1598, "step": 80 }, { "epoch": 0.14, "grad_norm": 3.47965669631958, "learning_rate": 9.986466165413534e-06, "loss": 4.0929, "step": 90 }, { "epoch": 0.15, "grad_norm": 3.431342363357544, "learning_rate": 9.984962406015038e-06, "loss": 3.9735, "step": 100 }, { "epoch": 0.17, "grad_norm": 3.915376901626587, "learning_rate": 9.983458646616541e-06, "loss": 3.9051, "step": 110 }, { "epoch": 0.18, "grad_norm": 3.6235947608947754, "learning_rate": 9.981954887218046e-06, "loss": 3.7795, "step": 120 }, { "epoch": 0.2, "grad_norm": 4.0705485343933105, "learning_rate": 9.98045112781955e-06, "loss": 3.6517, "step": 130 }, { "epoch": 0.21, "grad_norm": 3.916447162628174, "learning_rate": 9.978947368421053e-06, "loss": 3.5197, "step": 140 }, { "epoch": 0.23, "grad_norm": 4.508981227874756, "learning_rate": 9.977443609022557e-06, "loss": 3.4313, "step": 150 }, { "epoch": 0.24, "grad_norm": 4.508138179779053, "learning_rate": 9.97593984962406e-06, "loss": 3.3823, "step": 160 }, { "epoch": 0.26, "grad_norm": 5.037680625915527, "learning_rate": 9.974436090225564e-06, "loss": 3.2748, "step": 170 }, { "epoch": 0.27, "grad_norm": 10.304414749145508, "learning_rate": 9.97293233082707e-06, "loss": 3.1054, "step": 180 }, { "epoch": 0.29, "grad_norm": 5.1311421394348145, "learning_rate": 9.971428571428571e-06, "loss": 3.0703, "step": 190 }, { "epoch": 0.3, "grad_norm": 4.8540120124816895, "learning_rate": 9.969924812030076e-06, "loss": 2.9789, "step": 200 }, { "epoch": 0.32, "grad_norm": 4.647185802459717, "learning_rate": 9.96842105263158e-06, "loss": 2.8568, "step": 210 }, { "epoch": 0.33, "grad_norm": 6.943347454071045, "learning_rate": 9.966917293233083e-06, "loss": 2.8175, "step": 220 }, { "epoch": 0.35, "grad_norm": 17.15534210205078, "learning_rate": 9.965413533834587e-06, "loss": 2.7049, "step": 230 }, { "epoch": 0.36, "grad_norm": 5.327253341674805, "learning_rate": 9.963909774436092e-06, "loss": 2.6185, "step": 240 }, { "epoch": 0.38, "grad_norm": 5.56367301940918, "learning_rate": 9.962406015037594e-06, "loss": 2.4704, "step": 250 }, { "epoch": 0.39, "grad_norm": 6.038745880126953, "learning_rate": 9.960902255639099e-06, "loss": 2.4862, "step": 260 }, { "epoch": 0.41, "grad_norm": 5.661726474761963, "learning_rate": 9.959398496240603e-06, "loss": 2.4405, "step": 270 }, { "epoch": 0.42, "grad_norm": 5.565981864929199, "learning_rate": 9.957894736842106e-06, "loss": 2.3698, "step": 280 }, { "epoch": 0.44, "grad_norm": 7.349733829498291, "learning_rate": 9.95639097744361e-06, "loss": 2.3338, "step": 290 }, { "epoch": 0.45, "grad_norm": 6.526618957519531, "learning_rate": 9.954887218045113e-06, "loss": 2.2445, "step": 300 }, { "epoch": 0.47, "grad_norm": 5.558746337890625, "learning_rate": 9.953383458646617e-06, "loss": 2.202, "step": 310 }, { "epoch": 0.48, "grad_norm": 6.4157633781433105, "learning_rate": 9.951879699248122e-06, "loss": 2.1578, "step": 320 }, { "epoch": 0.5, "grad_norm": 5.635522842407227, "learning_rate": 9.950375939849625e-06, "loss": 2.099, "step": 330 }, { "epoch": 0.51, "grad_norm": 8.216004371643066, "learning_rate": 9.948872180451129e-06, "loss": 2.0472, "step": 340 }, { "epoch": 0.53, "grad_norm": 7.348927021026611, "learning_rate": 9.947368421052632e-06, "loss": 2.0272, "step": 350 }, { "epoch": 0.54, "grad_norm": 6.218992710113525, "learning_rate": 9.945864661654136e-06, "loss": 1.9022, "step": 360 }, { "epoch": 0.56, "grad_norm": 12.379638671875, "learning_rate": 9.94436090225564e-06, "loss": 1.9707, "step": 370 }, { "epoch": 0.57, "grad_norm": 7.454248905181885, "learning_rate": 9.942857142857145e-06, "loss": 1.9612, "step": 380 }, { "epoch": 0.59, "grad_norm": 8.975961685180664, "learning_rate": 9.941353383458647e-06, "loss": 1.8519, "step": 390 }, { "epoch": 0.6, "grad_norm": 11.839798927307129, "learning_rate": 9.939849624060152e-06, "loss": 1.8246, "step": 400 }, { "epoch": 0.62, "grad_norm": 21.542709350585938, "learning_rate": 9.938345864661655e-06, "loss": 1.7591, "step": 410 }, { "epoch": 0.63, "grad_norm": 6.640402793884277, "learning_rate": 9.936842105263159e-06, "loss": 1.7803, "step": 420 }, { "epoch": 0.65, "grad_norm": 7.312070369720459, "learning_rate": 9.935338345864662e-06, "loss": 1.6758, "step": 430 }, { "epoch": 0.66, "grad_norm": 6.633362770080566, "learning_rate": 9.933834586466168e-06, "loss": 1.7095, "step": 440 }, { "epoch": 0.68, "grad_norm": 9.548731803894043, "learning_rate": 9.93233082706767e-06, "loss": 1.6934, "step": 450 }, { "epoch": 0.69, "grad_norm": 9.280405044555664, "learning_rate": 9.930827067669175e-06, "loss": 1.74, "step": 460 }, { "epoch": 0.71, "grad_norm": 4.683008670806885, "learning_rate": 9.929323308270678e-06, "loss": 1.5803, "step": 470 }, { "epoch": 0.72, "grad_norm": 5.887816905975342, "learning_rate": 9.927819548872182e-06, "loss": 1.6039, "step": 480 }, { "epoch": 0.74, "grad_norm": 7.674352645874023, "learning_rate": 9.926315789473685e-06, "loss": 1.6009, "step": 490 }, { "epoch": 0.75, "grad_norm": 5.968412399291992, "learning_rate": 9.924812030075189e-06, "loss": 1.5391, "step": 500 }, { "epoch": 0.77, "grad_norm": 5.869007587432861, "learning_rate": 9.923308270676692e-06, "loss": 1.5226, "step": 510 }, { "epoch": 0.78, "grad_norm": 7.70728874206543, "learning_rate": 9.921804511278196e-06, "loss": 1.6173, "step": 520 }, { "epoch": 0.8, "grad_norm": 7.701499938964844, "learning_rate": 9.920300751879701e-06, "loss": 1.5042, "step": 530 }, { "epoch": 0.81, "grad_norm": 6.880636692047119, "learning_rate": 9.918796992481203e-06, "loss": 1.456, "step": 540 }, { "epoch": 0.83, "grad_norm": 7.021149635314941, "learning_rate": 9.917293233082708e-06, "loss": 1.5698, "step": 550 }, { "epoch": 0.84, "grad_norm": 7.1271138191223145, "learning_rate": 9.915789473684211e-06, "loss": 1.3838, "step": 560 }, { "epoch": 0.86, "grad_norm": 7.8424482345581055, "learning_rate": 9.914285714285715e-06, "loss": 1.3997, "step": 570 }, { "epoch": 0.87, "grad_norm": 8.099345207214355, "learning_rate": 9.912781954887218e-06, "loss": 1.4359, "step": 580 }, { "epoch": 0.89, "grad_norm": 7.261110782623291, "learning_rate": 9.911278195488722e-06, "loss": 1.454, "step": 590 }, { "epoch": 0.9, "grad_norm": 6.30597448348999, "learning_rate": 9.909774436090226e-06, "loss": 1.3907, "step": 600 }, { "epoch": 0.92, "grad_norm": 7.466990947723389, "learning_rate": 9.90827067669173e-06, "loss": 1.329, "step": 610 }, { "epoch": 0.93, "grad_norm": 7.6469316482543945, "learning_rate": 9.906766917293234e-06, "loss": 1.2848, "step": 620 }, { "epoch": 0.95, "grad_norm": 7.2480244636535645, "learning_rate": 9.905263157894738e-06, "loss": 1.4246, "step": 630 }, { "epoch": 0.96, "grad_norm": 7.738135814666748, "learning_rate": 9.903759398496241e-06, "loss": 1.3669, "step": 640 }, { "epoch": 0.98, "grad_norm": 8.298168182373047, "learning_rate": 9.902255639097745e-06, "loss": 1.2883, "step": 650 }, { "epoch": 0.99, "grad_norm": 6.747694969177246, "learning_rate": 9.900751879699248e-06, "loss": 1.2884, "step": 660 }, { "epoch": 1.0, "eval_accuracy": 0.8834, "eval_loss": 0.8751662373542786, "eval_runtime": 85.9745, "eval_samples_per_second": 116.314, "eval_steps_per_second": 0.465, "step": 665 }, { "epoch": 1.01, "grad_norm": 7.908563137054443, "learning_rate": 9.899248120300754e-06, "loss": 1.2429, "step": 670 }, { "epoch": 1.02, "grad_norm": 7.439415454864502, "learning_rate": 9.897744360902255e-06, "loss": 1.2716, "step": 680 }, { "epoch": 1.04, "grad_norm": 7.2306599617004395, "learning_rate": 9.89624060150376e-06, "loss": 1.1676, "step": 690 }, { "epoch": 1.05, "grad_norm": 6.287716388702393, "learning_rate": 9.894736842105264e-06, "loss": 1.2212, "step": 700 }, { "epoch": 1.07, "grad_norm": 10.363208770751953, "learning_rate": 9.893233082706768e-06, "loss": 1.259, "step": 710 }, { "epoch": 1.08, "grad_norm": 6.556034564971924, "learning_rate": 9.891729323308271e-06, "loss": 1.1383, "step": 720 }, { "epoch": 1.1, "grad_norm": 5.640949726104736, "learning_rate": 9.890225563909776e-06, "loss": 1.2454, "step": 730 }, { "epoch": 1.11, "grad_norm": 7.140594959259033, "learning_rate": 9.888721804511278e-06, "loss": 1.1363, "step": 740 }, { "epoch": 1.13, "grad_norm": 7.780942440032959, "learning_rate": 9.887218045112783e-06, "loss": 1.1806, "step": 750 }, { "epoch": 1.14, "grad_norm": 6.301698684692383, "learning_rate": 9.885714285714287e-06, "loss": 1.1416, "step": 760 }, { "epoch": 1.16, "grad_norm": 7.256317138671875, "learning_rate": 9.88421052631579e-06, "loss": 1.1947, "step": 770 }, { "epoch": 1.17, "grad_norm": 9.818164825439453, "learning_rate": 9.882706766917294e-06, "loss": 1.1865, "step": 780 }, { "epoch": 1.19, "grad_norm": 11.159587860107422, "learning_rate": 9.881203007518797e-06, "loss": 1.0911, "step": 790 }, { "epoch": 1.2, "grad_norm": 12.432153701782227, "learning_rate": 9.879699248120301e-06, "loss": 1.1486, "step": 800 }, { "epoch": 1.22, "grad_norm": 9.031283378601074, "learning_rate": 9.878195488721806e-06, "loss": 1.0838, "step": 810 }, { "epoch": 1.23, "grad_norm": 7.850508689880371, "learning_rate": 9.87669172932331e-06, "loss": 1.0953, "step": 820 }, { "epoch": 1.25, "grad_norm": 6.112914562225342, "learning_rate": 9.875187969924813e-06, "loss": 1.1159, "step": 830 }, { "epoch": 1.26, "grad_norm": 6.945899486541748, "learning_rate": 9.873684210526317e-06, "loss": 0.9613, "step": 840 }, { "epoch": 1.28, "grad_norm": 7.430254936218262, "learning_rate": 9.87218045112782e-06, "loss": 1.086, "step": 850 }, { "epoch": 1.29, "grad_norm": 6.00394868850708, "learning_rate": 9.870676691729324e-06, "loss": 1.0873, "step": 860 }, { "epoch": 1.31, "grad_norm": 5.864083290100098, "learning_rate": 9.869172932330829e-06, "loss": 1.055, "step": 870 }, { "epoch": 1.32, "grad_norm": 8.37553882598877, "learning_rate": 9.86766917293233e-06, "loss": 1.0894, "step": 880 }, { "epoch": 1.34, "grad_norm": 8.302824974060059, "learning_rate": 9.866165413533836e-06, "loss": 1.1037, "step": 890 }, { "epoch": 1.35, "grad_norm": 6.968749046325684, "learning_rate": 9.86466165413534e-06, "loss": 1.0071, "step": 900 }, { "epoch": 1.37, "grad_norm": 5.924696922302246, "learning_rate": 9.863157894736843e-06, "loss": 1.0498, "step": 910 }, { "epoch": 1.38, "grad_norm": 7.433680534362793, "learning_rate": 9.861654135338347e-06, "loss": 1.0456, "step": 920 }, { "epoch": 1.4, "grad_norm": 7.614802360534668, "learning_rate": 9.86015037593985e-06, "loss": 0.9659, "step": 930 }, { "epoch": 1.41, "grad_norm": 9.205560684204102, "learning_rate": 9.858646616541354e-06, "loss": 0.9692, "step": 940 }, { "epoch": 1.43, "grad_norm": 6.741930961608887, "learning_rate": 9.857142857142859e-06, "loss": 1.0157, "step": 950 }, { "epoch": 1.44, "grad_norm": 8.176901817321777, "learning_rate": 9.855639097744362e-06, "loss": 1.0014, "step": 960 }, { "epoch": 1.46, "grad_norm": 5.335792541503906, "learning_rate": 9.854135338345866e-06, "loss": 0.9361, "step": 970 }, { "epoch": 1.47, "grad_norm": 6.488396644592285, "learning_rate": 9.85263157894737e-06, "loss": 1.0858, "step": 980 }, { "epoch": 1.49, "grad_norm": 6.725528240203857, "learning_rate": 9.851127819548873e-06, "loss": 1.0027, "step": 990 }, { "epoch": 1.5, "grad_norm": 10.740559577941895, "learning_rate": 9.849624060150376e-06, "loss": 1.0121, "step": 1000 }, { "epoch": 1.52, "grad_norm": 8.048620223999023, "learning_rate": 9.84812030075188e-06, "loss": 0.967, "step": 1010 }, { "epoch": 1.53, "grad_norm": 7.225861072540283, "learning_rate": 9.846616541353383e-06, "loss": 1.0329, "step": 1020 }, { "epoch": 1.55, "grad_norm": 7.12366247177124, "learning_rate": 9.845112781954887e-06, "loss": 1.0117, "step": 1030 }, { "epoch": 1.56, "grad_norm": 7.5486273765563965, "learning_rate": 9.843609022556392e-06, "loss": 0.969, "step": 1040 }, { "epoch": 1.58, "grad_norm": 9.591785430908203, "learning_rate": 9.842105263157896e-06, "loss": 0.9415, "step": 1050 }, { "epoch": 1.59, "grad_norm": 7.203570365905762, "learning_rate": 9.8406015037594e-06, "loss": 1.0409, "step": 1060 }, { "epoch": 1.61, "grad_norm": 9.412242889404297, "learning_rate": 9.839097744360903e-06, "loss": 1.0352, "step": 1070 }, { "epoch": 1.62, "grad_norm": 9.688934326171875, "learning_rate": 9.837593984962406e-06, "loss": 1.0524, "step": 1080 }, { "epoch": 1.64, "grad_norm": 6.9523844718933105, "learning_rate": 9.83609022556391e-06, "loss": 0.9546, "step": 1090 }, { "epoch": 1.65, "grad_norm": 8.400866508483887, "learning_rate": 9.834586466165415e-06, "loss": 0.9361, "step": 1100 }, { "epoch": 1.67, "grad_norm": 8.09070110321045, "learning_rate": 9.833082706766917e-06, "loss": 0.9797, "step": 1110 }, { "epoch": 1.68, "grad_norm": 6.927423000335693, "learning_rate": 9.831578947368422e-06, "loss": 0.9439, "step": 1120 }, { "epoch": 1.7, "grad_norm": 7.291294574737549, "learning_rate": 9.830075187969926e-06, "loss": 0.9753, "step": 1130 }, { "epoch": 1.71, "grad_norm": 7.919008731842041, "learning_rate": 9.828571428571429e-06, "loss": 0.9697, "step": 1140 }, { "epoch": 1.73, "grad_norm": 8.660476684570312, "learning_rate": 9.827067669172933e-06, "loss": 0.8882, "step": 1150 }, { "epoch": 1.74, "grad_norm": 8.102679252624512, "learning_rate": 9.825563909774438e-06, "loss": 0.8959, "step": 1160 }, { "epoch": 1.76, "grad_norm": 5.902896404266357, "learning_rate": 9.82406015037594e-06, "loss": 0.887, "step": 1170 }, { "epoch": 1.77, "grad_norm": 6.6904778480529785, "learning_rate": 9.822556390977445e-06, "loss": 0.9352, "step": 1180 }, { "epoch": 1.79, "grad_norm": 6.770270824432373, "learning_rate": 9.821052631578948e-06, "loss": 0.8916, "step": 1190 }, { "epoch": 1.8, "grad_norm": 8.353099822998047, "learning_rate": 9.819548872180452e-06, "loss": 0.9523, "step": 1200 }, { "epoch": 1.82, "grad_norm": 6.385773658752441, "learning_rate": 9.818045112781955e-06, "loss": 0.9175, "step": 1210 }, { "epoch": 1.83, "grad_norm": 13.28996467590332, "learning_rate": 9.816541353383459e-06, "loss": 0.9375, "step": 1220 }, { "epoch": 1.85, "grad_norm": 9.252169609069824, "learning_rate": 9.815037593984962e-06, "loss": 1.0589, "step": 1230 }, { "epoch": 1.86, "grad_norm": 6.009567737579346, "learning_rate": 9.813533834586468e-06, "loss": 0.9608, "step": 1240 }, { "epoch": 1.88, "grad_norm": 4.863635063171387, "learning_rate": 9.812030075187971e-06, "loss": 0.9724, "step": 1250 }, { "epoch": 1.89, "grad_norm": 10.548372268676758, "learning_rate": 9.810526315789475e-06, "loss": 0.9651, "step": 1260 }, { "epoch": 1.91, "grad_norm": 8.277862548828125, "learning_rate": 9.809022556390978e-06, "loss": 0.8868, "step": 1270 }, { "epoch": 1.92, "grad_norm": 6.657036304473877, "learning_rate": 9.807518796992482e-06, "loss": 0.9133, "step": 1280 }, { "epoch": 1.94, "grad_norm": 7.065949440002441, "learning_rate": 9.806015037593985e-06, "loss": 0.9602, "step": 1290 }, { "epoch": 1.95, "grad_norm": 9.187036514282227, "learning_rate": 9.80451127819549e-06, "loss": 0.9626, "step": 1300 }, { "epoch": 1.97, "grad_norm": 7.014963150024414, "learning_rate": 9.803007518796992e-06, "loss": 0.8653, "step": 1310 }, { "epoch": 1.98, "grad_norm": 8.665754318237305, "learning_rate": 9.801503759398498e-06, "loss": 0.8795, "step": 1320 }, { "epoch": 2.0, "grad_norm": 8.92686939239502, "learning_rate": 9.800000000000001e-06, "loss": 0.7958, "step": 1330 }, { "epoch": 2.0, "eval_accuracy": 0.9142, "eval_loss": 0.4723776876926422, "eval_runtime": 84.6568, "eval_samples_per_second": 118.124, "eval_steps_per_second": 0.472, "step": 1330 }, { "epoch": 2.02, "grad_norm": 6.881080627441406, "learning_rate": 9.798496240601505e-06, "loss": 0.9416, "step": 1340 }, { "epoch": 2.03, "grad_norm": 5.449582099914551, "learning_rate": 9.796992481203008e-06, "loss": 0.8897, "step": 1350 }, { "epoch": 2.05, "grad_norm": 6.535237789154053, "learning_rate": 9.795488721804513e-06, "loss": 0.8241, "step": 1360 }, { "epoch": 2.06, "grad_norm": 7.858671188354492, "learning_rate": 9.793984962406015e-06, "loss": 0.7834, "step": 1370 }, { "epoch": 2.08, "grad_norm": 6.7106852531433105, "learning_rate": 9.79248120300752e-06, "loss": 0.836, "step": 1380 }, { "epoch": 2.09, "grad_norm": 6.440729141235352, "learning_rate": 9.790977443609024e-06, "loss": 0.775, "step": 1390 }, { "epoch": 2.11, "grad_norm": 8.935519218444824, "learning_rate": 9.789473684210527e-06, "loss": 0.7826, "step": 1400 }, { "epoch": 2.12, "grad_norm": 7.244870662689209, "learning_rate": 9.787969924812031e-06, "loss": 0.8917, "step": 1410 }, { "epoch": 2.14, "grad_norm": 9.125387191772461, "learning_rate": 9.786466165413534e-06, "loss": 0.8167, "step": 1420 }, { "epoch": 2.15, "grad_norm": 6.8634114265441895, "learning_rate": 9.784962406015038e-06, "loss": 0.8052, "step": 1430 }, { "epoch": 2.17, "grad_norm": 7.519056797027588, "learning_rate": 9.783458646616543e-06, "loss": 0.8204, "step": 1440 }, { "epoch": 2.18, "grad_norm": 7.861953258514404, "learning_rate": 9.781954887218047e-06, "loss": 0.8404, "step": 1450 }, { "epoch": 2.2, "grad_norm": 10.940001487731934, "learning_rate": 9.78045112781955e-06, "loss": 0.7934, "step": 1460 }, { "epoch": 2.21, "grad_norm": 5.963690757751465, "learning_rate": 9.778947368421054e-06, "loss": 0.7549, "step": 1470 }, { "epoch": 2.23, "grad_norm": 6.734865665435791, "learning_rate": 9.777443609022557e-06, "loss": 0.8112, "step": 1480 }, { "epoch": 2.24, "grad_norm": 7.971401214599609, "learning_rate": 9.77593984962406e-06, "loss": 0.8323, "step": 1490 }, { "epoch": 2.26, "grad_norm": 9.728713989257812, "learning_rate": 9.774436090225564e-06, "loss": 0.7441, "step": 1500 }, { "epoch": 2.27, "grad_norm": 9.506553649902344, "learning_rate": 9.772932330827068e-06, "loss": 0.8018, "step": 1510 }, { "epoch": 2.29, "grad_norm": 7.3224663734436035, "learning_rate": 9.771428571428571e-06, "loss": 0.7869, "step": 1520 }, { "epoch": 2.3, "grad_norm": 7.251104831695557, "learning_rate": 9.769924812030077e-06, "loss": 0.8472, "step": 1530 }, { "epoch": 2.32, "grad_norm": 7.3015055656433105, "learning_rate": 9.76842105263158e-06, "loss": 0.7224, "step": 1540 }, { "epoch": 2.33, "grad_norm": 9.096901893615723, "learning_rate": 9.766917293233084e-06, "loss": 0.8195, "step": 1550 }, { "epoch": 2.35, "grad_norm": 7.366261959075928, "learning_rate": 9.765413533834587e-06, "loss": 0.8322, "step": 1560 }, { "epoch": 2.36, "grad_norm": 9.955854415893555, "learning_rate": 9.76390977443609e-06, "loss": 0.8315, "step": 1570 }, { "epoch": 2.38, "grad_norm": 6.852784633636475, "learning_rate": 9.762406015037594e-06, "loss": 0.7113, "step": 1580 }, { "epoch": 2.39, "grad_norm": 9.9766263961792, "learning_rate": 9.7609022556391e-06, "loss": 0.8024, "step": 1590 }, { "epoch": 2.41, "grad_norm": 9.095175743103027, "learning_rate": 9.759398496240601e-06, "loss": 0.8774, "step": 1600 }, { "epoch": 2.42, "grad_norm": 5.913175106048584, "learning_rate": 9.757894736842106e-06, "loss": 0.7608, "step": 1610 }, { "epoch": 2.44, "grad_norm": 9.544361114501953, "learning_rate": 9.75639097744361e-06, "loss": 0.7984, "step": 1620 }, { "epoch": 2.45, "grad_norm": 6.991225242614746, "learning_rate": 9.754887218045113e-06, "loss": 0.7942, "step": 1630 }, { "epoch": 2.47, "grad_norm": 7.531531810760498, "learning_rate": 9.753383458646617e-06, "loss": 0.8005, "step": 1640 }, { "epoch": 2.48, "grad_norm": 5.945763111114502, "learning_rate": 9.751879699248122e-06, "loss": 0.7673, "step": 1650 }, { "epoch": 2.5, "grad_norm": 8.382121086120605, "learning_rate": 9.750375939849624e-06, "loss": 0.7966, "step": 1660 }, { "epoch": 2.51, "grad_norm": 5.387685775756836, "learning_rate": 9.74887218045113e-06, "loss": 0.7892, "step": 1670 }, { "epoch": 2.53, "grad_norm": 7.867427349090576, "learning_rate": 9.747368421052633e-06, "loss": 0.8002, "step": 1680 }, { "epoch": 2.54, "grad_norm": 7.549880027770996, "learning_rate": 9.745864661654136e-06, "loss": 0.8879, "step": 1690 }, { "epoch": 2.56, "grad_norm": 7.67978572845459, "learning_rate": 9.74436090225564e-06, "loss": 0.7849, "step": 1700 }, { "epoch": 2.57, "grad_norm": 8.076873779296875, "learning_rate": 9.742857142857143e-06, "loss": 0.6503, "step": 1710 }, { "epoch": 2.59, "grad_norm": 10.748533248901367, "learning_rate": 9.741353383458647e-06, "loss": 0.7868, "step": 1720 }, { "epoch": 2.6, "grad_norm": 8.428750991821289, "learning_rate": 9.739849624060152e-06, "loss": 0.8195, "step": 1730 }, { "epoch": 2.62, "grad_norm": 7.678562164306641, "learning_rate": 9.738345864661655e-06, "loss": 0.8428, "step": 1740 }, { "epoch": 2.63, "grad_norm": 7.171645164489746, "learning_rate": 9.736842105263159e-06, "loss": 0.809, "step": 1750 }, { "epoch": 2.65, "grad_norm": 7.041049003601074, "learning_rate": 9.735338345864663e-06, "loss": 0.7417, "step": 1760 }, { "epoch": 2.66, "grad_norm": 9.66743278503418, "learning_rate": 9.733834586466166e-06, "loss": 0.7952, "step": 1770 }, { "epoch": 2.68, "grad_norm": 9.864920616149902, "learning_rate": 9.73233082706767e-06, "loss": 0.8427, "step": 1780 }, { "epoch": 2.69, "grad_norm": 10.242929458618164, "learning_rate": 9.730827067669175e-06, "loss": 0.7599, "step": 1790 }, { "epoch": 2.71, "grad_norm": 8.138999938964844, "learning_rate": 9.729323308270677e-06, "loss": 0.7517, "step": 1800 }, { "epoch": 2.72, "grad_norm": 7.668764114379883, "learning_rate": 9.727819548872182e-06, "loss": 0.7766, "step": 1810 }, { "epoch": 2.74, "grad_norm": 6.978646278381348, "learning_rate": 9.726315789473685e-06, "loss": 0.7323, "step": 1820 }, { "epoch": 2.75, "grad_norm": 8.794787406921387, "learning_rate": 9.724812030075189e-06, "loss": 0.8644, "step": 1830 }, { "epoch": 2.77, "grad_norm": 10.154306411743164, "learning_rate": 9.723308270676692e-06, "loss": 0.8235, "step": 1840 }, { "epoch": 2.78, "grad_norm": 9.513362884521484, "learning_rate": 9.721804511278196e-06, "loss": 0.792, "step": 1850 }, { "epoch": 2.8, "grad_norm": 4.891651630401611, "learning_rate": 9.7203007518797e-06, "loss": 0.7343, "step": 1860 }, { "epoch": 2.81, "grad_norm": 6.595260143280029, "learning_rate": 9.718796992481205e-06, "loss": 0.6451, "step": 1870 }, { "epoch": 2.83, "grad_norm": 11.115670204162598, "learning_rate": 9.717293233082708e-06, "loss": 0.7841, "step": 1880 }, { "epoch": 2.84, "grad_norm": 7.82785701751709, "learning_rate": 9.715789473684212e-06, "loss": 0.8396, "step": 1890 }, { "epoch": 2.86, "grad_norm": 5.41937780380249, "learning_rate": 9.714285714285715e-06, "loss": 0.772, "step": 1900 }, { "epoch": 2.87, "grad_norm": 8.092954635620117, "learning_rate": 9.712781954887219e-06, "loss": 0.7124, "step": 1910 }, { "epoch": 2.89, "grad_norm": 4.913546562194824, "learning_rate": 9.711278195488722e-06, "loss": 0.7824, "step": 1920 }, { "epoch": 2.9, "grad_norm": 6.090660572052002, "learning_rate": 9.709774436090227e-06, "loss": 0.7911, "step": 1930 }, { "epoch": 2.92, "grad_norm": 5.547027111053467, "learning_rate": 9.70827067669173e-06, "loss": 0.7818, "step": 1940 }, { "epoch": 2.93, "grad_norm": 8.583475112915039, "learning_rate": 9.706766917293234e-06, "loss": 0.7272, "step": 1950 }, { "epoch": 2.95, "grad_norm": 8.129578590393066, "learning_rate": 9.705263157894738e-06, "loss": 0.7352, "step": 1960 }, { "epoch": 2.96, "grad_norm": 9.513014793395996, "learning_rate": 9.703759398496242e-06, "loss": 0.8126, "step": 1970 }, { "epoch": 2.98, "grad_norm": 5.819597244262695, "learning_rate": 9.702255639097745e-06, "loss": 0.7599, "step": 1980 }, { "epoch": 2.99, "grad_norm": 7.391184329986572, "learning_rate": 9.700751879699249e-06, "loss": 0.743, "step": 1990 }, { "epoch": 3.0, "eval_accuracy": 0.9207, "eval_loss": 0.3750178813934326, "eval_runtime": 84.5377, "eval_samples_per_second": 118.29, "eval_steps_per_second": 0.473, "step": 1995 }, { "epoch": 3.01, "grad_norm": 6.347775459289551, "learning_rate": 9.699248120300752e-06, "loss": 0.6685, "step": 2000 }, { "epoch": 3.02, "grad_norm": 6.109332084655762, "learning_rate": 9.697744360902256e-06, "loss": 0.7398, "step": 2010 }, { "epoch": 3.04, "grad_norm": 4.770040512084961, "learning_rate": 9.69624060150376e-06, "loss": 0.7587, "step": 2020 }, { "epoch": 3.05, "grad_norm": 7.322962760925293, "learning_rate": 9.694736842105263e-06, "loss": 0.7917, "step": 2030 }, { "epoch": 3.07, "grad_norm": 9.097600936889648, "learning_rate": 9.693233082706768e-06, "loss": 0.6967, "step": 2040 }, { "epoch": 3.08, "grad_norm": 10.507075309753418, "learning_rate": 9.691729323308271e-06, "loss": 0.7229, "step": 2050 }, { "epoch": 3.1, "grad_norm": 6.249164581298828, "learning_rate": 9.690225563909775e-06, "loss": 0.6491, "step": 2060 }, { "epoch": 3.11, "grad_norm": 7.525278568267822, "learning_rate": 9.688721804511278e-06, "loss": 0.6476, "step": 2070 }, { "epoch": 3.13, "grad_norm": 9.564391136169434, "learning_rate": 9.687218045112784e-06, "loss": 0.7133, "step": 2080 }, { "epoch": 3.14, "grad_norm": 6.4053955078125, "learning_rate": 9.685714285714285e-06, "loss": 0.8012, "step": 2090 }, { "epoch": 3.16, "grad_norm": 6.6741251945495605, "learning_rate": 9.68421052631579e-06, "loss": 0.7507, "step": 2100 }, { "epoch": 3.17, "grad_norm": 12.398811340332031, "learning_rate": 9.682706766917294e-06, "loss": 0.7138, "step": 2110 }, { "epoch": 3.19, "grad_norm": 8.505881309509277, "learning_rate": 9.681203007518798e-06, "loss": 0.7136, "step": 2120 }, { "epoch": 3.2, "grad_norm": 5.51025915145874, "learning_rate": 9.679699248120301e-06, "loss": 0.6037, "step": 2130 }, { "epoch": 3.22, "grad_norm": 6.39398193359375, "learning_rate": 9.678195488721805e-06, "loss": 0.6799, "step": 2140 }, { "epoch": 3.23, "grad_norm": 6.508944511413574, "learning_rate": 9.676691729323308e-06, "loss": 0.6587, "step": 2150 }, { "epoch": 3.25, "grad_norm": 7.027959823608398, "learning_rate": 9.675187969924813e-06, "loss": 0.7096, "step": 2160 }, { "epoch": 3.26, "grad_norm": 8.690469741821289, "learning_rate": 9.673684210526317e-06, "loss": 0.6169, "step": 2170 }, { "epoch": 3.28, "grad_norm": 9.489269256591797, "learning_rate": 9.67218045112782e-06, "loss": 0.6643, "step": 2180 }, { "epoch": 3.29, "grad_norm": 12.934528350830078, "learning_rate": 9.670676691729324e-06, "loss": 0.7485, "step": 2190 }, { "epoch": 3.31, "grad_norm": 9.072815895080566, "learning_rate": 9.669172932330828e-06, "loss": 0.704, "step": 2200 }, { "epoch": 3.32, "grad_norm": 7.934593200683594, "learning_rate": 9.667669172932331e-06, "loss": 0.7134, "step": 2210 }, { "epoch": 3.34, "grad_norm": 6.846796989440918, "learning_rate": 9.666165413533836e-06, "loss": 0.7372, "step": 2220 }, { "epoch": 3.35, "grad_norm": 6.8362250328063965, "learning_rate": 9.664661654135338e-06, "loss": 0.6854, "step": 2230 }, { "epoch": 3.37, "grad_norm": 8.184903144836426, "learning_rate": 9.663157894736843e-06, "loss": 0.6662, "step": 2240 }, { "epoch": 3.38, "grad_norm": 8.972626686096191, "learning_rate": 9.661654135338347e-06, "loss": 0.7895, "step": 2250 }, { "epoch": 3.4, "grad_norm": 6.524502754211426, "learning_rate": 9.66015037593985e-06, "loss": 0.8453, "step": 2260 }, { "epoch": 3.41, "grad_norm": 6.215096950531006, "learning_rate": 9.658646616541354e-06, "loss": 0.716, "step": 2270 }, { "epoch": 3.43, "grad_norm": 6.9516401290893555, "learning_rate": 9.657142857142859e-06, "loss": 0.7453, "step": 2280 }, { "epoch": 3.44, "grad_norm": 9.131119728088379, "learning_rate": 9.655639097744361e-06, "loss": 0.6474, "step": 2290 }, { "epoch": 3.46, "grad_norm": 7.063914775848389, "learning_rate": 9.654135338345866e-06, "loss": 0.6535, "step": 2300 }, { "epoch": 3.47, "grad_norm": 9.410021781921387, "learning_rate": 9.65263157894737e-06, "loss": 0.708, "step": 2310 }, { "epoch": 3.49, "grad_norm": 7.179042816162109, "learning_rate": 9.651127819548873e-06, "loss": 0.7098, "step": 2320 }, { "epoch": 3.5, "grad_norm": 8.097248077392578, "learning_rate": 9.649624060150377e-06, "loss": 0.7052, "step": 2330 }, { "epoch": 3.52, "grad_norm": 4.258429050445557, "learning_rate": 9.64812030075188e-06, "loss": 0.6803, "step": 2340 }, { "epoch": 3.53, "grad_norm": 7.451633930206299, "learning_rate": 9.646616541353384e-06, "loss": 0.6569, "step": 2350 }, { "epoch": 3.55, "grad_norm": 9.551535606384277, "learning_rate": 9.645112781954889e-06, "loss": 0.6411, "step": 2360 }, { "epoch": 3.56, "grad_norm": 6.456385612487793, "learning_rate": 9.643609022556392e-06, "loss": 0.5938, "step": 2370 }, { "epoch": 3.58, "grad_norm": 5.947078704833984, "learning_rate": 9.642105263157896e-06, "loss": 0.6175, "step": 2380 }, { "epoch": 3.59, "grad_norm": 5.279054164886475, "learning_rate": 9.6406015037594e-06, "loss": 0.6788, "step": 2390 }, { "epoch": 3.61, "grad_norm": 7.593316555023193, "learning_rate": 9.639097744360903e-06, "loss": 0.6255, "step": 2400 }, { "epoch": 3.62, "grad_norm": 7.478080749511719, "learning_rate": 9.637593984962407e-06, "loss": 0.7908, "step": 2410 }, { "epoch": 3.64, "grad_norm": 9.64027214050293, "learning_rate": 9.636090225563912e-06, "loss": 0.7241, "step": 2420 }, { "epoch": 3.65, "grad_norm": 14.006696701049805, "learning_rate": 9.634586466165414e-06, "loss": 0.686, "step": 2430 }, { "epoch": 3.67, "grad_norm": 5.0339789390563965, "learning_rate": 9.633082706766919e-06, "loss": 0.7082, "step": 2440 }, { "epoch": 3.68, "grad_norm": 8.148447036743164, "learning_rate": 9.631578947368422e-06, "loss": 0.6859, "step": 2450 }, { "epoch": 3.7, "grad_norm": 7.614720344543457, "learning_rate": 9.630075187969926e-06, "loss": 0.7117, "step": 2460 }, { "epoch": 3.71, "grad_norm": 9.017003059387207, "learning_rate": 9.62857142857143e-06, "loss": 0.6505, "step": 2470 }, { "epoch": 3.73, "grad_norm": 6.4466986656188965, "learning_rate": 9.627067669172933e-06, "loss": 0.71, "step": 2480 }, { "epoch": 3.74, "grad_norm": 7.973327159881592, "learning_rate": 9.625563909774436e-06, "loss": 0.6466, "step": 2490 }, { "epoch": 3.76, "grad_norm": 6.712606906890869, "learning_rate": 9.62406015037594e-06, "loss": 0.6767, "step": 2500 }, { "epoch": 3.77, "grad_norm": 8.149372100830078, "learning_rate": 9.622556390977445e-06, "loss": 0.783, "step": 2510 }, { "epoch": 3.79, "grad_norm": 8.645270347595215, "learning_rate": 9.621052631578947e-06, "loss": 0.6978, "step": 2520 }, { "epoch": 3.8, "grad_norm": 9.347142219543457, "learning_rate": 9.619548872180452e-06, "loss": 0.583, "step": 2530 }, { "epoch": 3.82, "grad_norm": 7.905392169952393, "learning_rate": 9.618045112781956e-06, "loss": 0.6884, "step": 2540 }, { "epoch": 3.83, "grad_norm": 8.783331871032715, "learning_rate": 9.61654135338346e-06, "loss": 0.691, "step": 2550 }, { "epoch": 3.85, "grad_norm": 8.456209182739258, "learning_rate": 9.615037593984963e-06, "loss": 0.7281, "step": 2560 }, { "epoch": 3.86, "grad_norm": 6.667693138122559, "learning_rate": 9.613533834586468e-06, "loss": 0.6912, "step": 2570 }, { "epoch": 3.88, "grad_norm": 8.541569709777832, "learning_rate": 9.61203007518797e-06, "loss": 0.6853, "step": 2580 }, { "epoch": 3.89, "grad_norm": 4.732927322387695, "learning_rate": 9.610526315789475e-06, "loss": 0.6647, "step": 2590 }, { "epoch": 3.91, "grad_norm": 7.604156017303467, "learning_rate": 9.609022556390978e-06, "loss": 0.7526, "step": 2600 }, { "epoch": 3.92, "grad_norm": 8.218050956726074, "learning_rate": 9.607518796992482e-06, "loss": 0.6828, "step": 2610 }, { "epoch": 3.94, "grad_norm": 5.613206386566162, "learning_rate": 9.606015037593985e-06, "loss": 0.6964, "step": 2620 }, { "epoch": 3.95, "grad_norm": 9.644120216369629, "learning_rate": 9.604511278195489e-06, "loss": 0.6912, "step": 2630 }, { "epoch": 3.97, "grad_norm": 8.14504337310791, "learning_rate": 9.603007518796993e-06, "loss": 0.7527, "step": 2640 }, { "epoch": 3.98, "grad_norm": 6.1560468673706055, "learning_rate": 9.601503759398498e-06, "loss": 0.6145, "step": 2650 }, { "epoch": 4.0, "grad_norm": 20.564706802368164, "learning_rate": 9.600000000000001e-06, "loss": 0.6935, "step": 2660 }, { "epoch": 4.0, "eval_accuracy": 0.9236, "eval_loss": 0.319810152053833, "eval_runtime": 84.5508, "eval_samples_per_second": 118.272, "eval_steps_per_second": 0.473, "step": 2660 }, { "epoch": 4.02, "grad_norm": 9.448854446411133, "learning_rate": 9.598496240601505e-06, "loss": 0.631, "step": 2670 }, { "epoch": 4.03, "grad_norm": 10.194000244140625, "learning_rate": 9.596992481203008e-06, "loss": 0.6046, "step": 2680 }, { "epoch": 4.05, "grad_norm": 8.277205467224121, "learning_rate": 9.595488721804512e-06, "loss": 0.5853, "step": 2690 }, { "epoch": 4.06, "grad_norm": 7.616865158081055, "learning_rate": 9.593984962406015e-06, "loss": 0.6168, "step": 2700 }, { "epoch": 4.08, "grad_norm": 5.158208847045898, "learning_rate": 9.59248120300752e-06, "loss": 0.5893, "step": 2710 }, { "epoch": 4.09, "grad_norm": 10.609253883361816, "learning_rate": 9.590977443609022e-06, "loss": 0.5819, "step": 2720 }, { "epoch": 4.11, "grad_norm": 7.288332462310791, "learning_rate": 9.589473684210528e-06, "loss": 0.6384, "step": 2730 }, { "epoch": 4.12, "grad_norm": 6.625866889953613, "learning_rate": 9.587969924812031e-06, "loss": 0.6841, "step": 2740 }, { "epoch": 4.14, "grad_norm": 8.38702392578125, "learning_rate": 9.586466165413535e-06, "loss": 0.5815, "step": 2750 }, { "epoch": 4.15, "grad_norm": 6.58852481842041, "learning_rate": 9.584962406015038e-06, "loss": 0.7091, "step": 2760 }, { "epoch": 4.17, "grad_norm": 5.776881217956543, "learning_rate": 9.583458646616542e-06, "loss": 0.6426, "step": 2770 }, { "epoch": 4.18, "grad_norm": 7.806540489196777, "learning_rate": 9.581954887218045e-06, "loss": 0.6459, "step": 2780 }, { "epoch": 4.2, "grad_norm": 7.378940582275391, "learning_rate": 9.58045112781955e-06, "loss": 0.6553, "step": 2790 }, { "epoch": 4.21, "grad_norm": 8.37366008758545, "learning_rate": 9.578947368421054e-06, "loss": 0.6024, "step": 2800 }, { "epoch": 4.23, "grad_norm": 5.783264636993408, "learning_rate": 9.577443609022557e-06, "loss": 0.5959, "step": 2810 }, { "epoch": 4.24, "grad_norm": 6.4687676429748535, "learning_rate": 9.575939849624061e-06, "loss": 0.5669, "step": 2820 }, { "epoch": 4.26, "grad_norm": 10.575803756713867, "learning_rate": 9.574436090225564e-06, "loss": 0.6461, "step": 2830 }, { "epoch": 4.27, "grad_norm": 9.703124046325684, "learning_rate": 9.572932330827068e-06, "loss": 0.5784, "step": 2840 }, { "epoch": 4.29, "grad_norm": 9.454757690429688, "learning_rate": 9.571428571428573e-06, "loss": 0.7381, "step": 2850 }, { "epoch": 4.3, "grad_norm": 7.034806728363037, "learning_rate": 9.569924812030075e-06, "loss": 0.6277, "step": 2860 }, { "epoch": 4.32, "grad_norm": 10.1060791015625, "learning_rate": 9.56842105263158e-06, "loss": 0.6973, "step": 2870 }, { "epoch": 4.33, "grad_norm": 7.225138187408447, "learning_rate": 9.566917293233084e-06, "loss": 0.587, "step": 2880 }, { "epoch": 4.35, "grad_norm": 6.221525192260742, "learning_rate": 9.565413533834587e-06, "loss": 0.6006, "step": 2890 }, { "epoch": 4.36, "grad_norm": 6.329552173614502, "learning_rate": 9.56390977443609e-06, "loss": 0.6474, "step": 2900 }, { "epoch": 4.38, "grad_norm": 8.411649703979492, "learning_rate": 9.562406015037596e-06, "loss": 0.5376, "step": 2910 }, { "epoch": 4.39, "grad_norm": 8.27790355682373, "learning_rate": 9.560902255639098e-06, "loss": 0.6324, "step": 2920 }, { "epoch": 4.41, "grad_norm": 6.995235443115234, "learning_rate": 9.559398496240603e-06, "loss": 0.6178, "step": 2930 }, { "epoch": 4.42, "grad_norm": 8.169748306274414, "learning_rate": 9.557894736842107e-06, "loss": 0.5546, "step": 2940 }, { "epoch": 4.44, "grad_norm": 7.832982063293457, "learning_rate": 9.55639097744361e-06, "loss": 0.5873, "step": 2950 }, { "epoch": 4.45, "grad_norm": 7.024545192718506, "learning_rate": 9.554887218045114e-06, "loss": 0.5919, "step": 2960 }, { "epoch": 4.47, "grad_norm": 8.610920906066895, "learning_rate": 9.553383458646617e-06, "loss": 0.6324, "step": 2970 }, { "epoch": 4.48, "grad_norm": 8.49885368347168, "learning_rate": 9.55187969924812e-06, "loss": 0.6392, "step": 2980 }, { "epoch": 4.5, "grad_norm": 6.013737678527832, "learning_rate": 9.550375939849624e-06, "loss": 0.6267, "step": 2990 }, { "epoch": 4.51, "grad_norm": 9.457529067993164, "learning_rate": 9.54887218045113e-06, "loss": 0.633, "step": 3000 }, { "epoch": 4.53, "grad_norm": 7.126248359680176, "learning_rate": 9.547368421052631e-06, "loss": 0.5527, "step": 3010 }, { "epoch": 4.54, "grad_norm": 8.481447219848633, "learning_rate": 9.545864661654136e-06, "loss": 0.7163, "step": 3020 }, { "epoch": 4.56, "grad_norm": 10.300518035888672, "learning_rate": 9.54436090225564e-06, "loss": 0.5921, "step": 3030 }, { "epoch": 4.57, "grad_norm": 8.265804290771484, "learning_rate": 9.542857142857143e-06, "loss": 0.5952, "step": 3040 }, { "epoch": 4.59, "grad_norm": 5.028606414794922, "learning_rate": 9.541353383458647e-06, "loss": 0.6269, "step": 3050 }, { "epoch": 4.6, "grad_norm": 8.997878074645996, "learning_rate": 9.53984962406015e-06, "loss": 0.5761, "step": 3060 }, { "epoch": 4.62, "grad_norm": 9.585675239562988, "learning_rate": 9.538345864661654e-06, "loss": 0.5851, "step": 3070 }, { "epoch": 4.63, "grad_norm": 5.83755350112915, "learning_rate": 9.53684210526316e-06, "loss": 0.6105, "step": 3080 }, { "epoch": 4.65, "grad_norm": 5.208207607269287, "learning_rate": 9.535338345864663e-06, "loss": 0.5786, "step": 3090 }, { "epoch": 4.66, "grad_norm": 9.895461082458496, "learning_rate": 9.533834586466166e-06, "loss": 0.623, "step": 3100 }, { "epoch": 4.68, "grad_norm": 8.958138465881348, "learning_rate": 9.53233082706767e-06, "loss": 0.5708, "step": 3110 }, { "epoch": 4.69, "grad_norm": 10.452126502990723, "learning_rate": 9.530827067669173e-06, "loss": 0.5694, "step": 3120 }, { "epoch": 4.71, "grad_norm": 7.20021915435791, "learning_rate": 9.529323308270677e-06, "loss": 0.6267, "step": 3130 }, { "epoch": 4.72, "grad_norm": 7.995909690856934, "learning_rate": 9.527819548872182e-06, "loss": 0.6989, "step": 3140 }, { "epoch": 4.74, "grad_norm": 6.9314985275268555, "learning_rate": 9.526315789473684e-06, "loss": 0.6093, "step": 3150 }, { "epoch": 4.75, "grad_norm": 10.158616065979004, "learning_rate": 9.524812030075189e-06, "loss": 0.5696, "step": 3160 }, { "epoch": 4.77, "grad_norm": 7.637181758880615, "learning_rate": 9.523308270676693e-06, "loss": 0.7494, "step": 3170 }, { "epoch": 4.78, "grad_norm": 7.443474769592285, "learning_rate": 9.521804511278196e-06, "loss": 0.7636, "step": 3180 }, { "epoch": 4.8, "grad_norm": 6.130582809448242, "learning_rate": 9.5203007518797e-06, "loss": 0.7397, "step": 3190 }, { "epoch": 4.81, "grad_norm": 8.699774742126465, "learning_rate": 9.518796992481205e-06, "loss": 0.5461, "step": 3200 }, { "epoch": 4.83, "grad_norm": 8.060851097106934, "learning_rate": 9.517293233082707e-06, "loss": 0.5424, "step": 3210 }, { "epoch": 4.84, "grad_norm": 6.084632396697998, "learning_rate": 9.515789473684212e-06, "loss": 0.6181, "step": 3220 }, { "epoch": 4.86, "grad_norm": 8.804571151733398, "learning_rate": 9.514285714285715e-06, "loss": 0.6696, "step": 3230 }, { "epoch": 4.87, "grad_norm": 8.552626609802246, "learning_rate": 9.512781954887219e-06, "loss": 0.6237, "step": 3240 }, { "epoch": 4.89, "grad_norm": 8.930567741394043, "learning_rate": 9.511278195488722e-06, "loss": 0.6427, "step": 3250 }, { "epoch": 4.9, "grad_norm": 8.916244506835938, "learning_rate": 9.509774436090226e-06, "loss": 0.608, "step": 3260 }, { "epoch": 4.92, "grad_norm": 12.679169654846191, "learning_rate": 9.50827067669173e-06, "loss": 0.622, "step": 3270 }, { "epoch": 4.93, "grad_norm": 9.21071720123291, "learning_rate": 9.506766917293235e-06, "loss": 0.6153, "step": 3280 }, { "epoch": 4.95, "grad_norm": 8.040297508239746, "learning_rate": 9.505263157894738e-06, "loss": 0.654, "step": 3290 }, { "epoch": 4.96, "grad_norm": 6.395382404327393, "learning_rate": 9.503759398496242e-06, "loss": 0.6729, "step": 3300 }, { "epoch": 4.98, "grad_norm": 8.437057495117188, "learning_rate": 9.502255639097745e-06, "loss": 0.6457, "step": 3310 }, { "epoch": 4.99, "grad_norm": 7.987279415130615, "learning_rate": 9.500751879699249e-06, "loss": 0.6159, "step": 3320 }, { "epoch": 5.0, "eval_accuracy": 0.9289, "eval_loss": 0.2945062816143036, "eval_runtime": 84.9367, "eval_samples_per_second": 117.735, "eval_steps_per_second": 0.471, "step": 3325 }, { "epoch": 5.01, "grad_norm": 6.12150239944458, "learning_rate": 9.499248120300752e-06, "loss": 0.4871, "step": 3330 }, { "epoch": 5.02, "grad_norm": 8.20666217803955, "learning_rate": 9.497744360902257e-06, "loss": 0.6011, "step": 3340 }, { "epoch": 5.04, "grad_norm": 8.818642616271973, "learning_rate": 9.49624060150376e-06, "loss": 0.5681, "step": 3350 }, { "epoch": 5.05, "grad_norm": 5.606151103973389, "learning_rate": 9.494736842105265e-06, "loss": 0.5494, "step": 3360 }, { "epoch": 5.07, "grad_norm": 6.230663299560547, "learning_rate": 9.493233082706768e-06, "loss": 0.6138, "step": 3370 }, { "epoch": 5.08, "grad_norm": 6.923035621643066, "learning_rate": 9.491729323308272e-06, "loss": 0.6398, "step": 3380 }, { "epoch": 5.1, "grad_norm": 8.464035034179688, "learning_rate": 9.490225563909775e-06, "loss": 0.533, "step": 3390 }, { "epoch": 5.11, "grad_norm": 5.440852165222168, "learning_rate": 9.488721804511279e-06, "loss": 0.5901, "step": 3400 }, { "epoch": 5.13, "grad_norm": 6.880829334259033, "learning_rate": 9.487218045112782e-06, "loss": 0.5699, "step": 3410 }, { "epoch": 5.14, "grad_norm": 6.773617267608643, "learning_rate": 9.485714285714287e-06, "loss": 0.5164, "step": 3420 }, { "epoch": 5.16, "grad_norm": 6.794729232788086, "learning_rate": 9.484210526315791e-06, "loss": 0.4939, "step": 3430 }, { "epoch": 5.17, "grad_norm": 8.347722053527832, "learning_rate": 9.482706766917294e-06, "loss": 0.6138, "step": 3440 }, { "epoch": 5.19, "grad_norm": 6.279055595397949, "learning_rate": 9.481203007518798e-06, "loss": 0.4792, "step": 3450 }, { "epoch": 5.2, "grad_norm": 7.93798828125, "learning_rate": 9.479699248120301e-06, "loss": 0.5945, "step": 3460 }, { "epoch": 5.22, "grad_norm": 6.767178535461426, "learning_rate": 9.478195488721805e-06, "loss": 0.5878, "step": 3470 }, { "epoch": 5.23, "grad_norm": 6.87293004989624, "learning_rate": 9.476691729323308e-06, "loss": 0.566, "step": 3480 }, { "epoch": 5.25, "grad_norm": 2.407437562942505, "learning_rate": 9.475187969924814e-06, "loss": 0.5014, "step": 3490 }, { "epoch": 5.26, "grad_norm": 8.233712196350098, "learning_rate": 9.473684210526315e-06, "loss": 0.5879, "step": 3500 }, { "epoch": 5.28, "grad_norm": 7.905375003814697, "learning_rate": 9.47218045112782e-06, "loss": 0.6127, "step": 3510 }, { "epoch": 5.29, "grad_norm": 5.8037238121032715, "learning_rate": 9.470676691729324e-06, "loss": 0.6048, "step": 3520 }, { "epoch": 5.31, "grad_norm": 9.2665433883667, "learning_rate": 9.469172932330828e-06, "loss": 0.6233, "step": 3530 }, { "epoch": 5.32, "grad_norm": 5.650614261627197, "learning_rate": 9.467669172932331e-06, "loss": 0.5703, "step": 3540 }, { "epoch": 5.34, "grad_norm": 5.246155738830566, "learning_rate": 9.466165413533835e-06, "loss": 0.5108, "step": 3550 }, { "epoch": 5.35, "grad_norm": 8.701322555541992, "learning_rate": 9.464661654135338e-06, "loss": 0.5783, "step": 3560 }, { "epoch": 5.37, "grad_norm": 5.870892524719238, "learning_rate": 9.463157894736844e-06, "loss": 0.5546, "step": 3570 }, { "epoch": 5.38, "grad_norm": 8.061163902282715, "learning_rate": 9.461654135338347e-06, "loss": 0.5973, "step": 3580 }, { "epoch": 5.4, "grad_norm": 4.166900157928467, "learning_rate": 9.46015037593985e-06, "loss": 0.6042, "step": 3590 }, { "epoch": 5.41, "grad_norm": 5.8524346351623535, "learning_rate": 9.458646616541354e-06, "loss": 0.5307, "step": 3600 }, { "epoch": 5.43, "grad_norm": 7.229081153869629, "learning_rate": 9.457142857142858e-06, "loss": 0.6533, "step": 3610 }, { "epoch": 5.44, "grad_norm": 5.403026580810547, "learning_rate": 9.455639097744361e-06, "loss": 0.5059, "step": 3620 }, { "epoch": 5.46, "grad_norm": 7.53814697265625, "learning_rate": 9.454135338345866e-06, "loss": 0.6576, "step": 3630 }, { "epoch": 5.47, "grad_norm": 8.08530330657959, "learning_rate": 9.452631578947368e-06, "loss": 0.5881, "step": 3640 }, { "epoch": 5.49, "grad_norm": 7.80808687210083, "learning_rate": 9.451127819548873e-06, "loss": 0.5725, "step": 3650 }, { "epoch": 5.5, "grad_norm": 4.368475914001465, "learning_rate": 9.449624060150377e-06, "loss": 0.5538, "step": 3660 }, { "epoch": 5.52, "grad_norm": 7.806415557861328, "learning_rate": 9.44812030075188e-06, "loss": 0.6739, "step": 3670 }, { "epoch": 5.53, "grad_norm": 8.047362327575684, "learning_rate": 9.446616541353384e-06, "loss": 0.5229, "step": 3680 }, { "epoch": 5.55, "grad_norm": 5.89243221282959, "learning_rate": 9.445112781954887e-06, "loss": 0.599, "step": 3690 }, { "epoch": 5.56, "grad_norm": 6.812804222106934, "learning_rate": 9.443609022556391e-06, "loss": 0.6213, "step": 3700 }, { "epoch": 5.58, "grad_norm": 7.177376747131348, "learning_rate": 9.442105263157896e-06, "loss": 0.5568, "step": 3710 }, { "epoch": 5.59, "grad_norm": 4.684938907623291, "learning_rate": 9.4406015037594e-06, "loss": 0.5266, "step": 3720 }, { "epoch": 5.61, "grad_norm": 9.440715789794922, "learning_rate": 9.439097744360903e-06, "loss": 0.6431, "step": 3730 }, { "epoch": 5.62, "grad_norm": 6.564364910125732, "learning_rate": 9.437593984962407e-06, "loss": 0.4702, "step": 3740 }, { "epoch": 5.64, "grad_norm": 7.499642372131348, "learning_rate": 9.43609022556391e-06, "loss": 0.5249, "step": 3750 }, { "epoch": 5.65, "grad_norm": 5.27653169631958, "learning_rate": 9.434586466165414e-06, "loss": 0.4517, "step": 3760 }, { "epoch": 5.67, "grad_norm": 4.881272792816162, "learning_rate": 9.433082706766919e-06, "loss": 0.4682, "step": 3770 }, { "epoch": 5.68, "grad_norm": 9.005290985107422, "learning_rate": 9.43157894736842e-06, "loss": 0.5642, "step": 3780 }, { "epoch": 5.7, "grad_norm": 6.421182155609131, "learning_rate": 9.430075187969926e-06, "loss": 0.4855, "step": 3790 }, { "epoch": 5.71, "grad_norm": 7.035130023956299, "learning_rate": 9.42857142857143e-06, "loss": 0.6362, "step": 3800 }, { "epoch": 5.73, "grad_norm": 5.77438497543335, "learning_rate": 9.427067669172933e-06, "loss": 0.5876, "step": 3810 }, { "epoch": 5.74, "grad_norm": 8.328043937683105, "learning_rate": 9.425563909774437e-06, "loss": 0.4768, "step": 3820 }, { "epoch": 5.76, "grad_norm": 5.7907586097717285, "learning_rate": 9.424060150375942e-06, "loss": 0.5941, "step": 3830 }, { "epoch": 5.77, "grad_norm": 8.730267524719238, "learning_rate": 9.422556390977444e-06, "loss": 0.6426, "step": 3840 }, { "epoch": 5.79, "grad_norm": 8.710532188415527, "learning_rate": 9.421052631578949e-06, "loss": 0.5909, "step": 3850 }, { "epoch": 5.8, "grad_norm": 8.74202823638916, "learning_rate": 9.419548872180452e-06, "loss": 0.5735, "step": 3860 }, { "epoch": 5.82, "grad_norm": 7.489967346191406, "learning_rate": 9.418045112781956e-06, "loss": 0.6072, "step": 3870 }, { "epoch": 5.83, "grad_norm": 6.942547798156738, "learning_rate": 9.41654135338346e-06, "loss": 0.5513, "step": 3880 }, { "epoch": 5.85, "grad_norm": 5.517817497253418, "learning_rate": 9.415037593984963e-06, "loss": 0.5317, "step": 3890 }, { "epoch": 5.86, "grad_norm": 6.26224946975708, "learning_rate": 9.413533834586466e-06, "loss": 0.6014, "step": 3900 }, { "epoch": 5.88, "grad_norm": 4.848892688751221, "learning_rate": 9.412030075187972e-06, "loss": 0.5628, "step": 3910 }, { "epoch": 5.89, "grad_norm": 7.279343128204346, "learning_rate": 9.410526315789475e-06, "loss": 0.631, "step": 3920 }, { "epoch": 5.91, "grad_norm": 5.791496753692627, "learning_rate": 9.409022556390979e-06, "loss": 0.5312, "step": 3930 }, { "epoch": 5.92, "grad_norm": 4.935235977172852, "learning_rate": 9.407518796992482e-06, "loss": 0.4694, "step": 3940 }, { "epoch": 5.94, "grad_norm": 5.741876125335693, "learning_rate": 9.406015037593986e-06, "loss": 0.6073, "step": 3950 }, { "epoch": 5.95, "grad_norm": 5.398350715637207, "learning_rate": 9.40451127819549e-06, "loss": 0.6009, "step": 3960 }, { "epoch": 5.97, "grad_norm": 6.093377590179443, "learning_rate": 9.403007518796994e-06, "loss": 0.5845, "step": 3970 }, { "epoch": 5.98, "grad_norm": 8.6488676071167, "learning_rate": 9.401503759398496e-06, "loss": 0.5932, "step": 3980 }, { "epoch": 6.0, "grad_norm": 3.0173494815826416, "learning_rate": 9.4e-06, "loss": 0.4423, "step": 3990 }, { "epoch": 6.0, "eval_accuracy": 0.925, "eval_loss": 0.2876322865486145, "eval_runtime": 85.2695, "eval_samples_per_second": 117.275, "eval_steps_per_second": 0.469, "step": 3990 }, { "epoch": 6.02, "grad_norm": 4.103921890258789, "learning_rate": 9.398496240601505e-06, "loss": 0.5346, "step": 4000 }, { "epoch": 6.03, "grad_norm": 4.625704288482666, "learning_rate": 9.396992481203009e-06, "loss": 0.5522, "step": 4010 }, { "epoch": 6.05, "grad_norm": 7.251491546630859, "learning_rate": 9.395488721804512e-06, "loss": 0.6501, "step": 4020 }, { "epoch": 6.06, "grad_norm": 8.962389945983887, "learning_rate": 9.393984962406016e-06, "loss": 0.5402, "step": 4030 }, { "epoch": 6.08, "grad_norm": 6.207771301269531, "learning_rate": 9.392481203007519e-06, "loss": 0.5545, "step": 4040 }, { "epoch": 6.09, "grad_norm": 5.253688812255859, "learning_rate": 9.390977443609023e-06, "loss": 0.5084, "step": 4050 }, { "epoch": 6.11, "grad_norm": 7.277046203613281, "learning_rate": 9.389473684210528e-06, "loss": 0.5115, "step": 4060 }, { "epoch": 6.12, "grad_norm": 5.671750068664551, "learning_rate": 9.38796992481203e-06, "loss": 0.5508, "step": 4070 }, { "epoch": 6.14, "grad_norm": 3.9672038555145264, "learning_rate": 9.386466165413535e-06, "loss": 0.494, "step": 4080 }, { "epoch": 6.15, "grad_norm": 6.129919052124023, "learning_rate": 9.384962406015038e-06, "loss": 0.5898, "step": 4090 }, { "epoch": 6.17, "grad_norm": 6.198451519012451, "learning_rate": 9.383458646616542e-06, "loss": 0.5378, "step": 4100 }, { "epoch": 6.18, "grad_norm": 9.286908149719238, "learning_rate": 9.381954887218045e-06, "loss": 0.5656, "step": 4110 }, { "epoch": 6.2, "grad_norm": 6.862420082092285, "learning_rate": 9.38045112781955e-06, "loss": 0.5457, "step": 4120 }, { "epoch": 6.21, "grad_norm": 5.948605537414551, "learning_rate": 9.378947368421052e-06, "loss": 0.5564, "step": 4130 }, { "epoch": 6.23, "grad_norm": 7.1652116775512695, "learning_rate": 9.377443609022558e-06, "loss": 0.5624, "step": 4140 }, { "epoch": 6.24, "grad_norm": 7.091752052307129, "learning_rate": 9.375939849624061e-06, "loss": 0.532, "step": 4150 }, { "epoch": 6.26, "grad_norm": 4.2119903564453125, "learning_rate": 9.374436090225565e-06, "loss": 0.385, "step": 4160 }, { "epoch": 6.27, "grad_norm": 9.477155685424805, "learning_rate": 9.372932330827068e-06, "loss": 0.5019, "step": 4170 }, { "epoch": 6.29, "grad_norm": 7.294814109802246, "learning_rate": 9.371428571428572e-06, "loss": 0.556, "step": 4180 }, { "epoch": 6.3, "grad_norm": 8.124314308166504, "learning_rate": 9.369924812030075e-06, "loss": 0.548, "step": 4190 }, { "epoch": 6.32, "grad_norm": 6.1076483726501465, "learning_rate": 9.36842105263158e-06, "loss": 0.5228, "step": 4200 }, { "epoch": 6.33, "grad_norm": 7.360411643981934, "learning_rate": 9.366917293233084e-06, "loss": 0.6335, "step": 4210 }, { "epoch": 6.35, "grad_norm": 6.000509738922119, "learning_rate": 9.365413533834588e-06, "loss": 0.5727, "step": 4220 }, { "epoch": 6.36, "grad_norm": 8.538400650024414, "learning_rate": 9.363909774436091e-06, "loss": 0.5135, "step": 4230 }, { "epoch": 6.38, "grad_norm": 6.543038845062256, "learning_rate": 9.362406015037595e-06, "loss": 0.4859, "step": 4240 }, { "epoch": 6.39, "grad_norm": 7.515405178070068, "learning_rate": 9.360902255639098e-06, "loss": 0.6172, "step": 4250 }, { "epoch": 6.41, "grad_norm": 4.859574317932129, "learning_rate": 9.359398496240603e-06, "loss": 0.407, "step": 4260 }, { "epoch": 6.42, "grad_norm": 5.684931755065918, "learning_rate": 9.357894736842105e-06, "loss": 0.5748, "step": 4270 }, { "epoch": 6.44, "grad_norm": 5.063128471374512, "learning_rate": 9.35639097744361e-06, "loss": 0.5363, "step": 4280 }, { "epoch": 6.45, "grad_norm": 9.34011173248291, "learning_rate": 9.354887218045114e-06, "loss": 0.5072, "step": 4290 }, { "epoch": 6.47, "grad_norm": 6.302648544311523, "learning_rate": 9.353383458646617e-06, "loss": 0.4891, "step": 4300 }, { "epoch": 6.48, "grad_norm": 6.268799781799316, "learning_rate": 9.351879699248121e-06, "loss": 0.6215, "step": 4310 }, { "epoch": 6.5, "grad_norm": 5.54179573059082, "learning_rate": 9.350375939849624e-06, "loss": 0.5827, "step": 4320 }, { "epoch": 6.51, "grad_norm": 6.042153835296631, "learning_rate": 9.348872180451128e-06, "loss": 0.5205, "step": 4330 }, { "epoch": 6.53, "grad_norm": 7.558413982391357, "learning_rate": 9.347368421052633e-06, "loss": 0.5357, "step": 4340 }, { "epoch": 6.54, "grad_norm": 7.838019847869873, "learning_rate": 9.345864661654137e-06, "loss": 0.5719, "step": 4350 }, { "epoch": 6.56, "grad_norm": 10.056818008422852, "learning_rate": 9.34436090225564e-06, "loss": 0.542, "step": 4360 }, { "epoch": 6.57, "grad_norm": 7.325047492980957, "learning_rate": 9.342857142857144e-06, "loss": 0.5564, "step": 4370 }, { "epoch": 6.59, "grad_norm": 8.13595199584961, "learning_rate": 9.341353383458647e-06, "loss": 0.5106, "step": 4380 }, { "epoch": 6.6, "grad_norm": 7.225549221038818, "learning_rate": 9.33984962406015e-06, "loss": 0.5714, "step": 4390 }, { "epoch": 6.62, "grad_norm": 9.103632926940918, "learning_rate": 9.338345864661656e-06, "loss": 0.559, "step": 4400 }, { "epoch": 6.63, "grad_norm": 5.579386234283447, "learning_rate": 9.336842105263158e-06, "loss": 0.5789, "step": 4410 }, { "epoch": 6.65, "grad_norm": 9.875541687011719, "learning_rate": 9.335338345864663e-06, "loss": 0.5435, "step": 4420 }, { "epoch": 6.66, "grad_norm": 10.33945083618164, "learning_rate": 9.333834586466166e-06, "loss": 0.4678, "step": 4430 }, { "epoch": 6.68, "grad_norm": 8.180964469909668, "learning_rate": 9.33233082706767e-06, "loss": 0.5476, "step": 4440 }, { "epoch": 6.69, "grad_norm": 8.327938079833984, "learning_rate": 9.330827067669174e-06, "loss": 0.5605, "step": 4450 }, { "epoch": 6.71, "grad_norm": 6.9362311363220215, "learning_rate": 9.329323308270679e-06, "loss": 0.6099, "step": 4460 }, { "epoch": 6.72, "grad_norm": 4.627447128295898, "learning_rate": 9.32781954887218e-06, "loss": 0.4521, "step": 4470 }, { "epoch": 6.74, "grad_norm": 5.713562488555908, "learning_rate": 9.326315789473684e-06, "loss": 0.4532, "step": 4480 }, { "epoch": 6.75, "grad_norm": 6.893897533416748, "learning_rate": 9.32481203007519e-06, "loss": 0.5298, "step": 4490 }, { "epoch": 6.77, "grad_norm": 6.630578994750977, "learning_rate": 9.323308270676693e-06, "loss": 0.5226, "step": 4500 }, { "epoch": 6.78, "grad_norm": 7.725119113922119, "learning_rate": 9.321804511278196e-06, "loss": 0.4581, "step": 4510 }, { "epoch": 6.8, "grad_norm": 8.243720054626465, "learning_rate": 9.3203007518797e-06, "loss": 0.4821, "step": 4520 }, { "epoch": 6.81, "grad_norm": 7.254865646362305, "learning_rate": 9.318796992481203e-06, "loss": 0.4623, "step": 4530 }, { "epoch": 6.83, "grad_norm": 6.510406494140625, "learning_rate": 9.317293233082707e-06, "loss": 0.5073, "step": 4540 }, { "epoch": 6.84, "grad_norm": 5.531012058258057, "learning_rate": 9.315789473684212e-06, "loss": 0.4842, "step": 4550 }, { "epoch": 6.86, "grad_norm": 9.50185489654541, "learning_rate": 9.314285714285714e-06, "loss": 0.5485, "step": 4560 }, { "epoch": 6.87, "grad_norm": 5.591551780700684, "learning_rate": 9.312781954887219e-06, "loss": 0.5397, "step": 4570 }, { "epoch": 6.89, "grad_norm": 3.7708208560943604, "learning_rate": 9.311278195488723e-06, "loss": 0.4746, "step": 4580 }, { "epoch": 6.9, "grad_norm": 5.826446533203125, "learning_rate": 9.309774436090226e-06, "loss": 0.503, "step": 4590 }, { "epoch": 6.92, "grad_norm": 14.129280090332031, "learning_rate": 9.30827067669173e-06, "loss": 0.5729, "step": 4600 }, { "epoch": 6.93, "grad_norm": 5.19706392288208, "learning_rate": 9.306766917293233e-06, "loss": 0.4967, "step": 4610 }, { "epoch": 6.95, "grad_norm": 6.513811111450195, "learning_rate": 9.305263157894737e-06, "loss": 0.587, "step": 4620 }, { "epoch": 6.96, "grad_norm": 7.2199506759643555, "learning_rate": 9.303759398496242e-06, "loss": 0.5733, "step": 4630 }, { "epoch": 6.98, "grad_norm": 6.173489570617676, "learning_rate": 9.302255639097745e-06, "loss": 0.6391, "step": 4640 }, { "epoch": 6.99, "grad_norm": 4.977587699890137, "learning_rate": 9.300751879699249e-06, "loss": 0.5506, "step": 4650 }, { "epoch": 7.0, "eval_accuracy": 0.9302, "eval_loss": 0.2616922855377197, "eval_runtime": 85.0513, "eval_samples_per_second": 117.576, "eval_steps_per_second": 0.47, "step": 4655 }, { "epoch": 7.01, "grad_norm": 7.2303266525268555, "learning_rate": 9.299248120300753e-06, "loss": 0.4889, "step": 4660 }, { "epoch": 7.02, "grad_norm": 5.8191914558410645, "learning_rate": 9.297744360902256e-06, "loss": 0.6557, "step": 4670 }, { "epoch": 7.04, "grad_norm": 7.453029632568359, "learning_rate": 9.29624060150376e-06, "loss": 0.5975, "step": 4680 }, { "epoch": 7.05, "grad_norm": 9.434555053710938, "learning_rate": 9.294736842105265e-06, "loss": 0.4769, "step": 4690 }, { "epoch": 7.07, "grad_norm": 9.096846580505371, "learning_rate": 9.293233082706767e-06, "loss": 0.5521, "step": 4700 }, { "epoch": 7.08, "grad_norm": 6.146598815917969, "learning_rate": 9.291729323308272e-06, "loss": 0.5191, "step": 4710 }, { "epoch": 7.1, "grad_norm": 5.138683319091797, "learning_rate": 9.290225563909775e-06, "loss": 0.4883, "step": 4720 }, { "epoch": 7.11, "grad_norm": 5.840444564819336, "learning_rate": 9.288721804511279e-06, "loss": 0.4901, "step": 4730 }, { "epoch": 7.13, "grad_norm": 5.589585781097412, "learning_rate": 9.287218045112782e-06, "loss": 0.4574, "step": 4740 }, { "epoch": 7.14, "grad_norm": 7.447097301483154, "learning_rate": 9.285714285714288e-06, "loss": 0.4672, "step": 4750 }, { "epoch": 7.16, "grad_norm": 6.8820295333862305, "learning_rate": 9.28421052631579e-06, "loss": 0.5641, "step": 4760 }, { "epoch": 7.17, "grad_norm": 5.46907901763916, "learning_rate": 9.282706766917295e-06, "loss": 0.4948, "step": 4770 }, { "epoch": 7.19, "grad_norm": 5.4233527183532715, "learning_rate": 9.281203007518798e-06, "loss": 0.5507, "step": 4780 }, { "epoch": 7.2, "grad_norm": 6.316089153289795, "learning_rate": 9.279699248120302e-06, "loss": 0.5791, "step": 4790 }, { "epoch": 7.22, "grad_norm": 3.7618801593780518, "learning_rate": 9.278195488721805e-06, "loss": 0.4846, "step": 4800 }, { "epoch": 7.23, "grad_norm": 6.426711082458496, "learning_rate": 9.276691729323309e-06, "loss": 0.4886, "step": 4810 }, { "epoch": 7.25, "grad_norm": 6.98826265335083, "learning_rate": 9.275187969924812e-06, "loss": 0.4627, "step": 4820 }, { "epoch": 7.26, "grad_norm": 6.147061824798584, "learning_rate": 9.273684210526317e-06, "loss": 0.4577, "step": 4830 }, { "epoch": 7.28, "grad_norm": 7.308942794799805, "learning_rate": 9.272180451127821e-06, "loss": 0.4893, "step": 4840 }, { "epoch": 7.29, "grad_norm": 8.406046867370605, "learning_rate": 9.270676691729324e-06, "loss": 0.4968, "step": 4850 }, { "epoch": 7.31, "grad_norm": 4.631737232208252, "learning_rate": 9.269172932330828e-06, "loss": 0.4654, "step": 4860 }, { "epoch": 7.32, "grad_norm": 3.802255868911743, "learning_rate": 9.267669172932331e-06, "loss": 0.5018, "step": 4870 }, { "epoch": 7.34, "grad_norm": 7.958065986633301, "learning_rate": 9.266165413533835e-06, "loss": 0.5358, "step": 4880 }, { "epoch": 7.35, "grad_norm": 4.825588703155518, "learning_rate": 9.26466165413534e-06, "loss": 0.5201, "step": 4890 }, { "epoch": 7.37, "grad_norm": 4.964457035064697, "learning_rate": 9.263157894736842e-06, "loss": 0.4819, "step": 4900 }, { "epoch": 7.38, "grad_norm": 11.642394065856934, "learning_rate": 9.261654135338347e-06, "loss": 0.5989, "step": 4910 }, { "epoch": 7.4, "grad_norm": 9.31828498840332, "learning_rate": 9.26015037593985e-06, "loss": 0.5455, "step": 4920 }, { "epoch": 7.41, "grad_norm": 7.8129963874816895, "learning_rate": 9.258646616541354e-06, "loss": 0.5542, "step": 4930 }, { "epoch": 7.43, "grad_norm": 7.043788909912109, "learning_rate": 9.257142857142858e-06, "loss": 0.5153, "step": 4940 }, { "epoch": 7.44, "grad_norm": 8.311758995056152, "learning_rate": 9.255639097744363e-06, "loss": 0.4802, "step": 4950 }, { "epoch": 7.46, "grad_norm": 10.970717430114746, "learning_rate": 9.254135338345865e-06, "loss": 0.4495, "step": 4960 }, { "epoch": 7.47, "grad_norm": 5.547107219696045, "learning_rate": 9.252631578947368e-06, "loss": 0.4824, "step": 4970 }, { "epoch": 7.49, "grad_norm": 5.658668518066406, "learning_rate": 9.251127819548874e-06, "loss": 0.5722, "step": 4980 }, { "epoch": 7.5, "grad_norm": 4.896615982055664, "learning_rate": 9.249624060150375e-06, "loss": 0.4936, "step": 4990 }, { "epoch": 7.52, "grad_norm": 7.777392864227295, "learning_rate": 9.24812030075188e-06, "loss": 0.5078, "step": 5000 }, { "epoch": 7.53, "grad_norm": 11.22333812713623, "learning_rate": 9.246616541353384e-06, "loss": 0.575, "step": 5010 }, { "epoch": 7.55, "grad_norm": 6.031052589416504, "learning_rate": 9.245112781954888e-06, "loss": 0.4742, "step": 5020 }, { "epoch": 7.56, "grad_norm": 11.427336692810059, "learning_rate": 9.243609022556391e-06, "loss": 0.5519, "step": 5030 }, { "epoch": 7.58, "grad_norm": 6.76407527923584, "learning_rate": 9.242105263157896e-06, "loss": 0.5202, "step": 5040 }, { "epoch": 7.59, "grad_norm": 7.091256618499756, "learning_rate": 9.240601503759398e-06, "loss": 0.5255, "step": 5050 }, { "epoch": 7.61, "grad_norm": 6.818326473236084, "learning_rate": 9.239097744360903e-06, "loss": 0.442, "step": 5060 }, { "epoch": 7.62, "grad_norm": 7.494906902313232, "learning_rate": 9.237593984962407e-06, "loss": 0.5126, "step": 5070 }, { "epoch": 7.64, "grad_norm": 5.982577800750732, "learning_rate": 9.23609022556391e-06, "loss": 0.5056, "step": 5080 }, { "epoch": 7.65, "grad_norm": 4.815781116485596, "learning_rate": 9.234586466165414e-06, "loss": 0.4896, "step": 5090 }, { "epoch": 7.67, "grad_norm": 10.674721717834473, "learning_rate": 9.233082706766918e-06, "loss": 0.5209, "step": 5100 }, { "epoch": 7.68, "grad_norm": 5.937568187713623, "learning_rate": 9.231578947368421e-06, "loss": 0.4772, "step": 5110 }, { "epoch": 7.7, "grad_norm": 5.146367073059082, "learning_rate": 9.230075187969926e-06, "loss": 0.5032, "step": 5120 }, { "epoch": 7.71, "grad_norm": 8.03272819519043, "learning_rate": 9.22857142857143e-06, "loss": 0.5072, "step": 5130 }, { "epoch": 7.73, "grad_norm": 5.361180782318115, "learning_rate": 9.227067669172933e-06, "loss": 0.5057, "step": 5140 }, { "epoch": 7.74, "grad_norm": 5.487973213195801, "learning_rate": 9.225563909774437e-06, "loss": 0.5253, "step": 5150 }, { "epoch": 7.76, "grad_norm": 6.845251560211182, "learning_rate": 9.22406015037594e-06, "loss": 0.5436, "step": 5160 }, { "epoch": 7.77, "grad_norm": 4.931974411010742, "learning_rate": 9.222556390977444e-06, "loss": 0.4227, "step": 5170 }, { "epoch": 7.79, "grad_norm": 7.382147312164307, "learning_rate": 9.221052631578949e-06, "loss": 0.5022, "step": 5180 }, { "epoch": 7.8, "grad_norm": 8.380685806274414, "learning_rate": 9.219548872180451e-06, "loss": 0.5259, "step": 5190 }, { "epoch": 7.82, "grad_norm": 6.625802993774414, "learning_rate": 9.218045112781956e-06, "loss": 0.5213, "step": 5200 }, { "epoch": 7.83, "grad_norm": 6.7276692390441895, "learning_rate": 9.21654135338346e-06, "loss": 0.4831, "step": 5210 }, { "epoch": 7.85, "grad_norm": 5.930064678192139, "learning_rate": 9.215037593984963e-06, "loss": 0.4278, "step": 5220 }, { "epoch": 7.86, "grad_norm": 5.958808422088623, "learning_rate": 9.213533834586467e-06, "loss": 0.5039, "step": 5230 }, { "epoch": 7.88, "grad_norm": 8.592114448547363, "learning_rate": 9.21203007518797e-06, "loss": 0.48, "step": 5240 }, { "epoch": 7.89, "grad_norm": 7.286666393280029, "learning_rate": 9.210526315789474e-06, "loss": 0.5513, "step": 5250 }, { "epoch": 7.91, "grad_norm": 3.8542234897613525, "learning_rate": 9.209022556390979e-06, "loss": 0.4452, "step": 5260 }, { "epoch": 7.92, "grad_norm": 8.812358856201172, "learning_rate": 9.207518796992482e-06, "loss": 0.4796, "step": 5270 }, { "epoch": 7.94, "grad_norm": 7.729457378387451, "learning_rate": 9.206015037593986e-06, "loss": 0.5283, "step": 5280 }, { "epoch": 7.95, "grad_norm": 5.370766639709473, "learning_rate": 9.20451127819549e-06, "loss": 0.557, "step": 5290 }, { "epoch": 7.97, "grad_norm": 7.00390625, "learning_rate": 9.203007518796993e-06, "loss": 0.5266, "step": 5300 }, { "epoch": 7.98, "grad_norm": 5.945902347564697, "learning_rate": 9.201503759398496e-06, "loss": 0.505, "step": 5310 }, { "epoch": 8.0, "grad_norm": 39.0330924987793, "learning_rate": 9.200000000000002e-06, "loss": 0.5673, "step": 5320 }, { "epoch": 8.0, "eval_accuracy": 0.9324, "eval_loss": 0.2575855553150177, "eval_runtime": 85.1441, "eval_samples_per_second": 117.448, "eval_steps_per_second": 0.47, "step": 5320 }, { "epoch": 8.02, "grad_norm": 7.23183012008667, "learning_rate": 9.198496240601504e-06, "loss": 0.4209, "step": 5330 }, { "epoch": 8.03, "grad_norm": 5.357458591461182, "learning_rate": 9.196992481203009e-06, "loss": 0.4701, "step": 5340 }, { "epoch": 8.05, "grad_norm": 9.471860885620117, "learning_rate": 9.195488721804512e-06, "loss": 0.4924, "step": 5350 }, { "epoch": 8.06, "grad_norm": 7.7437214851379395, "learning_rate": 9.193984962406016e-06, "loss": 0.4997, "step": 5360 }, { "epoch": 8.08, "grad_norm": 6.697991371154785, "learning_rate": 9.19248120300752e-06, "loss": 0.4365, "step": 5370 }, { "epoch": 8.09, "grad_norm": 9.78630542755127, "learning_rate": 9.190977443609025e-06, "loss": 0.5572, "step": 5380 }, { "epoch": 8.11, "grad_norm": 6.197582244873047, "learning_rate": 9.189473684210526e-06, "loss": 0.4581, "step": 5390 }, { "epoch": 8.12, "grad_norm": 5.565506458282471, "learning_rate": 9.187969924812032e-06, "loss": 0.5219, "step": 5400 }, { "epoch": 8.14, "grad_norm": 5.3856353759765625, "learning_rate": 9.186466165413535e-06, "loss": 0.5744, "step": 5410 }, { "epoch": 8.15, "grad_norm": 6.119697093963623, "learning_rate": 9.184962406015039e-06, "loss": 0.3964, "step": 5420 }, { "epoch": 8.17, "grad_norm": 5.344573497772217, "learning_rate": 9.183458646616542e-06, "loss": 0.4816, "step": 5430 }, { "epoch": 8.18, "grad_norm": 5.624027729034424, "learning_rate": 9.181954887218046e-06, "loss": 0.4341, "step": 5440 }, { "epoch": 8.2, "grad_norm": 4.847854137420654, "learning_rate": 9.180451127819549e-06, "loss": 0.5471, "step": 5450 }, { "epoch": 8.21, "grad_norm": 6.051102638244629, "learning_rate": 9.178947368421053e-06, "loss": 0.5593, "step": 5460 }, { "epoch": 8.23, "grad_norm": 8.345075607299805, "learning_rate": 9.177443609022558e-06, "loss": 0.4132, "step": 5470 }, { "epoch": 8.24, "grad_norm": 5.369195938110352, "learning_rate": 9.17593984962406e-06, "loss": 0.4844, "step": 5480 }, { "epoch": 8.26, "grad_norm": 6.0165228843688965, "learning_rate": 9.174436090225565e-06, "loss": 0.4712, "step": 5490 }, { "epoch": 8.27, "grad_norm": 4.54939079284668, "learning_rate": 9.172932330827068e-06, "loss": 0.4956, "step": 5500 }, { "epoch": 8.29, "grad_norm": 5.021441459655762, "learning_rate": 9.171428571428572e-06, "loss": 0.4428, "step": 5510 }, { "epoch": 8.3, "grad_norm": 7.0452446937561035, "learning_rate": 9.169924812030075e-06, "loss": 0.5036, "step": 5520 }, { "epoch": 8.32, "grad_norm": 8.039512634277344, "learning_rate": 9.168421052631579e-06, "loss": 0.478, "step": 5530 }, { "epoch": 8.33, "grad_norm": 6.608015537261963, "learning_rate": 9.166917293233083e-06, "loss": 0.4338, "step": 5540 }, { "epoch": 8.35, "grad_norm": 6.659971714019775, "learning_rate": 9.165413533834588e-06, "loss": 0.5033, "step": 5550 }, { "epoch": 8.36, "grad_norm": 5.663388252258301, "learning_rate": 9.163909774436091e-06, "loss": 0.4779, "step": 5560 }, { "epoch": 8.38, "grad_norm": 5.978389739990234, "learning_rate": 9.162406015037595e-06, "loss": 0.4615, "step": 5570 }, { "epoch": 8.39, "grad_norm": 5.943080425262451, "learning_rate": 9.160902255639098e-06, "loss": 0.4282, "step": 5580 }, { "epoch": 8.41, "grad_norm": 3.245058536529541, "learning_rate": 9.159398496240602e-06, "loss": 0.4423, "step": 5590 }, { "epoch": 8.42, "grad_norm": 5.73254919052124, "learning_rate": 9.157894736842105e-06, "loss": 0.4777, "step": 5600 }, { "epoch": 8.44, "grad_norm": 6.487976551055908, "learning_rate": 9.15639097744361e-06, "loss": 0.5132, "step": 5610 }, { "epoch": 8.45, "grad_norm": 4.1268463134765625, "learning_rate": 9.154887218045112e-06, "loss": 0.3942, "step": 5620 }, { "epoch": 8.47, "grad_norm": 7.593535900115967, "learning_rate": 9.153383458646618e-06, "loss": 0.5109, "step": 5630 }, { "epoch": 8.48, "grad_norm": 4.127936840057373, "learning_rate": 9.151879699248121e-06, "loss": 0.4554, "step": 5640 }, { "epoch": 8.5, "grad_norm": 12.721508026123047, "learning_rate": 9.150375939849625e-06, "loss": 0.5204, "step": 5650 }, { "epoch": 8.51, "grad_norm": 7.618612289428711, "learning_rate": 9.148872180451128e-06, "loss": 0.5095, "step": 5660 }, { "epoch": 8.53, "grad_norm": 5.089692115783691, "learning_rate": 9.147368421052633e-06, "loss": 0.478, "step": 5670 }, { "epoch": 8.54, "grad_norm": 6.890159606933594, "learning_rate": 9.145864661654135e-06, "loss": 0.4634, "step": 5680 }, { "epoch": 8.56, "grad_norm": 9.275102615356445, "learning_rate": 9.14436090225564e-06, "loss": 0.528, "step": 5690 }, { "epoch": 8.57, "grad_norm": 4.839653015136719, "learning_rate": 9.142857142857144e-06, "loss": 0.4558, "step": 5700 }, { "epoch": 8.59, "grad_norm": 7.7605791091918945, "learning_rate": 9.141353383458647e-06, "loss": 0.5086, "step": 5710 }, { "epoch": 8.6, "grad_norm": 6.753016948699951, "learning_rate": 9.139849624060151e-06, "loss": 0.4953, "step": 5720 }, { "epoch": 8.62, "grad_norm": 10.297369003295898, "learning_rate": 9.138345864661654e-06, "loss": 0.454, "step": 5730 }, { "epoch": 8.63, "grad_norm": 5.704922676086426, "learning_rate": 9.136842105263158e-06, "loss": 0.4504, "step": 5740 }, { "epoch": 8.65, "grad_norm": 5.4977030754089355, "learning_rate": 9.135338345864663e-06, "loss": 0.5203, "step": 5750 }, { "epoch": 8.66, "grad_norm": 7.531189918518066, "learning_rate": 9.133834586466167e-06, "loss": 0.4534, "step": 5760 }, { "epoch": 8.68, "grad_norm": 6.660569667816162, "learning_rate": 9.13233082706767e-06, "loss": 0.577, "step": 5770 }, { "epoch": 8.69, "grad_norm": 6.752721309661865, "learning_rate": 9.130827067669174e-06, "loss": 0.4574, "step": 5780 }, { "epoch": 8.71, "grad_norm": 7.526786804199219, "learning_rate": 9.129323308270677e-06, "loss": 0.4511, "step": 5790 }, { "epoch": 8.72, "grad_norm": 6.5963239669799805, "learning_rate": 9.12781954887218e-06, "loss": 0.4917, "step": 5800 }, { "epoch": 8.74, "grad_norm": 4.203681468963623, "learning_rate": 9.126315789473686e-06, "loss": 0.5218, "step": 5810 }, { "epoch": 8.75, "grad_norm": 4.367255210876465, "learning_rate": 9.124812030075188e-06, "loss": 0.4162, "step": 5820 }, { "epoch": 8.77, "grad_norm": 3.8670730590820312, "learning_rate": 9.123308270676693e-06, "loss": 0.3913, "step": 5830 }, { "epoch": 8.78, "grad_norm": 9.634267807006836, "learning_rate": 9.121804511278197e-06, "loss": 0.5062, "step": 5840 }, { "epoch": 8.8, "grad_norm": 2.509295701980591, "learning_rate": 9.1203007518797e-06, "loss": 0.4422, "step": 5850 }, { "epoch": 8.81, "grad_norm": 7.6244659423828125, "learning_rate": 9.118796992481204e-06, "loss": 0.4566, "step": 5860 }, { "epoch": 8.83, "grad_norm": 6.837118625640869, "learning_rate": 9.117293233082709e-06, "loss": 0.5095, "step": 5870 }, { "epoch": 8.84, "grad_norm": 4.819979667663574, "learning_rate": 9.11578947368421e-06, "loss": 0.4471, "step": 5880 }, { "epoch": 8.86, "grad_norm": 6.350512504577637, "learning_rate": 9.114285714285716e-06, "loss": 0.4751, "step": 5890 }, { "epoch": 8.87, "grad_norm": 3.4793074131011963, "learning_rate": 9.11278195488722e-06, "loss": 0.4089, "step": 5900 }, { "epoch": 8.89, "grad_norm": 5.1062774658203125, "learning_rate": 9.111278195488723e-06, "loss": 0.5624, "step": 5910 }, { "epoch": 8.9, "grad_norm": 8.126543998718262, "learning_rate": 9.109774436090226e-06, "loss": 0.5146, "step": 5920 }, { "epoch": 8.92, "grad_norm": 7.661808967590332, "learning_rate": 9.10827067669173e-06, "loss": 0.4602, "step": 5930 }, { "epoch": 8.93, "grad_norm": 6.820888996124268, "learning_rate": 9.106766917293233e-06, "loss": 0.4827, "step": 5940 }, { "epoch": 8.95, "grad_norm": 24.169485092163086, "learning_rate": 9.105263157894739e-06, "loss": 0.5302, "step": 5950 }, { "epoch": 8.96, "grad_norm": 5.068043231964111, "learning_rate": 9.103759398496242e-06, "loss": 0.5453, "step": 5960 }, { "epoch": 8.98, "grad_norm": 5.819450378417969, "learning_rate": 9.102255639097744e-06, "loss": 0.4811, "step": 5970 }, { "epoch": 8.99, "grad_norm": 4.129781723022461, "learning_rate": 9.10075187969925e-06, "loss": 0.4613, "step": 5980 }, { "epoch": 9.0, "eval_accuracy": 0.9311, "eval_loss": 0.25862327218055725, "eval_runtime": 84.8621, "eval_samples_per_second": 117.838, "eval_steps_per_second": 0.471, "step": 5985 }, { "epoch": 9.01, "grad_norm": 6.844424247741699, "learning_rate": 9.099248120300753e-06, "loss": 0.4154, "step": 5990 }, { "epoch": 9.02, "grad_norm": 5.792689323425293, "learning_rate": 9.097744360902256e-06, "loss": 0.4345, "step": 6000 }, { "epoch": 9.04, "grad_norm": 5.300471305847168, "learning_rate": 9.09624060150376e-06, "loss": 0.3986, "step": 6010 }, { "epoch": 9.05, "grad_norm": 11.9564208984375, "learning_rate": 9.094736842105263e-06, "loss": 0.4255, "step": 6020 }, { "epoch": 9.07, "grad_norm": 6.798022270202637, "learning_rate": 9.093233082706767e-06, "loss": 0.4296, "step": 6030 }, { "epoch": 9.08, "grad_norm": 7.78212308883667, "learning_rate": 9.091729323308272e-06, "loss": 0.4373, "step": 6040 }, { "epoch": 9.1, "grad_norm": 6.719583988189697, "learning_rate": 9.090225563909776e-06, "loss": 0.4943, "step": 6050 }, { "epoch": 9.11, "grad_norm": 8.298834800720215, "learning_rate": 9.088721804511279e-06, "loss": 0.3937, "step": 6060 }, { "epoch": 9.13, "grad_norm": 4.731727600097656, "learning_rate": 9.087218045112783e-06, "loss": 0.4711, "step": 6070 }, { "epoch": 9.14, "grad_norm": 6.207810878753662, "learning_rate": 9.085714285714286e-06, "loss": 0.4565, "step": 6080 }, { "epoch": 9.16, "grad_norm": 6.939966678619385, "learning_rate": 9.08421052631579e-06, "loss": 0.5276, "step": 6090 }, { "epoch": 9.17, "grad_norm": 9.00831127166748, "learning_rate": 9.082706766917295e-06, "loss": 0.5558, "step": 6100 }, { "epoch": 9.19, "grad_norm": 4.730199813842773, "learning_rate": 9.081203007518797e-06, "loss": 0.4826, "step": 6110 }, { "epoch": 9.2, "grad_norm": 4.198337078094482, "learning_rate": 9.079699248120302e-06, "loss": 0.3978, "step": 6120 }, { "epoch": 9.22, "grad_norm": 5.722704887390137, "learning_rate": 9.078195488721805e-06, "loss": 0.4622, "step": 6130 }, { "epoch": 9.23, "grad_norm": 8.497228622436523, "learning_rate": 9.076691729323309e-06, "loss": 0.4726, "step": 6140 }, { "epoch": 9.25, "grad_norm": 7.046009063720703, "learning_rate": 9.075187969924812e-06, "loss": 0.527, "step": 6150 }, { "epoch": 9.26, "grad_norm": 7.972896099090576, "learning_rate": 9.073684210526316e-06, "loss": 0.3072, "step": 6160 }, { "epoch": 9.28, "grad_norm": 8.850788116455078, "learning_rate": 9.07218045112782e-06, "loss": 0.4753, "step": 6170 }, { "epoch": 9.29, "grad_norm": 6.064061641693115, "learning_rate": 9.070676691729325e-06, "loss": 0.4009, "step": 6180 }, { "epoch": 9.31, "grad_norm": 6.12713098526001, "learning_rate": 9.069172932330828e-06, "loss": 0.4786, "step": 6190 }, { "epoch": 9.32, "grad_norm": 8.643204689025879, "learning_rate": 9.067669172932332e-06, "loss": 0.5134, "step": 6200 }, { "epoch": 9.34, "grad_norm": 7.257277488708496, "learning_rate": 9.066165413533835e-06, "loss": 0.4474, "step": 6210 }, { "epoch": 9.35, "grad_norm": 7.62333869934082, "learning_rate": 9.064661654135339e-06, "loss": 0.4579, "step": 6220 }, { "epoch": 9.37, "grad_norm": 9.346735954284668, "learning_rate": 9.063157894736842e-06, "loss": 0.4714, "step": 6230 }, { "epoch": 9.38, "grad_norm": 3.8007750511169434, "learning_rate": 9.061654135338347e-06, "loss": 0.4334, "step": 6240 }, { "epoch": 9.4, "grad_norm": 6.266302108764648, "learning_rate": 9.06015037593985e-06, "loss": 0.4704, "step": 6250 }, { "epoch": 9.41, "grad_norm": 6.959786891937256, "learning_rate": 9.058646616541355e-06, "loss": 0.5353, "step": 6260 }, { "epoch": 9.43, "grad_norm": 6.572616100311279, "learning_rate": 9.057142857142858e-06, "loss": 0.4629, "step": 6270 }, { "epoch": 9.44, "grad_norm": 5.961916446685791, "learning_rate": 9.055639097744362e-06, "loss": 0.5119, "step": 6280 }, { "epoch": 9.46, "grad_norm": 6.547915935516357, "learning_rate": 9.054135338345865e-06, "loss": 0.4576, "step": 6290 }, { "epoch": 9.47, "grad_norm": 6.359402179718018, "learning_rate": 9.05263157894737e-06, "loss": 0.3692, "step": 6300 }, { "epoch": 9.49, "grad_norm": 7.048614501953125, "learning_rate": 9.051127819548872e-06, "loss": 0.3911, "step": 6310 }, { "epoch": 9.5, "grad_norm": 5.198198318481445, "learning_rate": 9.049624060150377e-06, "loss": 0.475, "step": 6320 }, { "epoch": 9.52, "grad_norm": 5.797221660614014, "learning_rate": 9.04812030075188e-06, "loss": 0.3771, "step": 6330 }, { "epoch": 9.53, "grad_norm": 5.751585483551025, "learning_rate": 9.046616541353384e-06, "loss": 0.497, "step": 6340 }, { "epoch": 9.55, "grad_norm": 9.54306697845459, "learning_rate": 9.045112781954888e-06, "loss": 0.3976, "step": 6350 }, { "epoch": 9.56, "grad_norm": 8.968032836914062, "learning_rate": 9.043609022556391e-06, "loss": 0.4252, "step": 6360 }, { "epoch": 9.58, "grad_norm": 13.218304634094238, "learning_rate": 9.042105263157895e-06, "loss": 0.5191, "step": 6370 }, { "epoch": 9.59, "grad_norm": 4.405686855316162, "learning_rate": 9.0406015037594e-06, "loss": 0.4149, "step": 6380 }, { "epoch": 9.61, "grad_norm": 4.863158702850342, "learning_rate": 9.039097744360904e-06, "loss": 0.4875, "step": 6390 }, { "epoch": 9.62, "grad_norm": 6.247385501861572, "learning_rate": 9.037593984962407e-06, "loss": 0.4106, "step": 6400 }, { "epoch": 9.64, "grad_norm": 6.554888725280762, "learning_rate": 9.03609022556391e-06, "loss": 0.4645, "step": 6410 }, { "epoch": 9.65, "grad_norm": 7.249465465545654, "learning_rate": 9.034586466165414e-06, "loss": 0.4291, "step": 6420 }, { "epoch": 9.67, "grad_norm": 6.810882568359375, "learning_rate": 9.033082706766918e-06, "loss": 0.4637, "step": 6430 }, { "epoch": 9.68, "grad_norm": 2.65733003616333, "learning_rate": 9.031578947368423e-06, "loss": 0.4585, "step": 6440 }, { "epoch": 9.7, "grad_norm": 8.63343334197998, "learning_rate": 9.030075187969925e-06, "loss": 0.5187, "step": 6450 }, { "epoch": 9.71, "grad_norm": 5.558303356170654, "learning_rate": 9.028571428571428e-06, "loss": 0.4819, "step": 6460 }, { "epoch": 9.73, "grad_norm": 8.711833000183105, "learning_rate": 9.027067669172933e-06, "loss": 0.4585, "step": 6470 }, { "epoch": 9.74, "grad_norm": 7.882017135620117, "learning_rate": 9.025563909774437e-06, "loss": 0.5154, "step": 6480 }, { "epoch": 9.76, "grad_norm": 5.849830150604248, "learning_rate": 9.02406015037594e-06, "loss": 0.5093, "step": 6490 }, { "epoch": 9.77, "grad_norm": 7.546263217926025, "learning_rate": 9.022556390977444e-06, "loss": 0.3969, "step": 6500 }, { "epoch": 9.79, "grad_norm": 7.114614963531494, "learning_rate": 9.021052631578948e-06, "loss": 0.4485, "step": 6510 }, { "epoch": 9.8, "grad_norm": 7.6169209480285645, "learning_rate": 9.019548872180451e-06, "loss": 0.425, "step": 6520 }, { "epoch": 9.82, "grad_norm": 5.7843403816223145, "learning_rate": 9.018045112781956e-06, "loss": 0.4171, "step": 6530 }, { "epoch": 9.83, "grad_norm": 6.0503082275390625, "learning_rate": 9.016541353383458e-06, "loss": 0.483, "step": 6540 }, { "epoch": 9.85, "grad_norm": 7.677584648132324, "learning_rate": 9.015037593984963e-06, "loss": 0.4747, "step": 6550 }, { "epoch": 9.86, "grad_norm": 5.793139934539795, "learning_rate": 9.013533834586467e-06, "loss": 0.3621, "step": 6560 }, { "epoch": 9.88, "grad_norm": 6.399969577789307, "learning_rate": 9.01203007518797e-06, "loss": 0.4373, "step": 6570 }, { "epoch": 9.89, "grad_norm": 10.296338081359863, "learning_rate": 9.010526315789474e-06, "loss": 0.417, "step": 6580 }, { "epoch": 9.91, "grad_norm": 6.193917274475098, "learning_rate": 9.009022556390979e-06, "loss": 0.4419, "step": 6590 }, { "epoch": 9.92, "grad_norm": 3.921016216278076, "learning_rate": 9.007518796992481e-06, "loss": 0.3981, "step": 6600 }, { "epoch": 9.94, "grad_norm": 6.30132532119751, "learning_rate": 9.006015037593986e-06, "loss": 0.4699, "step": 6610 }, { "epoch": 9.95, "grad_norm": 8.901771545410156, "learning_rate": 9.00451127819549e-06, "loss": 0.4308, "step": 6620 }, { "epoch": 9.97, "grad_norm": 5.031552314758301, "learning_rate": 9.003007518796993e-06, "loss": 0.48, "step": 6630 }, { "epoch": 9.98, "grad_norm": 5.636510372161865, "learning_rate": 9.001503759398497e-06, "loss": 0.403, "step": 6640 }, { "epoch": 10.0, "grad_norm": 3.5294342041015625, "learning_rate": 9e-06, "loss": 0.4179, "step": 6650 }, { "epoch": 10.0, "eval_accuracy": 0.9285, "eval_loss": 0.2555387318134308, "eval_runtime": 84.7099, "eval_samples_per_second": 118.05, "eval_steps_per_second": 0.472, "step": 6650 }, { "epoch": 10.02, "grad_norm": 6.522907257080078, "learning_rate": 8.998496240601504e-06, "loss": 0.4525, "step": 6660 }, { "epoch": 10.03, "grad_norm": 6.142210006713867, "learning_rate": 8.996992481203009e-06, "loss": 0.3998, "step": 6670 }, { "epoch": 10.05, "grad_norm": 7.781100749969482, "learning_rate": 8.995488721804512e-06, "loss": 0.4122, "step": 6680 }, { "epoch": 10.06, "grad_norm": 5.448252201080322, "learning_rate": 8.993984962406016e-06, "loss": 0.461, "step": 6690 }, { "epoch": 10.08, "grad_norm": 7.063671588897705, "learning_rate": 8.99248120300752e-06, "loss": 0.432, "step": 6700 }, { "epoch": 10.09, "grad_norm": 6.696626663208008, "learning_rate": 8.990977443609023e-06, "loss": 0.4208, "step": 6710 }, { "epoch": 10.11, "grad_norm": 6.5666656494140625, "learning_rate": 8.989473684210527e-06, "loss": 0.4527, "step": 6720 }, { "epoch": 10.12, "grad_norm": 8.801324844360352, "learning_rate": 8.987969924812032e-06, "loss": 0.442, "step": 6730 }, { "epoch": 10.14, "grad_norm": 6.743152141571045, "learning_rate": 8.986466165413534e-06, "loss": 0.4424, "step": 6740 }, { "epoch": 10.15, "grad_norm": 5.408703327178955, "learning_rate": 8.984962406015039e-06, "loss": 0.4548, "step": 6750 }, { "epoch": 10.17, "grad_norm": 8.466784477233887, "learning_rate": 8.983458646616542e-06, "loss": 0.4355, "step": 6760 }, { "epoch": 10.18, "grad_norm": 5.309767723083496, "learning_rate": 8.981954887218046e-06, "loss": 0.4023, "step": 6770 }, { "epoch": 10.2, "grad_norm": 3.3604421615600586, "learning_rate": 8.98045112781955e-06, "loss": 0.4367, "step": 6780 }, { "epoch": 10.21, "grad_norm": 6.275347709655762, "learning_rate": 8.978947368421055e-06, "loss": 0.4764, "step": 6790 }, { "epoch": 10.23, "grad_norm": 6.770579814910889, "learning_rate": 8.977443609022556e-06, "loss": 0.4414, "step": 6800 }, { "epoch": 10.24, "grad_norm": 8.56733512878418, "learning_rate": 8.975939849624062e-06, "loss": 0.4426, "step": 6810 }, { "epoch": 10.26, "grad_norm": 6.006712436676025, "learning_rate": 8.974436090225565e-06, "loss": 0.3702, "step": 6820 }, { "epoch": 10.27, "grad_norm": 4.649052143096924, "learning_rate": 8.972932330827069e-06, "loss": 0.5371, "step": 6830 }, { "epoch": 10.29, "grad_norm": 9.080769538879395, "learning_rate": 8.971428571428572e-06, "loss": 0.4866, "step": 6840 }, { "epoch": 10.3, "grad_norm": 5.778624534606934, "learning_rate": 8.969924812030076e-06, "loss": 0.381, "step": 6850 }, { "epoch": 10.32, "grad_norm": 7.814187049865723, "learning_rate": 8.96842105263158e-06, "loss": 0.5162, "step": 6860 }, { "epoch": 10.33, "grad_norm": 5.049838542938232, "learning_rate": 8.966917293233084e-06, "loss": 0.4879, "step": 6870 }, { "epoch": 10.35, "grad_norm": 8.096096992492676, "learning_rate": 8.965413533834588e-06, "loss": 0.4726, "step": 6880 }, { "epoch": 10.36, "grad_norm": 7.028320789337158, "learning_rate": 8.963909774436091e-06, "loss": 0.4424, "step": 6890 }, { "epoch": 10.38, "grad_norm": 4.826821804046631, "learning_rate": 8.962406015037595e-06, "loss": 0.4552, "step": 6900 }, { "epoch": 10.39, "grad_norm": 8.392495155334473, "learning_rate": 8.960902255639098e-06, "loss": 0.4378, "step": 6910 }, { "epoch": 10.41, "grad_norm": 4.868290424346924, "learning_rate": 8.959398496240602e-06, "loss": 0.4151, "step": 6920 }, { "epoch": 10.42, "grad_norm": 6.117234230041504, "learning_rate": 8.957894736842107e-06, "loss": 0.5149, "step": 6930 }, { "epoch": 10.44, "grad_norm": 9.33238697052002, "learning_rate": 8.956390977443609e-06, "loss": 0.3984, "step": 6940 }, { "epoch": 10.45, "grad_norm": 9.559886932373047, "learning_rate": 8.954887218045113e-06, "loss": 0.4171, "step": 6950 }, { "epoch": 10.47, "grad_norm": 4.344634056091309, "learning_rate": 8.953383458646618e-06, "loss": 0.4419, "step": 6960 }, { "epoch": 10.48, "grad_norm": 5.508487701416016, "learning_rate": 8.951879699248121e-06, "loss": 0.4605, "step": 6970 }, { "epoch": 10.5, "grad_norm": 5.529686450958252, "learning_rate": 8.950375939849625e-06, "loss": 0.4004, "step": 6980 }, { "epoch": 10.51, "grad_norm": 5.424170970916748, "learning_rate": 8.948872180451128e-06, "loss": 0.4351, "step": 6990 }, { "epoch": 10.53, "grad_norm": 6.121506690979004, "learning_rate": 8.947368421052632e-06, "loss": 0.4198, "step": 7000 }, { "epoch": 10.54, "grad_norm": 4.664872169494629, "learning_rate": 8.945864661654135e-06, "loss": 0.3917, "step": 7010 }, { "epoch": 10.56, "grad_norm": 5.378602027893066, "learning_rate": 8.94436090225564e-06, "loss": 0.47, "step": 7020 }, { "epoch": 10.57, "grad_norm": 8.281057357788086, "learning_rate": 8.942857142857142e-06, "loss": 0.3779, "step": 7030 }, { "epoch": 10.59, "grad_norm": 5.378328800201416, "learning_rate": 8.941353383458648e-06, "loss": 0.4878, "step": 7040 }, { "epoch": 10.6, "grad_norm": 4.809008598327637, "learning_rate": 8.939849624060151e-06, "loss": 0.4409, "step": 7050 }, { "epoch": 10.62, "grad_norm": 6.703794002532959, "learning_rate": 8.938345864661655e-06, "loss": 0.45, "step": 7060 }, { "epoch": 10.63, "grad_norm": 10.097111701965332, "learning_rate": 8.936842105263158e-06, "loss": 0.4442, "step": 7070 }, { "epoch": 10.65, "grad_norm": 5.404522895812988, "learning_rate": 8.935338345864662e-06, "loss": 0.4289, "step": 7080 }, { "epoch": 10.66, "grad_norm": 2.983161449432373, "learning_rate": 8.933834586466165e-06, "loss": 0.4072, "step": 7090 }, { "epoch": 10.68, "grad_norm": 6.501340389251709, "learning_rate": 8.93233082706767e-06, "loss": 0.4344, "step": 7100 }, { "epoch": 10.69, "grad_norm": 7.439212322235107, "learning_rate": 8.930827067669174e-06, "loss": 0.4208, "step": 7110 }, { "epoch": 10.71, "grad_norm": 6.9180192947387695, "learning_rate": 8.929323308270677e-06, "loss": 0.4901, "step": 7120 }, { "epoch": 10.72, "grad_norm": 4.9598212242126465, "learning_rate": 8.927819548872181e-06, "loss": 0.4551, "step": 7130 }, { "epoch": 10.74, "grad_norm": 7.020519256591797, "learning_rate": 8.926315789473685e-06, "loss": 0.4469, "step": 7140 }, { "epoch": 10.75, "grad_norm": 6.747496604919434, "learning_rate": 8.924812030075188e-06, "loss": 0.3626, "step": 7150 }, { "epoch": 10.77, "grad_norm": 2.869495153427124, "learning_rate": 8.923308270676693e-06, "loss": 0.3794, "step": 7160 }, { "epoch": 10.78, "grad_norm": 7.156761169433594, "learning_rate": 8.921804511278195e-06, "loss": 0.4909, "step": 7170 }, { "epoch": 10.8, "grad_norm": 9.461006164550781, "learning_rate": 8.9203007518797e-06, "loss": 0.4487, "step": 7180 }, { "epoch": 10.81, "grad_norm": 5.75421142578125, "learning_rate": 8.918796992481204e-06, "loss": 0.4953, "step": 7190 }, { "epoch": 10.83, "grad_norm": 4.186371326446533, "learning_rate": 8.917293233082707e-06, "loss": 0.3788, "step": 7200 }, { "epoch": 10.84, "grad_norm": 6.402685165405273, "learning_rate": 8.915789473684211e-06, "loss": 0.5009, "step": 7210 }, { "epoch": 10.86, "grad_norm": 10.709757804870605, "learning_rate": 8.914285714285716e-06, "loss": 0.5308, "step": 7220 }, { "epoch": 10.87, "grad_norm": 8.926152229309082, "learning_rate": 8.912781954887218e-06, "loss": 0.4461, "step": 7230 }, { "epoch": 10.89, "grad_norm": 6.41901969909668, "learning_rate": 8.911278195488723e-06, "loss": 0.4188, "step": 7240 }, { "epoch": 10.9, "grad_norm": 4.931794166564941, "learning_rate": 8.909774436090227e-06, "loss": 0.4089, "step": 7250 }, { "epoch": 10.92, "grad_norm": 7.75593376159668, "learning_rate": 8.90827067669173e-06, "loss": 0.5261, "step": 7260 }, { "epoch": 10.93, "grad_norm": 9.013036727905273, "learning_rate": 8.906766917293234e-06, "loss": 0.3274, "step": 7270 }, { "epoch": 10.95, "grad_norm": 6.653579235076904, "learning_rate": 8.905263157894737e-06, "loss": 0.4646, "step": 7280 }, { "epoch": 10.96, "grad_norm": 5.304203987121582, "learning_rate": 8.90375939849624e-06, "loss": 0.4399, "step": 7290 }, { "epoch": 10.98, "grad_norm": 40.1646842956543, "learning_rate": 8.902255639097746e-06, "loss": 0.3007, "step": 7300 }, { "epoch": 10.99, "grad_norm": 5.538785934448242, "learning_rate": 8.90075187969925e-06, "loss": 0.4438, "step": 7310 }, { "epoch": 11.0, "eval_accuracy": 0.9316, "eval_loss": 0.25541195273399353, "eval_runtime": 84.8107, "eval_samples_per_second": 117.91, "eval_steps_per_second": 0.472, "step": 7315 }, { "epoch": 11.01, "grad_norm": 8.498943328857422, "learning_rate": 8.899248120300753e-06, "loss": 0.4009, "step": 7320 }, { "epoch": 11.02, "grad_norm": 6.2147040367126465, "learning_rate": 8.897744360902256e-06, "loss": 0.4283, "step": 7330 }, { "epoch": 11.04, "grad_norm": 5.028774261474609, "learning_rate": 8.89624060150376e-06, "loss": 0.4145, "step": 7340 }, { "epoch": 11.05, "grad_norm": 7.040588855743408, "learning_rate": 8.894736842105264e-06, "loss": 0.3753, "step": 7350 }, { "epoch": 11.07, "grad_norm": 4.658559322357178, "learning_rate": 8.893233082706769e-06, "loss": 0.4139, "step": 7360 }, { "epoch": 11.08, "grad_norm": 7.867548942565918, "learning_rate": 8.89172932330827e-06, "loss": 0.4854, "step": 7370 }, { "epoch": 11.1, "grad_norm": 8.354945182800293, "learning_rate": 8.890225563909776e-06, "loss": 0.4186, "step": 7380 }, { "epoch": 11.11, "grad_norm": 6.198273658752441, "learning_rate": 8.88872180451128e-06, "loss": 0.4486, "step": 7390 }, { "epoch": 11.13, "grad_norm": 7.401607990264893, "learning_rate": 8.887218045112783e-06, "loss": 0.4015, "step": 7400 }, { "epoch": 11.14, "grad_norm": 5.412950038909912, "learning_rate": 8.885714285714286e-06, "loss": 0.3654, "step": 7410 }, { "epoch": 11.16, "grad_norm": 3.7357654571533203, "learning_rate": 8.884210526315792e-06, "loss": 0.4165, "step": 7420 }, { "epoch": 11.17, "grad_norm": 7.468185901641846, "learning_rate": 8.882706766917293e-06, "loss": 0.3241, "step": 7430 }, { "epoch": 11.19, "grad_norm": 5.967494487762451, "learning_rate": 8.881203007518799e-06, "loss": 0.4364, "step": 7440 }, { "epoch": 11.2, "grad_norm": 8.94781494140625, "learning_rate": 8.879699248120302e-06, "loss": 0.4927, "step": 7450 }, { "epoch": 11.22, "grad_norm": 7.6195969581604, "learning_rate": 8.878195488721804e-06, "loss": 0.3722, "step": 7460 }, { "epoch": 11.23, "grad_norm": 9.522473335266113, "learning_rate": 8.876691729323309e-06, "loss": 0.3974, "step": 7470 }, { "epoch": 11.25, "grad_norm": 9.590860366821289, "learning_rate": 8.875187969924813e-06, "loss": 0.3842, "step": 7480 }, { "epoch": 11.26, "grad_norm": 6.479350566864014, "learning_rate": 8.873684210526316e-06, "loss": 0.4135, "step": 7490 }, { "epoch": 11.28, "grad_norm": 8.100231170654297, "learning_rate": 8.87218045112782e-06, "loss": 0.4262, "step": 7500 }, { "epoch": 11.29, "grad_norm": 9.401702880859375, "learning_rate": 8.870676691729325e-06, "loss": 0.3899, "step": 7510 }, { "epoch": 11.31, "grad_norm": 7.8885626792907715, "learning_rate": 8.869172932330827e-06, "loss": 0.4738, "step": 7520 }, { "epoch": 11.32, "grad_norm": 7.2377753257751465, "learning_rate": 8.867669172932332e-06, "loss": 0.3686, "step": 7530 }, { "epoch": 11.34, "grad_norm": 5.0235209465026855, "learning_rate": 8.866165413533835e-06, "loss": 0.3939, "step": 7540 }, { "epoch": 11.35, "grad_norm": 6.832250595092773, "learning_rate": 8.864661654135339e-06, "loss": 0.4485, "step": 7550 }, { "epoch": 11.37, "grad_norm": 8.186062812805176, "learning_rate": 8.863157894736842e-06, "loss": 0.4242, "step": 7560 }, { "epoch": 11.38, "grad_norm": 5.467780113220215, "learning_rate": 8.861654135338346e-06, "loss": 0.4599, "step": 7570 }, { "epoch": 11.4, "grad_norm": 6.155720233917236, "learning_rate": 8.86015037593985e-06, "loss": 0.5285, "step": 7580 }, { "epoch": 11.41, "grad_norm": 6.44677734375, "learning_rate": 8.858646616541355e-06, "loss": 0.4931, "step": 7590 }, { "epoch": 11.43, "grad_norm": 15.308818817138672, "learning_rate": 8.857142857142858e-06, "loss": 0.3899, "step": 7600 }, { "epoch": 11.44, "grad_norm": 6.691050052642822, "learning_rate": 8.855639097744362e-06, "loss": 0.4313, "step": 7610 }, { "epoch": 11.46, "grad_norm": 5.215397357940674, "learning_rate": 8.854135338345865e-06, "loss": 0.3528, "step": 7620 }, { "epoch": 11.47, "grad_norm": 7.355811595916748, "learning_rate": 8.852631578947369e-06, "loss": 0.4402, "step": 7630 }, { "epoch": 11.49, "grad_norm": 4.864825248718262, "learning_rate": 8.851127819548872e-06, "loss": 0.3485, "step": 7640 }, { "epoch": 11.5, "grad_norm": 7.4907755851745605, "learning_rate": 8.849624060150378e-06, "loss": 0.4522, "step": 7650 }, { "epoch": 11.52, "grad_norm": 6.480433464050293, "learning_rate": 8.84812030075188e-06, "loss": 0.4655, "step": 7660 }, { "epoch": 11.53, "grad_norm": 5.072092056274414, "learning_rate": 8.846616541353385e-06, "loss": 0.3735, "step": 7670 }, { "epoch": 11.55, "grad_norm": 10.207109451293945, "learning_rate": 8.845112781954888e-06, "loss": 0.3884, "step": 7680 }, { "epoch": 11.56, "grad_norm": 5.795559883117676, "learning_rate": 8.843609022556392e-06, "loss": 0.4115, "step": 7690 }, { "epoch": 11.58, "grad_norm": 7.781355381011963, "learning_rate": 8.842105263157895e-06, "loss": 0.4617, "step": 7700 }, { "epoch": 11.59, "grad_norm": 6.770030975341797, "learning_rate": 8.8406015037594e-06, "loss": 0.4218, "step": 7710 }, { "epoch": 11.61, "grad_norm": 7.052707672119141, "learning_rate": 8.839097744360902e-06, "loss": 0.3897, "step": 7720 }, { "epoch": 11.62, "grad_norm": 8.882899284362793, "learning_rate": 8.837593984962407e-06, "loss": 0.4546, "step": 7730 }, { "epoch": 11.64, "grad_norm": 7.858944892883301, "learning_rate": 8.836090225563911e-06, "loss": 0.4287, "step": 7740 }, { "epoch": 11.65, "grad_norm": 6.74614953994751, "learning_rate": 8.834586466165414e-06, "loss": 0.3326, "step": 7750 }, { "epoch": 11.67, "grad_norm": 8.970141410827637, "learning_rate": 8.833082706766918e-06, "loss": 0.4863, "step": 7760 }, { "epoch": 11.68, "grad_norm": 6.568352699279785, "learning_rate": 8.831578947368421e-06, "loss": 0.4248, "step": 7770 }, { "epoch": 11.7, "grad_norm": 6.05830717086792, "learning_rate": 8.830075187969925e-06, "loss": 0.4829, "step": 7780 }, { "epoch": 11.71, "grad_norm": 7.666469097137451, "learning_rate": 8.82857142857143e-06, "loss": 0.5319, "step": 7790 }, { "epoch": 11.73, "grad_norm": 5.955508708953857, "learning_rate": 8.827067669172934e-06, "loss": 0.4309, "step": 7800 }, { "epoch": 11.74, "grad_norm": 6.9883270263671875, "learning_rate": 8.825563909774437e-06, "loss": 0.4157, "step": 7810 }, { "epoch": 11.76, "grad_norm": 6.703571319580078, "learning_rate": 8.82406015037594e-06, "loss": 0.3219, "step": 7820 }, { "epoch": 11.77, "grad_norm": 7.131542682647705, "learning_rate": 8.822556390977444e-06, "loss": 0.4492, "step": 7830 }, { "epoch": 11.79, "grad_norm": 5.014946460723877, "learning_rate": 8.821052631578948e-06, "loss": 0.4189, "step": 7840 }, { "epoch": 11.8, "grad_norm": 4.254874229431152, "learning_rate": 8.819548872180453e-06, "loss": 0.484, "step": 7850 }, { "epoch": 11.82, "grad_norm": 4.319407939910889, "learning_rate": 8.818045112781955e-06, "loss": 0.3861, "step": 7860 }, { "epoch": 11.83, "grad_norm": 7.9686408042907715, "learning_rate": 8.81654135338346e-06, "loss": 0.4264, "step": 7870 }, { "epoch": 11.85, "grad_norm": 5.5855326652526855, "learning_rate": 8.815037593984964e-06, "loss": 0.4532, "step": 7880 }, { "epoch": 11.86, "grad_norm": 6.914451599121094, "learning_rate": 8.813533834586467e-06, "loss": 0.4355, "step": 7890 }, { "epoch": 11.88, "grad_norm": 7.542539596557617, "learning_rate": 8.81203007518797e-06, "loss": 0.3671, "step": 7900 }, { "epoch": 11.89, "grad_norm": 7.947263717651367, "learning_rate": 8.810526315789474e-06, "loss": 0.373, "step": 7910 }, { "epoch": 11.91, "grad_norm": 7.884321689605713, "learning_rate": 8.809022556390978e-06, "loss": 0.4827, "step": 7920 }, { "epoch": 11.92, "grad_norm": 5.361155986785889, "learning_rate": 8.807518796992483e-06, "loss": 0.4485, "step": 7930 }, { "epoch": 11.94, "grad_norm": 7.507490158081055, "learning_rate": 8.806015037593986e-06, "loss": 0.4446, "step": 7940 }, { "epoch": 11.95, "grad_norm": 7.053649425506592, "learning_rate": 8.804511278195488e-06, "loss": 0.4112, "step": 7950 }, { "epoch": 11.97, "grad_norm": 8.394134521484375, "learning_rate": 8.803007518796993e-06, "loss": 0.4221, "step": 7960 }, { "epoch": 11.98, "grad_norm": 9.852388381958008, "learning_rate": 8.801503759398497e-06, "loss": 0.4178, "step": 7970 }, { "epoch": 12.0, "grad_norm": 17.5406551361084, "learning_rate": 8.8e-06, "loss": 0.4869, "step": 7980 }, { "epoch": 12.0, "eval_accuracy": 0.9298, "eval_loss": 0.2563527822494507, "eval_runtime": 84.9156, "eval_samples_per_second": 117.764, "eval_steps_per_second": 0.471, "step": 7980 }, { "epoch": 12.02, "grad_norm": 4.651547908782959, "learning_rate": 8.798496240601504e-06, "loss": 0.4307, "step": 7990 }, { "epoch": 12.03, "grad_norm": 5.47507905960083, "learning_rate": 8.796992481203007e-06, "loss": 0.3957, "step": 8000 }, { "epoch": 12.05, "grad_norm": 7.309173583984375, "learning_rate": 8.795488721804511e-06, "loss": 0.3848, "step": 8010 }, { "epoch": 12.06, "grad_norm": 5.4073591232299805, "learning_rate": 8.793984962406016e-06, "loss": 0.347, "step": 8020 }, { "epoch": 12.08, "grad_norm": 9.495542526245117, "learning_rate": 8.79248120300752e-06, "loss": 0.3963, "step": 8030 }, { "epoch": 12.09, "grad_norm": 7.175304412841797, "learning_rate": 8.790977443609023e-06, "loss": 0.4028, "step": 8040 }, { "epoch": 12.11, "grad_norm": 5.7672624588012695, "learning_rate": 8.789473684210527e-06, "loss": 0.4336, "step": 8050 }, { "epoch": 12.12, "grad_norm": 5.373271942138672, "learning_rate": 8.78796992481203e-06, "loss": 0.4214, "step": 8060 }, { "epoch": 12.14, "grad_norm": 7.81503963470459, "learning_rate": 8.786466165413534e-06, "loss": 0.3362, "step": 8070 }, { "epoch": 12.15, "grad_norm": 5.352240085601807, "learning_rate": 8.784962406015039e-06, "loss": 0.438, "step": 8080 }, { "epoch": 12.17, "grad_norm": 4.825592994689941, "learning_rate": 8.783458646616541e-06, "loss": 0.3996, "step": 8090 }, { "epoch": 12.18, "grad_norm": 4.875209808349609, "learning_rate": 8.781954887218046e-06, "loss": 0.4056, "step": 8100 }, { "epoch": 12.2, "grad_norm": 6.405061721801758, "learning_rate": 8.78045112781955e-06, "loss": 0.404, "step": 8110 }, { "epoch": 12.21, "grad_norm": 5.762337684631348, "learning_rate": 8.778947368421053e-06, "loss": 0.3609, "step": 8120 }, { "epoch": 12.23, "grad_norm": 8.700191497802734, "learning_rate": 8.777443609022557e-06, "loss": 0.4316, "step": 8130 }, { "epoch": 12.24, "grad_norm": 5.509273052215576, "learning_rate": 8.775939849624062e-06, "loss": 0.3814, "step": 8140 }, { "epoch": 12.26, "grad_norm": 6.949098587036133, "learning_rate": 8.774436090225564e-06, "loss": 0.3588, "step": 8150 }, { "epoch": 12.27, "grad_norm": 5.564908981323242, "learning_rate": 8.772932330827069e-06, "loss": 0.353, "step": 8160 }, { "epoch": 12.29, "grad_norm": 6.935297012329102, "learning_rate": 8.771428571428572e-06, "loss": 0.4212, "step": 8170 }, { "epoch": 12.3, "grad_norm": 4.811358451843262, "learning_rate": 8.769924812030076e-06, "loss": 0.3865, "step": 8180 }, { "epoch": 12.32, "grad_norm": 6.4804368019104, "learning_rate": 8.76842105263158e-06, "loss": 0.3656, "step": 8190 }, { "epoch": 12.33, "grad_norm": 3.9236013889312744, "learning_rate": 8.766917293233083e-06, "loss": 0.4885, "step": 8200 }, { "epoch": 12.35, "grad_norm": 7.50891637802124, "learning_rate": 8.765413533834586e-06, "loss": 0.3962, "step": 8210 }, { "epoch": 12.36, "grad_norm": 4.313982963562012, "learning_rate": 8.763909774436092e-06, "loss": 0.4023, "step": 8220 }, { "epoch": 12.38, "grad_norm": 4.385167121887207, "learning_rate": 8.762406015037595e-06, "loss": 0.4841, "step": 8230 }, { "epoch": 12.39, "grad_norm": 5.977277755737305, "learning_rate": 8.760902255639099e-06, "loss": 0.399, "step": 8240 }, { "epoch": 12.41, "grad_norm": 8.858118057250977, "learning_rate": 8.759398496240602e-06, "loss": 0.451, "step": 8250 }, { "epoch": 12.42, "grad_norm": 6.294662952423096, "learning_rate": 8.757894736842106e-06, "loss": 0.416, "step": 8260 }, { "epoch": 12.44, "grad_norm": 4.536668300628662, "learning_rate": 8.75639097744361e-06, "loss": 0.407, "step": 8270 }, { "epoch": 12.45, "grad_norm": 5.644812107086182, "learning_rate": 8.754887218045114e-06, "loss": 0.3685, "step": 8280 }, { "epoch": 12.47, "grad_norm": 5.488842010498047, "learning_rate": 8.753383458646616e-06, "loss": 0.4136, "step": 8290 }, { "epoch": 12.48, "grad_norm": 4.548142910003662, "learning_rate": 8.751879699248122e-06, "loss": 0.4502, "step": 8300 }, { "epoch": 12.5, "grad_norm": 3.41457200050354, "learning_rate": 8.750375939849625e-06, "loss": 0.3598, "step": 8310 }, { "epoch": 12.51, "grad_norm": 6.259812831878662, "learning_rate": 8.748872180451129e-06, "loss": 0.3843, "step": 8320 }, { "epoch": 12.53, "grad_norm": 5.301551342010498, "learning_rate": 8.747368421052632e-06, "loss": 0.4038, "step": 8330 }, { "epoch": 12.54, "grad_norm": 14.684255599975586, "learning_rate": 8.745864661654137e-06, "loss": 0.4115, "step": 8340 }, { "epoch": 12.56, "grad_norm": 6.711531162261963, "learning_rate": 8.744360902255639e-06, "loss": 0.4112, "step": 8350 }, { "epoch": 12.57, "grad_norm": 8.990388870239258, "learning_rate": 8.742857142857144e-06, "loss": 0.3547, "step": 8360 }, { "epoch": 12.59, "grad_norm": 4.513948440551758, "learning_rate": 8.741353383458648e-06, "loss": 0.3776, "step": 8370 }, { "epoch": 12.6, "grad_norm": 6.088433742523193, "learning_rate": 8.739849624060151e-06, "loss": 0.4116, "step": 8380 }, { "epoch": 12.62, "grad_norm": 7.882970809936523, "learning_rate": 8.738345864661655e-06, "loss": 0.3712, "step": 8390 }, { "epoch": 12.63, "grad_norm": 6.829627990722656, "learning_rate": 8.736842105263158e-06, "loss": 0.3842, "step": 8400 }, { "epoch": 12.65, "grad_norm": 6.185722351074219, "learning_rate": 8.735338345864662e-06, "loss": 0.424, "step": 8410 }, { "epoch": 12.66, "grad_norm": 4.945958137512207, "learning_rate": 8.733834586466167e-06, "loss": 0.377, "step": 8420 }, { "epoch": 12.68, "grad_norm": 6.356648921966553, "learning_rate": 8.73233082706767e-06, "loss": 0.4256, "step": 8430 }, { "epoch": 12.69, "grad_norm": 6.276622295379639, "learning_rate": 8.730827067669172e-06, "loss": 0.3733, "step": 8440 }, { "epoch": 12.71, "grad_norm": 7.50572395324707, "learning_rate": 8.729323308270678e-06, "loss": 0.4407, "step": 8450 }, { "epoch": 12.72, "grad_norm": 7.089003086090088, "learning_rate": 8.727819548872181e-06, "loss": 0.3948, "step": 8460 }, { "epoch": 12.74, "grad_norm": 6.90725564956665, "learning_rate": 8.726315789473685e-06, "loss": 0.4511, "step": 8470 }, { "epoch": 12.75, "grad_norm": 4.369374752044678, "learning_rate": 8.724812030075188e-06, "loss": 0.3559, "step": 8480 }, { "epoch": 12.77, "grad_norm": 2.895493507385254, "learning_rate": 8.723308270676692e-06, "loss": 0.349, "step": 8490 }, { "epoch": 12.78, "grad_norm": 8.638984680175781, "learning_rate": 8.721804511278195e-06, "loss": 0.3406, "step": 8500 }, { "epoch": 12.8, "grad_norm": 7.664207458496094, "learning_rate": 8.7203007518797e-06, "loss": 0.3619, "step": 8510 }, { "epoch": 12.81, "grad_norm": 4.544347286224365, "learning_rate": 8.718796992481204e-06, "loss": 0.3109, "step": 8520 }, { "epoch": 12.83, "grad_norm": 6.640614032745361, "learning_rate": 8.717293233082708e-06, "loss": 0.4116, "step": 8530 }, { "epoch": 12.84, "grad_norm": 7.840051174163818, "learning_rate": 8.715789473684211e-06, "loss": 0.4027, "step": 8540 }, { "epoch": 12.86, "grad_norm": 10.355204582214355, "learning_rate": 8.714285714285715e-06, "loss": 0.4013, "step": 8550 }, { "epoch": 12.87, "grad_norm": 7.472030162811279, "learning_rate": 8.712781954887218e-06, "loss": 0.4119, "step": 8560 }, { "epoch": 12.89, "grad_norm": 7.9360246658325195, "learning_rate": 8.711278195488723e-06, "loss": 0.3472, "step": 8570 }, { "epoch": 12.9, "grad_norm": 5.889431953430176, "learning_rate": 8.709774436090225e-06, "loss": 0.4009, "step": 8580 }, { "epoch": 12.92, "grad_norm": 5.548401355743408, "learning_rate": 8.70827067669173e-06, "loss": 0.4261, "step": 8590 }, { "epoch": 12.93, "grad_norm": 5.590747833251953, "learning_rate": 8.706766917293234e-06, "loss": 0.4437, "step": 8600 }, { "epoch": 12.95, "grad_norm": 6.401696681976318, "learning_rate": 8.705263157894737e-06, "loss": 0.3746, "step": 8610 }, { "epoch": 12.96, "grad_norm": 9.315383911132812, "learning_rate": 8.703759398496241e-06, "loss": 0.3803, "step": 8620 }, { "epoch": 12.98, "grad_norm": 3.9589388370513916, "learning_rate": 8.702255639097746e-06, "loss": 0.4448, "step": 8630 }, { "epoch": 12.99, "grad_norm": 4.445014953613281, "learning_rate": 8.700751879699248e-06, "loss": 0.4289, "step": 8640 }, { "epoch": 13.0, "eval_accuracy": 0.9288, "eval_loss": 0.2712935507297516, "eval_runtime": 84.8607, "eval_samples_per_second": 117.84, "eval_steps_per_second": 0.471, "step": 8645 }, { "epoch": 13.01, "grad_norm": 5.444362163543701, "learning_rate": 8.699248120300753e-06, "loss": 0.3994, "step": 8650 }, { "epoch": 13.02, "grad_norm": 5.8952178955078125, "learning_rate": 8.697744360902257e-06, "loss": 0.3819, "step": 8660 }, { "epoch": 13.04, "grad_norm": 5.363025188446045, "learning_rate": 8.69624060150376e-06, "loss": 0.4251, "step": 8670 }, { "epoch": 13.05, "grad_norm": 6.1266961097717285, "learning_rate": 8.694736842105264e-06, "loss": 0.4236, "step": 8680 }, { "epoch": 13.07, "grad_norm": 6.096094131469727, "learning_rate": 8.693233082706767e-06, "loss": 0.4411, "step": 8690 }, { "epoch": 13.08, "grad_norm": 6.0483293533325195, "learning_rate": 8.69172932330827e-06, "loss": 0.3538, "step": 8700 }, { "epoch": 13.1, "grad_norm": 8.619955062866211, "learning_rate": 8.690225563909776e-06, "loss": 0.4698, "step": 8710 }, { "epoch": 13.11, "grad_norm": 5.028072834014893, "learning_rate": 8.68872180451128e-06, "loss": 0.3883, "step": 8720 }, { "epoch": 13.13, "grad_norm": 7.43666934967041, "learning_rate": 8.687218045112783e-06, "loss": 0.3552, "step": 8730 }, { "epoch": 13.14, "grad_norm": 9.520151138305664, "learning_rate": 8.685714285714287e-06, "loss": 0.4079, "step": 8740 }, { "epoch": 13.16, "grad_norm": 7.852067947387695, "learning_rate": 8.68421052631579e-06, "loss": 0.3607, "step": 8750 }, { "epoch": 13.17, "grad_norm": 5.92877721786499, "learning_rate": 8.682706766917294e-06, "loss": 0.3739, "step": 8760 }, { "epoch": 13.19, "grad_norm": 4.25166130065918, "learning_rate": 8.681203007518799e-06, "loss": 0.4621, "step": 8770 }, { "epoch": 13.2, "grad_norm": 7.073912143707275, "learning_rate": 8.6796992481203e-06, "loss": 0.4465, "step": 8780 }, { "epoch": 13.22, "grad_norm": 7.39524507522583, "learning_rate": 8.678195488721806e-06, "loss": 0.4303, "step": 8790 }, { "epoch": 13.23, "grad_norm": 6.938388824462891, "learning_rate": 8.67669172932331e-06, "loss": 0.3535, "step": 8800 }, { "epoch": 13.25, "grad_norm": 5.0067524909973145, "learning_rate": 8.675187969924813e-06, "loss": 0.4399, "step": 8810 }, { "epoch": 13.26, "grad_norm": 6.340808391571045, "learning_rate": 8.673684210526316e-06, "loss": 0.4199, "step": 8820 }, { "epoch": 13.28, "grad_norm": 4.246801853179932, "learning_rate": 8.67218045112782e-06, "loss": 0.4039, "step": 8830 }, { "epoch": 13.29, "grad_norm": 4.85552453994751, "learning_rate": 8.670676691729323e-06, "loss": 0.3753, "step": 8840 }, { "epoch": 13.31, "grad_norm": 6.020550727844238, "learning_rate": 8.669172932330829e-06, "loss": 0.4039, "step": 8850 }, { "epoch": 13.32, "grad_norm": 3.4875411987304688, "learning_rate": 8.667669172932332e-06, "loss": 0.3829, "step": 8860 }, { "epoch": 13.34, "grad_norm": 6.239095211029053, "learning_rate": 8.666165413533836e-06, "loss": 0.3511, "step": 8870 }, { "epoch": 13.35, "grad_norm": 4.244966983795166, "learning_rate": 8.66466165413534e-06, "loss": 0.4268, "step": 8880 }, { "epoch": 13.37, "grad_norm": 12.684317588806152, "learning_rate": 8.663157894736843e-06, "loss": 0.3471, "step": 8890 }, { "epoch": 13.38, "grad_norm": 8.664961814880371, "learning_rate": 8.661654135338346e-06, "loss": 0.3822, "step": 8900 }, { "epoch": 13.4, "grad_norm": 5.7766804695129395, "learning_rate": 8.660150375939851e-06, "loss": 0.4199, "step": 8910 }, { "epoch": 13.41, "grad_norm": 4.019351959228516, "learning_rate": 8.658646616541353e-06, "loss": 0.4137, "step": 8920 }, { "epoch": 13.43, "grad_norm": 6.156152248382568, "learning_rate": 8.657142857142858e-06, "loss": 0.4606, "step": 8930 }, { "epoch": 13.44, "grad_norm": 5.74890661239624, "learning_rate": 8.655639097744362e-06, "loss": 0.3221, "step": 8940 }, { "epoch": 13.46, "grad_norm": 6.321985721588135, "learning_rate": 8.654135338345866e-06, "loss": 0.415, "step": 8950 }, { "epoch": 13.47, "grad_norm": 8.88508129119873, "learning_rate": 8.652631578947369e-06, "loss": 0.3714, "step": 8960 }, { "epoch": 13.49, "grad_norm": 6.999327659606934, "learning_rate": 8.651127819548873e-06, "loss": 0.3576, "step": 8970 }, { "epoch": 13.5, "grad_norm": 7.313613414764404, "learning_rate": 8.649624060150376e-06, "loss": 0.4096, "step": 8980 }, { "epoch": 13.52, "grad_norm": 6.363276958465576, "learning_rate": 8.64812030075188e-06, "loss": 0.3356, "step": 8990 }, { "epoch": 13.53, "grad_norm": 7.81085729598999, "learning_rate": 8.646616541353385e-06, "loss": 0.4216, "step": 9000 }, { "epoch": 13.55, "grad_norm": 8.093158721923828, "learning_rate": 8.645112781954887e-06, "loss": 0.5105, "step": 9010 }, { "epoch": 13.56, "grad_norm": 3.801630735397339, "learning_rate": 8.643609022556392e-06, "loss": 0.439, "step": 9020 }, { "epoch": 13.58, "grad_norm": 5.564939975738525, "learning_rate": 8.642105263157895e-06, "loss": 0.3854, "step": 9030 }, { "epoch": 13.59, "grad_norm": 9.847439765930176, "learning_rate": 8.640601503759399e-06, "loss": 0.4034, "step": 9040 }, { "epoch": 13.61, "grad_norm": 9.21834659576416, "learning_rate": 8.639097744360902e-06, "loss": 0.4448, "step": 9050 }, { "epoch": 13.62, "grad_norm": 4.98524808883667, "learning_rate": 8.637593984962408e-06, "loss": 0.3646, "step": 9060 }, { "epoch": 13.64, "grad_norm": 6.707414150238037, "learning_rate": 8.63609022556391e-06, "loss": 0.3618, "step": 9070 }, { "epoch": 13.65, "grad_norm": 5.5840840339660645, "learning_rate": 8.634586466165415e-06, "loss": 0.3628, "step": 9080 }, { "epoch": 13.67, "grad_norm": 4.939608097076416, "learning_rate": 8.633082706766918e-06, "loss": 0.3785, "step": 9090 }, { "epoch": 13.68, "grad_norm": 7.449197769165039, "learning_rate": 8.631578947368422e-06, "loss": 0.4354, "step": 9100 }, { "epoch": 13.7, "grad_norm": 9.470358848571777, "learning_rate": 8.630075187969925e-06, "loss": 0.4075, "step": 9110 }, { "epoch": 13.71, "grad_norm": 7.6183085441589355, "learning_rate": 8.628571428571429e-06, "loss": 0.4067, "step": 9120 }, { "epoch": 13.73, "grad_norm": 3.0916943550109863, "learning_rate": 8.627067669172932e-06, "loss": 0.365, "step": 9130 }, { "epoch": 13.74, "grad_norm": 4.251070499420166, "learning_rate": 8.625563909774437e-06, "loss": 0.4255, "step": 9140 }, { "epoch": 13.76, "grad_norm": 6.8059282302856445, "learning_rate": 8.624060150375941e-06, "loss": 0.32, "step": 9150 }, { "epoch": 13.77, "grad_norm": 7.302189826965332, "learning_rate": 8.622556390977444e-06, "loss": 0.4131, "step": 9160 }, { "epoch": 13.79, "grad_norm": 6.402463436126709, "learning_rate": 8.621052631578948e-06, "loss": 0.3848, "step": 9170 }, { "epoch": 13.8, "grad_norm": 4.343325138092041, "learning_rate": 8.619548872180452e-06, "loss": 0.4014, "step": 9180 }, { "epoch": 13.82, "grad_norm": 9.013459205627441, "learning_rate": 8.618045112781955e-06, "loss": 0.3739, "step": 9190 }, { "epoch": 13.83, "grad_norm": 7.037381172180176, "learning_rate": 8.61654135338346e-06, "loss": 0.4189, "step": 9200 }, { "epoch": 13.85, "grad_norm": 4.7024760246276855, "learning_rate": 8.615037593984962e-06, "loss": 0.38, "step": 9210 }, { "epoch": 13.86, "grad_norm": 4.808414936065674, "learning_rate": 8.613533834586467e-06, "loss": 0.4414, "step": 9220 }, { "epoch": 13.88, "grad_norm": 8.237750053405762, "learning_rate": 8.61203007518797e-06, "loss": 0.4215, "step": 9230 }, { "epoch": 13.89, "grad_norm": 7.862570285797119, "learning_rate": 8.610526315789474e-06, "loss": 0.4727, "step": 9240 }, { "epoch": 13.91, "grad_norm": 7.045783519744873, "learning_rate": 8.609022556390978e-06, "loss": 0.4109, "step": 9250 }, { "epoch": 13.92, "grad_norm": 5.3544135093688965, "learning_rate": 8.607518796992483e-06, "loss": 0.3824, "step": 9260 }, { "epoch": 13.94, "grad_norm": 14.21022891998291, "learning_rate": 8.606015037593985e-06, "loss": 0.3659, "step": 9270 }, { "epoch": 13.95, "grad_norm": 7.408153533935547, "learning_rate": 8.60451127819549e-06, "loss": 0.3508, "step": 9280 }, { "epoch": 13.97, "grad_norm": 3.206442356109619, "learning_rate": 8.603007518796994e-06, "loss": 0.4451, "step": 9290 }, { "epoch": 13.98, "grad_norm": 4.974185466766357, "learning_rate": 8.601503759398497e-06, "loss": 0.3888, "step": 9300 }, { "epoch": 14.0, "grad_norm": 15.39065170288086, "learning_rate": 8.6e-06, "loss": 0.4003, "step": 9310 }, { "epoch": 14.0, "eval_accuracy": 0.932, "eval_loss": 0.2616865336894989, "eval_runtime": 84.8808, "eval_samples_per_second": 117.812, "eval_steps_per_second": 0.471, "step": 9310 }, { "epoch": 14.02, "grad_norm": 5.1268534660339355, "learning_rate": 8.598496240601504e-06, "loss": 0.4195, "step": 9320 }, { "epoch": 14.03, "grad_norm": 6.874084949493408, "learning_rate": 8.596992481203008e-06, "loss": 0.3901, "step": 9330 }, { "epoch": 14.05, "grad_norm": 11.405204772949219, "learning_rate": 8.595488721804513e-06, "loss": 0.3234, "step": 9340 }, { "epoch": 14.06, "grad_norm": 4.844882965087891, "learning_rate": 8.593984962406016e-06, "loss": 0.3437, "step": 9350 }, { "epoch": 14.08, "grad_norm": 7.187948226928711, "learning_rate": 8.59248120300752e-06, "loss": 0.3895, "step": 9360 }, { "epoch": 14.09, "grad_norm": 3.7594106197357178, "learning_rate": 8.590977443609023e-06, "loss": 0.3329, "step": 9370 }, { "epoch": 14.11, "grad_norm": 4.245199203491211, "learning_rate": 8.589473684210527e-06, "loss": 0.3644, "step": 9380 }, { "epoch": 14.12, "grad_norm": 6.302145004272461, "learning_rate": 8.58796992481203e-06, "loss": 0.4615, "step": 9390 }, { "epoch": 14.14, "grad_norm": 6.26497220993042, "learning_rate": 8.586466165413536e-06, "loss": 0.3983, "step": 9400 }, { "epoch": 14.15, "grad_norm": 7.315799236297607, "learning_rate": 8.584962406015038e-06, "loss": 0.3474, "step": 9410 }, { "epoch": 14.17, "grad_norm": 8.584407806396484, "learning_rate": 8.583458646616543e-06, "loss": 0.3858, "step": 9420 }, { "epoch": 14.18, "grad_norm": 6.192986488342285, "learning_rate": 8.581954887218046e-06, "loss": 0.4653, "step": 9430 }, { "epoch": 14.2, "grad_norm": 6.261072635650635, "learning_rate": 8.58045112781955e-06, "loss": 0.3686, "step": 9440 }, { "epoch": 14.21, "grad_norm": 6.7162017822265625, "learning_rate": 8.578947368421053e-06, "loss": 0.4143, "step": 9450 }, { "epoch": 14.23, "grad_norm": 5.550053119659424, "learning_rate": 8.577443609022557e-06, "loss": 0.4262, "step": 9460 }, { "epoch": 14.24, "grad_norm": 6.601341247558594, "learning_rate": 8.57593984962406e-06, "loss": 0.3615, "step": 9470 }, { "epoch": 14.26, "grad_norm": 6.859097957611084, "learning_rate": 8.574436090225564e-06, "loss": 0.3584, "step": 9480 }, { "epoch": 14.27, "grad_norm": 3.824615478515625, "learning_rate": 8.572932330827069e-06, "loss": 0.3941, "step": 9490 }, { "epoch": 14.29, "grad_norm": 6.923836708068848, "learning_rate": 8.571428571428571e-06, "loss": 0.4016, "step": 9500 }, { "epoch": 14.3, "grad_norm": 6.395806789398193, "learning_rate": 8.569924812030076e-06, "loss": 0.368, "step": 9510 }, { "epoch": 14.32, "grad_norm": 6.522418022155762, "learning_rate": 8.56842105263158e-06, "loss": 0.377, "step": 9520 }, { "epoch": 14.33, "grad_norm": 7.502889633178711, "learning_rate": 8.566917293233083e-06, "loss": 0.4179, "step": 9530 }, { "epoch": 14.35, "grad_norm": 6.025669574737549, "learning_rate": 8.565413533834587e-06, "loss": 0.3395, "step": 9540 }, { "epoch": 14.36, "grad_norm": 7.751435279846191, "learning_rate": 8.563909774436092e-06, "loss": 0.3531, "step": 9550 }, { "epoch": 14.38, "grad_norm": 6.964672088623047, "learning_rate": 8.562406015037594e-06, "loss": 0.3905, "step": 9560 }, { "epoch": 14.39, "grad_norm": 2.502666473388672, "learning_rate": 8.560902255639099e-06, "loss": 0.3172, "step": 9570 }, { "epoch": 14.41, "grad_norm": 7.133659839630127, "learning_rate": 8.559398496240602e-06, "loss": 0.442, "step": 9580 }, { "epoch": 14.42, "grad_norm": 4.32753324508667, "learning_rate": 8.557894736842106e-06, "loss": 0.3458, "step": 9590 }, { "epoch": 14.44, "grad_norm": 6.268803119659424, "learning_rate": 8.55639097744361e-06, "loss": 0.3431, "step": 9600 }, { "epoch": 14.45, "grad_norm": 7.018800735473633, "learning_rate": 8.554887218045113e-06, "loss": 0.4369, "step": 9610 }, { "epoch": 14.47, "grad_norm": 7.435917377471924, "learning_rate": 8.553383458646617e-06, "loss": 0.3861, "step": 9620 }, { "epoch": 14.48, "grad_norm": 7.8388752937316895, "learning_rate": 8.551879699248122e-06, "loss": 0.4239, "step": 9630 }, { "epoch": 14.5, "grad_norm": 7.880455493927002, "learning_rate": 8.550375939849625e-06, "loss": 0.3829, "step": 9640 }, { "epoch": 14.51, "grad_norm": 5.568830490112305, "learning_rate": 8.548872180451129e-06, "loss": 0.4011, "step": 9650 }, { "epoch": 14.53, "grad_norm": 5.6487274169921875, "learning_rate": 8.547368421052632e-06, "loss": 0.4091, "step": 9660 }, { "epoch": 14.54, "grad_norm": 4.51718807220459, "learning_rate": 8.545864661654136e-06, "loss": 0.4087, "step": 9670 }, { "epoch": 14.56, "grad_norm": 7.874798774719238, "learning_rate": 8.54436090225564e-06, "loss": 0.3494, "step": 9680 }, { "epoch": 14.57, "grad_norm": 4.497681617736816, "learning_rate": 8.542857142857145e-06, "loss": 0.3783, "step": 9690 }, { "epoch": 14.59, "grad_norm": 5.411101818084717, "learning_rate": 8.541353383458646e-06, "loss": 0.3395, "step": 9700 }, { "epoch": 14.6, "grad_norm": 5.714541435241699, "learning_rate": 8.539849624060152e-06, "loss": 0.4507, "step": 9710 }, { "epoch": 14.62, "grad_norm": 7.042336940765381, "learning_rate": 8.538345864661655e-06, "loss": 0.3363, "step": 9720 }, { "epoch": 14.63, "grad_norm": 6.70949649810791, "learning_rate": 8.536842105263159e-06, "loss": 0.3559, "step": 9730 }, { "epoch": 14.65, "grad_norm": 5.84644889831543, "learning_rate": 8.535338345864662e-06, "loss": 0.3936, "step": 9740 }, { "epoch": 14.66, "grad_norm": 5.070087432861328, "learning_rate": 8.533834586466166e-06, "loss": 0.4014, "step": 9750 }, { "epoch": 14.68, "grad_norm": 5.353463172912598, "learning_rate": 8.53233082706767e-06, "loss": 0.4091, "step": 9760 }, { "epoch": 14.69, "grad_norm": 9.118497848510742, "learning_rate": 8.530827067669174e-06, "loss": 0.3922, "step": 9770 }, { "epoch": 14.71, "grad_norm": 7.667191982269287, "learning_rate": 8.529323308270678e-06, "loss": 0.4087, "step": 9780 }, { "epoch": 14.72, "grad_norm": 7.210267066955566, "learning_rate": 8.527819548872181e-06, "loss": 0.3458, "step": 9790 }, { "epoch": 14.74, "grad_norm": 5.242373466491699, "learning_rate": 8.526315789473685e-06, "loss": 0.3293, "step": 9800 }, { "epoch": 14.75, "grad_norm": 7.6933393478393555, "learning_rate": 8.524812030075188e-06, "loss": 0.4406, "step": 9810 }, { "epoch": 14.77, "grad_norm": 5.179628372192383, "learning_rate": 8.523308270676692e-06, "loss": 0.3827, "step": 9820 }, { "epoch": 14.78, "grad_norm": 9.525907516479492, "learning_rate": 8.521804511278197e-06, "loss": 0.4095, "step": 9830 }, { "epoch": 14.8, "grad_norm": 6.132147789001465, "learning_rate": 8.520300751879699e-06, "loss": 0.3778, "step": 9840 }, { "epoch": 14.81, "grad_norm": 8.04976749420166, "learning_rate": 8.518796992481204e-06, "loss": 0.3996, "step": 9850 }, { "epoch": 14.83, "grad_norm": 9.131913185119629, "learning_rate": 8.517293233082708e-06, "loss": 0.4103, "step": 9860 }, { "epoch": 14.84, "grad_norm": 5.724211692810059, "learning_rate": 8.515789473684211e-06, "loss": 0.4166, "step": 9870 }, { "epoch": 14.86, "grad_norm": 4.537842750549316, "learning_rate": 8.514285714285715e-06, "loss": 0.4357, "step": 9880 }, { "epoch": 14.87, "grad_norm": 9.75554370880127, "learning_rate": 8.51278195488722e-06, "loss": 0.351, "step": 9890 }, { "epoch": 14.89, "grad_norm": 5.427340030670166, "learning_rate": 8.511278195488722e-06, "loss": 0.3488, "step": 9900 }, { "epoch": 14.9, "grad_norm": 4.465277671813965, "learning_rate": 8.509774436090227e-06, "loss": 0.32, "step": 9910 }, { "epoch": 14.92, "grad_norm": 4.331689834594727, "learning_rate": 8.50827067669173e-06, "loss": 0.4442, "step": 9920 }, { "epoch": 14.93, "grad_norm": 5.798705577850342, "learning_rate": 8.506766917293232e-06, "loss": 0.4003, "step": 9930 }, { "epoch": 14.95, "grad_norm": 9.353456497192383, "learning_rate": 8.505263157894738e-06, "loss": 0.3969, "step": 9940 }, { "epoch": 14.96, "grad_norm": 7.088143825531006, "learning_rate": 8.503759398496241e-06, "loss": 0.3749, "step": 9950 }, { "epoch": 14.98, "grad_norm": 6.825297832489014, "learning_rate": 8.502255639097745e-06, "loss": 0.3532, "step": 9960 }, { "epoch": 14.99, "grad_norm": 3.3977503776550293, "learning_rate": 8.500751879699248e-06, "loss": 0.3227, "step": 9970 }, { "epoch": 15.0, "eval_accuracy": 0.9335, "eval_loss": 0.2566547989845276, "eval_runtime": 84.4818, "eval_samples_per_second": 118.369, "eval_steps_per_second": 0.473, "step": 9975 }, { "epoch": 15.01, "grad_norm": 3.752779483795166, "learning_rate": 8.499248120300753e-06, "loss": 0.2741, "step": 9980 }, { "epoch": 15.02, "grad_norm": 5.626667499542236, "learning_rate": 8.497744360902255e-06, "loss": 0.3584, "step": 9990 }, { "epoch": 15.04, "grad_norm": 5.8750834465026855, "learning_rate": 8.49624060150376e-06, "loss": 0.3563, "step": 10000 }, { "epoch": 15.05, "grad_norm": 6.653073787689209, "learning_rate": 8.494736842105264e-06, "loss": 0.3808, "step": 10010 }, { "epoch": 15.07, "grad_norm": 6.017663955688477, "learning_rate": 8.493233082706767e-06, "loss": 0.3165, "step": 10020 }, { "epoch": 15.08, "grad_norm": 4.427550792694092, "learning_rate": 8.491729323308271e-06, "loss": 0.4417, "step": 10030 }, { "epoch": 15.1, "grad_norm": 7.968047142028809, "learning_rate": 8.490225563909775e-06, "loss": 0.3807, "step": 10040 }, { "epoch": 15.11, "grad_norm": 6.646710395812988, "learning_rate": 8.488721804511278e-06, "loss": 0.3014, "step": 10050 }, { "epoch": 15.13, "grad_norm": 5.180790424346924, "learning_rate": 8.487218045112783e-06, "loss": 0.3737, "step": 10060 }, { "epoch": 15.14, "grad_norm": 8.142125129699707, "learning_rate": 8.485714285714287e-06, "loss": 0.3953, "step": 10070 }, { "epoch": 15.16, "grad_norm": 5.258510589599609, "learning_rate": 8.48421052631579e-06, "loss": 0.2742, "step": 10080 }, { "epoch": 15.17, "grad_norm": 7.299388408660889, "learning_rate": 8.482706766917294e-06, "loss": 0.388, "step": 10090 }, { "epoch": 15.19, "grad_norm": 9.05027961730957, "learning_rate": 8.481203007518797e-06, "loss": 0.3364, "step": 10100 }, { "epoch": 15.2, "grad_norm": 8.713417053222656, "learning_rate": 8.4796992481203e-06, "loss": 0.2847, "step": 10110 }, { "epoch": 15.22, "grad_norm": 4.598002910614014, "learning_rate": 8.478195488721806e-06, "loss": 0.3536, "step": 10120 }, { "epoch": 15.23, "grad_norm": 8.883098602294922, "learning_rate": 8.476691729323308e-06, "loss": 0.3917, "step": 10130 }, { "epoch": 15.25, "grad_norm": 9.343592643737793, "learning_rate": 8.475187969924813e-06, "loss": 0.3912, "step": 10140 }, { "epoch": 15.26, "grad_norm": 11.456267356872559, "learning_rate": 8.473684210526317e-06, "loss": 0.3171, "step": 10150 }, { "epoch": 15.28, "grad_norm": 7.874906539916992, "learning_rate": 8.47218045112782e-06, "loss": 0.4443, "step": 10160 }, { "epoch": 15.29, "grad_norm": 7.280247211456299, "learning_rate": 8.470676691729324e-06, "loss": 0.4121, "step": 10170 }, { "epoch": 15.31, "grad_norm": 7.839987754821777, "learning_rate": 8.469172932330829e-06, "loss": 0.4284, "step": 10180 }, { "epoch": 15.32, "grad_norm": 5.39335298538208, "learning_rate": 8.46766917293233e-06, "loss": 0.4311, "step": 10190 }, { "epoch": 15.34, "grad_norm": 8.08191204071045, "learning_rate": 8.466165413533836e-06, "loss": 0.3789, "step": 10200 }, { "epoch": 15.35, "grad_norm": 3.493443250656128, "learning_rate": 8.46466165413534e-06, "loss": 0.3526, "step": 10210 }, { "epoch": 15.37, "grad_norm": 7.41270637512207, "learning_rate": 8.463157894736843e-06, "loss": 0.4283, "step": 10220 }, { "epoch": 15.38, "grad_norm": 6.4891486167907715, "learning_rate": 8.461654135338346e-06, "loss": 0.298, "step": 10230 }, { "epoch": 15.4, "grad_norm": 6.028573989868164, "learning_rate": 8.46015037593985e-06, "loss": 0.3996, "step": 10240 }, { "epoch": 15.41, "grad_norm": 3.365438938140869, "learning_rate": 8.458646616541353e-06, "loss": 0.334, "step": 10250 }, { "epoch": 15.43, "grad_norm": 5.2097015380859375, "learning_rate": 8.457142857142859e-06, "loss": 0.3555, "step": 10260 }, { "epoch": 15.44, "grad_norm": 4.656721591949463, "learning_rate": 8.455639097744362e-06, "loss": 0.398, "step": 10270 }, { "epoch": 15.46, "grad_norm": 11.353671073913574, "learning_rate": 8.454135338345866e-06, "loss": 0.3341, "step": 10280 }, { "epoch": 15.47, "grad_norm": 6.971073627471924, "learning_rate": 8.45263157894737e-06, "loss": 0.3569, "step": 10290 }, { "epoch": 15.49, "grad_norm": 5.809013843536377, "learning_rate": 8.451127819548873e-06, "loss": 0.3642, "step": 10300 }, { "epoch": 15.5, "grad_norm": 7.3322834968566895, "learning_rate": 8.449624060150376e-06, "loss": 0.3814, "step": 10310 }, { "epoch": 15.52, "grad_norm": 9.322681427001953, "learning_rate": 8.448120300751882e-06, "loss": 0.3455, "step": 10320 }, { "epoch": 15.53, "grad_norm": 7.197205066680908, "learning_rate": 8.446616541353383e-06, "loss": 0.3652, "step": 10330 }, { "epoch": 15.55, "grad_norm": 7.1502766609191895, "learning_rate": 8.445112781954889e-06, "loss": 0.3882, "step": 10340 }, { "epoch": 15.56, "grad_norm": 4.826005935668945, "learning_rate": 8.443609022556392e-06, "loss": 0.3878, "step": 10350 }, { "epoch": 15.58, "grad_norm": 8.432343482971191, "learning_rate": 8.442105263157896e-06, "loss": 0.3778, "step": 10360 }, { "epoch": 15.59, "grad_norm": 6.166329383850098, "learning_rate": 8.440601503759399e-06, "loss": 0.3675, "step": 10370 }, { "epoch": 15.61, "grad_norm": 9.84304428100586, "learning_rate": 8.439097744360903e-06, "loss": 0.385, "step": 10380 }, { "epoch": 15.62, "grad_norm": 4.937039375305176, "learning_rate": 8.437593984962406e-06, "loss": 0.3558, "step": 10390 }, { "epoch": 15.64, "grad_norm": 5.817636966705322, "learning_rate": 8.436090225563911e-06, "loss": 0.3605, "step": 10400 }, { "epoch": 15.65, "grad_norm": 6.982740879058838, "learning_rate": 8.434586466165415e-06, "loss": 0.2966, "step": 10410 }, { "epoch": 15.67, "grad_norm": 5.2945098876953125, "learning_rate": 8.433082706766918e-06, "loss": 0.3279, "step": 10420 }, { "epoch": 15.68, "grad_norm": 7.526950359344482, "learning_rate": 8.431578947368422e-06, "loss": 0.414, "step": 10430 }, { "epoch": 15.7, "grad_norm": 8.325518608093262, "learning_rate": 8.430075187969925e-06, "loss": 0.314, "step": 10440 }, { "epoch": 15.71, "grad_norm": 2.876897096633911, "learning_rate": 8.428571428571429e-06, "loss": 0.354, "step": 10450 }, { "epoch": 15.73, "grad_norm": 6.776325702667236, "learning_rate": 8.427067669172932e-06, "loss": 0.3462, "step": 10460 }, { "epoch": 15.74, "grad_norm": 8.158499717712402, "learning_rate": 8.425563909774438e-06, "loss": 0.3307, "step": 10470 }, { "epoch": 15.76, "grad_norm": 10.01845645904541, "learning_rate": 8.42406015037594e-06, "loss": 0.3196, "step": 10480 }, { "epoch": 15.77, "grad_norm": 4.778624534606934, "learning_rate": 8.422556390977445e-06, "loss": 0.3412, "step": 10490 }, { "epoch": 15.79, "grad_norm": 5.8504157066345215, "learning_rate": 8.421052631578948e-06, "loss": 0.4183, "step": 10500 }, { "epoch": 15.8, "grad_norm": 5.351130962371826, "learning_rate": 8.419548872180452e-06, "loss": 0.2639, "step": 10510 }, { "epoch": 15.82, "grad_norm": 7.211291313171387, "learning_rate": 8.418045112781955e-06, "loss": 0.3021, "step": 10520 }, { "epoch": 15.83, "grad_norm": 6.899810791015625, "learning_rate": 8.416541353383459e-06, "loss": 0.4069, "step": 10530 }, { "epoch": 15.85, "grad_norm": 4.64746618270874, "learning_rate": 8.415037593984962e-06, "loss": 0.2798, "step": 10540 }, { "epoch": 15.86, "grad_norm": 9.508644104003906, "learning_rate": 8.413533834586468e-06, "loss": 0.3791, "step": 10550 }, { "epoch": 15.88, "grad_norm": 6.956771373748779, "learning_rate": 8.412030075187971e-06, "loss": 0.4188, "step": 10560 }, { "epoch": 15.89, "grad_norm": 6.4203667640686035, "learning_rate": 8.410526315789475e-06, "loss": 0.3749, "step": 10570 }, { "epoch": 15.91, "grad_norm": 8.310030937194824, "learning_rate": 8.409022556390978e-06, "loss": 0.4605, "step": 10580 }, { "epoch": 15.92, "grad_norm": 8.788355827331543, "learning_rate": 8.407518796992482e-06, "loss": 0.3811, "step": 10590 }, { "epoch": 15.94, "grad_norm": 5.931136131286621, "learning_rate": 8.406015037593985e-06, "loss": 0.4129, "step": 10600 }, { "epoch": 15.95, "grad_norm": 7.846260070800781, "learning_rate": 8.40451127819549e-06, "loss": 0.3889, "step": 10610 }, { "epoch": 15.97, "grad_norm": 6.834481239318848, "learning_rate": 8.403007518796992e-06, "loss": 0.4303, "step": 10620 }, { "epoch": 15.98, "grad_norm": 3.6619720458984375, "learning_rate": 8.401503759398497e-06, "loss": 0.3483, "step": 10630 }, { "epoch": 16.0, "grad_norm": 12.63433837890625, "learning_rate": 8.400000000000001e-06, "loss": 0.386, "step": 10640 }, { "epoch": 16.0, "eval_accuracy": 0.931, "eval_loss": 0.25709524750709534, "eval_runtime": 84.9959, "eval_samples_per_second": 117.653, "eval_steps_per_second": 0.471, "step": 10640 }, { "epoch": 16.02, "grad_norm": 7.7511796951293945, "learning_rate": 8.398496240601504e-06, "loss": 0.3916, "step": 10650 }, { "epoch": 16.03, "grad_norm": 4.640677452087402, "learning_rate": 8.396992481203008e-06, "loss": 0.395, "step": 10660 }, { "epoch": 16.05, "grad_norm": 7.270589828491211, "learning_rate": 8.395488721804511e-06, "loss": 0.4299, "step": 10670 }, { "epoch": 16.06, "grad_norm": 5.698379993438721, "learning_rate": 8.393984962406015e-06, "loss": 0.3792, "step": 10680 }, { "epoch": 16.08, "grad_norm": 29.69732093811035, "learning_rate": 8.39248120300752e-06, "loss": 0.3564, "step": 10690 }, { "epoch": 16.09, "grad_norm": 5.11942720413208, "learning_rate": 8.390977443609024e-06, "loss": 0.3284, "step": 10700 }, { "epoch": 16.11, "grad_norm": 6.748551368713379, "learning_rate": 8.389473684210527e-06, "loss": 0.3582, "step": 10710 }, { "epoch": 16.12, "grad_norm": 6.748464584350586, "learning_rate": 8.38796992481203e-06, "loss": 0.3707, "step": 10720 }, { "epoch": 16.14, "grad_norm": 6.992805480957031, "learning_rate": 8.386466165413534e-06, "loss": 0.4388, "step": 10730 }, { "epoch": 16.15, "grad_norm": 4.689752578735352, "learning_rate": 8.384962406015038e-06, "loss": 0.3394, "step": 10740 }, { "epoch": 16.17, "grad_norm": 7.47608757019043, "learning_rate": 8.383458646616543e-06, "loss": 0.3662, "step": 10750 }, { "epoch": 16.18, "grad_norm": 8.161937713623047, "learning_rate": 8.381954887218045e-06, "loss": 0.3729, "step": 10760 }, { "epoch": 16.2, "grad_norm": 4.649080276489258, "learning_rate": 8.38045112781955e-06, "loss": 0.4012, "step": 10770 }, { "epoch": 16.21, "grad_norm": 5.3081512451171875, "learning_rate": 8.378947368421054e-06, "loss": 0.3275, "step": 10780 }, { "epoch": 16.23, "grad_norm": 8.424674034118652, "learning_rate": 8.377443609022557e-06, "loss": 0.3169, "step": 10790 }, { "epoch": 16.24, "grad_norm": 7.213728427886963, "learning_rate": 8.37593984962406e-06, "loss": 0.4158, "step": 10800 }, { "epoch": 16.26, "grad_norm": 4.228058815002441, "learning_rate": 8.374436090225566e-06, "loss": 0.2967, "step": 10810 }, { "epoch": 16.27, "grad_norm": 9.091151237487793, "learning_rate": 8.372932330827068e-06, "loss": 0.3631, "step": 10820 }, { "epoch": 16.29, "grad_norm": 7.325952053070068, "learning_rate": 8.371428571428573e-06, "loss": 0.4045, "step": 10830 }, { "epoch": 16.3, "grad_norm": 8.557323455810547, "learning_rate": 8.369924812030076e-06, "loss": 0.3657, "step": 10840 }, { "epoch": 16.32, "grad_norm": 8.98796558380127, "learning_rate": 8.36842105263158e-06, "loss": 0.3812, "step": 10850 }, { "epoch": 16.33, "grad_norm": 14.74909496307373, "learning_rate": 8.366917293233083e-06, "loss": 0.3966, "step": 10860 }, { "epoch": 16.35, "grad_norm": 6.872273921966553, "learning_rate": 8.365413533834587e-06, "loss": 0.3797, "step": 10870 }, { "epoch": 16.36, "grad_norm": 8.392032623291016, "learning_rate": 8.36390977443609e-06, "loss": 0.3864, "step": 10880 }, { "epoch": 16.38, "grad_norm": 8.61768627166748, "learning_rate": 8.362406015037596e-06, "loss": 0.3434, "step": 10890 }, { "epoch": 16.39, "grad_norm": 6.622392177581787, "learning_rate": 8.3609022556391e-06, "loss": 0.3751, "step": 10900 }, { "epoch": 16.41, "grad_norm": 7.108027458190918, "learning_rate": 8.359398496240603e-06, "loss": 0.3542, "step": 10910 }, { "epoch": 16.42, "grad_norm": 7.145939826965332, "learning_rate": 8.357894736842106e-06, "loss": 0.3586, "step": 10920 }, { "epoch": 16.44, "grad_norm": 16.715761184692383, "learning_rate": 8.35639097744361e-06, "loss": 0.4365, "step": 10930 }, { "epoch": 16.45, "grad_norm": 3.3526134490966797, "learning_rate": 8.354887218045113e-06, "loss": 0.38, "step": 10940 }, { "epoch": 16.47, "grad_norm": 4.425145149230957, "learning_rate": 8.353383458646617e-06, "loss": 0.2944, "step": 10950 }, { "epoch": 16.48, "grad_norm": 4.0147552490234375, "learning_rate": 8.35187969924812e-06, "loss": 0.3836, "step": 10960 }, { "epoch": 16.5, "grad_norm": 6.398830413818359, "learning_rate": 8.350375939849624e-06, "loss": 0.3402, "step": 10970 }, { "epoch": 16.51, "grad_norm": 5.556189060211182, "learning_rate": 8.348872180451129e-06, "loss": 0.3728, "step": 10980 }, { "epoch": 16.53, "grad_norm": 4.928891658782959, "learning_rate": 8.347368421052633e-06, "loss": 0.2993, "step": 10990 }, { "epoch": 16.54, "grad_norm": 8.429544448852539, "learning_rate": 8.345864661654136e-06, "loss": 0.3896, "step": 11000 }, { "epoch": 16.56, "grad_norm": 9.285510063171387, "learning_rate": 8.34436090225564e-06, "loss": 0.3428, "step": 11010 }, { "epoch": 16.57, "grad_norm": 5.118491172790527, "learning_rate": 8.342857142857143e-06, "loss": 0.3583, "step": 11020 }, { "epoch": 16.59, "grad_norm": 9.391587257385254, "learning_rate": 8.341353383458647e-06, "loss": 0.312, "step": 11030 }, { "epoch": 16.6, "grad_norm": 22.18227767944336, "learning_rate": 8.339849624060152e-06, "loss": 0.3549, "step": 11040 }, { "epoch": 16.62, "grad_norm": 5.834057331085205, "learning_rate": 8.338345864661654e-06, "loss": 0.4242, "step": 11050 }, { "epoch": 16.63, "grad_norm": 5.136647701263428, "learning_rate": 8.336842105263159e-06, "loss": 0.3623, "step": 11060 }, { "epoch": 16.65, "grad_norm": 5.481499195098877, "learning_rate": 8.335338345864662e-06, "loss": 0.3745, "step": 11070 }, { "epoch": 16.66, "grad_norm": 3.6383814811706543, "learning_rate": 8.333834586466166e-06, "loss": 0.3526, "step": 11080 }, { "epoch": 16.68, "grad_norm": 4.198364734649658, "learning_rate": 8.33233082706767e-06, "loss": 0.3247, "step": 11090 }, { "epoch": 16.69, "grad_norm": 6.066871166229248, "learning_rate": 8.330827067669175e-06, "loss": 0.346, "step": 11100 }, { "epoch": 16.71, "grad_norm": 4.822031497955322, "learning_rate": 8.329323308270676e-06, "loss": 0.3421, "step": 11110 }, { "epoch": 16.72, "grad_norm": 8.0927152633667, "learning_rate": 8.327819548872182e-06, "loss": 0.3212, "step": 11120 }, { "epoch": 16.74, "grad_norm": 5.737279415130615, "learning_rate": 8.326315789473685e-06, "loss": 0.338, "step": 11130 }, { "epoch": 16.75, "grad_norm": 6.6013994216918945, "learning_rate": 8.324812030075189e-06, "loss": 0.4221, "step": 11140 }, { "epoch": 16.77, "grad_norm": 3.22804856300354, "learning_rate": 8.323308270676692e-06, "loss": 0.2824, "step": 11150 }, { "epoch": 16.78, "grad_norm": 7.42767333984375, "learning_rate": 8.321804511278196e-06, "loss": 0.3654, "step": 11160 }, { "epoch": 16.8, "grad_norm": 6.393558979034424, "learning_rate": 8.3203007518797e-06, "loss": 0.3939, "step": 11170 }, { "epoch": 16.81, "grad_norm": 6.1491241455078125, "learning_rate": 8.318796992481204e-06, "loss": 0.3454, "step": 11180 }, { "epoch": 16.83, "grad_norm": 4.844079971313477, "learning_rate": 8.317293233082708e-06, "loss": 0.3744, "step": 11190 }, { "epoch": 16.84, "grad_norm": 7.577675819396973, "learning_rate": 8.315789473684212e-06, "loss": 0.4259, "step": 11200 }, { "epoch": 16.86, "grad_norm": 5.073234558105469, "learning_rate": 8.314285714285715e-06, "loss": 0.343, "step": 11210 }, { "epoch": 16.87, "grad_norm": 4.934657096862793, "learning_rate": 8.312781954887219e-06, "loss": 0.3834, "step": 11220 }, { "epoch": 16.89, "grad_norm": 4.744530200958252, "learning_rate": 8.311278195488722e-06, "loss": 0.2937, "step": 11230 }, { "epoch": 16.9, "grad_norm": 7.569250583648682, "learning_rate": 8.309774436090227e-06, "loss": 0.3719, "step": 11240 }, { "epoch": 16.92, "grad_norm": 7.076653480529785, "learning_rate": 8.308270676691729e-06, "loss": 0.3489, "step": 11250 }, { "epoch": 16.93, "grad_norm": 7.1391520500183105, "learning_rate": 8.306766917293234e-06, "loss": 0.3325, "step": 11260 }, { "epoch": 16.95, "grad_norm": 4.94738245010376, "learning_rate": 8.305263157894738e-06, "loss": 0.3421, "step": 11270 }, { "epoch": 16.96, "grad_norm": 6.052053451538086, "learning_rate": 8.303759398496241e-06, "loss": 0.4267, "step": 11280 }, { "epoch": 16.98, "grad_norm": 6.822144985198975, "learning_rate": 8.302255639097745e-06, "loss": 0.4932, "step": 11290 }, { "epoch": 16.99, "grad_norm": 7.0719218254089355, "learning_rate": 8.300751879699248e-06, "loss": 0.3688, "step": 11300 }, { "epoch": 17.0, "eval_accuracy": 0.9346, "eval_loss": 0.25758126378059387, "eval_runtime": 84.7562, "eval_samples_per_second": 117.986, "eval_steps_per_second": 0.472, "step": 11305 }, { "epoch": 17.01, "grad_norm": 4.741504669189453, "learning_rate": 8.299248120300752e-06, "loss": 0.3837, "step": 11310 }, { "epoch": 17.02, "grad_norm": 6.912674427032471, "learning_rate": 8.297744360902257e-06, "loss": 0.2938, "step": 11320 }, { "epoch": 17.04, "grad_norm": 5.881788730621338, "learning_rate": 8.29624060150376e-06, "loss": 0.3984, "step": 11330 }, { "epoch": 17.05, "grad_norm": 5.705871105194092, "learning_rate": 8.294736842105264e-06, "loss": 0.3793, "step": 11340 }, { "epoch": 17.07, "grad_norm": 5.036585330963135, "learning_rate": 8.293233082706768e-06, "loss": 0.365, "step": 11350 }, { "epoch": 17.08, "grad_norm": 5.040714740753174, "learning_rate": 8.291729323308271e-06, "loss": 0.3662, "step": 11360 }, { "epoch": 17.1, "grad_norm": 7.933087348937988, "learning_rate": 8.290225563909775e-06, "loss": 0.3204, "step": 11370 }, { "epoch": 17.11, "grad_norm": 8.096324920654297, "learning_rate": 8.28872180451128e-06, "loss": 0.3383, "step": 11380 }, { "epoch": 17.13, "grad_norm": 5.693844318389893, "learning_rate": 8.287218045112782e-06, "loss": 0.3455, "step": 11390 }, { "epoch": 17.14, "grad_norm": 5.274537086486816, "learning_rate": 8.285714285714287e-06, "loss": 0.3695, "step": 11400 }, { "epoch": 17.16, "grad_norm": 5.249573230743408, "learning_rate": 8.28421052631579e-06, "loss": 0.3801, "step": 11410 }, { "epoch": 17.17, "grad_norm": 6.644190311431885, "learning_rate": 8.282706766917294e-06, "loss": 0.3355, "step": 11420 }, { "epoch": 17.19, "grad_norm": 8.79143238067627, "learning_rate": 8.281203007518798e-06, "loss": 0.3857, "step": 11430 }, { "epoch": 17.2, "grad_norm": 4.0490851402282715, "learning_rate": 8.279699248120301e-06, "loss": 0.3224, "step": 11440 }, { "epoch": 17.22, "grad_norm": 8.35059928894043, "learning_rate": 8.278195488721805e-06, "loss": 0.372, "step": 11450 }, { "epoch": 17.23, "grad_norm": 5.733313083648682, "learning_rate": 8.276691729323308e-06, "loss": 0.3511, "step": 11460 }, { "epoch": 17.25, "grad_norm": 4.974298477172852, "learning_rate": 8.275187969924813e-06, "loss": 0.332, "step": 11470 }, { "epoch": 17.26, "grad_norm": 6.859165191650391, "learning_rate": 8.273684210526317e-06, "loss": 0.3053, "step": 11480 }, { "epoch": 17.28, "grad_norm": 7.4038591384887695, "learning_rate": 8.27218045112782e-06, "loss": 0.3419, "step": 11490 }, { "epoch": 17.29, "grad_norm": 4.045393943786621, "learning_rate": 8.270676691729324e-06, "loss": 0.3656, "step": 11500 }, { "epoch": 17.31, "grad_norm": 2.965898036956787, "learning_rate": 8.269172932330827e-06, "loss": 0.4222, "step": 11510 }, { "epoch": 17.32, "grad_norm": 6.6445746421813965, "learning_rate": 8.267669172932331e-06, "loss": 0.4351, "step": 11520 }, { "epoch": 17.34, "grad_norm": 11.220673561096191, "learning_rate": 8.266165413533836e-06, "loss": 0.3738, "step": 11530 }, { "epoch": 17.35, "grad_norm": 3.7990376949310303, "learning_rate": 8.264661654135338e-06, "loss": 0.2779, "step": 11540 }, { "epoch": 17.37, "grad_norm": 6.193857669830322, "learning_rate": 8.263157894736843e-06, "loss": 0.3864, "step": 11550 }, { "epoch": 17.38, "grad_norm": 7.089908123016357, "learning_rate": 8.261654135338347e-06, "loss": 0.3243, "step": 11560 }, { "epoch": 17.4, "grad_norm": 10.148313522338867, "learning_rate": 8.26015037593985e-06, "loss": 0.3848, "step": 11570 }, { "epoch": 17.41, "grad_norm": 7.47261905670166, "learning_rate": 8.258646616541354e-06, "loss": 0.3958, "step": 11580 }, { "epoch": 17.43, "grad_norm": 8.237654685974121, "learning_rate": 8.257142857142857e-06, "loss": 0.4104, "step": 11590 }, { "epoch": 17.44, "grad_norm": 7.028960227966309, "learning_rate": 8.25563909774436e-06, "loss": 0.3515, "step": 11600 }, { "epoch": 17.46, "grad_norm": 6.804955959320068, "learning_rate": 8.254135338345866e-06, "loss": 0.3555, "step": 11610 }, { "epoch": 17.47, "grad_norm": 8.740710258483887, "learning_rate": 8.25263157894737e-06, "loss": 0.328, "step": 11620 }, { "epoch": 17.49, "grad_norm": 6.383413314819336, "learning_rate": 8.251127819548873e-06, "loss": 0.3352, "step": 11630 }, { "epoch": 17.5, "grad_norm": 8.289705276489258, "learning_rate": 8.249624060150377e-06, "loss": 0.4067, "step": 11640 }, { "epoch": 17.52, "grad_norm": 6.0566911697387695, "learning_rate": 8.24812030075188e-06, "loss": 0.3556, "step": 11650 }, { "epoch": 17.53, "grad_norm": 9.813027381896973, "learning_rate": 8.246616541353384e-06, "loss": 0.3856, "step": 11660 }, { "epoch": 17.55, "grad_norm": 6.0970988273620605, "learning_rate": 8.245112781954889e-06, "loss": 0.396, "step": 11670 }, { "epoch": 17.56, "grad_norm": 4.837037086486816, "learning_rate": 8.24360902255639e-06, "loss": 0.3194, "step": 11680 }, { "epoch": 17.58, "grad_norm": 10.178328514099121, "learning_rate": 8.242105263157896e-06, "loss": 0.4108, "step": 11690 }, { "epoch": 17.59, "grad_norm": 3.696746826171875, "learning_rate": 8.2406015037594e-06, "loss": 0.3198, "step": 11700 }, { "epoch": 17.61, "grad_norm": 6.4773993492126465, "learning_rate": 8.239097744360903e-06, "loss": 0.2889, "step": 11710 }, { "epoch": 17.62, "grad_norm": 9.036526679992676, "learning_rate": 8.237593984962406e-06, "loss": 0.4616, "step": 11720 }, { "epoch": 17.64, "grad_norm": 5.1061320304870605, "learning_rate": 8.236090225563912e-06, "loss": 0.3941, "step": 11730 }, { "epoch": 17.65, "grad_norm": 5.17496919631958, "learning_rate": 8.234586466165413e-06, "loss": 0.3883, "step": 11740 }, { "epoch": 17.67, "grad_norm": 4.007594585418701, "learning_rate": 8.233082706766919e-06, "loss": 0.3097, "step": 11750 }, { "epoch": 17.68, "grad_norm": 3.903956174850464, "learning_rate": 8.231578947368422e-06, "loss": 0.3473, "step": 11760 }, { "epoch": 17.7, "grad_norm": 9.270066261291504, "learning_rate": 8.230075187969926e-06, "loss": 0.3255, "step": 11770 }, { "epoch": 17.71, "grad_norm": 4.118042469024658, "learning_rate": 8.22857142857143e-06, "loss": 0.335, "step": 11780 }, { "epoch": 17.73, "grad_norm": 5.715611457824707, "learning_rate": 8.227067669172933e-06, "loss": 0.422, "step": 11790 }, { "epoch": 17.74, "grad_norm": 5.848507881164551, "learning_rate": 8.225563909774436e-06, "loss": 0.3465, "step": 11800 }, { "epoch": 17.76, "grad_norm": 5.273082733154297, "learning_rate": 8.224060150375941e-06, "loss": 0.3479, "step": 11810 }, { "epoch": 17.77, "grad_norm": 4.500287055969238, "learning_rate": 8.222556390977445e-06, "loss": 0.3813, "step": 11820 }, { "epoch": 17.79, "grad_norm": 7.676726341247559, "learning_rate": 8.221052631578948e-06, "loss": 0.3973, "step": 11830 }, { "epoch": 17.8, "grad_norm": 6.1550211906433105, "learning_rate": 8.219548872180452e-06, "loss": 0.3209, "step": 11840 }, { "epoch": 17.82, "grad_norm": 3.796853542327881, "learning_rate": 8.218045112781955e-06, "loss": 0.3241, "step": 11850 }, { "epoch": 17.83, "grad_norm": 7.188779354095459, "learning_rate": 8.216541353383459e-06, "loss": 0.3923, "step": 11860 }, { "epoch": 17.85, "grad_norm": 5.088048934936523, "learning_rate": 8.215037593984964e-06, "loss": 0.3528, "step": 11870 }, { "epoch": 17.86, "grad_norm": 6.49263334274292, "learning_rate": 8.213533834586466e-06, "loss": 0.3724, "step": 11880 }, { "epoch": 17.88, "grad_norm": 8.370095252990723, "learning_rate": 8.212030075187971e-06, "loss": 0.3697, "step": 11890 }, { "epoch": 17.89, "grad_norm": 7.578341007232666, "learning_rate": 8.210526315789475e-06, "loss": 0.3317, "step": 11900 }, { "epoch": 17.91, "grad_norm": 5.1709723472595215, "learning_rate": 8.209022556390978e-06, "loss": 0.3304, "step": 11910 }, { "epoch": 17.92, "grad_norm": 4.586398124694824, "learning_rate": 8.207518796992482e-06, "loss": 0.3942, "step": 11920 }, { "epoch": 17.94, "grad_norm": 3.6240298748016357, "learning_rate": 8.206015037593985e-06, "loss": 0.2839, "step": 11930 }, { "epoch": 17.95, "grad_norm": 7.672499179840088, "learning_rate": 8.204511278195489e-06, "loss": 0.3539, "step": 11940 }, { "epoch": 17.97, "grad_norm": 5.807362079620361, "learning_rate": 8.203007518796992e-06, "loss": 0.3006, "step": 11950 }, { "epoch": 17.98, "grad_norm": 5.875560283660889, "learning_rate": 8.201503759398498e-06, "loss": 0.3857, "step": 11960 }, { "epoch": 18.0, "grad_norm": 47.7349967956543, "learning_rate": 8.2e-06, "loss": 0.3985, "step": 11970 }, { "epoch": 18.0, "eval_accuracy": 0.9356, "eval_loss": 0.25322866439819336, "eval_runtime": 84.6682, "eval_samples_per_second": 118.108, "eval_steps_per_second": 0.472, "step": 11970 }, { "epoch": 18.02, "grad_norm": 5.787768363952637, "learning_rate": 8.198496240601505e-06, "loss": 0.4037, "step": 11980 }, { "epoch": 18.03, "grad_norm": 9.899946212768555, "learning_rate": 8.196992481203008e-06, "loss": 0.3305, "step": 11990 }, { "epoch": 18.05, "grad_norm": 7.430878639221191, "learning_rate": 8.195488721804512e-06, "loss": 0.3072, "step": 12000 }, { "epoch": 18.06, "grad_norm": 4.635775566101074, "learning_rate": 8.193984962406015e-06, "loss": 0.3824, "step": 12010 }, { "epoch": 18.08, "grad_norm": 11.533434867858887, "learning_rate": 8.19248120300752e-06, "loss": 0.3494, "step": 12020 }, { "epoch": 18.09, "grad_norm": 9.492911338806152, "learning_rate": 8.190977443609022e-06, "loss": 0.2719, "step": 12030 }, { "epoch": 18.11, "grad_norm": 4.291653156280518, "learning_rate": 8.189473684210527e-06, "loss": 0.3766, "step": 12040 }, { "epoch": 18.12, "grad_norm": 6.437026023864746, "learning_rate": 8.187969924812031e-06, "loss": 0.3757, "step": 12050 }, { "epoch": 18.14, "grad_norm": 6.730251789093018, "learning_rate": 8.186466165413534e-06, "loss": 0.3426, "step": 12060 }, { "epoch": 18.15, "grad_norm": 4.403579235076904, "learning_rate": 8.184962406015038e-06, "loss": 0.4402, "step": 12070 }, { "epoch": 18.17, "grad_norm": 4.053240776062012, "learning_rate": 8.183458646616542e-06, "loss": 0.4017, "step": 12080 }, { "epoch": 18.18, "grad_norm": 9.355215072631836, "learning_rate": 8.181954887218045e-06, "loss": 0.3034, "step": 12090 }, { "epoch": 18.2, "grad_norm": 6.016571521759033, "learning_rate": 8.18045112781955e-06, "loss": 0.2966, "step": 12100 }, { "epoch": 18.21, "grad_norm": 3.7155065536499023, "learning_rate": 8.178947368421054e-06, "loss": 0.325, "step": 12110 }, { "epoch": 18.23, "grad_norm": 4.735385417938232, "learning_rate": 8.177443609022557e-06, "loss": 0.3916, "step": 12120 }, { "epoch": 18.24, "grad_norm": 5.728586673736572, "learning_rate": 8.17593984962406e-06, "loss": 0.3307, "step": 12130 }, { "epoch": 18.26, "grad_norm": 3.7535057067871094, "learning_rate": 8.174436090225564e-06, "loss": 0.2576, "step": 12140 }, { "epoch": 18.27, "grad_norm": 7.422114849090576, "learning_rate": 8.172932330827068e-06, "loss": 0.405, "step": 12150 }, { "epoch": 18.29, "grad_norm": 7.051729679107666, "learning_rate": 8.171428571428573e-06, "loss": 0.3504, "step": 12160 }, { "epoch": 18.3, "grad_norm": 5.593104362487793, "learning_rate": 8.169924812030075e-06, "loss": 0.3071, "step": 12170 }, { "epoch": 18.32, "grad_norm": 6.640134811401367, "learning_rate": 8.16842105263158e-06, "loss": 0.38, "step": 12180 }, { "epoch": 18.33, "grad_norm": 7.922089099884033, "learning_rate": 8.166917293233084e-06, "loss": 0.3383, "step": 12190 }, { "epoch": 18.35, "grad_norm": 6.431313514709473, "learning_rate": 8.165413533834587e-06, "loss": 0.2918, "step": 12200 }, { "epoch": 18.36, "grad_norm": 5.122470855712891, "learning_rate": 8.16390977443609e-06, "loss": 0.3129, "step": 12210 }, { "epoch": 18.38, "grad_norm": 5.576056957244873, "learning_rate": 8.162406015037594e-06, "loss": 0.3316, "step": 12220 }, { "epoch": 18.39, "grad_norm": 5.235702037811279, "learning_rate": 8.160902255639098e-06, "loss": 0.2955, "step": 12230 }, { "epoch": 18.41, "grad_norm": 5.954382419586182, "learning_rate": 8.159398496240603e-06, "loss": 0.3256, "step": 12240 }, { "epoch": 18.42, "grad_norm": 5.545419216156006, "learning_rate": 8.157894736842106e-06, "loss": 0.358, "step": 12250 }, { "epoch": 18.44, "grad_norm": 12.001044273376465, "learning_rate": 8.15639097744361e-06, "loss": 0.3185, "step": 12260 }, { "epoch": 18.45, "grad_norm": 10.437068939208984, "learning_rate": 8.154887218045113e-06, "loss": 0.3234, "step": 12270 }, { "epoch": 18.47, "grad_norm": 5.405700206756592, "learning_rate": 8.153383458646617e-06, "loss": 0.251, "step": 12280 }, { "epoch": 18.48, "grad_norm": 6.865713119506836, "learning_rate": 8.15187969924812e-06, "loss": 0.4126, "step": 12290 }, { "epoch": 18.5, "grad_norm": 6.4221038818359375, "learning_rate": 8.150375939849626e-06, "loss": 0.3338, "step": 12300 }, { "epoch": 18.51, "grad_norm": 5.260381698608398, "learning_rate": 8.148872180451128e-06, "loss": 0.3482, "step": 12310 }, { "epoch": 18.53, "grad_norm": 8.89275074005127, "learning_rate": 8.147368421052633e-06, "loss": 0.3684, "step": 12320 }, { "epoch": 18.54, "grad_norm": 4.887394428253174, "learning_rate": 8.145864661654136e-06, "loss": 0.3673, "step": 12330 }, { "epoch": 18.56, "grad_norm": 3.8077456951141357, "learning_rate": 8.14436090225564e-06, "loss": 0.3478, "step": 12340 }, { "epoch": 18.57, "grad_norm": 5.575742721557617, "learning_rate": 8.142857142857143e-06, "loss": 0.332, "step": 12350 }, { "epoch": 18.59, "grad_norm": 6.4698333740234375, "learning_rate": 8.141353383458649e-06, "loss": 0.4422, "step": 12360 }, { "epoch": 18.6, "grad_norm": 5.90009880065918, "learning_rate": 8.13984962406015e-06, "loss": 0.4025, "step": 12370 }, { "epoch": 18.62, "grad_norm": 7.366026401519775, "learning_rate": 8.138345864661656e-06, "loss": 0.3364, "step": 12380 }, { "epoch": 18.63, "grad_norm": 5.042256832122803, "learning_rate": 8.136842105263159e-06, "loss": 0.2686, "step": 12390 }, { "epoch": 18.65, "grad_norm": 5.483407020568848, "learning_rate": 8.135338345864663e-06, "loss": 0.2765, "step": 12400 }, { "epoch": 18.66, "grad_norm": 8.821917533874512, "learning_rate": 8.133834586466166e-06, "loss": 0.3899, "step": 12410 }, { "epoch": 18.68, "grad_norm": 5.6711297035217285, "learning_rate": 8.13233082706767e-06, "loss": 0.4293, "step": 12420 }, { "epoch": 18.69, "grad_norm": 6.44101619720459, "learning_rate": 8.130827067669173e-06, "loss": 0.3191, "step": 12430 }, { "epoch": 18.71, "grad_norm": 4.042926788330078, "learning_rate": 8.129323308270677e-06, "loss": 0.3414, "step": 12440 }, { "epoch": 18.72, "grad_norm": 7.297868728637695, "learning_rate": 8.127819548872182e-06, "loss": 0.2944, "step": 12450 }, { "epoch": 18.74, "grad_norm": 7.283233642578125, "learning_rate": 8.126315789473684e-06, "loss": 0.3262, "step": 12460 }, { "epoch": 18.75, "grad_norm": 7.256216526031494, "learning_rate": 8.124812030075189e-06, "loss": 0.3714, "step": 12470 }, { "epoch": 18.77, "grad_norm": 5.120934009552002, "learning_rate": 8.123308270676692e-06, "loss": 0.3963, "step": 12480 }, { "epoch": 18.78, "grad_norm": 5.541153907775879, "learning_rate": 8.121804511278196e-06, "loss": 0.3602, "step": 12490 }, { "epoch": 18.8, "grad_norm": 5.731190204620361, "learning_rate": 8.1203007518797e-06, "loss": 0.4258, "step": 12500 }, { "epoch": 18.81, "grad_norm": 5.054348468780518, "learning_rate": 8.118796992481203e-06, "loss": 0.2603, "step": 12510 }, { "epoch": 18.83, "grad_norm": 4.947210311889648, "learning_rate": 8.117293233082707e-06, "loss": 0.332, "step": 12520 }, { "epoch": 18.84, "grad_norm": 7.019644737243652, "learning_rate": 8.115789473684212e-06, "loss": 0.3752, "step": 12530 }, { "epoch": 18.86, "grad_norm": 6.0113325119018555, "learning_rate": 8.114285714285715e-06, "loss": 0.2853, "step": 12540 }, { "epoch": 18.87, "grad_norm": 10.12833309173584, "learning_rate": 8.112781954887219e-06, "loss": 0.2721, "step": 12550 }, { "epoch": 18.89, "grad_norm": 3.987274408340454, "learning_rate": 8.111278195488722e-06, "loss": 0.3351, "step": 12560 }, { "epoch": 18.9, "grad_norm": 6.421701431274414, "learning_rate": 8.109774436090226e-06, "loss": 0.3202, "step": 12570 }, { "epoch": 18.92, "grad_norm": 6.776229381561279, "learning_rate": 8.10827067669173e-06, "loss": 0.3123, "step": 12580 }, { "epoch": 18.93, "grad_norm": 8.584750175476074, "learning_rate": 8.106766917293235e-06, "loss": 0.4226, "step": 12590 }, { "epoch": 18.95, "grad_norm": 4.636366844177246, "learning_rate": 8.105263157894736e-06, "loss": 0.4647, "step": 12600 }, { "epoch": 18.96, "grad_norm": 5.144723415374756, "learning_rate": 8.103759398496242e-06, "loss": 0.287, "step": 12610 }, { "epoch": 18.98, "grad_norm": 7.949281692504883, "learning_rate": 8.102255639097745e-06, "loss": 0.3361, "step": 12620 }, { "epoch": 18.99, "grad_norm": 6.716331958770752, "learning_rate": 8.100751879699249e-06, "loss": 0.3213, "step": 12630 }, { "epoch": 19.0, "eval_accuracy": 0.9321, "eval_loss": 0.27284207940101624, "eval_runtime": 84.4705, "eval_samples_per_second": 118.384, "eval_steps_per_second": 0.474, "step": 12635 }, { "epoch": 19.01, "grad_norm": 5.840856075286865, "learning_rate": 8.099248120300752e-06, "loss": 0.3154, "step": 12640 }, { "epoch": 19.02, "grad_norm": 6.531993389129639, "learning_rate": 8.097744360902257e-06, "loss": 0.3244, "step": 12650 }, { "epoch": 19.04, "grad_norm": 8.1339111328125, "learning_rate": 8.09624060150376e-06, "loss": 0.3372, "step": 12660 }, { "epoch": 19.05, "grad_norm": 5.9135518074035645, "learning_rate": 8.094736842105264e-06, "loss": 0.3514, "step": 12670 }, { "epoch": 19.07, "grad_norm": 4.900998592376709, "learning_rate": 8.093233082706768e-06, "loss": 0.3248, "step": 12680 }, { "epoch": 19.08, "grad_norm": 6.988156795501709, "learning_rate": 8.091729323308271e-06, "loss": 0.3665, "step": 12690 }, { "epoch": 19.1, "grad_norm": 6.229936599731445, "learning_rate": 8.090225563909775e-06, "loss": 0.2717, "step": 12700 }, { "epoch": 19.11, "grad_norm": 7.66952657699585, "learning_rate": 8.088721804511278e-06, "loss": 0.4116, "step": 12710 }, { "epoch": 19.13, "grad_norm": 5.332541465759277, "learning_rate": 8.087218045112782e-06, "loss": 0.2748, "step": 12720 }, { "epoch": 19.14, "grad_norm": 6.3354268074035645, "learning_rate": 8.085714285714287e-06, "loss": 0.3508, "step": 12730 }, { "epoch": 19.16, "grad_norm": 2.598149538040161, "learning_rate": 8.08421052631579e-06, "loss": 0.2887, "step": 12740 }, { "epoch": 19.17, "grad_norm": 3.951981544494629, "learning_rate": 8.082706766917294e-06, "loss": 0.3201, "step": 12750 }, { "epoch": 19.19, "grad_norm": 6.856266021728516, "learning_rate": 8.081203007518798e-06, "loss": 0.3015, "step": 12760 }, { "epoch": 19.2, "grad_norm": 8.871770858764648, "learning_rate": 8.079699248120301e-06, "loss": 0.4151, "step": 12770 }, { "epoch": 19.22, "grad_norm": 7.119663238525391, "learning_rate": 8.078195488721805e-06, "loss": 0.2968, "step": 12780 }, { "epoch": 19.23, "grad_norm": 5.254119396209717, "learning_rate": 8.07669172932331e-06, "loss": 0.2823, "step": 12790 }, { "epoch": 19.25, "grad_norm": 8.958704948425293, "learning_rate": 8.075187969924812e-06, "loss": 0.3453, "step": 12800 }, { "epoch": 19.26, "grad_norm": 8.45573616027832, "learning_rate": 8.073684210526317e-06, "loss": 0.3827, "step": 12810 }, { "epoch": 19.28, "grad_norm": 3.3245856761932373, "learning_rate": 8.07218045112782e-06, "loss": 0.3515, "step": 12820 }, { "epoch": 19.29, "grad_norm": 4.837510585784912, "learning_rate": 8.070676691729324e-06, "loss": 0.3397, "step": 12830 }, { "epoch": 19.31, "grad_norm": 3.3870489597320557, "learning_rate": 8.069172932330828e-06, "loss": 0.3151, "step": 12840 }, { "epoch": 19.32, "grad_norm": 6.1325201988220215, "learning_rate": 8.067669172932333e-06, "loss": 0.3491, "step": 12850 }, { "epoch": 19.34, "grad_norm": 6.618387699127197, "learning_rate": 8.066165413533835e-06, "loss": 0.3685, "step": 12860 }, { "epoch": 19.35, "grad_norm": 6.2998223304748535, "learning_rate": 8.06466165413534e-06, "loss": 0.4341, "step": 12870 }, { "epoch": 19.37, "grad_norm": 5.405755043029785, "learning_rate": 8.063157894736843e-06, "loss": 0.3056, "step": 12880 }, { "epoch": 19.38, "grad_norm": 5.74697732925415, "learning_rate": 8.061654135338347e-06, "loss": 0.4015, "step": 12890 }, { "epoch": 19.4, "grad_norm": 4.222403526306152, "learning_rate": 8.06015037593985e-06, "loss": 0.2818, "step": 12900 }, { "epoch": 19.41, "grad_norm": 11.104449272155762, "learning_rate": 8.058646616541354e-06, "loss": 0.3528, "step": 12910 }, { "epoch": 19.43, "grad_norm": 5.580349445343018, "learning_rate": 8.057142857142857e-06, "loss": 0.3511, "step": 12920 }, { "epoch": 19.44, "grad_norm": 4.909841060638428, "learning_rate": 8.055639097744361e-06, "loss": 0.2696, "step": 12930 }, { "epoch": 19.46, "grad_norm": 4.32835054397583, "learning_rate": 8.054135338345866e-06, "loss": 0.2714, "step": 12940 }, { "epoch": 19.47, "grad_norm": 4.742302894592285, "learning_rate": 8.052631578947368e-06, "loss": 0.3427, "step": 12950 }, { "epoch": 19.49, "grad_norm": 6.302316665649414, "learning_rate": 8.051127819548873e-06, "loss": 0.406, "step": 12960 }, { "epoch": 19.5, "grad_norm": 6.898422718048096, "learning_rate": 8.049624060150377e-06, "loss": 0.3048, "step": 12970 }, { "epoch": 19.52, "grad_norm": 9.071293830871582, "learning_rate": 8.04812030075188e-06, "loss": 0.3603, "step": 12980 }, { "epoch": 19.53, "grad_norm": 5.120814323425293, "learning_rate": 8.046616541353384e-06, "loss": 0.3555, "step": 12990 }, { "epoch": 19.55, "grad_norm": 8.293437957763672, "learning_rate": 8.045112781954887e-06, "loss": 0.3061, "step": 13000 }, { "epoch": 19.56, "grad_norm": 6.051889419555664, "learning_rate": 8.04360902255639e-06, "loss": 0.3308, "step": 13010 }, { "epoch": 19.58, "grad_norm": 6.6684889793396, "learning_rate": 8.042105263157896e-06, "loss": 0.3497, "step": 13020 }, { "epoch": 19.59, "grad_norm": 6.767400741577148, "learning_rate": 8.0406015037594e-06, "loss": 0.3037, "step": 13030 }, { "epoch": 19.61, "grad_norm": 6.514301300048828, "learning_rate": 8.039097744360903e-06, "loss": 0.266, "step": 13040 }, { "epoch": 19.62, "grad_norm": 4.410576820373535, "learning_rate": 8.037593984962407e-06, "loss": 0.3086, "step": 13050 }, { "epoch": 19.64, "grad_norm": 8.338872909545898, "learning_rate": 8.03609022556391e-06, "loss": 0.3505, "step": 13060 }, { "epoch": 19.65, "grad_norm": 7.640175819396973, "learning_rate": 8.034586466165414e-06, "loss": 0.3573, "step": 13070 }, { "epoch": 19.67, "grad_norm": 5.462586402893066, "learning_rate": 8.033082706766919e-06, "loss": 0.3053, "step": 13080 }, { "epoch": 19.68, "grad_norm": 4.294394016265869, "learning_rate": 8.03157894736842e-06, "loss": 0.3741, "step": 13090 }, { "epoch": 19.7, "grad_norm": 4.442080497741699, "learning_rate": 8.030075187969926e-06, "loss": 0.2855, "step": 13100 }, { "epoch": 19.71, "grad_norm": 3.8873209953308105, "learning_rate": 8.02857142857143e-06, "loss": 0.2914, "step": 13110 }, { "epoch": 19.73, "grad_norm": 9.044189453125, "learning_rate": 8.027067669172933e-06, "loss": 0.3345, "step": 13120 }, { "epoch": 19.74, "grad_norm": 9.596213340759277, "learning_rate": 8.025563909774436e-06, "loss": 0.3249, "step": 13130 }, { "epoch": 19.76, "grad_norm": 6.889251708984375, "learning_rate": 8.02406015037594e-06, "loss": 0.3135, "step": 13140 }, { "epoch": 19.77, "grad_norm": 6.394194602966309, "learning_rate": 8.022556390977443e-06, "loss": 0.2785, "step": 13150 }, { "epoch": 19.79, "grad_norm": 8.294524192810059, "learning_rate": 8.021052631578949e-06, "loss": 0.3854, "step": 13160 }, { "epoch": 19.8, "grad_norm": 9.557373046875, "learning_rate": 8.019548872180452e-06, "loss": 0.3031, "step": 13170 }, { "epoch": 19.82, "grad_norm": 5.370851516723633, "learning_rate": 8.018045112781956e-06, "loss": 0.3836, "step": 13180 }, { "epoch": 19.83, "grad_norm": 7.893476963043213, "learning_rate": 8.01654135338346e-06, "loss": 0.4143, "step": 13190 }, { "epoch": 19.85, "grad_norm": 6.872874736785889, "learning_rate": 8.015037593984963e-06, "loss": 0.3131, "step": 13200 }, { "epoch": 19.86, "grad_norm": 8.026068687438965, "learning_rate": 8.013533834586466e-06, "loss": 0.3558, "step": 13210 }, { "epoch": 19.88, "grad_norm": 5.648526668548584, "learning_rate": 8.012030075187971e-06, "loss": 0.3377, "step": 13220 }, { "epoch": 19.89, "grad_norm": 9.711199760437012, "learning_rate": 8.010526315789473e-06, "loss": 0.3421, "step": 13230 }, { "epoch": 19.91, "grad_norm": 5.787407875061035, "learning_rate": 8.009022556390979e-06, "loss": 0.3919, "step": 13240 }, { "epoch": 19.92, "grad_norm": 6.500585079193115, "learning_rate": 8.007518796992482e-06, "loss": 0.3383, "step": 13250 }, { "epoch": 19.94, "grad_norm": 6.25896692276001, "learning_rate": 8.006015037593986e-06, "loss": 0.3031, "step": 13260 }, { "epoch": 19.95, "grad_norm": 4.565229415893555, "learning_rate": 8.004511278195489e-06, "loss": 0.379, "step": 13270 }, { "epoch": 19.97, "grad_norm": 5.765014171600342, "learning_rate": 8.003007518796994e-06, "loss": 0.2949, "step": 13280 }, { "epoch": 19.98, "grad_norm": 5.820449352264404, "learning_rate": 8.001503759398496e-06, "loss": 0.2912, "step": 13290 }, { "epoch": 20.0, "grad_norm": 0.44211289286613464, "learning_rate": 8.000000000000001e-06, "loss": 0.3046, "step": 13300 }, { "epoch": 20.0, "eval_accuracy": 0.9334, "eval_loss": 0.27018824219703674, "eval_runtime": 84.9396, "eval_samples_per_second": 117.731, "eval_steps_per_second": 0.471, "step": 13300 }, { "epoch": 20.02, "grad_norm": 6.310322284698486, "learning_rate": 7.998496240601505e-06, "loss": 0.3525, "step": 13310 }, { "epoch": 20.03, "grad_norm": 4.756250381469727, "learning_rate": 7.996992481203008e-06, "loss": 0.2493, "step": 13320 }, { "epoch": 20.05, "grad_norm": 6.389369487762451, "learning_rate": 7.995488721804512e-06, "loss": 0.3425, "step": 13330 }, { "epoch": 20.06, "grad_norm": 6.042266368865967, "learning_rate": 7.993984962406015e-06, "loss": 0.2894, "step": 13340 }, { "epoch": 20.08, "grad_norm": 5.721156597137451, "learning_rate": 7.992481203007519e-06, "loss": 0.3251, "step": 13350 }, { "epoch": 20.09, "grad_norm": 8.065733909606934, "learning_rate": 7.990977443609024e-06, "loss": 0.2898, "step": 13360 }, { "epoch": 20.11, "grad_norm": 5.018655300140381, "learning_rate": 7.989473684210528e-06, "loss": 0.3525, "step": 13370 }, { "epoch": 20.12, "grad_norm": 7.109757900238037, "learning_rate": 7.987969924812031e-06, "loss": 0.2977, "step": 13380 }, { "epoch": 20.14, "grad_norm": 5.0374274253845215, "learning_rate": 7.986466165413535e-06, "loss": 0.2893, "step": 13390 }, { "epoch": 20.15, "grad_norm": 6.046887397766113, "learning_rate": 7.984962406015038e-06, "loss": 0.2753, "step": 13400 }, { "epoch": 20.17, "grad_norm": 4.841386795043945, "learning_rate": 7.983458646616542e-06, "loss": 0.3502, "step": 13410 }, { "epoch": 20.18, "grad_norm": 6.6681036949157715, "learning_rate": 7.981954887218045e-06, "loss": 0.3064, "step": 13420 }, { "epoch": 20.2, "grad_norm": 7.210595607757568, "learning_rate": 7.980451127819549e-06, "loss": 0.3887, "step": 13430 }, { "epoch": 20.21, "grad_norm": 7.28894567489624, "learning_rate": 7.978947368421052e-06, "loss": 0.3476, "step": 13440 }, { "epoch": 20.23, "grad_norm": 14.942082405090332, "learning_rate": 7.977443609022558e-06, "loss": 0.3151, "step": 13450 }, { "epoch": 20.24, "grad_norm": 6.0876054763793945, "learning_rate": 7.975939849624061e-06, "loss": 0.3542, "step": 13460 }, { "epoch": 20.26, "grad_norm": 8.294320106506348, "learning_rate": 7.974436090225565e-06, "loss": 0.3312, "step": 13470 }, { "epoch": 20.27, "grad_norm": 5.090022087097168, "learning_rate": 7.972932330827068e-06, "loss": 0.323, "step": 13480 }, { "epoch": 20.29, "grad_norm": 2.9050402641296387, "learning_rate": 7.971428571428572e-06, "loss": 0.2753, "step": 13490 }, { "epoch": 20.3, "grad_norm": 7.342869758605957, "learning_rate": 7.969924812030075e-06, "loss": 0.2639, "step": 13500 }, { "epoch": 20.32, "grad_norm": 5.894117832183838, "learning_rate": 7.96842105263158e-06, "loss": 0.2893, "step": 13510 }, { "epoch": 20.33, "grad_norm": 5.4692792892456055, "learning_rate": 7.966917293233082e-06, "loss": 0.3384, "step": 13520 }, { "epoch": 20.35, "grad_norm": 7.577072620391846, "learning_rate": 7.965413533834587e-06, "loss": 0.3865, "step": 13530 }, { "epoch": 20.36, "grad_norm": 22.644380569458008, "learning_rate": 7.963909774436091e-06, "loss": 0.3332, "step": 13540 }, { "epoch": 20.38, "grad_norm": 6.491385459899902, "learning_rate": 7.962406015037594e-06, "loss": 0.3662, "step": 13550 }, { "epoch": 20.39, "grad_norm": 5.722530841827393, "learning_rate": 7.960902255639098e-06, "loss": 0.3452, "step": 13560 }, { "epoch": 20.41, "grad_norm": 5.905157566070557, "learning_rate": 7.959398496240603e-06, "loss": 0.3424, "step": 13570 }, { "epoch": 20.42, "grad_norm": 8.227418899536133, "learning_rate": 7.957894736842105e-06, "loss": 0.3635, "step": 13580 }, { "epoch": 20.44, "grad_norm": 3.507075309753418, "learning_rate": 7.95639097744361e-06, "loss": 0.2959, "step": 13590 }, { "epoch": 20.45, "grad_norm": 10.020979881286621, "learning_rate": 7.954887218045114e-06, "loss": 0.325, "step": 13600 }, { "epoch": 20.47, "grad_norm": 6.452096939086914, "learning_rate": 7.953383458646617e-06, "loss": 0.2817, "step": 13610 }, { "epoch": 20.48, "grad_norm": 5.931590557098389, "learning_rate": 7.95187969924812e-06, "loss": 0.3138, "step": 13620 }, { "epoch": 20.5, "grad_norm": 11.746089935302734, "learning_rate": 7.950375939849624e-06, "loss": 0.3263, "step": 13630 }, { "epoch": 20.51, "grad_norm": 5.600671291351318, "learning_rate": 7.948872180451128e-06, "loss": 0.2635, "step": 13640 }, { "epoch": 20.53, "grad_norm": 4.570649147033691, "learning_rate": 7.947368421052633e-06, "loss": 0.3372, "step": 13650 }, { "epoch": 20.54, "grad_norm": 10.411113739013672, "learning_rate": 7.945864661654136e-06, "loss": 0.3403, "step": 13660 }, { "epoch": 20.56, "grad_norm": 7.8483710289001465, "learning_rate": 7.94436090225564e-06, "loss": 0.4401, "step": 13670 }, { "epoch": 20.57, "grad_norm": 6.16978645324707, "learning_rate": 7.942857142857144e-06, "loss": 0.2905, "step": 13680 }, { "epoch": 20.59, "grad_norm": 5.427683353424072, "learning_rate": 7.941353383458647e-06, "loss": 0.3853, "step": 13690 }, { "epoch": 20.6, "grad_norm": 10.262079238891602, "learning_rate": 7.93984962406015e-06, "loss": 0.3126, "step": 13700 }, { "epoch": 20.62, "grad_norm": 2.678656578063965, "learning_rate": 7.938345864661656e-06, "loss": 0.2864, "step": 13710 }, { "epoch": 20.63, "grad_norm": 4.784215450286865, "learning_rate": 7.936842105263158e-06, "loss": 0.2532, "step": 13720 }, { "epoch": 20.65, "grad_norm": 6.748628616333008, "learning_rate": 7.935338345864663e-06, "loss": 0.3338, "step": 13730 }, { "epoch": 20.66, "grad_norm": 6.877127647399902, "learning_rate": 7.933834586466166e-06, "loss": 0.2662, "step": 13740 }, { "epoch": 20.68, "grad_norm": 7.904510974884033, "learning_rate": 7.93233082706767e-06, "loss": 0.3532, "step": 13750 }, { "epoch": 20.69, "grad_norm": 3.242637872695923, "learning_rate": 7.930827067669173e-06, "loss": 0.3012, "step": 13760 }, { "epoch": 20.71, "grad_norm": 3.0852160453796387, "learning_rate": 7.929323308270679e-06, "loss": 0.2948, "step": 13770 }, { "epoch": 20.72, "grad_norm": 10.306611061096191, "learning_rate": 7.92781954887218e-06, "loss": 0.2837, "step": 13780 }, { "epoch": 20.74, "grad_norm": 3.293861150741577, "learning_rate": 7.926315789473686e-06, "loss": 0.3023, "step": 13790 }, { "epoch": 20.75, "grad_norm": 6.937739849090576, "learning_rate": 7.924812030075189e-06, "loss": 0.3147, "step": 13800 }, { "epoch": 20.77, "grad_norm": 4.617713928222656, "learning_rate": 7.923308270676693e-06, "loss": 0.3503, "step": 13810 }, { "epoch": 20.78, "grad_norm": 4.726008892059326, "learning_rate": 7.921804511278196e-06, "loss": 0.3526, "step": 13820 }, { "epoch": 20.8, "grad_norm": 6.7416558265686035, "learning_rate": 7.9203007518797e-06, "loss": 0.2887, "step": 13830 }, { "epoch": 20.81, "grad_norm": 6.563324451446533, "learning_rate": 7.918796992481203e-06, "loss": 0.2874, "step": 13840 }, { "epoch": 20.83, "grad_norm": 5.6465840339660645, "learning_rate": 7.917293233082708e-06, "loss": 0.3598, "step": 13850 }, { "epoch": 20.84, "grad_norm": 5.625797748565674, "learning_rate": 7.915789473684212e-06, "loss": 0.3195, "step": 13860 }, { "epoch": 20.86, "grad_norm": 7.804876327514648, "learning_rate": 7.914285714285715e-06, "loss": 0.3369, "step": 13870 }, { "epoch": 20.87, "grad_norm": 8.359640121459961, "learning_rate": 7.912781954887219e-06, "loss": 0.3332, "step": 13880 }, { "epoch": 20.89, "grad_norm": 5.867996692657471, "learning_rate": 7.911278195488723e-06, "loss": 0.4172, "step": 13890 }, { "epoch": 20.9, "grad_norm": 5.789632320404053, "learning_rate": 7.909774436090226e-06, "loss": 0.3705, "step": 13900 }, { "epoch": 20.92, "grad_norm": 4.992484092712402, "learning_rate": 7.90827067669173e-06, "loss": 0.3287, "step": 13910 }, { "epoch": 20.93, "grad_norm": 8.009930610656738, "learning_rate": 7.906766917293233e-06, "loss": 0.3065, "step": 13920 }, { "epoch": 20.95, "grad_norm": 5.598669528961182, "learning_rate": 7.905263157894737e-06, "loss": 0.3115, "step": 13930 }, { "epoch": 20.96, "grad_norm": 5.0005364418029785, "learning_rate": 7.903759398496242e-06, "loss": 0.3635, "step": 13940 }, { "epoch": 20.98, "grad_norm": 6.9587788581848145, "learning_rate": 7.902255639097745e-06, "loss": 0.3386, "step": 13950 }, { "epoch": 20.99, "grad_norm": 6.115482330322266, "learning_rate": 7.900751879699249e-06, "loss": 0.3676, "step": 13960 }, { "epoch": 21.0, "eval_accuracy": 0.9319, "eval_loss": 0.2700176537036896, "eval_runtime": 84.5019, "eval_samples_per_second": 118.341, "eval_steps_per_second": 0.473, "step": 13965 }, { "epoch": 21.01, "grad_norm": 3.4582622051239014, "learning_rate": 7.899248120300752e-06, "loss": 0.6324, "step": 13970 }, { "epoch": 21.02, "grad_norm": 5.3081512451171875, "learning_rate": 7.897744360902256e-06, "loss": 0.3913, "step": 13980 }, { "epoch": 21.04, "grad_norm": 7.172489166259766, "learning_rate": 7.89624060150376e-06, "loss": 0.3863, "step": 13990 }, { "epoch": 21.05, "grad_norm": 6.308826446533203, "learning_rate": 7.894736842105265e-06, "loss": 0.3214, "step": 14000 }, { "epoch": 21.07, "grad_norm": 5.6395111083984375, "learning_rate": 7.893233082706766e-06, "loss": 0.2801, "step": 14010 }, { "epoch": 21.08, "grad_norm": 2.734475612640381, "learning_rate": 7.891729323308272e-06, "loss": 0.3212, "step": 14020 }, { "epoch": 21.1, "grad_norm": 4.720375061035156, "learning_rate": 7.890225563909775e-06, "loss": 0.3271, "step": 14030 }, { "epoch": 21.11, "grad_norm": 3.2389378547668457, "learning_rate": 7.888721804511279e-06, "loss": 0.3118, "step": 14040 }, { "epoch": 21.13, "grad_norm": 3.7250821590423584, "learning_rate": 7.887218045112782e-06, "loss": 0.337, "step": 14050 }, { "epoch": 21.14, "grad_norm": 8.08519458770752, "learning_rate": 7.885714285714286e-06, "loss": 0.3126, "step": 14060 }, { "epoch": 21.16, "grad_norm": 5.902373313903809, "learning_rate": 7.88421052631579e-06, "loss": 0.3088, "step": 14070 }, { "epoch": 21.17, "grad_norm": 4.858761787414551, "learning_rate": 7.882706766917294e-06, "loss": 0.3258, "step": 14080 }, { "epoch": 21.19, "grad_norm": 6.120957374572754, "learning_rate": 7.881203007518798e-06, "loss": 0.2718, "step": 14090 }, { "epoch": 21.2, "grad_norm": 3.7284491062164307, "learning_rate": 7.879699248120301e-06, "loss": 0.3521, "step": 14100 }, { "epoch": 21.22, "grad_norm": 6.68389892578125, "learning_rate": 7.878195488721805e-06, "loss": 0.345, "step": 14110 }, { "epoch": 21.23, "grad_norm": 6.401340484619141, "learning_rate": 7.876691729323309e-06, "loss": 0.302, "step": 14120 }, { "epoch": 21.25, "grad_norm": 3.490999698638916, "learning_rate": 7.875187969924812e-06, "loss": 0.3658, "step": 14130 }, { "epoch": 21.26, "grad_norm": 7.066733360290527, "learning_rate": 7.873684210526317e-06, "loss": 0.3089, "step": 14140 }, { "epoch": 21.28, "grad_norm": 5.916280269622803, "learning_rate": 7.872180451127819e-06, "loss": 0.2996, "step": 14150 }, { "epoch": 21.29, "grad_norm": 5.348523139953613, "learning_rate": 7.870676691729324e-06, "loss": 0.3389, "step": 14160 }, { "epoch": 21.31, "grad_norm": 5.067401885986328, "learning_rate": 7.869172932330828e-06, "loss": 0.334, "step": 14170 }, { "epoch": 21.32, "grad_norm": 4.67525053024292, "learning_rate": 7.867669172932331e-06, "loss": 0.3098, "step": 14180 }, { "epoch": 21.34, "grad_norm": 7.282690525054932, "learning_rate": 7.866165413533835e-06, "loss": 0.3779, "step": 14190 }, { "epoch": 21.35, "grad_norm": 6.01992654800415, "learning_rate": 7.86466165413534e-06, "loss": 0.2547, "step": 14200 }, { "epoch": 21.37, "grad_norm": 3.902848720550537, "learning_rate": 7.863157894736842e-06, "loss": 0.2964, "step": 14210 }, { "epoch": 21.38, "grad_norm": 5.350156784057617, "learning_rate": 7.861654135338347e-06, "loss": 0.325, "step": 14220 }, { "epoch": 21.4, "grad_norm": 6.831077575683594, "learning_rate": 7.86015037593985e-06, "loss": 0.3629, "step": 14230 }, { "epoch": 21.41, "grad_norm": 6.829043865203857, "learning_rate": 7.858646616541354e-06, "loss": 0.364, "step": 14240 }, { "epoch": 21.43, "grad_norm": 4.683376789093018, "learning_rate": 7.857142857142858e-06, "loss": 0.3197, "step": 14250 }, { "epoch": 21.44, "grad_norm": 5.582406520843506, "learning_rate": 7.855639097744361e-06, "loss": 0.2911, "step": 14260 }, { "epoch": 21.46, "grad_norm": 3.7415030002593994, "learning_rate": 7.854135338345865e-06, "loss": 0.321, "step": 14270 }, { "epoch": 21.47, "grad_norm": 5.881400108337402, "learning_rate": 7.85263157894737e-06, "loss": 0.2524, "step": 14280 }, { "epoch": 21.49, "grad_norm": 8.578994750976562, "learning_rate": 7.851127819548873e-06, "loss": 0.2918, "step": 14290 }, { "epoch": 21.5, "grad_norm": 8.9381742477417, "learning_rate": 7.849624060150377e-06, "loss": 0.4393, "step": 14300 }, { "epoch": 21.52, "grad_norm": 4.9871602058410645, "learning_rate": 7.84812030075188e-06, "loss": 0.3624, "step": 14310 }, { "epoch": 21.53, "grad_norm": 6.881855487823486, "learning_rate": 7.846616541353384e-06, "loss": 0.3561, "step": 14320 }, { "epoch": 21.55, "grad_norm": 4.248598575592041, "learning_rate": 7.845112781954888e-06, "loss": 0.2925, "step": 14330 }, { "epoch": 21.56, "grad_norm": 3.864591121673584, "learning_rate": 7.843609022556393e-06, "loss": 0.327, "step": 14340 }, { "epoch": 21.58, "grad_norm": 6.71173095703125, "learning_rate": 7.842105263157895e-06, "loss": 0.3397, "step": 14350 }, { "epoch": 21.59, "grad_norm": 3.9306325912475586, "learning_rate": 7.8406015037594e-06, "loss": 0.2788, "step": 14360 }, { "epoch": 21.61, "grad_norm": 5.838838577270508, "learning_rate": 7.839097744360903e-06, "loss": 0.3116, "step": 14370 }, { "epoch": 21.62, "grad_norm": 13.80683708190918, "learning_rate": 7.837593984962407e-06, "loss": 0.2882, "step": 14380 }, { "epoch": 21.64, "grad_norm": 5.687681198120117, "learning_rate": 7.83609022556391e-06, "loss": 0.3521, "step": 14390 }, { "epoch": 21.65, "grad_norm": 6.886200904846191, "learning_rate": 7.834586466165414e-06, "loss": 0.3544, "step": 14400 }, { "epoch": 21.67, "grad_norm": 7.935529708862305, "learning_rate": 7.833082706766917e-06, "loss": 0.3214, "step": 14410 }, { "epoch": 21.68, "grad_norm": 8.129573822021484, "learning_rate": 7.831578947368421e-06, "loss": 0.3341, "step": 14420 }, { "epoch": 21.7, "grad_norm": 6.135876178741455, "learning_rate": 7.830075187969926e-06, "loss": 0.2829, "step": 14430 }, { "epoch": 21.71, "grad_norm": 8.596725463867188, "learning_rate": 7.828571428571428e-06, "loss": 0.3856, "step": 14440 }, { "epoch": 21.73, "grad_norm": 3.492475748062134, "learning_rate": 7.827067669172933e-06, "loss": 0.4355, "step": 14450 }, { "epoch": 21.74, "grad_norm": 8.090415000915527, "learning_rate": 7.825563909774437e-06, "loss": 0.2981, "step": 14460 }, { "epoch": 21.76, "grad_norm": 6.346333026885986, "learning_rate": 7.82406015037594e-06, "loss": 0.3715, "step": 14470 }, { "epoch": 21.77, "grad_norm": 4.496158123016357, "learning_rate": 7.822556390977444e-06, "loss": 0.3275, "step": 14480 }, { "epoch": 21.79, "grad_norm": 5.410872459411621, "learning_rate": 7.821052631578949e-06, "loss": 0.3401, "step": 14490 }, { "epoch": 21.8, "grad_norm": 6.038545608520508, "learning_rate": 7.81954887218045e-06, "loss": 0.374, "step": 14500 }, { "epoch": 21.82, "grad_norm": 6.986004829406738, "learning_rate": 7.818045112781956e-06, "loss": 0.3294, "step": 14510 }, { "epoch": 21.83, "grad_norm": 6.893804550170898, "learning_rate": 7.81654135338346e-06, "loss": 0.3189, "step": 14520 }, { "epoch": 21.85, "grad_norm": 3.2566752433776855, "learning_rate": 7.815037593984963e-06, "loss": 0.3837, "step": 14530 }, { "epoch": 21.86, "grad_norm": 18.923295974731445, "learning_rate": 7.813533834586466e-06, "loss": 0.3319, "step": 14540 }, { "epoch": 21.88, "grad_norm": 4.527941703796387, "learning_rate": 7.81203007518797e-06, "loss": 0.2917, "step": 14550 }, { "epoch": 21.89, "grad_norm": 4.466981410980225, "learning_rate": 7.810526315789474e-06, "loss": 0.3239, "step": 14560 }, { "epoch": 21.91, "grad_norm": 7.884292125701904, "learning_rate": 7.809022556390979e-06, "loss": 0.3068, "step": 14570 }, { "epoch": 21.92, "grad_norm": 5.265153408050537, "learning_rate": 7.807518796992482e-06, "loss": 0.3472, "step": 14580 }, { "epoch": 21.94, "grad_norm": 5.18987512588501, "learning_rate": 7.806015037593986e-06, "loss": 0.3452, "step": 14590 }, { "epoch": 21.95, "grad_norm": 2.9694736003875732, "learning_rate": 7.80451127819549e-06, "loss": 0.3544, "step": 14600 }, { "epoch": 21.97, "grad_norm": 5.018842697143555, "learning_rate": 7.803007518796993e-06, "loss": 0.2905, "step": 14610 }, { "epoch": 21.98, "grad_norm": 7.495065212249756, "learning_rate": 7.801503759398496e-06, "loss": 0.3675, "step": 14620 }, { "epoch": 22.0, "grad_norm": 4.347672939300537, "learning_rate": 7.800000000000002e-06, "loss": 0.3329, "step": 14630 }, { "epoch": 22.0, "eval_accuracy": 0.9333, "eval_loss": 0.272014319896698, "eval_runtime": 84.1987, "eval_samples_per_second": 118.767, "eval_steps_per_second": 0.475, "step": 14630 }, { "epoch": 22.02, "grad_norm": 6.415873050689697, "learning_rate": 7.798496240601503e-06, "loss": 0.3666, "step": 14640 }, { "epoch": 22.03, "grad_norm": 3.398313522338867, "learning_rate": 7.796992481203009e-06, "loss": 0.2848, "step": 14650 }, { "epoch": 22.05, "grad_norm": 6.848686218261719, "learning_rate": 7.795488721804512e-06, "loss": 0.3241, "step": 14660 }, { "epoch": 22.06, "grad_norm": 4.208133697509766, "learning_rate": 7.793984962406016e-06, "loss": 0.3478, "step": 14670 }, { "epoch": 22.08, "grad_norm": 7.1864213943481445, "learning_rate": 7.792481203007519e-06, "loss": 0.3515, "step": 14680 }, { "epoch": 22.09, "grad_norm": 6.161532878875732, "learning_rate": 7.790977443609024e-06, "loss": 0.3254, "step": 14690 }, { "epoch": 22.11, "grad_norm": 5.770120143890381, "learning_rate": 7.789473684210526e-06, "loss": 0.3253, "step": 14700 }, { "epoch": 22.12, "grad_norm": 6.7618021965026855, "learning_rate": 7.787969924812031e-06, "loss": 0.2816, "step": 14710 }, { "epoch": 22.14, "grad_norm": 8.137096405029297, "learning_rate": 7.786466165413535e-06, "loss": 0.3144, "step": 14720 }, { "epoch": 22.15, "grad_norm": 10.209376335144043, "learning_rate": 7.784962406015038e-06, "loss": 0.3009, "step": 14730 }, { "epoch": 22.17, "grad_norm": 4.7329792976379395, "learning_rate": 7.783458646616542e-06, "loss": 0.2923, "step": 14740 }, { "epoch": 22.18, "grad_norm": 7.629988670349121, "learning_rate": 7.781954887218045e-06, "loss": 0.2598, "step": 14750 }, { "epoch": 22.2, "grad_norm": 3.4664206504821777, "learning_rate": 7.780451127819549e-06, "loss": 0.2701, "step": 14760 }, { "epoch": 22.21, "grad_norm": 7.403353691101074, "learning_rate": 7.778947368421054e-06, "loss": 0.2668, "step": 14770 }, { "epoch": 22.23, "grad_norm": 7.961404800415039, "learning_rate": 7.777443609022558e-06, "loss": 0.2531, "step": 14780 }, { "epoch": 22.24, "grad_norm": 6.001272201538086, "learning_rate": 7.775939849624061e-06, "loss": 0.3847, "step": 14790 }, { "epoch": 22.26, "grad_norm": 9.084465026855469, "learning_rate": 7.774436090225565e-06, "loss": 0.2646, "step": 14800 }, { "epoch": 22.27, "grad_norm": 7.2393012046813965, "learning_rate": 7.772932330827068e-06, "loss": 0.3203, "step": 14810 }, { "epoch": 22.29, "grad_norm": 8.683479309082031, "learning_rate": 7.771428571428572e-06, "loss": 0.285, "step": 14820 }, { "epoch": 22.3, "grad_norm": 5.802537441253662, "learning_rate": 7.769924812030077e-06, "loss": 0.3638, "step": 14830 }, { "epoch": 22.32, "grad_norm": 2.3165462017059326, "learning_rate": 7.768421052631579e-06, "loss": 0.3051, "step": 14840 }, { "epoch": 22.33, "grad_norm": 7.184248924255371, "learning_rate": 7.766917293233084e-06, "loss": 0.3008, "step": 14850 }, { "epoch": 22.35, "grad_norm": 4.384416580200195, "learning_rate": 7.765413533834588e-06, "loss": 0.3827, "step": 14860 }, { "epoch": 22.36, "grad_norm": 4.507965087890625, "learning_rate": 7.763909774436091e-06, "loss": 0.2811, "step": 14870 }, { "epoch": 22.38, "grad_norm": 6.374339580535889, "learning_rate": 7.762406015037595e-06, "loss": 0.2874, "step": 14880 }, { "epoch": 22.39, "grad_norm": 6.437895774841309, "learning_rate": 7.760902255639098e-06, "loss": 0.3101, "step": 14890 }, { "epoch": 22.41, "grad_norm": 7.283292770385742, "learning_rate": 7.759398496240602e-06, "loss": 0.3169, "step": 14900 }, { "epoch": 22.42, "grad_norm": 7.374083042144775, "learning_rate": 7.757894736842105e-06, "loss": 0.334, "step": 14910 }, { "epoch": 22.44, "grad_norm": 8.147022247314453, "learning_rate": 7.75639097744361e-06, "loss": 0.382, "step": 14920 }, { "epoch": 22.45, "grad_norm": 4.416728496551514, "learning_rate": 7.754887218045112e-06, "loss": 0.329, "step": 14930 }, { "epoch": 22.47, "grad_norm": 4.376534938812256, "learning_rate": 7.753383458646617e-06, "loss": 0.4032, "step": 14940 }, { "epoch": 22.48, "grad_norm": 9.247624397277832, "learning_rate": 7.751879699248121e-06, "loss": 0.3748, "step": 14950 }, { "epoch": 22.5, "grad_norm": 6.153301239013672, "learning_rate": 7.750375939849624e-06, "loss": 0.295, "step": 14960 }, { "epoch": 22.51, "grad_norm": 9.856022834777832, "learning_rate": 7.748872180451128e-06, "loss": 0.3033, "step": 14970 }, { "epoch": 22.53, "grad_norm": 6.767205715179443, "learning_rate": 7.747368421052631e-06, "loss": 0.3249, "step": 14980 }, { "epoch": 22.54, "grad_norm": 7.620680809020996, "learning_rate": 7.745864661654135e-06, "loss": 0.3775, "step": 14990 }, { "epoch": 22.56, "grad_norm": 4.250925064086914, "learning_rate": 7.74436090225564e-06, "loss": 0.3464, "step": 15000 }, { "epoch": 22.57, "grad_norm": 5.965568542480469, "learning_rate": 7.742857142857144e-06, "loss": 0.287, "step": 15010 }, { "epoch": 22.59, "grad_norm": 7.024303913116455, "learning_rate": 7.741353383458647e-06, "loss": 0.3145, "step": 15020 }, { "epoch": 22.6, "grad_norm": 5.691739559173584, "learning_rate": 7.73984962406015e-06, "loss": 0.2593, "step": 15030 }, { "epoch": 22.62, "grad_norm": 4.950546741485596, "learning_rate": 7.738345864661654e-06, "loss": 0.2312, "step": 15040 }, { "epoch": 22.63, "grad_norm": 8.352819442749023, "learning_rate": 7.736842105263158e-06, "loss": 0.3138, "step": 15050 }, { "epoch": 22.65, "grad_norm": 7.059927940368652, "learning_rate": 7.735338345864663e-06, "loss": 0.3281, "step": 15060 }, { "epoch": 22.66, "grad_norm": 8.698114395141602, "learning_rate": 7.733834586466165e-06, "loss": 0.3882, "step": 15070 }, { "epoch": 22.68, "grad_norm": 9.284893989562988, "learning_rate": 7.73233082706767e-06, "loss": 0.3517, "step": 15080 }, { "epoch": 22.69, "grad_norm": 10.998963356018066, "learning_rate": 7.730827067669174e-06, "loss": 0.3258, "step": 15090 }, { "epoch": 22.71, "grad_norm": 6.2202277183532715, "learning_rate": 7.729323308270677e-06, "loss": 0.3219, "step": 15100 }, { "epoch": 22.72, "grad_norm": 5.044315814971924, "learning_rate": 7.72781954887218e-06, "loss": 0.3523, "step": 15110 }, { "epoch": 22.74, "grad_norm": 7.376669883728027, "learning_rate": 7.726315789473686e-06, "loss": 0.3644, "step": 15120 }, { "epoch": 22.75, "grad_norm": 4.171261787414551, "learning_rate": 7.724812030075188e-06, "loss": 0.356, "step": 15130 }, { "epoch": 22.77, "grad_norm": 4.358337879180908, "learning_rate": 7.723308270676693e-06, "loss": 0.3316, "step": 15140 }, { "epoch": 22.78, "grad_norm": 7.347938060760498, "learning_rate": 7.721804511278196e-06, "loss": 0.3134, "step": 15150 }, { "epoch": 22.8, "grad_norm": 6.51662540435791, "learning_rate": 7.7203007518797e-06, "loss": 0.3304, "step": 15160 }, { "epoch": 22.81, "grad_norm": 4.554670810699463, "learning_rate": 7.718796992481203e-06, "loss": 0.3568, "step": 15170 }, { "epoch": 22.83, "grad_norm": 5.587891101837158, "learning_rate": 7.717293233082707e-06, "loss": 0.2347, "step": 15180 }, { "epoch": 22.84, "grad_norm": 4.167472839355469, "learning_rate": 7.71578947368421e-06, "loss": 0.2926, "step": 15190 }, { "epoch": 22.86, "grad_norm": 7.967960834503174, "learning_rate": 7.714285714285716e-06, "loss": 0.3413, "step": 15200 }, { "epoch": 22.87, "grad_norm": 4.611883640289307, "learning_rate": 7.71278195488722e-06, "loss": 0.328, "step": 15210 }, { "epoch": 22.89, "grad_norm": 6.755552291870117, "learning_rate": 7.711278195488723e-06, "loss": 0.3418, "step": 15220 }, { "epoch": 22.9, "grad_norm": 7.961472034454346, "learning_rate": 7.709774436090226e-06, "loss": 0.3717, "step": 15230 }, { "epoch": 22.92, "grad_norm": 6.6936187744140625, "learning_rate": 7.70827067669173e-06, "loss": 0.318, "step": 15240 }, { "epoch": 22.93, "grad_norm": 5.8512444496154785, "learning_rate": 7.706766917293233e-06, "loss": 0.3481, "step": 15250 }, { "epoch": 22.95, "grad_norm": 6.354591369628906, "learning_rate": 7.705263157894738e-06, "loss": 0.3909, "step": 15260 }, { "epoch": 22.96, "grad_norm": 5.42380428314209, "learning_rate": 7.70375939849624e-06, "loss": 0.34, "step": 15270 }, { "epoch": 22.98, "grad_norm": 9.55280876159668, "learning_rate": 7.702255639097746e-06, "loss": 0.3557, "step": 15280 }, { "epoch": 22.99, "grad_norm": 5.095829963684082, "learning_rate": 7.700751879699249e-06, "loss": 0.4089, "step": 15290 }, { "epoch": 23.0, "eval_accuracy": 0.9325, "eval_loss": 0.27643856406211853, "eval_runtime": 84.9045, "eval_samples_per_second": 117.779, "eval_steps_per_second": 0.471, "step": 15295 }, { "epoch": 23.01, "grad_norm": 5.0213518142700195, "learning_rate": 7.699248120300753e-06, "loss": 0.3049, "step": 15300 }, { "epoch": 23.02, "grad_norm": 7.806211948394775, "learning_rate": 7.697744360902256e-06, "loss": 0.3641, "step": 15310 }, { "epoch": 23.04, "grad_norm": 3.328399896621704, "learning_rate": 7.696240601503761e-06, "loss": 0.3035, "step": 15320 }, { "epoch": 23.05, "grad_norm": 4.0139875411987305, "learning_rate": 7.694736842105263e-06, "loss": 0.3657, "step": 15330 }, { "epoch": 23.07, "grad_norm": 5.296818256378174, "learning_rate": 7.693233082706768e-06, "loss": 0.3067, "step": 15340 }, { "epoch": 23.08, "grad_norm": 6.957002639770508, "learning_rate": 7.691729323308272e-06, "loss": 0.3746, "step": 15350 }, { "epoch": 23.1, "grad_norm": 5.104499816894531, "learning_rate": 7.690225563909775e-06, "loss": 0.3475, "step": 15360 }, { "epoch": 23.11, "grad_norm": 4.252979755401611, "learning_rate": 7.688721804511279e-06, "loss": 0.3913, "step": 15370 }, { "epoch": 23.13, "grad_norm": 6.049491882324219, "learning_rate": 7.687218045112782e-06, "loss": 0.2903, "step": 15380 }, { "epoch": 23.14, "grad_norm": 5.293207168579102, "learning_rate": 7.685714285714286e-06, "loss": 0.2508, "step": 15390 }, { "epoch": 23.16, "grad_norm": 5.338700771331787, "learning_rate": 7.68421052631579e-06, "loss": 0.2695, "step": 15400 }, { "epoch": 23.17, "grad_norm": 3.670703172683716, "learning_rate": 7.682706766917295e-06, "loss": 0.3534, "step": 15410 }, { "epoch": 23.19, "grad_norm": 5.252980709075928, "learning_rate": 7.681203007518796e-06, "loss": 0.3555, "step": 15420 }, { "epoch": 23.2, "grad_norm": 8.209909439086914, "learning_rate": 7.679699248120302e-06, "loss": 0.3796, "step": 15430 }, { "epoch": 23.22, "grad_norm": 4.704070091247559, "learning_rate": 7.678195488721805e-06, "loss": 0.3024, "step": 15440 }, { "epoch": 23.23, "grad_norm": 5.453511714935303, "learning_rate": 7.676691729323309e-06, "loss": 0.2894, "step": 15450 }, { "epoch": 23.25, "grad_norm": 7.019674777984619, "learning_rate": 7.675187969924812e-06, "loss": 0.3011, "step": 15460 }, { "epoch": 23.26, "grad_norm": 16.94940185546875, "learning_rate": 7.673684210526316e-06, "loss": 0.3526, "step": 15470 }, { "epoch": 23.28, "grad_norm": 5.086287498474121, "learning_rate": 7.67218045112782e-06, "loss": 0.3219, "step": 15480 }, { "epoch": 23.29, "grad_norm": 3.3719613552093506, "learning_rate": 7.670676691729325e-06, "loss": 0.2706, "step": 15490 }, { "epoch": 23.31, "grad_norm": 6.172971248626709, "learning_rate": 7.669172932330828e-06, "loss": 0.2633, "step": 15500 }, { "epoch": 23.32, "grad_norm": 6.304495811462402, "learning_rate": 7.667669172932332e-06, "loss": 0.3932, "step": 15510 }, { "epoch": 23.34, "grad_norm": 7.454652309417725, "learning_rate": 7.666165413533835e-06, "loss": 0.3278, "step": 15520 }, { "epoch": 23.35, "grad_norm": 4.993924617767334, "learning_rate": 7.664661654135339e-06, "loss": 0.3158, "step": 15530 }, { "epoch": 23.37, "grad_norm": 8.269258499145508, "learning_rate": 7.663157894736842e-06, "loss": 0.2965, "step": 15540 }, { "epoch": 23.38, "grad_norm": 9.472188949584961, "learning_rate": 7.661654135338347e-06, "loss": 0.3321, "step": 15550 }, { "epoch": 23.4, "grad_norm": 4.879781246185303, "learning_rate": 7.66015037593985e-06, "loss": 0.3012, "step": 15560 }, { "epoch": 23.41, "grad_norm": 3.718254566192627, "learning_rate": 7.658646616541354e-06, "loss": 0.3498, "step": 15570 }, { "epoch": 23.43, "grad_norm": 3.531419038772583, "learning_rate": 7.657142857142858e-06, "loss": 0.2622, "step": 15580 }, { "epoch": 23.44, "grad_norm": 4.4930949211120605, "learning_rate": 7.655639097744361e-06, "loss": 0.2778, "step": 15590 }, { "epoch": 23.46, "grad_norm": 4.008451461791992, "learning_rate": 7.654135338345865e-06, "loss": 0.3077, "step": 15600 }, { "epoch": 23.47, "grad_norm": 6.081947326660156, "learning_rate": 7.65263157894737e-06, "loss": 0.2566, "step": 15610 }, { "epoch": 23.49, "grad_norm": 3.446821689605713, "learning_rate": 7.651127819548872e-06, "loss": 0.3096, "step": 15620 }, { "epoch": 23.5, "grad_norm": 10.16897201538086, "learning_rate": 7.649624060150377e-06, "loss": 0.3254, "step": 15630 }, { "epoch": 23.52, "grad_norm": 1.691789150238037, "learning_rate": 7.64812030075188e-06, "loss": 0.3094, "step": 15640 }, { "epoch": 23.53, "grad_norm": 4.911680698394775, "learning_rate": 7.646616541353384e-06, "loss": 0.2983, "step": 15650 }, { "epoch": 23.55, "grad_norm": 6.379064559936523, "learning_rate": 7.645112781954888e-06, "loss": 0.2172, "step": 15660 }, { "epoch": 23.56, "grad_norm": 4.125355243682861, "learning_rate": 7.643609022556391e-06, "loss": 0.2838, "step": 15670 }, { "epoch": 23.58, "grad_norm": 6.153583526611328, "learning_rate": 7.642105263157895e-06, "loss": 0.3016, "step": 15680 }, { "epoch": 23.59, "grad_norm": 7.581343173980713, "learning_rate": 7.6406015037594e-06, "loss": 0.3684, "step": 15690 }, { "epoch": 23.61, "grad_norm": 8.388538360595703, "learning_rate": 7.639097744360904e-06, "loss": 0.3272, "step": 15700 }, { "epoch": 23.62, "grad_norm": 7.5239362716674805, "learning_rate": 7.637593984962407e-06, "loss": 0.3148, "step": 15710 }, { "epoch": 23.64, "grad_norm": 7.033330917358398, "learning_rate": 7.63609022556391e-06, "loss": 0.3008, "step": 15720 }, { "epoch": 23.65, "grad_norm": 5.851361274719238, "learning_rate": 7.634586466165414e-06, "loss": 0.3205, "step": 15730 }, { "epoch": 23.67, "grad_norm": 2.9884681701660156, "learning_rate": 7.633082706766918e-06, "loss": 0.2622, "step": 15740 }, { "epoch": 23.68, "grad_norm": 7.287815570831299, "learning_rate": 7.631578947368423e-06, "loss": 0.3984, "step": 15750 }, { "epoch": 23.7, "grad_norm": 4.209038734436035, "learning_rate": 7.630075187969925e-06, "loss": 0.2849, "step": 15760 }, { "epoch": 23.71, "grad_norm": 3.485328197479248, "learning_rate": 7.62857142857143e-06, "loss": 0.3395, "step": 15770 }, { "epoch": 23.73, "grad_norm": 4.945652484893799, "learning_rate": 7.6270676691729325e-06, "loss": 0.3045, "step": 15780 }, { "epoch": 23.74, "grad_norm": 9.309234619140625, "learning_rate": 7.625563909774437e-06, "loss": 0.3023, "step": 15790 }, { "epoch": 23.76, "grad_norm": 4.634711265563965, "learning_rate": 7.62406015037594e-06, "loss": 0.371, "step": 15800 }, { "epoch": 23.77, "grad_norm": 5.632133483886719, "learning_rate": 7.622556390977445e-06, "loss": 0.3813, "step": 15810 }, { "epoch": 23.79, "grad_norm": 3.4830102920532227, "learning_rate": 7.621052631578948e-06, "loss": 0.2663, "step": 15820 }, { "epoch": 23.8, "grad_norm": 4.266145706176758, "learning_rate": 7.619548872180453e-06, "loss": 0.3295, "step": 15830 }, { "epoch": 23.82, "grad_norm": 6.466432571411133, "learning_rate": 7.618045112781955e-06, "loss": 0.3109, "step": 15840 }, { "epoch": 23.83, "grad_norm": 7.617573261260986, "learning_rate": 7.61654135338346e-06, "loss": 0.3228, "step": 15850 }, { "epoch": 23.85, "grad_norm": 6.581226825714111, "learning_rate": 7.615037593984963e-06, "loss": 0.293, "step": 15860 }, { "epoch": 23.86, "grad_norm": 3.765594720840454, "learning_rate": 7.6135338345864676e-06, "loss": 0.2756, "step": 15870 }, { "epoch": 23.88, "grad_norm": 5.244023323059082, "learning_rate": 7.61203007518797e-06, "loss": 0.3237, "step": 15880 }, { "epoch": 23.89, "grad_norm": 4.341951370239258, "learning_rate": 7.610526315789474e-06, "loss": 0.3115, "step": 15890 }, { "epoch": 23.91, "grad_norm": 7.612969875335693, "learning_rate": 7.609022556390978e-06, "loss": 0.3097, "step": 15900 }, { "epoch": 23.92, "grad_norm": 4.730962753295898, "learning_rate": 7.607518796992482e-06, "loss": 0.381, "step": 15910 }, { "epoch": 23.94, "grad_norm": 6.929048538208008, "learning_rate": 7.606015037593986e-06, "loss": 0.4364, "step": 15920 }, { "epoch": 23.95, "grad_norm": 8.269315719604492, "learning_rate": 7.604511278195489e-06, "loss": 0.3312, "step": 15930 }, { "epoch": 23.97, "grad_norm": 5.69378662109375, "learning_rate": 7.603007518796993e-06, "loss": 0.3105, "step": 15940 }, { "epoch": 23.98, "grad_norm": 2.1282405853271484, "learning_rate": 7.6015037593984966e-06, "loss": 0.3473, "step": 15950 }, { "epoch": 24.0, "grad_norm": 64.98310089111328, "learning_rate": 7.600000000000001e-06, "loss": 0.3196, "step": 15960 }, { "epoch": 24.0, "eval_accuracy": 0.9305, "eval_loss": 0.27353373169898987, "eval_runtime": 84.6965, "eval_samples_per_second": 118.069, "eval_steps_per_second": 0.472, "step": 15960 }, { "epoch": 24.02, "grad_norm": 6.7349653244018555, "learning_rate": 7.598496240601504e-06, "loss": 0.2695, "step": 15970 }, { "epoch": 24.03, "grad_norm": 3.1804494857788086, "learning_rate": 7.596992481203008e-06, "loss": 0.2994, "step": 15980 }, { "epoch": 24.05, "grad_norm": 5.326109886169434, "learning_rate": 7.5954887218045115e-06, "loss": 0.3565, "step": 15990 }, { "epoch": 24.06, "grad_norm": 5.608499526977539, "learning_rate": 7.593984962406016e-06, "loss": 0.2904, "step": 16000 }, { "epoch": 24.08, "grad_norm": 3.598764181137085, "learning_rate": 7.592481203007519e-06, "loss": 0.2959, "step": 16010 }, { "epoch": 24.09, "grad_norm": 11.522496223449707, "learning_rate": 7.590977443609024e-06, "loss": 0.2509, "step": 16020 }, { "epoch": 24.11, "grad_norm": 5.142250061035156, "learning_rate": 7.589473684210526e-06, "loss": 0.2768, "step": 16030 }, { "epoch": 24.12, "grad_norm": 4.173882484436035, "learning_rate": 7.587969924812031e-06, "loss": 0.3249, "step": 16040 }, { "epoch": 24.14, "grad_norm": 5.356581211090088, "learning_rate": 7.586466165413534e-06, "loss": 0.342, "step": 16050 }, { "epoch": 24.15, "grad_norm": 5.133426666259766, "learning_rate": 7.584962406015039e-06, "loss": 0.372, "step": 16060 }, { "epoch": 24.17, "grad_norm": 6.920117378234863, "learning_rate": 7.583458646616541e-06, "loss": 0.2469, "step": 16070 }, { "epoch": 24.18, "grad_norm": 5.305706024169922, "learning_rate": 7.581954887218046e-06, "loss": 0.3355, "step": 16080 }, { "epoch": 24.2, "grad_norm": 5.647949695587158, "learning_rate": 7.580451127819549e-06, "loss": 0.3394, "step": 16090 }, { "epoch": 24.21, "grad_norm": 8.31951904296875, "learning_rate": 7.578947368421054e-06, "loss": 0.2804, "step": 16100 }, { "epoch": 24.23, "grad_norm": 7.295516490936279, "learning_rate": 7.577443609022557e-06, "loss": 0.2522, "step": 16110 }, { "epoch": 24.24, "grad_norm": 4.9929022789001465, "learning_rate": 7.575939849624061e-06, "loss": 0.2747, "step": 16120 }, { "epoch": 24.26, "grad_norm": 4.699337482452393, "learning_rate": 7.574436090225564e-06, "loss": 0.2968, "step": 16130 }, { "epoch": 24.27, "grad_norm": 8.267194747924805, "learning_rate": 7.5729323308270685e-06, "loss": 0.396, "step": 16140 }, { "epoch": 24.29, "grad_norm": 5.007888317108154, "learning_rate": 7.571428571428572e-06, "loss": 0.2286, "step": 16150 }, { "epoch": 24.3, "grad_norm": 4.469249725341797, "learning_rate": 7.569924812030076e-06, "loss": 0.2907, "step": 16160 }, { "epoch": 24.32, "grad_norm": 5.973073482513428, "learning_rate": 7.568421052631579e-06, "loss": 0.2583, "step": 16170 }, { "epoch": 24.33, "grad_norm": 7.025624752044678, "learning_rate": 7.5669172932330834e-06, "loss": 0.332, "step": 16180 }, { "epoch": 24.35, "grad_norm": 6.508031368255615, "learning_rate": 7.565413533834587e-06, "loss": 0.3216, "step": 16190 }, { "epoch": 24.36, "grad_norm": 5.830172538757324, "learning_rate": 7.563909774436091e-06, "loss": 0.2935, "step": 16200 }, { "epoch": 24.38, "grad_norm": 8.18553638458252, "learning_rate": 7.562406015037595e-06, "loss": 0.309, "step": 16210 }, { "epoch": 24.39, "grad_norm": 19.962448120117188, "learning_rate": 7.560902255639098e-06, "loss": 0.3435, "step": 16220 }, { "epoch": 24.41, "grad_norm": 4.706577301025391, "learning_rate": 7.559398496240602e-06, "loss": 0.3436, "step": 16230 }, { "epoch": 24.42, "grad_norm": 5.416121959686279, "learning_rate": 7.557894736842106e-06, "loss": 0.3253, "step": 16240 }, { "epoch": 24.44, "grad_norm": 7.063791275024414, "learning_rate": 7.55639097744361e-06, "loss": 0.3762, "step": 16250 }, { "epoch": 24.45, "grad_norm": 4.680507659912109, "learning_rate": 7.554887218045114e-06, "loss": 0.2883, "step": 16260 }, { "epoch": 24.47, "grad_norm": 8.150341987609863, "learning_rate": 7.553383458646617e-06, "loss": 0.3714, "step": 16270 }, { "epoch": 24.48, "grad_norm": 3.5707345008850098, "learning_rate": 7.551879699248121e-06, "loss": 0.2659, "step": 16280 }, { "epoch": 24.5, "grad_norm": 7.740113735198975, "learning_rate": 7.550375939849625e-06, "loss": 0.3134, "step": 16290 }, { "epoch": 24.51, "grad_norm": 28.682573318481445, "learning_rate": 7.548872180451129e-06, "loss": 0.3294, "step": 16300 }, { "epoch": 24.53, "grad_norm": 5.2670793533325195, "learning_rate": 7.547368421052632e-06, "loss": 0.272, "step": 16310 }, { "epoch": 24.54, "grad_norm": 6.7343339920043945, "learning_rate": 7.545864661654136e-06, "loss": 0.2836, "step": 16320 }, { "epoch": 24.56, "grad_norm": 5.1457672119140625, "learning_rate": 7.54436090225564e-06, "loss": 0.222, "step": 16330 }, { "epoch": 24.57, "grad_norm": 5.616130828857422, "learning_rate": 7.542857142857144e-06, "loss": 0.2684, "step": 16340 }, { "epoch": 24.59, "grad_norm": 6.380970478057861, "learning_rate": 7.5413533834586475e-06, "loss": 0.3899, "step": 16350 }, { "epoch": 24.6, "grad_norm": 9.467977523803711, "learning_rate": 7.539849624060152e-06, "loss": 0.2699, "step": 16360 }, { "epoch": 24.62, "grad_norm": 5.1410136222839355, "learning_rate": 7.5383458646616545e-06, "loss": 0.2859, "step": 16370 }, { "epoch": 24.63, "grad_norm": 7.608031749725342, "learning_rate": 7.536842105263158e-06, "loss": 0.342, "step": 16380 }, { "epoch": 24.65, "grad_norm": 3.234297513961792, "learning_rate": 7.535338345864662e-06, "loss": 0.3229, "step": 16390 }, { "epoch": 24.66, "grad_norm": 5.070233345031738, "learning_rate": 7.533834586466165e-06, "loss": 0.3494, "step": 16400 }, { "epoch": 24.68, "grad_norm": 5.208074569702148, "learning_rate": 7.5323308270676694e-06, "loss": 0.3288, "step": 16410 }, { "epoch": 24.69, "grad_norm": 3.5855026245117188, "learning_rate": 7.530827067669173e-06, "loss": 0.3241, "step": 16420 }, { "epoch": 24.71, "grad_norm": 5.344216823577881, "learning_rate": 7.529323308270677e-06, "loss": 0.2997, "step": 16430 }, { "epoch": 24.72, "grad_norm": 7.309630393981934, "learning_rate": 7.527819548872181e-06, "loss": 0.304, "step": 16440 }, { "epoch": 24.74, "grad_norm": 4.991174697875977, "learning_rate": 7.526315789473685e-06, "loss": 0.3107, "step": 16450 }, { "epoch": 24.75, "grad_norm": 6.519332408905029, "learning_rate": 7.524812030075188e-06, "loss": 0.2857, "step": 16460 }, { "epoch": 24.77, "grad_norm": 6.599658012390137, "learning_rate": 7.523308270676692e-06, "loss": 0.3334, "step": 16470 }, { "epoch": 24.78, "grad_norm": 6.142018795013428, "learning_rate": 7.521804511278196e-06, "loss": 0.3121, "step": 16480 }, { "epoch": 24.8, "grad_norm": 5.483290195465088, "learning_rate": 7.5203007518797e-06, "loss": 0.3155, "step": 16490 }, { "epoch": 24.81, "grad_norm": 10.318331718444824, "learning_rate": 7.518796992481203e-06, "loss": 0.3272, "step": 16500 }, { "epoch": 24.83, "grad_norm": 4.9072771072387695, "learning_rate": 7.517293233082707e-06, "loss": 0.3807, "step": 16510 }, { "epoch": 24.84, "grad_norm": 5.348799228668213, "learning_rate": 7.515789473684211e-06, "loss": 0.3662, "step": 16520 }, { "epoch": 24.86, "grad_norm": 4.338939189910889, "learning_rate": 7.514285714285715e-06, "loss": 0.3022, "step": 16530 }, { "epoch": 24.87, "grad_norm": 7.069118499755859, "learning_rate": 7.512781954887219e-06, "loss": 0.3111, "step": 16540 }, { "epoch": 24.89, "grad_norm": 7.8967790603637695, "learning_rate": 7.511278195488723e-06, "loss": 0.2683, "step": 16550 }, { "epoch": 24.9, "grad_norm": 6.323410987854004, "learning_rate": 7.509774436090226e-06, "loss": 0.3274, "step": 16560 }, { "epoch": 24.92, "grad_norm": 6.77821683883667, "learning_rate": 7.50827067669173e-06, "loss": 0.2803, "step": 16570 }, { "epoch": 24.93, "grad_norm": 6.189555644989014, "learning_rate": 7.5067669172932335e-06, "loss": 0.3104, "step": 16580 }, { "epoch": 24.95, "grad_norm": 3.698765277862549, "learning_rate": 7.505263157894738e-06, "loss": 0.3238, "step": 16590 }, { "epoch": 24.96, "grad_norm": 5.7407355308532715, "learning_rate": 7.5037593984962405e-06, "loss": 0.3003, "step": 16600 }, { "epoch": 24.98, "grad_norm": 4.214737892150879, "learning_rate": 7.502255639097745e-06, "loss": 0.3016, "step": 16610 }, { "epoch": 24.99, "grad_norm": 6.830847263336182, "learning_rate": 7.5007518796992484e-06, "loss": 0.2982, "step": 16620 }, { "epoch": 25.0, "eval_accuracy": 0.9312, "eval_loss": 0.27709507942199707, "eval_runtime": 84.7229, "eval_samples_per_second": 118.032, "eval_steps_per_second": 0.472, "step": 16625 }, { "epoch": 25.01, "grad_norm": 8.3620023727417, "learning_rate": 7.499248120300753e-06, "loss": 0.2671, "step": 16630 }, { "epoch": 25.02, "grad_norm": 6.188384532928467, "learning_rate": 7.497744360902256e-06, "loss": 0.3315, "step": 16640 }, { "epoch": 25.04, "grad_norm": 4.457509994506836, "learning_rate": 7.496240601503761e-06, "loss": 0.2999, "step": 16650 }, { "epoch": 25.05, "grad_norm": 3.5126192569732666, "learning_rate": 7.494736842105263e-06, "loss": 0.2304, "step": 16660 }, { "epoch": 25.07, "grad_norm": 8.772309303283691, "learning_rate": 7.493233082706768e-06, "loss": 0.3678, "step": 16670 }, { "epoch": 25.08, "grad_norm": 6.1839189529418945, "learning_rate": 7.491729323308271e-06, "loss": 0.3029, "step": 16680 }, { "epoch": 25.1, "grad_norm": 3.8670859336853027, "learning_rate": 7.490225563909776e-06, "loss": 0.2816, "step": 16690 }, { "epoch": 25.11, "grad_norm": 6.036872863769531, "learning_rate": 7.488721804511278e-06, "loss": 0.3167, "step": 16700 }, { "epoch": 25.13, "grad_norm": 5.029440879821777, "learning_rate": 7.487218045112783e-06, "loss": 0.3886, "step": 16710 }, { "epoch": 25.14, "grad_norm": 6.317779541015625, "learning_rate": 7.485714285714286e-06, "loss": 0.3374, "step": 16720 }, { "epoch": 25.16, "grad_norm": 4.726473808288574, "learning_rate": 7.4842105263157905e-06, "loss": 0.2748, "step": 16730 }, { "epoch": 25.17, "grad_norm": 7.509809494018555, "learning_rate": 7.482706766917294e-06, "loss": 0.3124, "step": 16740 }, { "epoch": 25.19, "grad_norm": 6.474162578582764, "learning_rate": 7.481203007518798e-06, "loss": 0.3249, "step": 16750 }, { "epoch": 25.2, "grad_norm": 5.663277626037598, "learning_rate": 7.479699248120301e-06, "loss": 0.3175, "step": 16760 }, { "epoch": 25.22, "grad_norm": 5.958733558654785, "learning_rate": 7.4781954887218055e-06, "loss": 0.2671, "step": 16770 }, { "epoch": 25.23, "grad_norm": 3.8807878494262695, "learning_rate": 7.476691729323309e-06, "loss": 0.2347, "step": 16780 }, { "epoch": 25.25, "grad_norm": 8.403759956359863, "learning_rate": 7.475187969924813e-06, "loss": 0.2834, "step": 16790 }, { "epoch": 25.26, "grad_norm": 4.284714698791504, "learning_rate": 7.473684210526316e-06, "loss": 0.2763, "step": 16800 }, { "epoch": 25.28, "grad_norm": 5.796482563018799, "learning_rate": 7.47218045112782e-06, "loss": 0.3044, "step": 16810 }, { "epoch": 25.29, "grad_norm": 4.833478927612305, "learning_rate": 7.470676691729324e-06, "loss": 0.3536, "step": 16820 }, { "epoch": 25.31, "grad_norm": 6.935545921325684, "learning_rate": 7.469172932330828e-06, "loss": 0.3231, "step": 16830 }, { "epoch": 25.32, "grad_norm": 7.3656229972839355, "learning_rate": 7.467669172932332e-06, "loss": 0.2747, "step": 16840 }, { "epoch": 25.34, "grad_norm": 6.294661998748779, "learning_rate": 7.466165413533836e-06, "loss": 0.3172, "step": 16850 }, { "epoch": 25.35, "grad_norm": 5.849190711975098, "learning_rate": 7.464661654135339e-06, "loss": 0.307, "step": 16860 }, { "epoch": 25.37, "grad_norm": 3.9488296508789062, "learning_rate": 7.463157894736843e-06, "loss": 0.2835, "step": 16870 }, { "epoch": 25.38, "grad_norm": 5.17800760269165, "learning_rate": 7.461654135338347e-06, "loss": 0.3037, "step": 16880 }, { "epoch": 25.4, "grad_norm": 5.4995436668396, "learning_rate": 7.460150375939849e-06, "loss": 0.3726, "step": 16890 }, { "epoch": 25.41, "grad_norm": 5.913205623626709, "learning_rate": 7.458646616541354e-06, "loss": 0.244, "step": 16900 }, { "epoch": 25.43, "grad_norm": 7.40120267868042, "learning_rate": 7.457142857142857e-06, "loss": 0.3658, "step": 16910 }, { "epoch": 25.44, "grad_norm": 6.155134677886963, "learning_rate": 7.455639097744362e-06, "loss": 0.2988, "step": 16920 }, { "epoch": 25.46, "grad_norm": 7.154820919036865, "learning_rate": 7.454135338345865e-06, "loss": 0.2828, "step": 16930 }, { "epoch": 25.47, "grad_norm": 15.181897163391113, "learning_rate": 7.4526315789473695e-06, "loss": 0.2919, "step": 16940 }, { "epoch": 25.49, "grad_norm": 15.616399765014648, "learning_rate": 7.451127819548872e-06, "loss": 0.2602, "step": 16950 }, { "epoch": 25.5, "grad_norm": 3.783785104751587, "learning_rate": 7.4496240601503765e-06, "loss": 0.2663, "step": 16960 }, { "epoch": 25.52, "grad_norm": 3.7820167541503906, "learning_rate": 7.44812030075188e-06, "loss": 0.3069, "step": 16970 }, { "epoch": 25.53, "grad_norm": 22.630653381347656, "learning_rate": 7.4466165413533844e-06, "loss": 0.3701, "step": 16980 }, { "epoch": 25.55, "grad_norm": 7.006454944610596, "learning_rate": 7.445112781954887e-06, "loss": 0.2762, "step": 16990 }, { "epoch": 25.56, "grad_norm": 3.897531747817993, "learning_rate": 7.4436090225563915e-06, "loss": 0.2871, "step": 17000 }, { "epoch": 25.58, "grad_norm": 9.737010955810547, "learning_rate": 7.442105263157895e-06, "loss": 0.3115, "step": 17010 }, { "epoch": 25.59, "grad_norm": 9.285514831542969, "learning_rate": 7.440601503759399e-06, "loss": 0.3061, "step": 17020 }, { "epoch": 25.61, "grad_norm": 6.54390811920166, "learning_rate": 7.439097744360903e-06, "loss": 0.4048, "step": 17030 }, { "epoch": 25.62, "grad_norm": 7.407090663909912, "learning_rate": 7.437593984962406e-06, "loss": 0.335, "step": 17040 }, { "epoch": 25.64, "grad_norm": 5.179807186126709, "learning_rate": 7.43609022556391e-06, "loss": 0.3313, "step": 17050 }, { "epoch": 25.65, "grad_norm": 3.106466770172119, "learning_rate": 7.434586466165414e-06, "loss": 0.3702, "step": 17060 }, { "epoch": 25.67, "grad_norm": 4.030908584594727, "learning_rate": 7.433082706766918e-06, "loss": 0.262, "step": 17070 }, { "epoch": 25.68, "grad_norm": 5.041976451873779, "learning_rate": 7.431578947368422e-06, "loss": 0.3317, "step": 17080 }, { "epoch": 25.7, "grad_norm": 7.29601526260376, "learning_rate": 7.430075187969925e-06, "loss": 0.295, "step": 17090 }, { "epoch": 25.71, "grad_norm": 6.026291370391846, "learning_rate": 7.428571428571429e-06, "loss": 0.3131, "step": 17100 }, { "epoch": 25.73, "grad_norm": 6.111357688903809, "learning_rate": 7.427067669172933e-06, "loss": 0.2887, "step": 17110 }, { "epoch": 25.74, "grad_norm": 6.1451520919799805, "learning_rate": 7.425563909774437e-06, "loss": 0.2974, "step": 17120 }, { "epoch": 25.76, "grad_norm": 4.193663120269775, "learning_rate": 7.424060150375941e-06, "loss": 0.2106, "step": 17130 }, { "epoch": 25.77, "grad_norm": 5.288703441619873, "learning_rate": 7.422556390977444e-06, "loss": 0.2647, "step": 17140 }, { "epoch": 25.79, "grad_norm": 9.568791389465332, "learning_rate": 7.421052631578948e-06, "loss": 0.383, "step": 17150 }, { "epoch": 25.8, "grad_norm": 4.507370471954346, "learning_rate": 7.419548872180452e-06, "loss": 0.3109, "step": 17160 }, { "epoch": 25.82, "grad_norm": 4.298795223236084, "learning_rate": 7.4180451127819555e-06, "loss": 0.3037, "step": 17170 }, { "epoch": 25.83, "grad_norm": 7.694872856140137, "learning_rate": 7.41654135338346e-06, "loss": 0.353, "step": 17180 }, { "epoch": 25.85, "grad_norm": 4.656931400299072, "learning_rate": 7.4150375939849626e-06, "loss": 0.3047, "step": 17190 }, { "epoch": 25.86, "grad_norm": 6.448072910308838, "learning_rate": 7.413533834586467e-06, "loss": 0.3562, "step": 17200 }, { "epoch": 25.88, "grad_norm": 7.515552043914795, "learning_rate": 7.4120300751879705e-06, "loss": 0.3391, "step": 17210 }, { "epoch": 25.89, "grad_norm": 9.326400756835938, "learning_rate": 7.410526315789475e-06, "loss": 0.2554, "step": 17220 }, { "epoch": 25.91, "grad_norm": 4.4977498054504395, "learning_rate": 7.4090225563909775e-06, "loss": 0.3115, "step": 17230 }, { "epoch": 25.92, "grad_norm": 5.859455585479736, "learning_rate": 7.407518796992482e-06, "loss": 0.2923, "step": 17240 }, { "epoch": 25.94, "grad_norm": 3.0939574241638184, "learning_rate": 7.406015037593985e-06, "loss": 0.3387, "step": 17250 }, { "epoch": 25.95, "grad_norm": 6.690404415130615, "learning_rate": 7.40451127819549e-06, "loss": 0.2812, "step": 17260 }, { "epoch": 25.97, "grad_norm": 3.768836736679077, "learning_rate": 7.403007518796993e-06, "loss": 0.2778, "step": 17270 }, { "epoch": 25.98, "grad_norm": 4.476847171783447, "learning_rate": 7.401503759398498e-06, "loss": 0.3684, "step": 17280 }, { "epoch": 26.0, "grad_norm": 0.6643197536468506, "learning_rate": 7.4e-06, "loss": 0.1884, "step": 17290 }, { "epoch": 26.0, "eval_accuracy": 0.9304, "eval_loss": 0.2943102717399597, "eval_runtime": 84.682, "eval_samples_per_second": 118.089, "eval_steps_per_second": 0.472, "step": 17290 }, { "epoch": 26.02, "grad_norm": 6.030354976654053, "learning_rate": 7.398496240601505e-06, "loss": 0.2949, "step": 17300 }, { "epoch": 26.03, "grad_norm": 6.9479193687438965, "learning_rate": 7.396992481203008e-06, "loss": 0.3083, "step": 17310 }, { "epoch": 26.05, "grad_norm": 5.633476257324219, "learning_rate": 7.3954887218045126e-06, "loss": 0.2906, "step": 17320 }, { "epoch": 26.06, "grad_norm": 4.715734481811523, "learning_rate": 7.393984962406015e-06, "loss": 0.3034, "step": 17330 }, { "epoch": 26.08, "grad_norm": 5.452293395996094, "learning_rate": 7.39248120300752e-06, "loss": 0.3242, "step": 17340 }, { "epoch": 26.09, "grad_norm": 5.496037006378174, "learning_rate": 7.390977443609023e-06, "loss": 0.2861, "step": 17350 }, { "epoch": 26.11, "grad_norm": 8.563288688659668, "learning_rate": 7.3894736842105275e-06, "loss": 0.3451, "step": 17360 }, { "epoch": 26.12, "grad_norm": 3.828183889389038, "learning_rate": 7.387969924812031e-06, "loss": 0.3346, "step": 17370 }, { "epoch": 26.14, "grad_norm": 6.062718391418457, "learning_rate": 7.386466165413534e-06, "loss": 0.2778, "step": 17380 }, { "epoch": 26.15, "grad_norm": 5.245468616485596, "learning_rate": 7.384962406015038e-06, "loss": 0.2623, "step": 17390 }, { "epoch": 26.17, "grad_norm": 5.32465934753418, "learning_rate": 7.3834586466165416e-06, "loss": 0.2771, "step": 17400 }, { "epoch": 26.18, "grad_norm": 6.1053009033203125, "learning_rate": 7.381954887218046e-06, "loss": 0.2738, "step": 17410 }, { "epoch": 26.2, "grad_norm": 4.224456310272217, "learning_rate": 7.380451127819549e-06, "loss": 0.2576, "step": 17420 }, { "epoch": 26.21, "grad_norm": 5.59401798248291, "learning_rate": 7.378947368421053e-06, "loss": 0.2733, "step": 17430 }, { "epoch": 26.23, "grad_norm": 9.04089641571045, "learning_rate": 7.3774436090225565e-06, "loss": 0.294, "step": 17440 }, { "epoch": 26.24, "grad_norm": 5.4782233238220215, "learning_rate": 7.375939849624061e-06, "loss": 0.2776, "step": 17450 }, { "epoch": 26.26, "grad_norm": 3.4573464393615723, "learning_rate": 7.374436090225564e-06, "loss": 0.2913, "step": 17460 }, { "epoch": 26.27, "grad_norm": 4.618410587310791, "learning_rate": 7.372932330827069e-06, "loss": 0.3398, "step": 17470 }, { "epoch": 26.29, "grad_norm": 3.339372396469116, "learning_rate": 7.371428571428571e-06, "loss": 0.2932, "step": 17480 }, { "epoch": 26.3, "grad_norm": 4.173111438751221, "learning_rate": 7.369924812030076e-06, "loss": 0.3159, "step": 17490 }, { "epoch": 26.32, "grad_norm": 5.077332973480225, "learning_rate": 7.368421052631579e-06, "loss": 0.2747, "step": 17500 }, { "epoch": 26.33, "grad_norm": 3.467737913131714, "learning_rate": 7.366917293233084e-06, "loss": 0.3015, "step": 17510 }, { "epoch": 26.35, "grad_norm": 4.046982288360596, "learning_rate": 7.365413533834586e-06, "loss": 0.2847, "step": 17520 }, { "epoch": 26.36, "grad_norm": 5.868749141693115, "learning_rate": 7.363909774436091e-06, "loss": 0.2743, "step": 17530 }, { "epoch": 26.38, "grad_norm": 4.371850967407227, "learning_rate": 7.362406015037594e-06, "loss": 0.2951, "step": 17540 }, { "epoch": 26.39, "grad_norm": 5.016296863555908, "learning_rate": 7.3609022556390986e-06, "loss": 0.3517, "step": 17550 }, { "epoch": 26.41, "grad_norm": 4.3615946769714355, "learning_rate": 7.359398496240602e-06, "loss": 0.4145, "step": 17560 }, { "epoch": 26.42, "grad_norm": 5.725963115692139, "learning_rate": 7.3578947368421065e-06, "loss": 0.2518, "step": 17570 }, { "epoch": 26.44, "grad_norm": 9.736555099487305, "learning_rate": 7.356390977443609e-06, "loss": 0.3154, "step": 17580 }, { "epoch": 26.45, "grad_norm": 7.777997016906738, "learning_rate": 7.3548872180451135e-06, "loss": 0.325, "step": 17590 }, { "epoch": 26.47, "grad_norm": 4.7586894035339355, "learning_rate": 7.353383458646617e-06, "loss": 0.3155, "step": 17600 }, { "epoch": 26.48, "grad_norm": 5.5998430252075195, "learning_rate": 7.351879699248121e-06, "loss": 0.3358, "step": 17610 }, { "epoch": 26.5, "grad_norm": 6.661489486694336, "learning_rate": 7.350375939849624e-06, "loss": 0.2973, "step": 17620 }, { "epoch": 26.51, "grad_norm": 8.244695663452148, "learning_rate": 7.348872180451128e-06, "loss": 0.2739, "step": 17630 }, { "epoch": 26.53, "grad_norm": 7.346087455749512, "learning_rate": 7.347368421052632e-06, "loss": 0.3281, "step": 17640 }, { "epoch": 26.54, "grad_norm": 8.243110656738281, "learning_rate": 7.345864661654136e-06, "loss": 0.2994, "step": 17650 }, { "epoch": 26.56, "grad_norm": 3.7893612384796143, "learning_rate": 7.34436090225564e-06, "loss": 0.3569, "step": 17660 }, { "epoch": 26.57, "grad_norm": 4.714302062988281, "learning_rate": 7.342857142857144e-06, "loss": 0.2856, "step": 17670 }, { "epoch": 26.59, "grad_norm": 7.72084903717041, "learning_rate": 7.341353383458647e-06, "loss": 0.2402, "step": 17680 }, { "epoch": 26.6, "grad_norm": 4.239202499389648, "learning_rate": 7.339849624060151e-06, "loss": 0.2312, "step": 17690 }, { "epoch": 26.62, "grad_norm": 6.118954658508301, "learning_rate": 7.338345864661655e-06, "loss": 0.3843, "step": 17700 }, { "epoch": 26.63, "grad_norm": 6.067955017089844, "learning_rate": 7.336842105263159e-06, "loss": 0.3087, "step": 17710 }, { "epoch": 26.65, "grad_norm": 6.024227619171143, "learning_rate": 7.335338345864662e-06, "loss": 0.2224, "step": 17720 }, { "epoch": 26.66, "grad_norm": 3.350494146347046, "learning_rate": 7.333834586466166e-06, "loss": 0.2977, "step": 17730 }, { "epoch": 26.68, "grad_norm": 5.893447399139404, "learning_rate": 7.33233082706767e-06, "loss": 0.2636, "step": 17740 }, { "epoch": 26.69, "grad_norm": 5.77102518081665, "learning_rate": 7.330827067669174e-06, "loss": 0.2412, "step": 17750 }, { "epoch": 26.71, "grad_norm": 4.75433349609375, "learning_rate": 7.3293233082706776e-06, "loss": 0.3262, "step": 17760 }, { "epoch": 26.72, "grad_norm": 9.549809455871582, "learning_rate": 7.327819548872182e-06, "loss": 0.2673, "step": 17770 }, { "epoch": 26.74, "grad_norm": 7.770995616912842, "learning_rate": 7.326315789473685e-06, "loss": 0.3227, "step": 17780 }, { "epoch": 26.75, "grad_norm": 2.771538257598877, "learning_rate": 7.324812030075189e-06, "loss": 0.2894, "step": 17790 }, { "epoch": 26.77, "grad_norm": 5.751589298248291, "learning_rate": 7.3233082706766925e-06, "loss": 0.281, "step": 17800 }, { "epoch": 26.78, "grad_norm": 6.880566596984863, "learning_rate": 7.321804511278197e-06, "loss": 0.3191, "step": 17810 }, { "epoch": 26.8, "grad_norm": 5.7594218254089355, "learning_rate": 7.3203007518796995e-06, "loss": 0.2659, "step": 17820 }, { "epoch": 26.81, "grad_norm": 3.8072729110717773, "learning_rate": 7.318796992481204e-06, "loss": 0.2825, "step": 17830 }, { "epoch": 26.83, "grad_norm": 7.311132431030273, "learning_rate": 7.317293233082707e-06, "loss": 0.3758, "step": 17840 }, { "epoch": 26.84, "grad_norm": 6.675231456756592, "learning_rate": 7.315789473684212e-06, "loss": 0.2856, "step": 17850 }, { "epoch": 26.86, "grad_norm": 9.13115119934082, "learning_rate": 7.314285714285715e-06, "loss": 0.266, "step": 17860 }, { "epoch": 26.87, "grad_norm": 5.860391616821289, "learning_rate": 7.312781954887218e-06, "loss": 0.3354, "step": 17870 }, { "epoch": 26.89, "grad_norm": 5.685858249664307, "learning_rate": 7.311278195488722e-06, "loss": 0.3235, "step": 17880 }, { "epoch": 26.9, "grad_norm": 6.518139839172363, "learning_rate": 7.309774436090226e-06, "loss": 0.313, "step": 17890 }, { "epoch": 26.92, "grad_norm": 14.772072792053223, "learning_rate": 7.30827067669173e-06, "loss": 0.3201, "step": 17900 }, { "epoch": 26.93, "grad_norm": 4.325756072998047, "learning_rate": 7.306766917293233e-06, "loss": 0.2681, "step": 17910 }, { "epoch": 26.95, "grad_norm": 4.291186809539795, "learning_rate": 7.305263157894737e-06, "loss": 0.3831, "step": 17920 }, { "epoch": 26.96, "grad_norm": 8.093550682067871, "learning_rate": 7.303759398496241e-06, "loss": 0.2855, "step": 17930 }, { "epoch": 26.98, "grad_norm": 4.962594985961914, "learning_rate": 7.302255639097745e-06, "loss": 0.2369, "step": 17940 }, { "epoch": 26.99, "grad_norm": 5.591581344604492, "learning_rate": 7.300751879699249e-06, "loss": 0.3624, "step": 17950 }, { "epoch": 27.0, "eval_accuracy": 0.9316, "eval_loss": 0.2865539491176605, "eval_runtime": 84.8448, "eval_samples_per_second": 117.862, "eval_steps_per_second": 0.471, "step": 17955 }, { "epoch": 27.01, "grad_norm": 4.0642547607421875, "learning_rate": 7.299248120300752e-06, "loss": 0.2308, "step": 17960 }, { "epoch": 27.02, "grad_norm": 5.927849292755127, "learning_rate": 7.297744360902256e-06, "loss": 0.2986, "step": 17970 }, { "epoch": 27.04, "grad_norm": 6.369534492492676, "learning_rate": 7.29624060150376e-06, "loss": 0.2482, "step": 17980 }, { "epoch": 27.05, "grad_norm": 4.340782165527344, "learning_rate": 7.2947368421052636e-06, "loss": 0.2583, "step": 17990 }, { "epoch": 27.07, "grad_norm": 4.993666648864746, "learning_rate": 7.293233082706768e-06, "loss": 0.237, "step": 18000 }, { "epoch": 27.08, "grad_norm": 5.864019870758057, "learning_rate": 7.291729323308271e-06, "loss": 0.3072, "step": 18010 }, { "epoch": 27.1, "grad_norm": 6.993382930755615, "learning_rate": 7.290225563909775e-06, "loss": 0.2972, "step": 18020 }, { "epoch": 27.11, "grad_norm": 5.060710430145264, "learning_rate": 7.2887218045112785e-06, "loss": 0.269, "step": 18030 }, { "epoch": 27.13, "grad_norm": 3.3099091053009033, "learning_rate": 7.287218045112783e-06, "loss": 0.3175, "step": 18040 }, { "epoch": 27.14, "grad_norm": 5.775263786315918, "learning_rate": 7.285714285714286e-06, "loss": 0.2844, "step": 18050 }, { "epoch": 27.16, "grad_norm": 5.468781471252441, "learning_rate": 7.28421052631579e-06, "loss": 0.2273, "step": 18060 }, { "epoch": 27.17, "grad_norm": 6.351372241973877, "learning_rate": 7.282706766917293e-06, "loss": 0.3525, "step": 18070 }, { "epoch": 27.19, "grad_norm": 7.59521484375, "learning_rate": 7.281203007518798e-06, "loss": 0.3068, "step": 18080 }, { "epoch": 27.2, "grad_norm": 3.0641674995422363, "learning_rate": 7.279699248120301e-06, "loss": 0.2865, "step": 18090 }, { "epoch": 27.22, "grad_norm": 6.9775238037109375, "learning_rate": 7.278195488721806e-06, "loss": 0.3022, "step": 18100 }, { "epoch": 27.23, "grad_norm": 4.060088634490967, "learning_rate": 7.276691729323308e-06, "loss": 0.3148, "step": 18110 }, { "epoch": 27.25, "grad_norm": 7.767538070678711, "learning_rate": 7.275187969924813e-06, "loss": 0.229, "step": 18120 }, { "epoch": 27.26, "grad_norm": 4.900406360626221, "learning_rate": 7.273684210526316e-06, "loss": 0.2672, "step": 18130 }, { "epoch": 27.28, "grad_norm": 6.7180962562561035, "learning_rate": 7.272180451127821e-06, "loss": 0.24, "step": 18140 }, { "epoch": 27.29, "grad_norm": 6.126708507537842, "learning_rate": 7.270676691729323e-06, "loss": 0.284, "step": 18150 }, { "epoch": 27.31, "grad_norm": 3.9110794067382812, "learning_rate": 7.269172932330828e-06, "loss": 0.2719, "step": 18160 }, { "epoch": 27.32, "grad_norm": 6.212155342102051, "learning_rate": 7.267669172932331e-06, "loss": 0.3151, "step": 18170 }, { "epoch": 27.34, "grad_norm": 3.0043139457702637, "learning_rate": 7.2661654135338355e-06, "loss": 0.2592, "step": 18180 }, { "epoch": 27.35, "grad_norm": 3.2263989448547363, "learning_rate": 7.264661654135339e-06, "loss": 0.3309, "step": 18190 }, { "epoch": 27.37, "grad_norm": 4.7933197021484375, "learning_rate": 7.263157894736843e-06, "loss": 0.3262, "step": 18200 }, { "epoch": 27.38, "grad_norm": 2.227823495864868, "learning_rate": 7.261654135338346e-06, "loss": 0.2645, "step": 18210 }, { "epoch": 27.4, "grad_norm": 3.900000810623169, "learning_rate": 7.2601503759398504e-06, "loss": 0.3359, "step": 18220 }, { "epoch": 27.41, "grad_norm": 3.5033364295959473, "learning_rate": 7.258646616541354e-06, "loss": 0.3467, "step": 18230 }, { "epoch": 27.43, "grad_norm": 6.635146617889404, "learning_rate": 7.257142857142858e-06, "loss": 0.3276, "step": 18240 }, { "epoch": 27.44, "grad_norm": 4.950628757476807, "learning_rate": 7.255639097744361e-06, "loss": 0.2459, "step": 18250 }, { "epoch": 27.46, "grad_norm": 8.33364486694336, "learning_rate": 7.254135338345865e-06, "loss": 0.3154, "step": 18260 }, { "epoch": 27.47, "grad_norm": 6.362850666046143, "learning_rate": 7.252631578947369e-06, "loss": 0.3349, "step": 18270 }, { "epoch": 27.49, "grad_norm": 2.358121395111084, "learning_rate": 7.251127819548873e-06, "loss": 0.247, "step": 18280 }, { "epoch": 27.5, "grad_norm": 7.640157699584961, "learning_rate": 7.249624060150377e-06, "loss": 0.259, "step": 18290 }, { "epoch": 27.52, "grad_norm": 4.780099868774414, "learning_rate": 7.248120300751881e-06, "loss": 0.2818, "step": 18300 }, { "epoch": 27.53, "grad_norm": 4.6383256912231445, "learning_rate": 7.246616541353384e-06, "loss": 0.2534, "step": 18310 }, { "epoch": 27.55, "grad_norm": 4.298056125640869, "learning_rate": 7.245112781954888e-06, "loss": 0.2586, "step": 18320 }, { "epoch": 27.56, "grad_norm": 7.920955181121826, "learning_rate": 7.243609022556392e-06, "loss": 0.2583, "step": 18330 }, { "epoch": 27.58, "grad_norm": 2.8801300525665283, "learning_rate": 7.242105263157896e-06, "loss": 0.2478, "step": 18340 }, { "epoch": 27.59, "grad_norm": 5.475898265838623, "learning_rate": 7.240601503759399e-06, "loss": 0.3193, "step": 18350 }, { "epoch": 27.61, "grad_norm": 26.629573822021484, "learning_rate": 7.239097744360903e-06, "loss": 0.2725, "step": 18360 }, { "epoch": 27.62, "grad_norm": 5.835658073425293, "learning_rate": 7.237593984962407e-06, "loss": 0.274, "step": 18370 }, { "epoch": 27.64, "grad_norm": 7.282139301300049, "learning_rate": 7.23609022556391e-06, "loss": 0.288, "step": 18380 }, { "epoch": 27.65, "grad_norm": 11.25385856628418, "learning_rate": 7.2345864661654145e-06, "loss": 0.2611, "step": 18390 }, { "epoch": 27.67, "grad_norm": 6.181314945220947, "learning_rate": 7.233082706766917e-06, "loss": 0.3109, "step": 18400 }, { "epoch": 27.68, "grad_norm": 5.535643100738525, "learning_rate": 7.2315789473684215e-06, "loss": 0.2892, "step": 18410 }, { "epoch": 27.7, "grad_norm": 7.809751510620117, "learning_rate": 7.230075187969925e-06, "loss": 0.3626, "step": 18420 }, { "epoch": 27.71, "grad_norm": 5.477492332458496, "learning_rate": 7.2285714285714294e-06, "loss": 0.3423, "step": 18430 }, { "epoch": 27.73, "grad_norm": 5.311155796051025, "learning_rate": 7.227067669172932e-06, "loss": 0.2859, "step": 18440 }, { "epoch": 27.74, "grad_norm": 6.58491325378418, "learning_rate": 7.2255639097744365e-06, "loss": 0.3323, "step": 18450 }, { "epoch": 27.76, "grad_norm": 10.060500144958496, "learning_rate": 7.22406015037594e-06, "loss": 0.2803, "step": 18460 }, { "epoch": 27.77, "grad_norm": 3.332087278366089, "learning_rate": 7.222556390977444e-06, "loss": 0.3011, "step": 18470 }, { "epoch": 27.79, "grad_norm": 3.494025945663452, "learning_rate": 7.221052631578948e-06, "loss": 0.2756, "step": 18480 }, { "epoch": 27.8, "grad_norm": 6.302395820617676, "learning_rate": 7.219548872180452e-06, "loss": 0.2454, "step": 18490 }, { "epoch": 27.82, "grad_norm": 4.8681793212890625, "learning_rate": 7.218045112781955e-06, "loss": 0.349, "step": 18500 }, { "epoch": 27.83, "grad_norm": 5.359793663024902, "learning_rate": 7.216541353383459e-06, "loss": 0.3316, "step": 18510 }, { "epoch": 27.85, "grad_norm": 7.901219367980957, "learning_rate": 7.215037593984963e-06, "loss": 0.255, "step": 18520 }, { "epoch": 27.86, "grad_norm": 6.270127773284912, "learning_rate": 7.213533834586467e-06, "loss": 0.3856, "step": 18530 }, { "epoch": 27.88, "grad_norm": 3.3462986946105957, "learning_rate": 7.21203007518797e-06, "loss": 0.2656, "step": 18540 }, { "epoch": 27.89, "grad_norm": 5.023731708526611, "learning_rate": 7.210526315789474e-06, "loss": 0.3367, "step": 18550 }, { "epoch": 27.91, "grad_norm": 7.475311756134033, "learning_rate": 7.209022556390978e-06, "loss": 0.3652, "step": 18560 }, { "epoch": 27.92, "grad_norm": 5.593987464904785, "learning_rate": 7.207518796992482e-06, "loss": 0.2514, "step": 18570 }, { "epoch": 27.94, "grad_norm": 52.27437973022461, "learning_rate": 7.206015037593986e-06, "loss": 0.3338, "step": 18580 }, { "epoch": 27.95, "grad_norm": 6.0754499435424805, "learning_rate": 7.20451127819549e-06, "loss": 0.2033, "step": 18590 }, { "epoch": 27.97, "grad_norm": 13.490704536437988, "learning_rate": 7.203007518796993e-06, "loss": 0.328, "step": 18600 }, { "epoch": 27.98, "grad_norm": 8.996991157531738, "learning_rate": 7.201503759398497e-06, "loss": 0.3082, "step": 18610 }, { "epoch": 28.0, "grad_norm": 1.09871506690979, "learning_rate": 7.2000000000000005e-06, "loss": 0.2957, "step": 18620 }, { "epoch": 28.0, "eval_accuracy": 0.932, "eval_loss": 0.2707752585411072, "eval_runtime": 84.4192, "eval_samples_per_second": 118.456, "eval_steps_per_second": 0.474, "step": 18620 }, { "epoch": 28.02, "grad_norm": 4.388790130615234, "learning_rate": 7.198496240601505e-06, "loss": 0.2982, "step": 18630 }, { "epoch": 28.03, "grad_norm": 11.981380462646484, "learning_rate": 7.1969924812030076e-06, "loss": 0.2254, "step": 18640 }, { "epoch": 28.05, "grad_norm": 5.74471378326416, "learning_rate": 7.195488721804512e-06, "loss": 0.2806, "step": 18650 }, { "epoch": 28.06, "grad_norm": 6.635989189147949, "learning_rate": 7.1939849624060154e-06, "loss": 0.3081, "step": 18660 }, { "epoch": 28.08, "grad_norm": 8.036993026733398, "learning_rate": 7.19248120300752e-06, "loss": 0.2684, "step": 18670 }, { "epoch": 28.09, "grad_norm": 5.605617523193359, "learning_rate": 7.190977443609023e-06, "loss": 0.3037, "step": 18680 }, { "epoch": 28.11, "grad_norm": 5.046873569488525, "learning_rate": 7.189473684210527e-06, "loss": 0.2489, "step": 18690 }, { "epoch": 28.12, "grad_norm": 7.5858154296875, "learning_rate": 7.18796992481203e-06, "loss": 0.2868, "step": 18700 }, { "epoch": 28.14, "grad_norm": 5.2180399894714355, "learning_rate": 7.186466165413535e-06, "loss": 0.282, "step": 18710 }, { "epoch": 28.15, "grad_norm": 5.189599990844727, "learning_rate": 7.184962406015038e-06, "loss": 0.3185, "step": 18720 }, { "epoch": 28.17, "grad_norm": 3.309946298599243, "learning_rate": 7.183458646616543e-06, "loss": 0.3342, "step": 18730 }, { "epoch": 28.18, "grad_norm": 4.891626834869385, "learning_rate": 7.181954887218045e-06, "loss": 0.2972, "step": 18740 }, { "epoch": 28.2, "grad_norm": 4.263134002685547, "learning_rate": 7.18045112781955e-06, "loss": 0.2384, "step": 18750 }, { "epoch": 28.21, "grad_norm": 4.385184288024902, "learning_rate": 7.178947368421053e-06, "loss": 0.2699, "step": 18760 }, { "epoch": 28.23, "grad_norm": 10.802248001098633, "learning_rate": 7.1774436090225575e-06, "loss": 0.3162, "step": 18770 }, { "epoch": 28.24, "grad_norm": 4.073564052581787, "learning_rate": 7.175939849624061e-06, "loss": 0.2886, "step": 18780 }, { "epoch": 28.26, "grad_norm": 4.669020175933838, "learning_rate": 7.174436090225565e-06, "loss": 0.3056, "step": 18790 }, { "epoch": 28.27, "grad_norm": 7.317276477813721, "learning_rate": 7.172932330827068e-06, "loss": 0.3272, "step": 18800 }, { "epoch": 28.29, "grad_norm": 2.6165945529937744, "learning_rate": 7.1714285714285725e-06, "loss": 0.2903, "step": 18810 }, { "epoch": 28.3, "grad_norm": 4.007689476013184, "learning_rate": 7.169924812030076e-06, "loss": 0.3343, "step": 18820 }, { "epoch": 28.32, "grad_norm": 4.415470123291016, "learning_rate": 7.16842105263158e-06, "loss": 0.2953, "step": 18830 }, { "epoch": 28.33, "grad_norm": 3.293377637863159, "learning_rate": 7.166917293233083e-06, "loss": 0.2722, "step": 18840 }, { "epoch": 28.35, "grad_norm": 5.47039270401001, "learning_rate": 7.165413533834587e-06, "loss": 0.2798, "step": 18850 }, { "epoch": 28.36, "grad_norm": 5.256561756134033, "learning_rate": 7.163909774436091e-06, "loss": 0.2649, "step": 18860 }, { "epoch": 28.38, "grad_norm": 3.793679714202881, "learning_rate": 7.1624060150375944e-06, "loss": 0.2919, "step": 18870 }, { "epoch": 28.39, "grad_norm": 3.6800131797790527, "learning_rate": 7.160902255639098e-06, "loss": 0.2941, "step": 18880 }, { "epoch": 28.41, "grad_norm": 6.140032768249512, "learning_rate": 7.1593984962406015e-06, "loss": 0.307, "step": 18890 }, { "epoch": 28.42, "grad_norm": 5.978692531585693, "learning_rate": 7.157894736842106e-06, "loss": 0.2973, "step": 18900 }, { "epoch": 28.44, "grad_norm": 6.257145404815674, "learning_rate": 7.156390977443609e-06, "loss": 0.3043, "step": 18910 }, { "epoch": 28.45, "grad_norm": 6.124258518218994, "learning_rate": 7.154887218045114e-06, "loss": 0.2877, "step": 18920 }, { "epoch": 28.47, "grad_norm": 4.110513687133789, "learning_rate": 7.153383458646616e-06, "loss": 0.2723, "step": 18930 }, { "epoch": 28.48, "grad_norm": 8.390369415283203, "learning_rate": 7.151879699248121e-06, "loss": 0.2854, "step": 18940 }, { "epoch": 28.5, "grad_norm": 4.641889572143555, "learning_rate": 7.150375939849624e-06, "loss": 0.2325, "step": 18950 }, { "epoch": 28.51, "grad_norm": 9.678013801574707, "learning_rate": 7.148872180451129e-06, "loss": 0.2325, "step": 18960 }, { "epoch": 28.53, "grad_norm": 4.375498294830322, "learning_rate": 7.147368421052631e-06, "loss": 0.2814, "step": 18970 }, { "epoch": 28.54, "grad_norm": 11.331188201904297, "learning_rate": 7.145864661654136e-06, "loss": 0.2659, "step": 18980 }, { "epoch": 28.56, "grad_norm": 4.929275035858154, "learning_rate": 7.144360902255639e-06, "loss": 0.3207, "step": 18990 }, { "epoch": 28.57, "grad_norm": 6.941195487976074, "learning_rate": 7.1428571428571436e-06, "loss": 0.2677, "step": 19000 }, { "epoch": 28.59, "grad_norm": 7.25919771194458, "learning_rate": 7.141353383458647e-06, "loss": 0.265, "step": 19010 }, { "epoch": 28.6, "grad_norm": 4.009887218475342, "learning_rate": 7.1398496240601514e-06, "loss": 0.289, "step": 19020 }, { "epoch": 28.62, "grad_norm": 7.65335750579834, "learning_rate": 7.138345864661654e-06, "loss": 0.2977, "step": 19030 }, { "epoch": 28.63, "grad_norm": 7.200645923614502, "learning_rate": 7.1368421052631585e-06, "loss": 0.3497, "step": 19040 }, { "epoch": 28.65, "grad_norm": 4.218509674072266, "learning_rate": 7.135338345864662e-06, "loss": 0.2505, "step": 19050 }, { "epoch": 28.66, "grad_norm": 5.385000228881836, "learning_rate": 7.133834586466166e-06, "loss": 0.2603, "step": 19060 }, { "epoch": 28.68, "grad_norm": 6.183909893035889, "learning_rate": 7.132330827067669e-06, "loss": 0.3193, "step": 19070 }, { "epoch": 28.69, "grad_norm": 7.682650089263916, "learning_rate": 7.130827067669173e-06, "loss": 0.3485, "step": 19080 }, { "epoch": 28.71, "grad_norm": 6.737081527709961, "learning_rate": 7.129323308270677e-06, "loss": 0.4199, "step": 19090 }, { "epoch": 28.72, "grad_norm": 6.185298442840576, "learning_rate": 7.127819548872181e-06, "loss": 0.3144, "step": 19100 }, { "epoch": 28.74, "grad_norm": 3.423515558242798, "learning_rate": 7.126315789473685e-06, "loss": 0.2768, "step": 19110 }, { "epoch": 28.75, "grad_norm": 7.198062896728516, "learning_rate": 7.124812030075189e-06, "loss": 0.3282, "step": 19120 }, { "epoch": 28.77, "grad_norm": 5.3529372215271, "learning_rate": 7.123308270676692e-06, "loss": 0.2467, "step": 19130 }, { "epoch": 28.78, "grad_norm": 7.850579261779785, "learning_rate": 7.121804511278196e-06, "loss": 0.3204, "step": 19140 }, { "epoch": 28.8, "grad_norm": 8.140227317810059, "learning_rate": 7.1203007518797e-06, "loss": 0.3087, "step": 19150 }, { "epoch": 28.81, "grad_norm": 6.9892778396606445, "learning_rate": 7.118796992481204e-06, "loss": 0.3148, "step": 19160 }, { "epoch": 28.83, "grad_norm": 6.189218521118164, "learning_rate": 7.117293233082707e-06, "loss": 0.2779, "step": 19170 }, { "epoch": 28.84, "grad_norm": 4.910006999969482, "learning_rate": 7.115789473684211e-06, "loss": 0.2469, "step": 19180 }, { "epoch": 28.86, "grad_norm": 6.6792449951171875, "learning_rate": 7.114285714285715e-06, "loss": 0.3604, "step": 19190 }, { "epoch": 28.87, "grad_norm": 8.408853530883789, "learning_rate": 7.112781954887219e-06, "loss": 0.3493, "step": 19200 }, { "epoch": 28.89, "grad_norm": 5.114132404327393, "learning_rate": 7.1112781954887225e-06, "loss": 0.2874, "step": 19210 }, { "epoch": 28.9, "grad_norm": 7.833899021148682, "learning_rate": 7.109774436090227e-06, "loss": 0.2365, "step": 19220 }, { "epoch": 28.92, "grad_norm": 3.8841164112091064, "learning_rate": 7.10827067669173e-06, "loss": 0.2937, "step": 19230 }, { "epoch": 28.93, "grad_norm": 11.486865997314453, "learning_rate": 7.106766917293234e-06, "loss": 0.2851, "step": 19240 }, { "epoch": 28.95, "grad_norm": 9.058024406433105, "learning_rate": 7.1052631578947375e-06, "loss": 0.2317, "step": 19250 }, { "epoch": 28.96, "grad_norm": 9.031519889831543, "learning_rate": 7.103759398496242e-06, "loss": 0.4064, "step": 19260 }, { "epoch": 28.98, "grad_norm": 5.929012298583984, "learning_rate": 7.1022556390977445e-06, "loss": 0.2661, "step": 19270 }, { "epoch": 28.99, "grad_norm": 2.968751907348633, "learning_rate": 7.100751879699249e-06, "loss": 0.3013, "step": 19280 }, { "epoch": 29.0, "eval_accuracy": 0.932, "eval_loss": 0.28813642263412476, "eval_runtime": 84.4456, "eval_samples_per_second": 118.419, "eval_steps_per_second": 0.474, "step": 19285 }, { "epoch": 29.01, "grad_norm": 5.468835830688477, "learning_rate": 7.099248120300752e-06, "loss": 0.2397, "step": 19290 }, { "epoch": 29.02, "grad_norm": 7.077803134918213, "learning_rate": 7.097744360902257e-06, "loss": 0.3068, "step": 19300 }, { "epoch": 29.04, "grad_norm": 5.543747425079346, "learning_rate": 7.09624060150376e-06, "loss": 0.2652, "step": 19310 }, { "epoch": 29.05, "grad_norm": 7.105257034301758, "learning_rate": 7.094736842105265e-06, "loss": 0.3016, "step": 19320 }, { "epoch": 29.07, "grad_norm": 7.57663106918335, "learning_rate": 7.093233082706767e-06, "loss": 0.3195, "step": 19330 }, { "epoch": 29.08, "grad_norm": 3.0557851791381836, "learning_rate": 7.091729323308272e-06, "loss": 0.2686, "step": 19340 }, { "epoch": 29.1, "grad_norm": 6.048924446105957, "learning_rate": 7.090225563909775e-06, "loss": 0.2936, "step": 19350 }, { "epoch": 29.11, "grad_norm": 5.691340446472168, "learning_rate": 7.088721804511278e-06, "loss": 0.2832, "step": 19360 }, { "epoch": 29.13, "grad_norm": 0.4570540189743042, "learning_rate": 7.087218045112782e-06, "loss": 0.2204, "step": 19370 }, { "epoch": 29.14, "grad_norm": 7.902672290802002, "learning_rate": 7.085714285714286e-06, "loss": 0.2519, "step": 19380 }, { "epoch": 29.16, "grad_norm": 6.772716045379639, "learning_rate": 7.08421052631579e-06, "loss": 0.3449, "step": 19390 }, { "epoch": 29.17, "grad_norm": 5.602826118469238, "learning_rate": 7.082706766917294e-06, "loss": 0.2434, "step": 19400 }, { "epoch": 29.19, "grad_norm": 8.90966510772705, "learning_rate": 7.081203007518798e-06, "loss": 0.3094, "step": 19410 }, { "epoch": 29.2, "grad_norm": 4.287824630737305, "learning_rate": 7.079699248120301e-06, "loss": 0.2961, "step": 19420 }, { "epoch": 29.22, "grad_norm": 5.467939853668213, "learning_rate": 7.078195488721805e-06, "loss": 0.2937, "step": 19430 }, { "epoch": 29.23, "grad_norm": 4.856724739074707, "learning_rate": 7.0766917293233086e-06, "loss": 0.3074, "step": 19440 }, { "epoch": 29.25, "grad_norm": 4.881340980529785, "learning_rate": 7.075187969924813e-06, "loss": 0.3334, "step": 19450 }, { "epoch": 29.26, "grad_norm": 6.680235862731934, "learning_rate": 7.073684210526316e-06, "loss": 0.2404, "step": 19460 }, { "epoch": 29.28, "grad_norm": 3.575651168823242, "learning_rate": 7.07218045112782e-06, "loss": 0.2544, "step": 19470 }, { "epoch": 29.29, "grad_norm": 3.3242225646972656, "learning_rate": 7.0706766917293235e-06, "loss": 0.2732, "step": 19480 }, { "epoch": 29.31, "grad_norm": 3.5619566440582275, "learning_rate": 7.069172932330828e-06, "loss": 0.3299, "step": 19490 }, { "epoch": 29.32, "grad_norm": 6.326242923736572, "learning_rate": 7.067669172932331e-06, "loss": 0.3078, "step": 19500 }, { "epoch": 29.34, "grad_norm": 3.983919620513916, "learning_rate": 7.066165413533836e-06, "loss": 0.2805, "step": 19510 }, { "epoch": 29.35, "grad_norm": 7.784421443939209, "learning_rate": 7.064661654135338e-06, "loss": 0.3146, "step": 19520 }, { "epoch": 29.37, "grad_norm": 7.162094593048096, "learning_rate": 7.063157894736843e-06, "loss": 0.2942, "step": 19530 }, { "epoch": 29.38, "grad_norm": 4.964542388916016, "learning_rate": 7.061654135338346e-06, "loss": 0.3356, "step": 19540 }, { "epoch": 29.4, "grad_norm": 6.946933269500732, "learning_rate": 7.060150375939851e-06, "loss": 0.312, "step": 19550 }, { "epoch": 29.41, "grad_norm": 6.954648494720459, "learning_rate": 7.058646616541353e-06, "loss": 0.3188, "step": 19560 }, { "epoch": 29.43, "grad_norm": 6.528570175170898, "learning_rate": 7.057142857142858e-06, "loss": 0.2966, "step": 19570 }, { "epoch": 29.44, "grad_norm": 5.95659875869751, "learning_rate": 7.055639097744361e-06, "loss": 0.3398, "step": 19580 }, { "epoch": 29.46, "grad_norm": 2.7002809047698975, "learning_rate": 7.054135338345866e-06, "loss": 0.2737, "step": 19590 }, { "epoch": 29.47, "grad_norm": 4.33442497253418, "learning_rate": 7.052631578947369e-06, "loss": 0.3002, "step": 19600 }, { "epoch": 29.49, "grad_norm": 6.4437713623046875, "learning_rate": 7.051127819548873e-06, "loss": 0.28, "step": 19610 }, { "epoch": 29.5, "grad_norm": 3.9194564819335938, "learning_rate": 7.049624060150376e-06, "loss": 0.3069, "step": 19620 }, { "epoch": 29.52, "grad_norm": 5.887040615081787, "learning_rate": 7.0481203007518805e-06, "loss": 0.2634, "step": 19630 }, { "epoch": 29.53, "grad_norm": 4.538432598114014, "learning_rate": 7.046616541353384e-06, "loss": 0.2684, "step": 19640 }, { "epoch": 29.55, "grad_norm": 7.666621208190918, "learning_rate": 7.045112781954888e-06, "loss": 0.2344, "step": 19650 }, { "epoch": 29.56, "grad_norm": 5.944088935852051, "learning_rate": 7.043609022556391e-06, "loss": 0.2358, "step": 19660 }, { "epoch": 29.58, "grad_norm": 4.063729286193848, "learning_rate": 7.0421052631578954e-06, "loss": 0.2532, "step": 19670 }, { "epoch": 29.59, "grad_norm": 3.9652276039123535, "learning_rate": 7.040601503759399e-06, "loss": 0.3471, "step": 19680 }, { "epoch": 29.61, "grad_norm": 5.847537040710449, "learning_rate": 7.039097744360903e-06, "loss": 0.2433, "step": 19690 }, { "epoch": 29.62, "grad_norm": 4.780336380004883, "learning_rate": 7.037593984962407e-06, "loss": 0.3358, "step": 19700 }, { "epoch": 29.64, "grad_norm": 3.0303971767425537, "learning_rate": 7.03609022556391e-06, "loss": 0.2168, "step": 19710 }, { "epoch": 29.65, "grad_norm": 3.0998427867889404, "learning_rate": 7.034586466165414e-06, "loss": 0.2995, "step": 19720 }, { "epoch": 29.67, "grad_norm": 2.832582473754883, "learning_rate": 7.033082706766918e-06, "loss": 0.2413, "step": 19730 }, { "epoch": 29.68, "grad_norm": 7.660305023193359, "learning_rate": 7.031578947368422e-06, "loss": 0.2962, "step": 19740 }, { "epoch": 29.7, "grad_norm": 5.547909259796143, "learning_rate": 7.030075187969926e-06, "loss": 0.3422, "step": 19750 }, { "epoch": 29.71, "grad_norm": 8.43567180633545, "learning_rate": 7.028571428571429e-06, "loss": 0.3488, "step": 19760 }, { "epoch": 29.73, "grad_norm": 8.269464492797852, "learning_rate": 7.027067669172933e-06, "loss": 0.2961, "step": 19770 }, { "epoch": 29.74, "grad_norm": 10.270258903503418, "learning_rate": 7.025563909774437e-06, "loss": 0.2522, "step": 19780 }, { "epoch": 29.76, "grad_norm": 5.3482537269592285, "learning_rate": 7.024060150375941e-06, "loss": 0.2841, "step": 19790 }, { "epoch": 29.77, "grad_norm": 5.868763446807861, "learning_rate": 7.022556390977444e-06, "loss": 0.2728, "step": 19800 }, { "epoch": 29.79, "grad_norm": 12.772025108337402, "learning_rate": 7.021052631578948e-06, "loss": 0.2202, "step": 19810 }, { "epoch": 29.8, "grad_norm": 9.140650749206543, "learning_rate": 7.019548872180452e-06, "loss": 0.3589, "step": 19820 }, { "epoch": 29.82, "grad_norm": 5.013607978820801, "learning_rate": 7.018045112781956e-06, "loss": 0.2229, "step": 19830 }, { "epoch": 29.83, "grad_norm": 8.760464668273926, "learning_rate": 7.0165413533834595e-06, "loss": 0.3337, "step": 19840 }, { "epoch": 29.85, "grad_norm": 7.938315391540527, "learning_rate": 7.015037593984964e-06, "loss": 0.3052, "step": 19850 }, { "epoch": 29.86, "grad_norm": 7.728393077850342, "learning_rate": 7.0135338345864665e-06, "loss": 0.2861, "step": 19860 }, { "epoch": 29.88, "grad_norm": 6.810389995574951, "learning_rate": 7.01203007518797e-06, "loss": 0.3451, "step": 19870 }, { "epoch": 29.89, "grad_norm": 3.8391294479370117, "learning_rate": 7.010526315789474e-06, "loss": 0.2302, "step": 19880 }, { "epoch": 29.91, "grad_norm": 8.415011405944824, "learning_rate": 7.009022556390977e-06, "loss": 0.3055, "step": 19890 }, { "epoch": 29.92, "grad_norm": 3.6795806884765625, "learning_rate": 7.0075187969924815e-06, "loss": 0.3007, "step": 19900 }, { "epoch": 29.94, "grad_norm": 6.91939640045166, "learning_rate": 7.006015037593985e-06, "loss": 0.296, "step": 19910 }, { "epoch": 29.95, "grad_norm": 4.2215681076049805, "learning_rate": 7.004511278195489e-06, "loss": 0.26, "step": 19920 }, { "epoch": 29.97, "grad_norm": 3.9634735584259033, "learning_rate": 7.003007518796993e-06, "loss": 0.2929, "step": 19930 }, { "epoch": 29.98, "grad_norm": 4.356061935424805, "learning_rate": 7.001503759398497e-06, "loss": 0.2259, "step": 19940 }, { "epoch": 30.0, "grad_norm": 1.2935045957565308, "learning_rate": 7e-06, "loss": 0.2811, "step": 19950 }, { "epoch": 30.0, "eval_accuracy": 0.9304, "eval_loss": 0.2939961552619934, "eval_runtime": 85.2279, "eval_samples_per_second": 117.333, "eval_steps_per_second": 0.469, "step": 19950 }, { "epoch": 30.02, "grad_norm": 4.659907817840576, "learning_rate": 6.998496240601504e-06, "loss": 0.2815, "step": 19960 }, { "epoch": 30.03, "grad_norm": 7.678098201751709, "learning_rate": 6.996992481203008e-06, "loss": 0.2224, "step": 19970 }, { "epoch": 30.05, "grad_norm": 3.063901662826538, "learning_rate": 6.995488721804512e-06, "loss": 0.2971, "step": 19980 }, { "epoch": 30.06, "grad_norm": 5.704662322998047, "learning_rate": 6.993984962406015e-06, "loss": 0.276, "step": 19990 }, { "epoch": 30.08, "grad_norm": 5.150294780731201, "learning_rate": 6.992481203007519e-06, "loss": 0.3514, "step": 20000 }, { "epoch": 30.09, "grad_norm": 6.354630947113037, "learning_rate": 6.990977443609023e-06, "loss": 0.2391, "step": 20010 }, { "epoch": 30.11, "grad_norm": 3.2037034034729004, "learning_rate": 6.989473684210527e-06, "loss": 0.2754, "step": 20020 }, { "epoch": 30.12, "grad_norm": 9.49938678741455, "learning_rate": 6.987969924812031e-06, "loss": 0.3042, "step": 20030 }, { "epoch": 30.14, "grad_norm": 7.142296314239502, "learning_rate": 6.986466165413535e-06, "loss": 0.2314, "step": 20040 }, { "epoch": 30.15, "grad_norm": 7.547910690307617, "learning_rate": 6.984962406015038e-06, "loss": 0.2545, "step": 20050 }, { "epoch": 30.17, "grad_norm": 3.509197235107422, "learning_rate": 6.983458646616542e-06, "loss": 0.2241, "step": 20060 }, { "epoch": 30.18, "grad_norm": 5.678248882293701, "learning_rate": 6.9819548872180455e-06, "loss": 0.2485, "step": 20070 }, { "epoch": 30.2, "grad_norm": 6.966419696807861, "learning_rate": 6.98045112781955e-06, "loss": 0.2342, "step": 20080 }, { "epoch": 30.21, "grad_norm": 5.888791084289551, "learning_rate": 6.9789473684210525e-06, "loss": 0.2433, "step": 20090 }, { "epoch": 30.23, "grad_norm": 6.074886322021484, "learning_rate": 6.977443609022557e-06, "loss": 0.3245, "step": 20100 }, { "epoch": 30.24, "grad_norm": 8.158697128295898, "learning_rate": 6.9759398496240604e-06, "loss": 0.2656, "step": 20110 }, { "epoch": 30.26, "grad_norm": 4.1151909828186035, "learning_rate": 6.974436090225565e-06, "loss": 0.2862, "step": 20120 }, { "epoch": 30.27, "grad_norm": 5.835092067718506, "learning_rate": 6.972932330827068e-06, "loss": 0.3154, "step": 20130 }, { "epoch": 30.29, "grad_norm": 7.01699161529541, "learning_rate": 6.971428571428573e-06, "loss": 0.3146, "step": 20140 }, { "epoch": 30.3, "grad_norm": 4.701117515563965, "learning_rate": 6.969924812030075e-06, "loss": 0.2476, "step": 20150 }, { "epoch": 30.32, "grad_norm": 7.054307460784912, "learning_rate": 6.96842105263158e-06, "loss": 0.2467, "step": 20160 }, { "epoch": 30.33, "grad_norm": 6.54818868637085, "learning_rate": 6.966917293233083e-06, "loss": 0.2588, "step": 20170 }, { "epoch": 30.35, "grad_norm": 4.79921817779541, "learning_rate": 6.965413533834588e-06, "loss": 0.2452, "step": 20180 }, { "epoch": 30.36, "grad_norm": 7.635383129119873, "learning_rate": 6.96390977443609e-06, "loss": 0.2971, "step": 20190 }, { "epoch": 30.38, "grad_norm": 6.174952507019043, "learning_rate": 6.962406015037595e-06, "loss": 0.2438, "step": 20200 }, { "epoch": 30.39, "grad_norm": 8.053534507751465, "learning_rate": 6.960902255639098e-06, "loss": 0.3147, "step": 20210 }, { "epoch": 30.41, "grad_norm": 5.930545806884766, "learning_rate": 6.9593984962406025e-06, "loss": 0.3246, "step": 20220 }, { "epoch": 30.42, "grad_norm": 5.0360493659973145, "learning_rate": 6.957894736842106e-06, "loss": 0.3028, "step": 20230 }, { "epoch": 30.44, "grad_norm": 5.536324977874756, "learning_rate": 6.95639097744361e-06, "loss": 0.2762, "step": 20240 }, { "epoch": 30.45, "grad_norm": 3.6296212673187256, "learning_rate": 6.954887218045113e-06, "loss": 0.2799, "step": 20250 }, { "epoch": 30.47, "grad_norm": 5.655179977416992, "learning_rate": 6.9533834586466175e-06, "loss": 0.3003, "step": 20260 }, { "epoch": 30.48, "grad_norm": 5.260562419891357, "learning_rate": 6.951879699248121e-06, "loss": 0.2221, "step": 20270 }, { "epoch": 30.5, "grad_norm": 4.750830173492432, "learning_rate": 6.950375939849625e-06, "loss": 0.2439, "step": 20280 }, { "epoch": 30.51, "grad_norm": 6.058375835418701, "learning_rate": 6.948872180451128e-06, "loss": 0.2716, "step": 20290 }, { "epoch": 30.53, "grad_norm": 4.154491901397705, "learning_rate": 6.947368421052632e-06, "loss": 0.2311, "step": 20300 }, { "epoch": 30.54, "grad_norm": 4.645699977874756, "learning_rate": 6.945864661654136e-06, "loss": 0.3293, "step": 20310 }, { "epoch": 30.56, "grad_norm": 4.276103973388672, "learning_rate": 6.94436090225564e-06, "loss": 0.2236, "step": 20320 }, { "epoch": 30.57, "grad_norm": 5.338918685913086, "learning_rate": 6.942857142857144e-06, "loss": 0.3162, "step": 20330 }, { "epoch": 30.59, "grad_norm": 4.5537285804748535, "learning_rate": 6.941353383458648e-06, "loss": 0.2801, "step": 20340 }, { "epoch": 30.6, "grad_norm": 4.682258605957031, "learning_rate": 6.939849624060151e-06, "loss": 0.2674, "step": 20350 }, { "epoch": 30.62, "grad_norm": 5.651388168334961, "learning_rate": 6.938345864661654e-06, "loss": 0.3353, "step": 20360 }, { "epoch": 30.63, "grad_norm": 4.2494797706604, "learning_rate": 6.936842105263159e-06, "loss": 0.3508, "step": 20370 }, { "epoch": 30.65, "grad_norm": 5.229772090911865, "learning_rate": 6.935338345864661e-06, "loss": 0.2578, "step": 20380 }, { "epoch": 30.66, "grad_norm": 1.8404473066329956, "learning_rate": 6.933834586466166e-06, "loss": 0.2398, "step": 20390 }, { "epoch": 30.68, "grad_norm": 3.122826337814331, "learning_rate": 6.932330827067669e-06, "loss": 0.2732, "step": 20400 }, { "epoch": 30.69, "grad_norm": 3.0387120246887207, "learning_rate": 6.930827067669174e-06, "loss": 0.2722, "step": 20410 }, { "epoch": 30.71, "grad_norm": 3.9582316875457764, "learning_rate": 6.929323308270677e-06, "loss": 0.2502, "step": 20420 }, { "epoch": 30.72, "grad_norm": 4.217288494110107, "learning_rate": 6.9278195488721815e-06, "loss": 0.2763, "step": 20430 }, { "epoch": 30.74, "grad_norm": 5.593020915985107, "learning_rate": 6.926315789473684e-06, "loss": 0.3234, "step": 20440 }, { "epoch": 30.75, "grad_norm": 6.763814926147461, "learning_rate": 6.9248120300751886e-06, "loss": 0.3275, "step": 20450 }, { "epoch": 30.77, "grad_norm": 4.887606620788574, "learning_rate": 6.923308270676692e-06, "loss": 0.266, "step": 20460 }, { "epoch": 30.78, "grad_norm": 2.9521090984344482, "learning_rate": 6.9218045112781964e-06, "loss": 0.2542, "step": 20470 }, { "epoch": 30.8, "grad_norm": 6.818195819854736, "learning_rate": 6.920300751879699e-06, "loss": 0.3018, "step": 20480 }, { "epoch": 30.81, "grad_norm": 9.507380485534668, "learning_rate": 6.9187969924812035e-06, "loss": 0.2462, "step": 20490 }, { "epoch": 30.83, "grad_norm": 6.438077926635742, "learning_rate": 6.917293233082707e-06, "loss": 0.2951, "step": 20500 }, { "epoch": 30.84, "grad_norm": 2.8363983631134033, "learning_rate": 6.915789473684211e-06, "loss": 0.3104, "step": 20510 }, { "epoch": 30.86, "grad_norm": 4.733820915222168, "learning_rate": 6.914285714285715e-06, "loss": 0.2756, "step": 20520 }, { "epoch": 30.87, "grad_norm": 6.546677112579346, "learning_rate": 6.912781954887218e-06, "loss": 0.3449, "step": 20530 }, { "epoch": 30.89, "grad_norm": 7.878147125244141, "learning_rate": 6.911278195488722e-06, "loss": 0.2393, "step": 20540 }, { "epoch": 30.9, "grad_norm": 4.541097164154053, "learning_rate": 6.909774436090226e-06, "loss": 0.3316, "step": 20550 }, { "epoch": 30.92, "grad_norm": 6.208898544311523, "learning_rate": 6.90827067669173e-06, "loss": 0.3414, "step": 20560 }, { "epoch": 30.93, "grad_norm": 5.853148460388184, "learning_rate": 6.906766917293234e-06, "loss": 0.356, "step": 20570 }, { "epoch": 30.95, "grad_norm": 5.502560615539551, "learning_rate": 6.905263157894737e-06, "loss": 0.2357, "step": 20580 }, { "epoch": 30.96, "grad_norm": 3.3541431427001953, "learning_rate": 6.903759398496241e-06, "loss": 0.2527, "step": 20590 }, { "epoch": 30.98, "grad_norm": 20.790668487548828, "learning_rate": 6.902255639097745e-06, "loss": 0.3085, "step": 20600 }, { "epoch": 30.99, "grad_norm": 6.3316216468811035, "learning_rate": 6.900751879699249e-06, "loss": 0.2031, "step": 20610 }, { "epoch": 31.0, "eval_accuracy": 0.9335, "eval_loss": 0.2801915109157562, "eval_runtime": 84.9003, "eval_samples_per_second": 117.785, "eval_steps_per_second": 0.471, "step": 20615 }, { "epoch": 31.01, "grad_norm": 7.1735148429870605, "learning_rate": 6.899248120300753e-06, "loss": 0.2616, "step": 20620 }, { "epoch": 31.02, "grad_norm": 3.6667120456695557, "learning_rate": 6.897744360902256e-06, "loss": 0.354, "step": 20630 }, { "epoch": 31.04, "grad_norm": 5.409661293029785, "learning_rate": 6.89624060150376e-06, "loss": 0.323, "step": 20640 }, { "epoch": 31.05, "grad_norm": 4.91942834854126, "learning_rate": 6.894736842105264e-06, "loss": 0.3082, "step": 20650 }, { "epoch": 31.07, "grad_norm": 7.898626804351807, "learning_rate": 6.8932330827067675e-06, "loss": 0.2154, "step": 20660 }, { "epoch": 31.08, "grad_norm": 5.627191543579102, "learning_rate": 6.891729323308272e-06, "loss": 0.3336, "step": 20670 }, { "epoch": 31.1, "grad_norm": 6.512294769287109, "learning_rate": 6.8902255639097746e-06, "loss": 0.2925, "step": 20680 }, { "epoch": 31.11, "grad_norm": 5.055330276489258, "learning_rate": 6.888721804511279e-06, "loss": 0.195, "step": 20690 }, { "epoch": 31.13, "grad_norm": 4.006707191467285, "learning_rate": 6.8872180451127825e-06, "loss": 0.2374, "step": 20700 }, { "epoch": 31.14, "grad_norm": 6.585967063903809, "learning_rate": 6.885714285714287e-06, "loss": 0.2938, "step": 20710 }, { "epoch": 31.16, "grad_norm": 7.993644714355469, "learning_rate": 6.8842105263157895e-06, "loss": 0.2862, "step": 20720 }, { "epoch": 31.17, "grad_norm": 6.300648212432861, "learning_rate": 6.882706766917294e-06, "loss": 0.3122, "step": 20730 }, { "epoch": 31.19, "grad_norm": 6.135032653808594, "learning_rate": 6.881203007518797e-06, "loss": 0.2494, "step": 20740 }, { "epoch": 31.2, "grad_norm": 3.280155658721924, "learning_rate": 6.879699248120302e-06, "loss": 0.3043, "step": 20750 }, { "epoch": 31.22, "grad_norm": 6.118671417236328, "learning_rate": 6.878195488721805e-06, "loss": 0.2779, "step": 20760 }, { "epoch": 31.23, "grad_norm": 8.142518043518066, "learning_rate": 6.87669172932331e-06, "loss": 0.3617, "step": 20770 }, { "epoch": 31.25, "grad_norm": 5.192366123199463, "learning_rate": 6.875187969924812e-06, "loss": 0.3242, "step": 20780 }, { "epoch": 31.26, "grad_norm": 5.72282075881958, "learning_rate": 6.873684210526317e-06, "loss": 0.2499, "step": 20790 }, { "epoch": 31.28, "grad_norm": 6.699811935424805, "learning_rate": 6.87218045112782e-06, "loss": 0.2588, "step": 20800 }, { "epoch": 31.29, "grad_norm": 5.5423173904418945, "learning_rate": 6.8706766917293246e-06, "loss": 0.3162, "step": 20810 }, { "epoch": 31.31, "grad_norm": 6.422053813934326, "learning_rate": 6.869172932330827e-06, "loss": 0.2665, "step": 20820 }, { "epoch": 31.32, "grad_norm": 6.014066219329834, "learning_rate": 6.867669172932332e-06, "loss": 0.2328, "step": 20830 }, { "epoch": 31.34, "grad_norm": 13.063108444213867, "learning_rate": 6.866165413533835e-06, "loss": 0.2635, "step": 20840 }, { "epoch": 31.35, "grad_norm": 5.6524882316589355, "learning_rate": 6.864661654135339e-06, "loss": 0.3334, "step": 20850 }, { "epoch": 31.37, "grad_norm": 4.057956218719482, "learning_rate": 6.863157894736843e-06, "loss": 0.3186, "step": 20860 }, { "epoch": 31.38, "grad_norm": 4.220489025115967, "learning_rate": 6.861654135338346e-06, "loss": 0.2551, "step": 20870 }, { "epoch": 31.4, "grad_norm": 7.07074499130249, "learning_rate": 6.86015037593985e-06, "loss": 0.2236, "step": 20880 }, { "epoch": 31.41, "grad_norm": 4.640635967254639, "learning_rate": 6.8586466165413536e-06, "loss": 0.2972, "step": 20890 }, { "epoch": 31.43, "grad_norm": 11.102641105651855, "learning_rate": 6.857142857142858e-06, "loss": 0.2724, "step": 20900 }, { "epoch": 31.44, "grad_norm": 8.299273490905762, "learning_rate": 6.855639097744361e-06, "loss": 0.267, "step": 20910 }, { "epoch": 31.46, "grad_norm": 6.167851448059082, "learning_rate": 6.854135338345865e-06, "loss": 0.2695, "step": 20920 }, { "epoch": 31.47, "grad_norm": 6.481257438659668, "learning_rate": 6.8526315789473685e-06, "loss": 0.2886, "step": 20930 }, { "epoch": 31.49, "grad_norm": 6.720365047454834, "learning_rate": 6.851127819548873e-06, "loss": 0.2833, "step": 20940 }, { "epoch": 31.5, "grad_norm": 6.032981872558594, "learning_rate": 6.849624060150376e-06, "loss": 0.2235, "step": 20950 }, { "epoch": 31.52, "grad_norm": 7.538634300231934, "learning_rate": 6.848120300751881e-06, "loss": 0.261, "step": 20960 }, { "epoch": 31.53, "grad_norm": 3.033374071121216, "learning_rate": 6.846616541353383e-06, "loss": 0.2572, "step": 20970 }, { "epoch": 31.55, "grad_norm": 4.783783435821533, "learning_rate": 6.845112781954888e-06, "loss": 0.268, "step": 20980 }, { "epoch": 31.56, "grad_norm": 5.95822811126709, "learning_rate": 6.843609022556391e-06, "loss": 0.265, "step": 20990 }, { "epoch": 31.58, "grad_norm": 8.217907905578613, "learning_rate": 6.842105263157896e-06, "loss": 0.3141, "step": 21000 }, { "epoch": 31.59, "grad_norm": 9.597149848937988, "learning_rate": 6.840601503759398e-06, "loss": 0.2925, "step": 21010 }, { "epoch": 31.61, "grad_norm": 7.296209812164307, "learning_rate": 6.839097744360903e-06, "loss": 0.3045, "step": 21020 }, { "epoch": 31.62, "grad_norm": 5.84061336517334, "learning_rate": 6.837593984962406e-06, "loss": 0.3293, "step": 21030 }, { "epoch": 31.64, "grad_norm": 4.444825172424316, "learning_rate": 6.8360902255639106e-06, "loss": 0.2902, "step": 21040 }, { "epoch": 31.65, "grad_norm": 12.595419883728027, "learning_rate": 6.834586466165414e-06, "loss": 0.253, "step": 21050 }, { "epoch": 31.67, "grad_norm": 8.716811180114746, "learning_rate": 6.8330827067669185e-06, "loss": 0.2794, "step": 21060 }, { "epoch": 31.68, "grad_norm": 6.067722320556641, "learning_rate": 6.831578947368421e-06, "loss": 0.2841, "step": 21070 }, { "epoch": 31.7, "grad_norm": 4.765297889709473, "learning_rate": 6.8300751879699255e-06, "loss": 0.2732, "step": 21080 }, { "epoch": 31.71, "grad_norm": 7.085923671722412, "learning_rate": 6.828571428571429e-06, "loss": 0.2453, "step": 21090 }, { "epoch": 31.73, "grad_norm": 4.44352912902832, "learning_rate": 6.827067669172933e-06, "loss": 0.3153, "step": 21100 }, { "epoch": 31.74, "grad_norm": 6.792245864868164, "learning_rate": 6.825563909774436e-06, "loss": 0.2952, "step": 21110 }, { "epoch": 31.76, "grad_norm": 4.471166133880615, "learning_rate": 6.82406015037594e-06, "loss": 0.282, "step": 21120 }, { "epoch": 31.77, "grad_norm": 57.37477493286133, "learning_rate": 6.822556390977444e-06, "loss": 0.2059, "step": 21130 }, { "epoch": 31.79, "grad_norm": 4.992650985717773, "learning_rate": 6.821052631578948e-06, "loss": 0.2746, "step": 21140 }, { "epoch": 31.8, "grad_norm": 4.329148292541504, "learning_rate": 6.819548872180452e-06, "loss": 0.2674, "step": 21150 }, { "epoch": 31.82, "grad_norm": 4.691008567810059, "learning_rate": 6.818045112781956e-06, "loss": 0.3004, "step": 21160 }, { "epoch": 31.83, "grad_norm": 7.85280704498291, "learning_rate": 6.816541353383459e-06, "loss": 0.2769, "step": 21170 }, { "epoch": 31.85, "grad_norm": 7.473185062408447, "learning_rate": 6.815037593984963e-06, "loss": 0.3217, "step": 21180 }, { "epoch": 31.86, "grad_norm": 5.13551664352417, "learning_rate": 6.813533834586467e-06, "loss": 0.2425, "step": 21190 }, { "epoch": 31.88, "grad_norm": 4.801725387573242, "learning_rate": 6.812030075187971e-06, "loss": 0.3008, "step": 21200 }, { "epoch": 31.89, "grad_norm": 6.320078372955322, "learning_rate": 6.810526315789474e-06, "loss": 0.2616, "step": 21210 }, { "epoch": 31.91, "grad_norm": 7.002920150756836, "learning_rate": 6.809022556390978e-06, "loss": 0.2647, "step": 21220 }, { "epoch": 31.92, "grad_norm": 7.263726711273193, "learning_rate": 6.807518796992482e-06, "loss": 0.3434, "step": 21230 }, { "epoch": 31.94, "grad_norm": 6.2287139892578125, "learning_rate": 6.806015037593986e-06, "loss": 0.2775, "step": 21240 }, { "epoch": 31.95, "grad_norm": 3.209961175918579, "learning_rate": 6.8045112781954896e-06, "loss": 0.2689, "step": 21250 }, { "epoch": 31.97, "grad_norm": 5.191007614135742, "learning_rate": 6.803007518796994e-06, "loss": 0.2824, "step": 21260 }, { "epoch": 31.98, "grad_norm": 7.311644077301025, "learning_rate": 6.801503759398497e-06, "loss": 0.3207, "step": 21270 }, { "epoch": 32.0, "grad_norm": 33.7657356262207, "learning_rate": 6.800000000000001e-06, "loss": 0.3268, "step": 21280 }, { "epoch": 32.0, "eval_accuracy": 0.9312, "eval_loss": 0.2803204655647278, "eval_runtime": 84.7157, "eval_samples_per_second": 118.042, "eval_steps_per_second": 0.472, "step": 21280 }, { "epoch": 32.02, "grad_norm": 8.19914722442627, "learning_rate": 6.7984962406015045e-06, "loss": 0.3492, "step": 21290 }, { "epoch": 32.03, "grad_norm": 5.234302043914795, "learning_rate": 6.796992481203009e-06, "loss": 0.2722, "step": 21300 }, { "epoch": 32.05, "grad_norm": 5.019562244415283, "learning_rate": 6.7954887218045115e-06, "loss": 0.245, "step": 21310 }, { "epoch": 32.06, "grad_norm": 4.851130962371826, "learning_rate": 6.793984962406016e-06, "loss": 0.3073, "step": 21320 }, { "epoch": 32.08, "grad_norm": 4.4718241691589355, "learning_rate": 6.792481203007519e-06, "loss": 0.2929, "step": 21330 }, { "epoch": 32.09, "grad_norm": 4.94941520690918, "learning_rate": 6.790977443609023e-06, "loss": 0.2941, "step": 21340 }, { "epoch": 32.11, "grad_norm": 5.517939567565918, "learning_rate": 6.789473684210527e-06, "loss": 0.2528, "step": 21350 }, { "epoch": 32.12, "grad_norm": 6.751891136169434, "learning_rate": 6.78796992481203e-06, "loss": 0.3063, "step": 21360 }, { "epoch": 32.14, "grad_norm": 6.197683811187744, "learning_rate": 6.786466165413534e-06, "loss": 0.2686, "step": 21370 }, { "epoch": 32.15, "grad_norm": 5.212826728820801, "learning_rate": 6.784962406015038e-06, "loss": 0.2153, "step": 21380 }, { "epoch": 32.17, "grad_norm": 6.147881984710693, "learning_rate": 6.783458646616542e-06, "loss": 0.3603, "step": 21390 }, { "epoch": 32.18, "grad_norm": 6.068096160888672, "learning_rate": 6.781954887218045e-06, "loss": 0.3144, "step": 21400 }, { "epoch": 32.2, "grad_norm": 6.553431987762451, "learning_rate": 6.780451127819549e-06, "loss": 0.2902, "step": 21410 }, { "epoch": 32.21, "grad_norm": 6.4035515785217285, "learning_rate": 6.778947368421053e-06, "loss": 0.2156, "step": 21420 }, { "epoch": 32.23, "grad_norm": 4.220142841339111, "learning_rate": 6.777443609022557e-06, "loss": 0.2905, "step": 21430 }, { "epoch": 32.24, "grad_norm": 4.17386531829834, "learning_rate": 6.775939849624061e-06, "loss": 0.3002, "step": 21440 }, { "epoch": 32.26, "grad_norm": 3.2023849487304688, "learning_rate": 6.774436090225564e-06, "loss": 0.3396, "step": 21450 }, { "epoch": 32.27, "grad_norm": 2.8168628215789795, "learning_rate": 6.772932330827068e-06, "loss": 0.248, "step": 21460 }, { "epoch": 32.29, "grad_norm": 4.326303482055664, "learning_rate": 6.771428571428572e-06, "loss": 0.2821, "step": 21470 }, { "epoch": 32.3, "grad_norm": 3.3244757652282715, "learning_rate": 6.769924812030076e-06, "loss": 0.2907, "step": 21480 }, { "epoch": 32.32, "grad_norm": 5.797328948974609, "learning_rate": 6.76842105263158e-06, "loss": 0.2961, "step": 21490 }, { "epoch": 32.33, "grad_norm": 3.571841239929199, "learning_rate": 6.766917293233083e-06, "loss": 0.1967, "step": 21500 }, { "epoch": 32.35, "grad_norm": 5.261970520019531, "learning_rate": 6.765413533834587e-06, "loss": 0.241, "step": 21510 }, { "epoch": 32.36, "grad_norm": 6.464574813842773, "learning_rate": 6.7639097744360905e-06, "loss": 0.2727, "step": 21520 }, { "epoch": 32.38, "grad_norm": 4.337974548339844, "learning_rate": 6.762406015037595e-06, "loss": 0.2508, "step": 21530 }, { "epoch": 32.39, "grad_norm": 3.5019898414611816, "learning_rate": 6.760902255639098e-06, "loss": 0.2173, "step": 21540 }, { "epoch": 32.41, "grad_norm": 7.147461891174316, "learning_rate": 6.759398496240602e-06, "loss": 0.3561, "step": 21550 }, { "epoch": 32.42, "grad_norm": 5.347845077514648, "learning_rate": 6.7578947368421054e-06, "loss": 0.2572, "step": 21560 }, { "epoch": 32.44, "grad_norm": 3.541206121444702, "learning_rate": 6.75639097744361e-06, "loss": 0.266, "step": 21570 }, { "epoch": 32.45, "grad_norm": 4.880330562591553, "learning_rate": 6.754887218045113e-06, "loss": 0.2968, "step": 21580 }, { "epoch": 32.47, "grad_norm": 6.621898174285889, "learning_rate": 6.753383458646618e-06, "loss": 0.1805, "step": 21590 }, { "epoch": 32.48, "grad_norm": 7.193774700164795, "learning_rate": 6.75187969924812e-06, "loss": 0.2282, "step": 21600 }, { "epoch": 32.5, "grad_norm": 5.0164408683776855, "learning_rate": 6.750375939849625e-06, "loss": 0.3077, "step": 21610 }, { "epoch": 32.51, "grad_norm": 8.061211585998535, "learning_rate": 6.748872180451128e-06, "loss": 0.2758, "step": 21620 }, { "epoch": 32.53, "grad_norm": 6.3904852867126465, "learning_rate": 6.747368421052633e-06, "loss": 0.2499, "step": 21630 }, { "epoch": 32.54, "grad_norm": 8.771563529968262, "learning_rate": 6.745864661654135e-06, "loss": 0.3302, "step": 21640 }, { "epoch": 32.56, "grad_norm": 7.61922550201416, "learning_rate": 6.74436090225564e-06, "loss": 0.2691, "step": 21650 }, { "epoch": 32.57, "grad_norm": 7.187370300292969, "learning_rate": 6.742857142857143e-06, "loss": 0.3071, "step": 21660 }, { "epoch": 32.59, "grad_norm": 3.4237334728240967, "learning_rate": 6.7413533834586475e-06, "loss": 0.296, "step": 21670 }, { "epoch": 32.6, "grad_norm": 4.828934192657471, "learning_rate": 6.739849624060151e-06, "loss": 0.3158, "step": 21680 }, { "epoch": 32.62, "grad_norm": 5.043696880340576, "learning_rate": 6.738345864661655e-06, "loss": 0.2656, "step": 21690 }, { "epoch": 32.63, "grad_norm": 6.4835004806518555, "learning_rate": 6.736842105263158e-06, "loss": 0.2796, "step": 21700 }, { "epoch": 32.65, "grad_norm": 2.3352560997009277, "learning_rate": 6.7353383458646624e-06, "loss": 0.294, "step": 21710 }, { "epoch": 32.66, "grad_norm": 7.649822235107422, "learning_rate": 6.733834586466166e-06, "loss": 0.2857, "step": 21720 }, { "epoch": 32.68, "grad_norm": 5.9862565994262695, "learning_rate": 6.73233082706767e-06, "loss": 0.2401, "step": 21730 }, { "epoch": 32.69, "grad_norm": 5.723448276519775, "learning_rate": 6.730827067669173e-06, "loss": 0.3426, "step": 21740 }, { "epoch": 32.71, "grad_norm": 2.8246076107025146, "learning_rate": 6.729323308270677e-06, "loss": 0.2431, "step": 21750 }, { "epoch": 32.72, "grad_norm": 7.292664527893066, "learning_rate": 6.727819548872181e-06, "loss": 0.2624, "step": 21760 }, { "epoch": 32.74, "grad_norm": 4.674510478973389, "learning_rate": 6.726315789473685e-06, "loss": 0.3025, "step": 21770 }, { "epoch": 32.75, "grad_norm": 4.479643821716309, "learning_rate": 6.724812030075189e-06, "loss": 0.2175, "step": 21780 }, { "epoch": 32.77, "grad_norm": 4.182172775268555, "learning_rate": 6.723308270676693e-06, "loss": 0.3039, "step": 21790 }, { "epoch": 32.78, "grad_norm": 5.148652076721191, "learning_rate": 6.721804511278196e-06, "loss": 0.2735, "step": 21800 }, { "epoch": 32.8, "grad_norm": 4.162076950073242, "learning_rate": 6.7203007518797e-06, "loss": 0.2652, "step": 21810 }, { "epoch": 32.81, "grad_norm": 5.020504474639893, "learning_rate": 6.718796992481204e-06, "loss": 0.3151, "step": 21820 }, { "epoch": 32.83, "grad_norm": 3.3114843368530273, "learning_rate": 6.717293233082708e-06, "loss": 0.3088, "step": 21830 }, { "epoch": 32.84, "grad_norm": 7.768105983734131, "learning_rate": 6.715789473684211e-06, "loss": 0.2574, "step": 21840 }, { "epoch": 32.86, "grad_norm": 5.251821517944336, "learning_rate": 6.714285714285714e-06, "loss": 0.2308, "step": 21850 }, { "epoch": 32.87, "grad_norm": 3.617433786392212, "learning_rate": 6.712781954887219e-06, "loss": 0.2609, "step": 21860 }, { "epoch": 32.89, "grad_norm": 7.9688615798950195, "learning_rate": 6.711278195488722e-06, "loss": 0.2844, "step": 21870 }, { "epoch": 32.9, "grad_norm": 3.2969768047332764, "learning_rate": 6.7097744360902265e-06, "loss": 0.2347, "step": 21880 }, { "epoch": 32.92, "grad_norm": 7.453803539276123, "learning_rate": 6.708270676691729e-06, "loss": 0.2808, "step": 21890 }, { "epoch": 32.93, "grad_norm": 4.9906744956970215, "learning_rate": 6.7067669172932335e-06, "loss": 0.2362, "step": 21900 }, { "epoch": 32.95, "grad_norm": 3.7684195041656494, "learning_rate": 6.705263157894737e-06, "loss": 0.2816, "step": 21910 }, { "epoch": 32.96, "grad_norm": 4.246355056762695, "learning_rate": 6.7037593984962414e-06, "loss": 0.2608, "step": 21920 }, { "epoch": 32.98, "grad_norm": 10.584210395812988, "learning_rate": 6.702255639097744e-06, "loss": 0.3571, "step": 21930 }, { "epoch": 32.99, "grad_norm": 7.687515735626221, "learning_rate": 6.7007518796992485e-06, "loss": 0.218, "step": 21940 }, { "epoch": 33.0, "eval_accuracy": 0.9307, "eval_loss": 0.28834185004234314, "eval_runtime": 84.8678, "eval_samples_per_second": 117.83, "eval_steps_per_second": 0.471, "step": 21945 }, { "epoch": 33.01, "grad_norm": 2.6484274864196777, "learning_rate": 6.699248120300752e-06, "loss": 0.2635, "step": 21950 }, { "epoch": 33.02, "grad_norm": 6.136809825897217, "learning_rate": 6.697744360902256e-06, "loss": 0.2438, "step": 21960 }, { "epoch": 33.04, "grad_norm": 2.4499707221984863, "learning_rate": 6.69624060150376e-06, "loss": 0.2448, "step": 21970 }, { "epoch": 33.05, "grad_norm": 8.61922550201416, "learning_rate": 6.694736842105264e-06, "loss": 0.2895, "step": 21980 }, { "epoch": 33.07, "grad_norm": 4.760676383972168, "learning_rate": 6.693233082706767e-06, "loss": 0.2138, "step": 21990 }, { "epoch": 33.08, "grad_norm": 4.3553853034973145, "learning_rate": 6.691729323308271e-06, "loss": 0.3182, "step": 22000 }, { "epoch": 33.1, "grad_norm": 6.192696571350098, "learning_rate": 6.690225563909775e-06, "loss": 0.2446, "step": 22010 }, { "epoch": 33.11, "grad_norm": 8.466830253601074, "learning_rate": 6.688721804511279e-06, "loss": 0.3202, "step": 22020 }, { "epoch": 33.13, "grad_norm": 5.688916206359863, "learning_rate": 6.687218045112782e-06, "loss": 0.3136, "step": 22030 }, { "epoch": 33.14, "grad_norm": 7.089521408081055, "learning_rate": 6.685714285714286e-06, "loss": 0.3003, "step": 22040 }, { "epoch": 33.16, "grad_norm": 5.240951061248779, "learning_rate": 6.68421052631579e-06, "loss": 0.2772, "step": 22050 }, { "epoch": 33.17, "grad_norm": 4.90128755569458, "learning_rate": 6.682706766917294e-06, "loss": 0.2949, "step": 22060 }, { "epoch": 33.19, "grad_norm": 1.9220607280731201, "learning_rate": 6.681203007518798e-06, "loss": 0.2716, "step": 22070 }, { "epoch": 33.2, "grad_norm": 11.761991500854492, "learning_rate": 6.679699248120302e-06, "loss": 0.2519, "step": 22080 }, { "epoch": 33.22, "grad_norm": 3.534703016281128, "learning_rate": 6.678195488721805e-06, "loss": 0.2398, "step": 22090 }, { "epoch": 33.23, "grad_norm": 5.2441887855529785, "learning_rate": 6.676691729323309e-06, "loss": 0.2638, "step": 22100 }, { "epoch": 33.25, "grad_norm": 5.488763809204102, "learning_rate": 6.6751879699248125e-06, "loss": 0.2381, "step": 22110 }, { "epoch": 33.26, "grad_norm": 4.989120006561279, "learning_rate": 6.673684210526317e-06, "loss": 0.2582, "step": 22120 }, { "epoch": 33.28, "grad_norm": 5.641976833343506, "learning_rate": 6.6721804511278196e-06, "loss": 0.2608, "step": 22130 }, { "epoch": 33.29, "grad_norm": 5.41449499130249, "learning_rate": 6.670676691729324e-06, "loss": 0.3076, "step": 22140 }, { "epoch": 33.31, "grad_norm": 5.977665424346924, "learning_rate": 6.6691729323308274e-06, "loss": 0.2189, "step": 22150 }, { "epoch": 33.32, "grad_norm": 6.3149213790893555, "learning_rate": 6.667669172932332e-06, "loss": 0.3281, "step": 22160 }, { "epoch": 33.34, "grad_norm": 2.910731077194214, "learning_rate": 6.666165413533835e-06, "loss": 0.2563, "step": 22170 }, { "epoch": 33.35, "grad_norm": 5.496181011199951, "learning_rate": 6.664661654135339e-06, "loss": 0.2087, "step": 22180 }, { "epoch": 33.37, "grad_norm": 4.933781623840332, "learning_rate": 6.663157894736842e-06, "loss": 0.2249, "step": 22190 }, { "epoch": 33.38, "grad_norm": 6.685660362243652, "learning_rate": 6.661654135338347e-06, "loss": 0.3751, "step": 22200 }, { "epoch": 33.4, "grad_norm": 3.7350828647613525, "learning_rate": 6.66015037593985e-06, "loss": 0.2126, "step": 22210 }, { "epoch": 33.41, "grad_norm": 7.765669822692871, "learning_rate": 6.658646616541355e-06, "loss": 0.3009, "step": 22220 }, { "epoch": 33.43, "grad_norm": 5.381826877593994, "learning_rate": 6.657142857142857e-06, "loss": 0.3344, "step": 22230 }, { "epoch": 33.44, "grad_norm": 3.3078157901763916, "learning_rate": 6.655639097744362e-06, "loss": 0.3077, "step": 22240 }, { "epoch": 33.46, "grad_norm": 3.342693328857422, "learning_rate": 6.654135338345865e-06, "loss": 0.2235, "step": 22250 }, { "epoch": 33.47, "grad_norm": 5.725019454956055, "learning_rate": 6.6526315789473695e-06, "loss": 0.3253, "step": 22260 }, { "epoch": 33.49, "grad_norm": 7.685015678405762, "learning_rate": 6.651127819548873e-06, "loss": 0.2343, "step": 22270 }, { "epoch": 33.5, "grad_norm": 3.291060447692871, "learning_rate": 6.649624060150377e-06, "loss": 0.2907, "step": 22280 }, { "epoch": 33.52, "grad_norm": 5.01746940612793, "learning_rate": 6.64812030075188e-06, "loss": 0.2124, "step": 22290 }, { "epoch": 33.53, "grad_norm": 8.099120140075684, "learning_rate": 6.6466165413533845e-06, "loss": 0.2862, "step": 22300 }, { "epoch": 33.55, "grad_norm": 6.425881862640381, "learning_rate": 6.645112781954888e-06, "loss": 0.216, "step": 22310 }, { "epoch": 33.56, "grad_norm": 6.16859769821167, "learning_rate": 6.643609022556392e-06, "loss": 0.3214, "step": 22320 }, { "epoch": 33.58, "grad_norm": 4.426242828369141, "learning_rate": 6.642105263157895e-06, "loss": 0.2471, "step": 22330 }, { "epoch": 33.59, "grad_norm": 4.199456691741943, "learning_rate": 6.6406015037593985e-06, "loss": 0.2696, "step": 22340 }, { "epoch": 33.61, "grad_norm": 4.16933536529541, "learning_rate": 6.639097744360903e-06, "loss": 0.1926, "step": 22350 }, { "epoch": 33.62, "grad_norm": 7.852532863616943, "learning_rate": 6.6375939849624064e-06, "loss": 0.2983, "step": 22360 }, { "epoch": 33.64, "grad_norm": 5.3104119300842285, "learning_rate": 6.63609022556391e-06, "loss": 0.2412, "step": 22370 }, { "epoch": 33.65, "grad_norm": 4.502700328826904, "learning_rate": 6.6345864661654135e-06, "loss": 0.3023, "step": 22380 }, { "epoch": 33.67, "grad_norm": 4.96920108795166, "learning_rate": 6.633082706766918e-06, "loss": 0.2634, "step": 22390 }, { "epoch": 33.68, "grad_norm": 7.31613826751709, "learning_rate": 6.631578947368421e-06, "loss": 0.3551, "step": 22400 }, { "epoch": 33.7, "grad_norm": 4.635395526885986, "learning_rate": 6.630075187969926e-06, "loss": 0.2552, "step": 22410 }, { "epoch": 33.71, "grad_norm": 7.699193477630615, "learning_rate": 6.628571428571428e-06, "loss": 0.2955, "step": 22420 }, { "epoch": 33.73, "grad_norm": 5.194084644317627, "learning_rate": 6.627067669172933e-06, "loss": 0.2743, "step": 22430 }, { "epoch": 33.74, "grad_norm": 9.023811340332031, "learning_rate": 6.625563909774436e-06, "loss": 0.2817, "step": 22440 }, { "epoch": 33.76, "grad_norm": 4.306658744812012, "learning_rate": 6.624060150375941e-06, "loss": 0.3008, "step": 22450 }, { "epoch": 33.77, "grad_norm": 2.301297187805176, "learning_rate": 6.622556390977443e-06, "loss": 0.2354, "step": 22460 }, { "epoch": 33.79, "grad_norm": 4.28262186050415, "learning_rate": 6.621052631578948e-06, "loss": 0.2656, "step": 22470 }, { "epoch": 33.8, "grad_norm": 6.531603813171387, "learning_rate": 6.619548872180451e-06, "loss": 0.3753, "step": 22480 }, { "epoch": 33.82, "grad_norm": 8.290949821472168, "learning_rate": 6.6180451127819556e-06, "loss": 0.2842, "step": 22490 }, { "epoch": 33.83, "grad_norm": 3.5719752311706543, "learning_rate": 6.616541353383459e-06, "loss": 0.2617, "step": 22500 }, { "epoch": 33.85, "grad_norm": 6.209147930145264, "learning_rate": 6.6150375939849635e-06, "loss": 0.3299, "step": 22510 }, { "epoch": 33.86, "grad_norm": 3.604736566543579, "learning_rate": 6.613533834586466e-06, "loss": 0.1786, "step": 22520 }, { "epoch": 33.88, "grad_norm": 8.47739028930664, "learning_rate": 6.6120300751879705e-06, "loss": 0.3399, "step": 22530 }, { "epoch": 33.89, "grad_norm": 4.806243896484375, "learning_rate": 6.610526315789474e-06, "loss": 0.2474, "step": 22540 }, { "epoch": 33.91, "grad_norm": 4.551697731018066, "learning_rate": 6.609022556390978e-06, "loss": 0.2451, "step": 22550 }, { "epoch": 33.92, "grad_norm": 4.965244293212891, "learning_rate": 6.607518796992481e-06, "loss": 0.2971, "step": 22560 }, { "epoch": 33.94, "grad_norm": 4.888105392456055, "learning_rate": 6.606015037593985e-06, "loss": 0.3368, "step": 22570 }, { "epoch": 33.95, "grad_norm": 7.72111701965332, "learning_rate": 6.604511278195489e-06, "loss": 0.2713, "step": 22580 }, { "epoch": 33.97, "grad_norm": 4.930686950683594, "learning_rate": 6.603007518796993e-06, "loss": 0.2445, "step": 22590 }, { "epoch": 33.98, "grad_norm": 6.420777797698975, "learning_rate": 6.601503759398497e-06, "loss": 0.2183, "step": 22600 }, { "epoch": 34.0, "grad_norm": 0.24922990798950195, "learning_rate": 6.600000000000001e-06, "loss": 0.217, "step": 22610 }, { "epoch": 34.0, "eval_accuracy": 0.9356, "eval_loss": 0.2865731716156006, "eval_runtime": 84.2017, "eval_samples_per_second": 118.763, "eval_steps_per_second": 0.475, "step": 22610 }, { "epoch": 34.02, "grad_norm": 5.616089820861816, "learning_rate": 6.598496240601504e-06, "loss": 0.3432, "step": 22620 }, { "epoch": 34.03, "grad_norm": 1.8118317127227783, "learning_rate": 6.596992481203008e-06, "loss": 0.2664, "step": 22630 }, { "epoch": 34.05, "grad_norm": 5.037443161010742, "learning_rate": 6.595488721804512e-06, "loss": 0.27, "step": 22640 }, { "epoch": 34.06, "grad_norm": 3.1154747009277344, "learning_rate": 6.593984962406016e-06, "loss": 0.2335, "step": 22650 }, { "epoch": 34.08, "grad_norm": 5.144960880279541, "learning_rate": 6.592481203007519e-06, "loss": 0.3204, "step": 22660 }, { "epoch": 34.09, "grad_norm": 5.48844575881958, "learning_rate": 6.590977443609023e-06, "loss": 0.3072, "step": 22670 }, { "epoch": 34.11, "grad_norm": 8.889720916748047, "learning_rate": 6.589473684210527e-06, "loss": 0.2398, "step": 22680 }, { "epoch": 34.12, "grad_norm": 4.211205959320068, "learning_rate": 6.587969924812031e-06, "loss": 0.2425, "step": 22690 }, { "epoch": 34.14, "grad_norm": 6.309680938720703, "learning_rate": 6.5864661654135345e-06, "loss": 0.2828, "step": 22700 }, { "epoch": 34.15, "grad_norm": 5.038494110107422, "learning_rate": 6.584962406015039e-06, "loss": 0.2633, "step": 22710 }, { "epoch": 34.17, "grad_norm": 1.1278971433639526, "learning_rate": 6.583458646616542e-06, "loss": 0.2935, "step": 22720 }, { "epoch": 34.18, "grad_norm": 5.858100891113281, "learning_rate": 6.581954887218046e-06, "loss": 0.2771, "step": 22730 }, { "epoch": 34.2, "grad_norm": 6.1508402824401855, "learning_rate": 6.5804511278195495e-06, "loss": 0.3256, "step": 22740 }, { "epoch": 34.21, "grad_norm": 5.791116237640381, "learning_rate": 6.578947368421054e-06, "loss": 0.2354, "step": 22750 }, { "epoch": 34.23, "grad_norm": 6.087039470672607, "learning_rate": 6.5774436090225565e-06, "loss": 0.2098, "step": 22760 }, { "epoch": 34.24, "grad_norm": 6.835604667663574, "learning_rate": 6.575939849624061e-06, "loss": 0.285, "step": 22770 }, { "epoch": 34.26, "grad_norm": 6.213393688201904, "learning_rate": 6.574436090225564e-06, "loss": 0.2488, "step": 22780 }, { "epoch": 34.27, "grad_norm": 6.763580322265625, "learning_rate": 6.572932330827069e-06, "loss": 0.2043, "step": 22790 }, { "epoch": 34.29, "grad_norm": 5.975349426269531, "learning_rate": 6.571428571428572e-06, "loss": 0.3282, "step": 22800 }, { "epoch": 34.3, "grad_norm": 5.7898406982421875, "learning_rate": 6.569924812030077e-06, "loss": 0.2369, "step": 22810 }, { "epoch": 34.32, "grad_norm": 3.7875592708587646, "learning_rate": 6.568421052631579e-06, "loss": 0.3036, "step": 22820 }, { "epoch": 34.33, "grad_norm": 7.54277229309082, "learning_rate": 6.566917293233083e-06, "loss": 0.2441, "step": 22830 }, { "epoch": 34.35, "grad_norm": 5.216723442077637, "learning_rate": 6.565413533834587e-06, "loss": 0.2537, "step": 22840 }, { "epoch": 34.36, "grad_norm": 5.119565486907959, "learning_rate": 6.56390977443609e-06, "loss": 0.3228, "step": 22850 }, { "epoch": 34.38, "grad_norm": 10.754874229431152, "learning_rate": 6.562406015037594e-06, "loss": 0.1848, "step": 22860 }, { "epoch": 34.39, "grad_norm": 8.560099601745605, "learning_rate": 6.560902255639098e-06, "loss": 0.2648, "step": 22870 }, { "epoch": 34.41, "grad_norm": 6.255738258361816, "learning_rate": 6.559398496240602e-06, "loss": 0.2562, "step": 22880 }, { "epoch": 34.42, "grad_norm": 4.35892391204834, "learning_rate": 6.557894736842106e-06, "loss": 0.2838, "step": 22890 }, { "epoch": 34.44, "grad_norm": 6.469654083251953, "learning_rate": 6.55639097744361e-06, "loss": 0.2885, "step": 22900 }, { "epoch": 34.45, "grad_norm": 7.00548791885376, "learning_rate": 6.554887218045113e-06, "loss": 0.327, "step": 22910 }, { "epoch": 34.47, "grad_norm": 6.044936180114746, "learning_rate": 6.553383458646617e-06, "loss": 0.2582, "step": 22920 }, { "epoch": 34.48, "grad_norm": 6.903324127197266, "learning_rate": 6.5518796992481206e-06, "loss": 0.2673, "step": 22930 }, { "epoch": 34.5, "grad_norm": 4.936728477478027, "learning_rate": 6.550375939849625e-06, "loss": 0.2469, "step": 22940 }, { "epoch": 34.51, "grad_norm": 5.086190223693848, "learning_rate": 6.548872180451128e-06, "loss": 0.2323, "step": 22950 }, { "epoch": 34.53, "grad_norm": 2.871715784072876, "learning_rate": 6.547368421052632e-06, "loss": 0.2158, "step": 22960 }, { "epoch": 34.54, "grad_norm": 4.698546886444092, "learning_rate": 6.5458646616541355e-06, "loss": 0.257, "step": 22970 }, { "epoch": 34.56, "grad_norm": 6.885629177093506, "learning_rate": 6.54436090225564e-06, "loss": 0.2187, "step": 22980 }, { "epoch": 34.57, "grad_norm": 3.128361463546753, "learning_rate": 6.542857142857143e-06, "loss": 0.2876, "step": 22990 }, { "epoch": 34.59, "grad_norm": 3.115068197250366, "learning_rate": 6.541353383458648e-06, "loss": 0.2565, "step": 23000 }, { "epoch": 34.6, "grad_norm": 4.966819763183594, "learning_rate": 6.53984962406015e-06, "loss": 0.3301, "step": 23010 }, { "epoch": 34.62, "grad_norm": 3.8072173595428467, "learning_rate": 6.538345864661655e-06, "loss": 0.2815, "step": 23020 }, { "epoch": 34.63, "grad_norm": 7.346180438995361, "learning_rate": 6.536842105263158e-06, "loss": 0.2456, "step": 23030 }, { "epoch": 34.65, "grad_norm": 5.6631317138671875, "learning_rate": 6.535338345864663e-06, "loss": 0.2629, "step": 23040 }, { "epoch": 34.66, "grad_norm": 4.316861629486084, "learning_rate": 6.533834586466165e-06, "loss": 0.218, "step": 23050 }, { "epoch": 34.68, "grad_norm": 5.202853202819824, "learning_rate": 6.53233082706767e-06, "loss": 0.3226, "step": 23060 }, { "epoch": 34.69, "grad_norm": 5.017209529876709, "learning_rate": 6.530827067669173e-06, "loss": 0.3087, "step": 23070 }, { "epoch": 34.71, "grad_norm": 9.907183647155762, "learning_rate": 6.529323308270678e-06, "loss": 0.2096, "step": 23080 }, { "epoch": 34.72, "grad_norm": 5.446949005126953, "learning_rate": 6.527819548872181e-06, "loss": 0.2725, "step": 23090 }, { "epoch": 34.74, "grad_norm": 5.2780537605285645, "learning_rate": 6.526315789473685e-06, "loss": 0.2429, "step": 23100 }, { "epoch": 34.75, "grad_norm": 3.5522546768188477, "learning_rate": 6.524812030075188e-06, "loss": 0.257, "step": 23110 }, { "epoch": 34.77, "grad_norm": 6.132695198059082, "learning_rate": 6.5233082706766925e-06, "loss": 0.3128, "step": 23120 }, { "epoch": 34.78, "grad_norm": 7.575121879577637, "learning_rate": 6.521804511278196e-06, "loss": 0.2529, "step": 23130 }, { "epoch": 34.8, "grad_norm": 6.6957244873046875, "learning_rate": 6.5203007518797e-06, "loss": 0.2612, "step": 23140 }, { "epoch": 34.81, "grad_norm": 4.827077865600586, "learning_rate": 6.518796992481203e-06, "loss": 0.307, "step": 23150 }, { "epoch": 34.83, "grad_norm": 7.132809638977051, "learning_rate": 6.5172932330827074e-06, "loss": 0.2111, "step": 23160 }, { "epoch": 34.84, "grad_norm": 3.0563974380493164, "learning_rate": 6.515789473684211e-06, "loss": 0.2332, "step": 23170 }, { "epoch": 34.86, "grad_norm": 5.9987592697143555, "learning_rate": 6.514285714285715e-06, "loss": 0.3418, "step": 23180 }, { "epoch": 34.87, "grad_norm": 4.280242443084717, "learning_rate": 6.512781954887219e-06, "loss": 0.2447, "step": 23190 }, { "epoch": 34.89, "grad_norm": 4.552205562591553, "learning_rate": 6.511278195488722e-06, "loss": 0.2697, "step": 23200 }, { "epoch": 34.9, "grad_norm": 5.720810890197754, "learning_rate": 6.509774436090226e-06, "loss": 0.3134, "step": 23210 }, { "epoch": 34.92, "grad_norm": 5.941717147827148, "learning_rate": 6.50827067669173e-06, "loss": 0.2535, "step": 23220 }, { "epoch": 34.93, "grad_norm": 5.756423473358154, "learning_rate": 6.506766917293234e-06, "loss": 0.2717, "step": 23230 }, { "epoch": 34.95, "grad_norm": 6.748666763305664, "learning_rate": 6.505263157894738e-06, "loss": 0.2522, "step": 23240 }, { "epoch": 34.96, "grad_norm": 3.929311990737915, "learning_rate": 6.503759398496241e-06, "loss": 0.3033, "step": 23250 }, { "epoch": 34.98, "grad_norm": 5.266348838806152, "learning_rate": 6.502255639097745e-06, "loss": 0.2428, "step": 23260 }, { "epoch": 34.99, "grad_norm": 6.238527774810791, "learning_rate": 6.500751879699249e-06, "loss": 0.2032, "step": 23270 }, { "epoch": 35.0, "eval_accuracy": 0.9317, "eval_loss": 0.29047271609306335, "eval_runtime": 84.6472, "eval_samples_per_second": 118.137, "eval_steps_per_second": 0.473, "step": 23275 }, { "epoch": 35.01, "grad_norm": 4.792929172515869, "learning_rate": 6.499248120300753e-06, "loss": 0.2373, "step": 23280 }, { "epoch": 35.02, "grad_norm": 5.885370254516602, "learning_rate": 6.497744360902256e-06, "loss": 0.2296, "step": 23290 }, { "epoch": 35.04, "grad_norm": 3.2437541484832764, "learning_rate": 6.49624060150376e-06, "loss": 0.3242, "step": 23300 }, { "epoch": 35.05, "grad_norm": 4.990633964538574, "learning_rate": 6.494736842105264e-06, "loss": 0.3038, "step": 23310 }, { "epoch": 35.07, "grad_norm": 4.416833877563477, "learning_rate": 6.493233082706768e-06, "loss": 0.2001, "step": 23320 }, { "epoch": 35.08, "grad_norm": 8.694164276123047, "learning_rate": 6.4917293233082715e-06, "loss": 0.2442, "step": 23330 }, { "epoch": 35.1, "grad_norm": 9.367162704467773, "learning_rate": 6.490225563909774e-06, "loss": 0.3088, "step": 23340 }, { "epoch": 35.11, "grad_norm": 3.4011213779449463, "learning_rate": 6.4887218045112785e-06, "loss": 0.2324, "step": 23350 }, { "epoch": 35.13, "grad_norm": 5.543578624725342, "learning_rate": 6.487218045112782e-06, "loss": 0.2571, "step": 23360 }, { "epoch": 35.14, "grad_norm": 6.542150974273682, "learning_rate": 6.485714285714286e-06, "loss": 0.233, "step": 23370 }, { "epoch": 35.16, "grad_norm": 3.572125196456909, "learning_rate": 6.484210526315789e-06, "loss": 0.317, "step": 23380 }, { "epoch": 35.17, "grad_norm": 6.729075908660889, "learning_rate": 6.4827067669172935e-06, "loss": 0.1965, "step": 23390 }, { "epoch": 35.19, "grad_norm": 5.777679920196533, "learning_rate": 6.481203007518797e-06, "loss": 0.2543, "step": 23400 }, { "epoch": 35.2, "grad_norm": 4.606062412261963, "learning_rate": 6.479699248120301e-06, "loss": 0.2478, "step": 23410 }, { "epoch": 35.22, "grad_norm": 4.347212791442871, "learning_rate": 6.478195488721805e-06, "loss": 0.2716, "step": 23420 }, { "epoch": 35.23, "grad_norm": 6.0750732421875, "learning_rate": 6.476691729323309e-06, "loss": 0.3095, "step": 23430 }, { "epoch": 35.25, "grad_norm": 3.2727339267730713, "learning_rate": 6.475187969924812e-06, "loss": 0.2904, "step": 23440 }, { "epoch": 35.26, "grad_norm": 10.178025245666504, "learning_rate": 6.473684210526316e-06, "loss": 0.2603, "step": 23450 }, { "epoch": 35.28, "grad_norm": 6.457759380340576, "learning_rate": 6.47218045112782e-06, "loss": 0.3092, "step": 23460 }, { "epoch": 35.29, "grad_norm": 1.357625126838684, "learning_rate": 6.470676691729324e-06, "loss": 0.1796, "step": 23470 }, { "epoch": 35.31, "grad_norm": 6.197760105133057, "learning_rate": 6.469172932330827e-06, "loss": 0.331, "step": 23480 }, { "epoch": 35.32, "grad_norm": 3.364699602127075, "learning_rate": 6.467669172932331e-06, "loss": 0.2559, "step": 23490 }, { "epoch": 35.34, "grad_norm": 6.3956618309021, "learning_rate": 6.466165413533835e-06, "loss": 0.2026, "step": 23500 }, { "epoch": 35.35, "grad_norm": 6.079551696777344, "learning_rate": 6.464661654135339e-06, "loss": 0.2687, "step": 23510 }, { "epoch": 35.37, "grad_norm": 5.305349349975586, "learning_rate": 6.463157894736843e-06, "loss": 0.2628, "step": 23520 }, { "epoch": 35.38, "grad_norm": 3.124565362930298, "learning_rate": 6.461654135338347e-06, "loss": 0.2969, "step": 23530 }, { "epoch": 35.4, "grad_norm": 5.130784511566162, "learning_rate": 6.46015037593985e-06, "loss": 0.2696, "step": 23540 }, { "epoch": 35.41, "grad_norm": 5.65298318862915, "learning_rate": 6.458646616541354e-06, "loss": 0.2035, "step": 23550 }, { "epoch": 35.43, "grad_norm": 5.994067192077637, "learning_rate": 6.4571428571428575e-06, "loss": 0.3008, "step": 23560 }, { "epoch": 35.44, "grad_norm": 4.849883556365967, "learning_rate": 6.455639097744362e-06, "loss": 0.2681, "step": 23570 }, { "epoch": 35.46, "grad_norm": 7.175223350524902, "learning_rate": 6.4541353383458646e-06, "loss": 0.3364, "step": 23580 }, { "epoch": 35.47, "grad_norm": 5.904483318328857, "learning_rate": 6.452631578947369e-06, "loss": 0.2725, "step": 23590 }, { "epoch": 35.49, "grad_norm": 6.1249260902404785, "learning_rate": 6.4511278195488724e-06, "loss": 0.3174, "step": 23600 }, { "epoch": 35.5, "grad_norm": 5.8022332191467285, "learning_rate": 6.449624060150377e-06, "loss": 0.2296, "step": 23610 }, { "epoch": 35.52, "grad_norm": 4.012286186218262, "learning_rate": 6.44812030075188e-06, "loss": 0.2594, "step": 23620 }, { "epoch": 35.53, "grad_norm": 5.536571502685547, "learning_rate": 6.446616541353385e-06, "loss": 0.2567, "step": 23630 }, { "epoch": 35.55, "grad_norm": 5.743587017059326, "learning_rate": 6.445112781954887e-06, "loss": 0.2928, "step": 23640 }, { "epoch": 35.56, "grad_norm": 9.780457496643066, "learning_rate": 6.443609022556392e-06, "loss": 0.2897, "step": 23650 }, { "epoch": 35.58, "grad_norm": 4.584736347198486, "learning_rate": 6.442105263157895e-06, "loss": 0.1786, "step": 23660 }, { "epoch": 35.59, "grad_norm": 1.9408100843429565, "learning_rate": 6.4406015037594e-06, "loss": 0.3392, "step": 23670 }, { "epoch": 35.61, "grad_norm": 6.093573570251465, "learning_rate": 6.439097744360902e-06, "loss": 0.286, "step": 23680 }, { "epoch": 35.62, "grad_norm": 7.220097541809082, "learning_rate": 6.437593984962407e-06, "loss": 0.297, "step": 23690 }, { "epoch": 35.64, "grad_norm": 5.479585647583008, "learning_rate": 6.43609022556391e-06, "loss": 0.269, "step": 23700 }, { "epoch": 35.65, "grad_norm": 6.430034160614014, "learning_rate": 6.4345864661654145e-06, "loss": 0.2266, "step": 23710 }, { "epoch": 35.67, "grad_norm": 2.1765003204345703, "learning_rate": 6.433082706766918e-06, "loss": 0.2206, "step": 23720 }, { "epoch": 35.68, "grad_norm": 4.447315216064453, "learning_rate": 6.431578947368422e-06, "loss": 0.235, "step": 23730 }, { "epoch": 35.7, "grad_norm": 8.542555809020996, "learning_rate": 6.430075187969925e-06, "loss": 0.2793, "step": 23740 }, { "epoch": 35.71, "grad_norm": 7.156379699707031, "learning_rate": 6.4285714285714295e-06, "loss": 0.258, "step": 23750 }, { "epoch": 35.73, "grad_norm": 5.014525413513184, "learning_rate": 6.427067669172933e-06, "loss": 0.2489, "step": 23760 }, { "epoch": 35.74, "grad_norm": 3.7913882732391357, "learning_rate": 6.425563909774437e-06, "loss": 0.2875, "step": 23770 }, { "epoch": 35.76, "grad_norm": 5.546550750732422, "learning_rate": 6.42406015037594e-06, "loss": 0.2528, "step": 23780 }, { "epoch": 35.77, "grad_norm": 5.281984329223633, "learning_rate": 6.422556390977444e-06, "loss": 0.2684, "step": 23790 }, { "epoch": 35.79, "grad_norm": 4.275721073150635, "learning_rate": 6.421052631578948e-06, "loss": 0.2681, "step": 23800 }, { "epoch": 35.8, "grad_norm": 5.552488803863525, "learning_rate": 6.419548872180452e-06, "loss": 0.2607, "step": 23810 }, { "epoch": 35.82, "grad_norm": 4.155028343200684, "learning_rate": 6.418045112781956e-06, "loss": 0.246, "step": 23820 }, { "epoch": 35.83, "grad_norm": 10.015910148620605, "learning_rate": 6.4165413533834585e-06, "loss": 0.3312, "step": 23830 }, { "epoch": 35.85, "grad_norm": 6.853920936584473, "learning_rate": 6.415037593984963e-06, "loss": 0.3212, "step": 23840 }, { "epoch": 35.86, "grad_norm": 7.707338809967041, "learning_rate": 6.413533834586466e-06, "loss": 0.2773, "step": 23850 }, { "epoch": 35.88, "grad_norm": 6.877796649932861, "learning_rate": 6.412030075187971e-06, "loss": 0.306, "step": 23860 }, { "epoch": 35.89, "grad_norm": 6.9555487632751465, "learning_rate": 6.410526315789473e-06, "loss": 0.3005, "step": 23870 }, { "epoch": 35.91, "grad_norm": 5.553525924682617, "learning_rate": 6.409022556390978e-06, "loss": 0.3166, "step": 23880 }, { "epoch": 35.92, "grad_norm": 3.6754038333892822, "learning_rate": 6.407518796992481e-06, "loss": 0.2731, "step": 23890 }, { "epoch": 35.94, "grad_norm": 11.875730514526367, "learning_rate": 6.406015037593986e-06, "loss": 0.3046, "step": 23900 }, { "epoch": 35.95, "grad_norm": 5.181977272033691, "learning_rate": 6.404511278195489e-06, "loss": 0.2291, "step": 23910 }, { "epoch": 35.97, "grad_norm": 4.224656105041504, "learning_rate": 6.4030075187969935e-06, "loss": 0.2335, "step": 23920 }, { "epoch": 35.98, "grad_norm": 3.9968817234039307, "learning_rate": 6.401503759398496e-06, "loss": 0.3294, "step": 23930 }, { "epoch": 36.0, "grad_norm": 0.03734013810753822, "learning_rate": 6.4000000000000006e-06, "loss": 0.2539, "step": 23940 }, { "epoch": 36.0, "eval_accuracy": 0.9313, "eval_loss": 0.28184354305267334, "eval_runtime": 84.4602, "eval_samples_per_second": 118.399, "eval_steps_per_second": 0.474, "step": 23940 }, { "epoch": 36.02, "grad_norm": 4.844832897186279, "learning_rate": 6.398496240601504e-06, "loss": 0.247, "step": 23950 }, { "epoch": 36.03, "grad_norm": 8.93395709991455, "learning_rate": 6.3969924812030084e-06, "loss": 0.2341, "step": 23960 }, { "epoch": 36.05, "grad_norm": 6.799169063568115, "learning_rate": 6.395488721804511e-06, "loss": 0.1804, "step": 23970 }, { "epoch": 36.06, "grad_norm": 6.8090901374816895, "learning_rate": 6.3939849624060155e-06, "loss": 0.3, "step": 23980 }, { "epoch": 36.08, "grad_norm": 4.476424217224121, "learning_rate": 6.392481203007519e-06, "loss": 0.2273, "step": 23990 }, { "epoch": 36.09, "grad_norm": 5.188058853149414, "learning_rate": 6.390977443609023e-06, "loss": 0.3322, "step": 24000 }, { "epoch": 36.11, "grad_norm": 6.895328998565674, "learning_rate": 6.389473684210527e-06, "loss": 0.2552, "step": 24010 }, { "epoch": 36.12, "grad_norm": 6.052617073059082, "learning_rate": 6.38796992481203e-06, "loss": 0.3099, "step": 24020 }, { "epoch": 36.14, "grad_norm": 4.1569366455078125, "learning_rate": 6.386466165413534e-06, "loss": 0.2408, "step": 24030 }, { "epoch": 36.15, "grad_norm": 10.491902351379395, "learning_rate": 6.384962406015038e-06, "loss": 0.3039, "step": 24040 }, { "epoch": 36.17, "grad_norm": 1.8690141439437866, "learning_rate": 6.383458646616542e-06, "loss": 0.2466, "step": 24050 }, { "epoch": 36.18, "grad_norm": 6.9134650230407715, "learning_rate": 6.381954887218046e-06, "loss": 0.2627, "step": 24060 }, { "epoch": 36.2, "grad_norm": 6.333374977111816, "learning_rate": 6.380451127819549e-06, "loss": 0.2438, "step": 24070 }, { "epoch": 36.21, "grad_norm": 5.581048488616943, "learning_rate": 6.378947368421053e-06, "loss": 0.2583, "step": 24080 }, { "epoch": 36.23, "grad_norm": 14.36394214630127, "learning_rate": 6.377443609022557e-06, "loss": 0.29, "step": 24090 }, { "epoch": 36.24, "grad_norm": 3.045477867126465, "learning_rate": 6.375939849624061e-06, "loss": 0.238, "step": 24100 }, { "epoch": 36.26, "grad_norm": 7.110077857971191, "learning_rate": 6.374436090225565e-06, "loss": 0.3271, "step": 24110 }, { "epoch": 36.27, "grad_norm": 5.218198299407959, "learning_rate": 6.372932330827068e-06, "loss": 0.2952, "step": 24120 }, { "epoch": 36.29, "grad_norm": 10.754549026489258, "learning_rate": 6.371428571428572e-06, "loss": 0.2661, "step": 24130 }, { "epoch": 36.3, "grad_norm": 3.623293876647949, "learning_rate": 6.369924812030076e-06, "loss": 0.189, "step": 24140 }, { "epoch": 36.32, "grad_norm": 5.730505466461182, "learning_rate": 6.3684210526315795e-06, "loss": 0.2611, "step": 24150 }, { "epoch": 36.33, "grad_norm": 6.075179100036621, "learning_rate": 6.366917293233084e-06, "loss": 0.3031, "step": 24160 }, { "epoch": 36.35, "grad_norm": 5.460084915161133, "learning_rate": 6.365413533834587e-06, "loss": 0.2325, "step": 24170 }, { "epoch": 36.36, "grad_norm": 5.883102893829346, "learning_rate": 6.363909774436091e-06, "loss": 0.3088, "step": 24180 }, { "epoch": 36.38, "grad_norm": 2.4602198600769043, "learning_rate": 6.3624060150375945e-06, "loss": 0.2467, "step": 24190 }, { "epoch": 36.39, "grad_norm": 7.747447490692139, "learning_rate": 6.360902255639099e-06, "loss": 0.2818, "step": 24200 }, { "epoch": 36.41, "grad_norm": 5.207604885101318, "learning_rate": 6.3593984962406015e-06, "loss": 0.2313, "step": 24210 }, { "epoch": 36.42, "grad_norm": 3.6121327877044678, "learning_rate": 6.357894736842106e-06, "loss": 0.2216, "step": 24220 }, { "epoch": 36.44, "grad_norm": 8.649584770202637, "learning_rate": 6.356390977443609e-06, "loss": 0.2514, "step": 24230 }, { "epoch": 36.45, "grad_norm": 8.641491889953613, "learning_rate": 6.354887218045114e-06, "loss": 0.2535, "step": 24240 }, { "epoch": 36.47, "grad_norm": 5.185530662536621, "learning_rate": 6.353383458646617e-06, "loss": 0.1882, "step": 24250 }, { "epoch": 36.48, "grad_norm": 6.481791019439697, "learning_rate": 6.351879699248122e-06, "loss": 0.2954, "step": 24260 }, { "epoch": 36.5, "grad_norm": 3.8771073818206787, "learning_rate": 6.350375939849624e-06, "loss": 0.2594, "step": 24270 }, { "epoch": 36.51, "grad_norm": 3.2521779537200928, "learning_rate": 6.348872180451129e-06, "loss": 0.1955, "step": 24280 }, { "epoch": 36.53, "grad_norm": 4.331993103027344, "learning_rate": 6.347368421052632e-06, "loss": 0.2964, "step": 24290 }, { "epoch": 36.54, "grad_norm": 5.428518295288086, "learning_rate": 6.3458646616541366e-06, "loss": 0.3339, "step": 24300 }, { "epoch": 36.56, "grad_norm": 5.912198543548584, "learning_rate": 6.344360902255639e-06, "loss": 0.2986, "step": 24310 }, { "epoch": 36.57, "grad_norm": 6.707327365875244, "learning_rate": 6.342857142857143e-06, "loss": 0.2508, "step": 24320 }, { "epoch": 36.59, "grad_norm": 5.767425060272217, "learning_rate": 6.341353383458647e-06, "loss": 0.2468, "step": 24330 }, { "epoch": 36.6, "grad_norm": 6.399289608001709, "learning_rate": 6.339849624060151e-06, "loss": 0.2553, "step": 24340 }, { "epoch": 36.62, "grad_norm": 4.582364082336426, "learning_rate": 6.338345864661655e-06, "loss": 0.2534, "step": 24350 }, { "epoch": 36.63, "grad_norm": 3.3339309692382812, "learning_rate": 6.336842105263158e-06, "loss": 0.2576, "step": 24360 }, { "epoch": 36.65, "grad_norm": 4.761340141296387, "learning_rate": 6.335338345864662e-06, "loss": 0.2606, "step": 24370 }, { "epoch": 36.66, "grad_norm": 4.896874904632568, "learning_rate": 6.3338345864661656e-06, "loss": 0.2359, "step": 24380 }, { "epoch": 36.68, "grad_norm": 8.0689697265625, "learning_rate": 6.33233082706767e-06, "loss": 0.2927, "step": 24390 }, { "epoch": 36.69, "grad_norm": 7.064751148223877, "learning_rate": 6.330827067669173e-06, "loss": 0.2416, "step": 24400 }, { "epoch": 36.71, "grad_norm": 4.654138565063477, "learning_rate": 6.329323308270677e-06, "loss": 0.2719, "step": 24410 }, { "epoch": 36.72, "grad_norm": 3.553248167037964, "learning_rate": 6.3278195488721805e-06, "loss": 0.2545, "step": 24420 }, { "epoch": 36.74, "grad_norm": 4.331087589263916, "learning_rate": 6.326315789473685e-06, "loss": 0.2351, "step": 24430 }, { "epoch": 36.75, "grad_norm": 5.628582954406738, "learning_rate": 6.324812030075188e-06, "loss": 0.3051, "step": 24440 }, { "epoch": 36.77, "grad_norm": 6.611728191375732, "learning_rate": 6.323308270676693e-06, "loss": 0.24, "step": 24450 }, { "epoch": 36.78, "grad_norm": 5.588087558746338, "learning_rate": 6.321804511278195e-06, "loss": 0.2702, "step": 24460 }, { "epoch": 36.8, "grad_norm": 2.716883420944214, "learning_rate": 6.3203007518797e-06, "loss": 0.2452, "step": 24470 }, { "epoch": 36.81, "grad_norm": 3.0164976119995117, "learning_rate": 6.318796992481203e-06, "loss": 0.2998, "step": 24480 }, { "epoch": 36.83, "grad_norm": 9.347999572753906, "learning_rate": 6.317293233082708e-06, "loss": 0.2446, "step": 24490 }, { "epoch": 36.84, "grad_norm": 13.030348777770996, "learning_rate": 6.31578947368421e-06, "loss": 0.2017, "step": 24500 }, { "epoch": 36.86, "grad_norm": 4.36257791519165, "learning_rate": 6.314285714285715e-06, "loss": 0.331, "step": 24510 }, { "epoch": 36.87, "grad_norm": 6.986863136291504, "learning_rate": 6.312781954887218e-06, "loss": 0.289, "step": 24520 }, { "epoch": 36.89, "grad_norm": 2.9986495971679688, "learning_rate": 6.311278195488723e-06, "loss": 0.2632, "step": 24530 }, { "epoch": 36.9, "grad_norm": 6.963717460632324, "learning_rate": 6.309774436090226e-06, "loss": 0.283, "step": 24540 }, { "epoch": 36.92, "grad_norm": 2.926877975463867, "learning_rate": 6.3082706766917305e-06, "loss": 0.2462, "step": 24550 }, { "epoch": 36.93, "grad_norm": 2.751361131668091, "learning_rate": 6.306766917293233e-06, "loss": 0.2986, "step": 24560 }, { "epoch": 36.95, "grad_norm": 7.566164493560791, "learning_rate": 6.3052631578947375e-06, "loss": 0.2437, "step": 24570 }, { "epoch": 36.96, "grad_norm": 10.196669578552246, "learning_rate": 6.303759398496241e-06, "loss": 0.3045, "step": 24580 }, { "epoch": 36.98, "grad_norm": 3.9259514808654785, "learning_rate": 6.302255639097745e-06, "loss": 0.2523, "step": 24590 }, { "epoch": 36.99, "grad_norm": 3.9328627586364746, "learning_rate": 6.300751879699248e-06, "loss": 0.2104, "step": 24600 }, { "epoch": 37.0, "eval_accuracy": 0.9329, "eval_loss": 0.2907086908817291, "eval_runtime": 84.7416, "eval_samples_per_second": 118.006, "eval_steps_per_second": 0.472, "step": 24605 }, { "epoch": 37.01, "grad_norm": 5.213517189025879, "learning_rate": 6.2992481203007524e-06, "loss": 0.2175, "step": 24610 }, { "epoch": 37.02, "grad_norm": 5.479267120361328, "learning_rate": 6.297744360902256e-06, "loss": 0.2723, "step": 24620 }, { "epoch": 37.04, "grad_norm": 5.910623073577881, "learning_rate": 6.29624060150376e-06, "loss": 0.2277, "step": 24630 }, { "epoch": 37.05, "grad_norm": 3.9250035285949707, "learning_rate": 6.294736842105264e-06, "loss": 0.2364, "step": 24640 }, { "epoch": 37.07, "grad_norm": 5.7687201499938965, "learning_rate": 6.293233082706768e-06, "loss": 0.2434, "step": 24650 }, { "epoch": 37.08, "grad_norm": 6.585549354553223, "learning_rate": 6.291729323308271e-06, "loss": 0.2501, "step": 24660 }, { "epoch": 37.1, "grad_norm": 9.130806922912598, "learning_rate": 6.290225563909775e-06, "loss": 0.3176, "step": 24670 }, { "epoch": 37.11, "grad_norm": 5.826327800750732, "learning_rate": 6.288721804511279e-06, "loss": 0.259, "step": 24680 }, { "epoch": 37.13, "grad_norm": 3.6899287700653076, "learning_rate": 6.287218045112783e-06, "loss": 0.225, "step": 24690 }, { "epoch": 37.14, "grad_norm": 7.567675590515137, "learning_rate": 6.285714285714286e-06, "loss": 0.3511, "step": 24700 }, { "epoch": 37.16, "grad_norm": 4.670961856842041, "learning_rate": 6.28421052631579e-06, "loss": 0.2718, "step": 24710 }, { "epoch": 37.17, "grad_norm": 6.8157057762146, "learning_rate": 6.282706766917294e-06, "loss": 0.2676, "step": 24720 }, { "epoch": 37.19, "grad_norm": 9.021499633789062, "learning_rate": 6.281203007518798e-06, "loss": 0.2615, "step": 24730 }, { "epoch": 37.2, "grad_norm": 5.826773166656494, "learning_rate": 6.2796992481203016e-06, "loss": 0.2652, "step": 24740 }, { "epoch": 37.22, "grad_norm": 6.296904563903809, "learning_rate": 6.278195488721806e-06, "loss": 0.2632, "step": 24750 }, { "epoch": 37.23, "grad_norm": 3.506654739379883, "learning_rate": 6.276691729323309e-06, "loss": 0.2946, "step": 24760 }, { "epoch": 37.25, "grad_norm": 6.164925575256348, "learning_rate": 6.275187969924813e-06, "loss": 0.2214, "step": 24770 }, { "epoch": 37.26, "grad_norm": 5.420670986175537, "learning_rate": 6.2736842105263165e-06, "loss": 0.2709, "step": 24780 }, { "epoch": 37.28, "grad_norm": 5.200439453125, "learning_rate": 6.272180451127821e-06, "loss": 0.2599, "step": 24790 }, { "epoch": 37.29, "grad_norm": 5.116550445556641, "learning_rate": 6.2706766917293235e-06, "loss": 0.2643, "step": 24800 }, { "epoch": 37.31, "grad_norm": 4.102775573730469, "learning_rate": 6.269172932330828e-06, "loss": 0.2768, "step": 24810 }, { "epoch": 37.32, "grad_norm": 5.1978254318237305, "learning_rate": 6.267669172932331e-06, "loss": 0.2132, "step": 24820 }, { "epoch": 37.34, "grad_norm": 7.613659858703613, "learning_rate": 6.266165413533835e-06, "loss": 0.3066, "step": 24830 }, { "epoch": 37.35, "grad_norm": 7.9959917068481445, "learning_rate": 6.264661654135339e-06, "loss": 0.2219, "step": 24840 }, { "epoch": 37.37, "grad_norm": 7.0650811195373535, "learning_rate": 6.263157894736842e-06, "loss": 0.2497, "step": 24850 }, { "epoch": 37.38, "grad_norm": 4.512048721313477, "learning_rate": 6.261654135338346e-06, "loss": 0.243, "step": 24860 }, { "epoch": 37.4, "grad_norm": 3.7339611053466797, "learning_rate": 6.26015037593985e-06, "loss": 0.1984, "step": 24870 }, { "epoch": 37.41, "grad_norm": 5.475858211517334, "learning_rate": 6.258646616541354e-06, "loss": 0.2657, "step": 24880 }, { "epoch": 37.43, "grad_norm": 5.605356693267822, "learning_rate": 6.257142857142857e-06, "loss": 0.2905, "step": 24890 }, { "epoch": 37.44, "grad_norm": 9.095086097717285, "learning_rate": 6.255639097744361e-06, "loss": 0.3472, "step": 24900 }, { "epoch": 37.46, "grad_norm": 6.019258975982666, "learning_rate": 6.254135338345865e-06, "loss": 0.3156, "step": 24910 }, { "epoch": 37.47, "grad_norm": 1.5480716228485107, "learning_rate": 6.252631578947369e-06, "loss": 0.244, "step": 24920 }, { "epoch": 37.49, "grad_norm": 5.178092002868652, "learning_rate": 6.251127819548873e-06, "loss": 0.2058, "step": 24930 }, { "epoch": 37.5, "grad_norm": 5.011317729949951, "learning_rate": 6.249624060150376e-06, "loss": 0.2563, "step": 24940 }, { "epoch": 37.52, "grad_norm": 6.064964771270752, "learning_rate": 6.24812030075188e-06, "loss": 0.2321, "step": 24950 }, { "epoch": 37.53, "grad_norm": 4.71431827545166, "learning_rate": 6.246616541353384e-06, "loss": 0.2923, "step": 24960 }, { "epoch": 37.55, "grad_norm": 7.876862525939941, "learning_rate": 6.245112781954888e-06, "loss": 0.3188, "step": 24970 }, { "epoch": 37.56, "grad_norm": 5.848691940307617, "learning_rate": 6.243609022556392e-06, "loss": 0.2144, "step": 24980 }, { "epoch": 37.58, "grad_norm": 10.56225299835205, "learning_rate": 6.242105263157895e-06, "loss": 0.2761, "step": 24990 }, { "epoch": 37.59, "grad_norm": 4.970668792724609, "learning_rate": 6.240601503759399e-06, "loss": 0.2745, "step": 25000 }, { "epoch": 37.61, "grad_norm": 9.470555305480957, "learning_rate": 6.2390977443609025e-06, "loss": 0.2858, "step": 25010 }, { "epoch": 37.62, "grad_norm": 7.466954708099365, "learning_rate": 6.237593984962407e-06, "loss": 0.2461, "step": 25020 }, { "epoch": 37.64, "grad_norm": 7.034529685974121, "learning_rate": 6.23609022556391e-06, "loss": 0.2675, "step": 25030 }, { "epoch": 37.65, "grad_norm": 11.616925239562988, "learning_rate": 6.234586466165414e-06, "loss": 0.2296, "step": 25040 }, { "epoch": 37.67, "grad_norm": 4.31875467300415, "learning_rate": 6.2330827067669174e-06, "loss": 0.2634, "step": 25050 }, { "epoch": 37.68, "grad_norm": 5.886952877044678, "learning_rate": 6.231578947368422e-06, "loss": 0.2379, "step": 25060 }, { "epoch": 37.7, "grad_norm": 4.664031982421875, "learning_rate": 6.230075187969925e-06, "loss": 0.2202, "step": 25070 }, { "epoch": 37.71, "grad_norm": 4.743470191955566, "learning_rate": 6.22857142857143e-06, "loss": 0.2281, "step": 25080 }, { "epoch": 37.73, "grad_norm": 2.948579788208008, "learning_rate": 6.227067669172932e-06, "loss": 0.225, "step": 25090 }, { "epoch": 37.74, "grad_norm": 4.467003345489502, "learning_rate": 6.225563909774437e-06, "loss": 0.241, "step": 25100 }, { "epoch": 37.76, "grad_norm": 5.358434677124023, "learning_rate": 6.22406015037594e-06, "loss": 0.2336, "step": 25110 }, { "epoch": 37.77, "grad_norm": 3.714233875274658, "learning_rate": 6.222556390977445e-06, "loss": 0.2112, "step": 25120 }, { "epoch": 37.79, "grad_norm": 6.145929336547852, "learning_rate": 6.221052631578947e-06, "loss": 0.3233, "step": 25130 }, { "epoch": 37.8, "grad_norm": 5.789137840270996, "learning_rate": 6.219548872180452e-06, "loss": 0.2396, "step": 25140 }, { "epoch": 37.82, "grad_norm": 7.418402671813965, "learning_rate": 6.218045112781955e-06, "loss": 0.2444, "step": 25150 }, { "epoch": 37.83, "grad_norm": 4.341475486755371, "learning_rate": 6.2165413533834595e-06, "loss": 0.2258, "step": 25160 }, { "epoch": 37.85, "grad_norm": 5.642782211303711, "learning_rate": 6.215037593984963e-06, "loss": 0.2946, "step": 25170 }, { "epoch": 37.86, "grad_norm": 6.9304680824279785, "learning_rate": 6.213533834586467e-06, "loss": 0.2476, "step": 25180 }, { "epoch": 37.88, "grad_norm": 5.448428153991699, "learning_rate": 6.21203007518797e-06, "loss": 0.2455, "step": 25190 }, { "epoch": 37.89, "grad_norm": 5.857348918914795, "learning_rate": 6.2105263157894745e-06, "loss": 0.2786, "step": 25200 }, { "epoch": 37.91, "grad_norm": 3.1548938751220703, "learning_rate": 6.209022556390978e-06, "loss": 0.2123, "step": 25210 }, { "epoch": 37.92, "grad_norm": 5.268825531005859, "learning_rate": 6.207518796992482e-06, "loss": 0.1931, "step": 25220 }, { "epoch": 37.94, "grad_norm": 5.20718240737915, "learning_rate": 6.206015037593985e-06, "loss": 0.2774, "step": 25230 }, { "epoch": 37.95, "grad_norm": 7.931674957275391, "learning_rate": 6.204511278195489e-06, "loss": 0.2652, "step": 25240 }, { "epoch": 37.97, "grad_norm": 4.287258625030518, "learning_rate": 6.203007518796993e-06, "loss": 0.3186, "step": 25250 }, { "epoch": 37.98, "grad_norm": 8.485033988952637, "learning_rate": 6.201503759398497e-06, "loss": 0.2543, "step": 25260 }, { "epoch": 38.0, "grad_norm": 0.33357012271881104, "learning_rate": 6.200000000000001e-06, "loss": 0.264, "step": 25270 }, { "epoch": 38.0, "eval_accuracy": 0.9298, "eval_loss": 0.3029746413230896, "eval_runtime": 84.4594, "eval_samples_per_second": 118.4, "eval_steps_per_second": 0.474, "step": 25270 }, { "epoch": 38.02, "grad_norm": 5.222786903381348, "learning_rate": 6.198496240601505e-06, "loss": 0.2575, "step": 25280 }, { "epoch": 38.03, "grad_norm": 2.386619806289673, "learning_rate": 6.196992481203008e-06, "loss": 0.2419, "step": 25290 }, { "epoch": 38.05, "grad_norm": 3.8002138137817383, "learning_rate": 6.195488721804512e-06, "loss": 0.2632, "step": 25300 }, { "epoch": 38.06, "grad_norm": 4.710455417633057, "learning_rate": 6.193984962406016e-06, "loss": 0.1841, "step": 25310 }, { "epoch": 38.08, "grad_norm": 3.5531041622161865, "learning_rate": 6.192481203007518e-06, "loss": 0.2501, "step": 25320 }, { "epoch": 38.09, "grad_norm": 4.9958062171936035, "learning_rate": 6.190977443609023e-06, "loss": 0.2728, "step": 25330 }, { "epoch": 38.11, "grad_norm": 3.301966428756714, "learning_rate": 6.189473684210526e-06, "loss": 0.2564, "step": 25340 }, { "epoch": 38.12, "grad_norm": 4.437223434448242, "learning_rate": 6.187969924812031e-06, "loss": 0.3406, "step": 25350 }, { "epoch": 38.14, "grad_norm": 3.1734423637390137, "learning_rate": 6.186466165413534e-06, "loss": 0.2296, "step": 25360 }, { "epoch": 38.15, "grad_norm": 8.440972328186035, "learning_rate": 6.1849624060150385e-06, "loss": 0.2089, "step": 25370 }, { "epoch": 38.17, "grad_norm": 3.5104334354400635, "learning_rate": 6.183458646616541e-06, "loss": 0.3243, "step": 25380 }, { "epoch": 38.18, "grad_norm": 5.661349296569824, "learning_rate": 6.1819548872180455e-06, "loss": 0.2984, "step": 25390 }, { "epoch": 38.2, "grad_norm": 4.282464981079102, "learning_rate": 6.180451127819549e-06, "loss": 0.2493, "step": 25400 }, { "epoch": 38.21, "grad_norm": 4.8217926025390625, "learning_rate": 6.1789473684210534e-06, "loss": 0.289, "step": 25410 }, { "epoch": 38.23, "grad_norm": 5.01459264755249, "learning_rate": 6.177443609022556e-06, "loss": 0.3262, "step": 25420 }, { "epoch": 38.24, "grad_norm": 5.024478435516357, "learning_rate": 6.1759398496240605e-06, "loss": 0.297, "step": 25430 }, { "epoch": 38.26, "grad_norm": 9.888729095458984, "learning_rate": 6.174436090225564e-06, "loss": 0.2414, "step": 25440 }, { "epoch": 38.27, "grad_norm": 6.393784046173096, "learning_rate": 6.172932330827068e-06, "loss": 0.2329, "step": 25450 }, { "epoch": 38.29, "grad_norm": 4.191177845001221, "learning_rate": 6.171428571428572e-06, "loss": 0.303, "step": 25460 }, { "epoch": 38.3, "grad_norm": 5.586199760437012, "learning_rate": 6.169924812030076e-06, "loss": 0.2868, "step": 25470 }, { "epoch": 38.32, "grad_norm": 4.708113670349121, "learning_rate": 6.168421052631579e-06, "loss": 0.2418, "step": 25480 }, { "epoch": 38.33, "grad_norm": 8.40892505645752, "learning_rate": 6.166917293233083e-06, "loss": 0.2396, "step": 25490 }, { "epoch": 38.35, "grad_norm": 6.81309175491333, "learning_rate": 6.165413533834587e-06, "loss": 0.2496, "step": 25500 }, { "epoch": 38.36, "grad_norm": 8.350881576538086, "learning_rate": 6.163909774436091e-06, "loss": 0.2002, "step": 25510 }, { "epoch": 38.38, "grad_norm": 3.867704153060913, "learning_rate": 6.162406015037594e-06, "loss": 0.2544, "step": 25520 }, { "epoch": 38.39, "grad_norm": 2.9632339477539062, "learning_rate": 6.160902255639098e-06, "loss": 0.2444, "step": 25530 }, { "epoch": 38.41, "grad_norm": 2.540008544921875, "learning_rate": 6.159398496240602e-06, "loss": 0.1792, "step": 25540 }, { "epoch": 38.42, "grad_norm": 5.204866886138916, "learning_rate": 6.157894736842106e-06, "loss": 0.2785, "step": 25550 }, { "epoch": 38.44, "grad_norm": 5.661520957946777, "learning_rate": 6.15639097744361e-06, "loss": 0.2656, "step": 25560 }, { "epoch": 38.45, "grad_norm": 6.089839458465576, "learning_rate": 6.154887218045114e-06, "loss": 0.2698, "step": 25570 }, { "epoch": 38.47, "grad_norm": 6.78479528427124, "learning_rate": 6.153383458646617e-06, "loss": 0.2754, "step": 25580 }, { "epoch": 38.48, "grad_norm": 5.978816032409668, "learning_rate": 6.151879699248121e-06, "loss": 0.2906, "step": 25590 }, { "epoch": 38.5, "grad_norm": 6.391565799713135, "learning_rate": 6.1503759398496245e-06, "loss": 0.2373, "step": 25600 }, { "epoch": 38.51, "grad_norm": 6.544304847717285, "learning_rate": 6.148872180451129e-06, "loss": 0.256, "step": 25610 }, { "epoch": 38.53, "grad_norm": 3.7540414333343506, "learning_rate": 6.1473684210526316e-06, "loss": 0.2462, "step": 25620 }, { "epoch": 38.54, "grad_norm": 4.831808567047119, "learning_rate": 6.145864661654136e-06, "loss": 0.2307, "step": 25630 }, { "epoch": 38.56, "grad_norm": 3.9876370429992676, "learning_rate": 6.1443609022556395e-06, "loss": 0.182, "step": 25640 }, { "epoch": 38.57, "grad_norm": 4.689981460571289, "learning_rate": 6.142857142857144e-06, "loss": 0.2577, "step": 25650 }, { "epoch": 38.59, "grad_norm": 6.713221073150635, "learning_rate": 6.141353383458647e-06, "loss": 0.2902, "step": 25660 }, { "epoch": 38.6, "grad_norm": 5.03303861618042, "learning_rate": 6.139849624060152e-06, "loss": 0.1913, "step": 25670 }, { "epoch": 38.62, "grad_norm": 6.612110137939453, "learning_rate": 6.138345864661654e-06, "loss": 0.2724, "step": 25680 }, { "epoch": 38.63, "grad_norm": 0.28649356961250305, "learning_rate": 6.136842105263159e-06, "loss": 0.2571, "step": 25690 }, { "epoch": 38.65, "grad_norm": 3.211318016052246, "learning_rate": 6.135338345864662e-06, "loss": 0.2778, "step": 25700 }, { "epoch": 38.66, "grad_norm": 2.4958066940307617, "learning_rate": 6.133834586466167e-06, "loss": 0.2794, "step": 25710 }, { "epoch": 38.68, "grad_norm": 7.281109809875488, "learning_rate": 6.132330827067669e-06, "loss": 0.2804, "step": 25720 }, { "epoch": 38.69, "grad_norm": 5.622795581817627, "learning_rate": 6.130827067669174e-06, "loss": 0.2422, "step": 25730 }, { "epoch": 38.71, "grad_norm": 6.729783058166504, "learning_rate": 6.129323308270677e-06, "loss": 0.252, "step": 25740 }, { "epoch": 38.72, "grad_norm": 8.58928108215332, "learning_rate": 6.1278195488721816e-06, "loss": 0.2239, "step": 25750 }, { "epoch": 38.74, "grad_norm": 4.566402912139893, "learning_rate": 6.126315789473685e-06, "loss": 0.2211, "step": 25760 }, { "epoch": 38.75, "grad_norm": 13.606522560119629, "learning_rate": 6.124812030075189e-06, "loss": 0.2715, "step": 25770 }, { "epoch": 38.77, "grad_norm": 7.567572593688965, "learning_rate": 6.123308270676692e-06, "loss": 0.223, "step": 25780 }, { "epoch": 38.78, "grad_norm": 5.111462116241455, "learning_rate": 6.1218045112781965e-06, "loss": 0.2691, "step": 25790 }, { "epoch": 38.8, "grad_norm": 3.890907049179077, "learning_rate": 6.1203007518797e-06, "loss": 0.2056, "step": 25800 }, { "epoch": 38.81, "grad_norm": 8.117431640625, "learning_rate": 6.118796992481203e-06, "loss": 0.2391, "step": 25810 }, { "epoch": 38.83, "grad_norm": 3.895110845565796, "learning_rate": 6.117293233082707e-06, "loss": 0.2405, "step": 25820 }, { "epoch": 38.84, "grad_norm": 5.332907676696777, "learning_rate": 6.1157894736842106e-06, "loss": 0.2132, "step": 25830 }, { "epoch": 38.86, "grad_norm": 5.767151355743408, "learning_rate": 6.114285714285715e-06, "loss": 0.2321, "step": 25840 }, { "epoch": 38.87, "grad_norm": 6.4265899658203125, "learning_rate": 6.1127819548872184e-06, "loss": 0.265, "step": 25850 }, { "epoch": 38.89, "grad_norm": 5.724833011627197, "learning_rate": 6.111278195488722e-06, "loss": 0.2308, "step": 25860 }, { "epoch": 38.9, "grad_norm": 4.019651412963867, "learning_rate": 6.1097744360902255e-06, "loss": 0.2451, "step": 25870 }, { "epoch": 38.92, "grad_norm": 6.344007968902588, "learning_rate": 6.10827067669173e-06, "loss": 0.2445, "step": 25880 }, { "epoch": 38.93, "grad_norm": 4.17384672164917, "learning_rate": 6.106766917293233e-06, "loss": 0.2344, "step": 25890 }, { "epoch": 38.95, "grad_norm": 7.522851467132568, "learning_rate": 6.105263157894738e-06, "loss": 0.2443, "step": 25900 }, { "epoch": 38.96, "grad_norm": 5.275865077972412, "learning_rate": 6.10375939849624e-06, "loss": 0.2393, "step": 25910 }, { "epoch": 38.98, "grad_norm": 12.723134994506836, "learning_rate": 6.102255639097745e-06, "loss": 0.2157, "step": 25920 }, { "epoch": 38.99, "grad_norm": 6.258028507232666, "learning_rate": 6.100751879699248e-06, "loss": 0.3343, "step": 25930 }, { "epoch": 39.0, "eval_accuracy": 0.9299, "eval_loss": 0.30296048521995544, "eval_runtime": 85.0044, "eval_samples_per_second": 117.641, "eval_steps_per_second": 0.471, "step": 25935 }, { "epoch": 39.01, "grad_norm": 7.032735347747803, "learning_rate": 6.099248120300753e-06, "loss": 0.2688, "step": 25940 }, { "epoch": 39.02, "grad_norm": 4.064879417419434, "learning_rate": 6.097744360902256e-06, "loss": 0.2388, "step": 25950 }, { "epoch": 39.04, "grad_norm": 10.355484008789062, "learning_rate": 6.09624060150376e-06, "loss": 0.236, "step": 25960 }, { "epoch": 39.05, "grad_norm": 5.589707374572754, "learning_rate": 6.094736842105263e-06, "loss": 0.284, "step": 25970 }, { "epoch": 39.07, "grad_norm": 5.585343360900879, "learning_rate": 6.0932330827067676e-06, "loss": 0.2678, "step": 25980 }, { "epoch": 39.08, "grad_norm": 5.8805670738220215, "learning_rate": 6.091729323308271e-06, "loss": 0.2468, "step": 25990 }, { "epoch": 39.1, "grad_norm": 3.8114516735076904, "learning_rate": 6.0902255639097755e-06, "loss": 0.2651, "step": 26000 }, { "epoch": 39.11, "grad_norm": 2.4582362174987793, "learning_rate": 6.088721804511278e-06, "loss": 0.2564, "step": 26010 }, { "epoch": 39.13, "grad_norm": 7.29220724105835, "learning_rate": 6.0872180451127825e-06, "loss": 0.2869, "step": 26020 }, { "epoch": 39.14, "grad_norm": 5.818304061889648, "learning_rate": 6.085714285714286e-06, "loss": 0.1954, "step": 26030 }, { "epoch": 39.16, "grad_norm": 3.6301677227020264, "learning_rate": 6.08421052631579e-06, "loss": 0.2483, "step": 26040 }, { "epoch": 39.17, "grad_norm": 8.798738479614258, "learning_rate": 6.082706766917293e-06, "loss": 0.2372, "step": 26050 }, { "epoch": 39.19, "grad_norm": 4.8981404304504395, "learning_rate": 6.081203007518797e-06, "loss": 0.1533, "step": 26060 }, { "epoch": 39.2, "grad_norm": 3.82854962348938, "learning_rate": 6.079699248120301e-06, "loss": 0.1997, "step": 26070 }, { "epoch": 39.22, "grad_norm": 5.643593788146973, "learning_rate": 6.078195488721805e-06, "loss": 0.195, "step": 26080 }, { "epoch": 39.23, "grad_norm": 5.393759727478027, "learning_rate": 6.076691729323309e-06, "loss": 0.2615, "step": 26090 }, { "epoch": 39.25, "grad_norm": 4.524438381195068, "learning_rate": 6.075187969924813e-06, "loss": 0.2547, "step": 26100 }, { "epoch": 39.26, "grad_norm": 4.402938365936279, "learning_rate": 6.073684210526316e-06, "loss": 0.3235, "step": 26110 }, { "epoch": 39.28, "grad_norm": 4.850139141082764, "learning_rate": 6.07218045112782e-06, "loss": 0.3356, "step": 26120 }, { "epoch": 39.29, "grad_norm": 6.940186023712158, "learning_rate": 6.070676691729324e-06, "loss": 0.2929, "step": 26130 }, { "epoch": 39.31, "grad_norm": 5.906674861907959, "learning_rate": 6.069172932330828e-06, "loss": 0.2722, "step": 26140 }, { "epoch": 39.32, "grad_norm": 6.132667541503906, "learning_rate": 6.067669172932331e-06, "loss": 0.2593, "step": 26150 }, { "epoch": 39.34, "grad_norm": 4.581514835357666, "learning_rate": 6.066165413533835e-06, "loss": 0.2535, "step": 26160 }, { "epoch": 39.35, "grad_norm": 7.306211471557617, "learning_rate": 6.064661654135339e-06, "loss": 0.2108, "step": 26170 }, { "epoch": 39.37, "grad_norm": 5.0160040855407715, "learning_rate": 6.063157894736843e-06, "loss": 0.2747, "step": 26180 }, { "epoch": 39.38, "grad_norm": 6.838048458099365, "learning_rate": 6.0616541353383466e-06, "loss": 0.2339, "step": 26190 }, { "epoch": 39.4, "grad_norm": 7.156051158905029, "learning_rate": 6.060150375939851e-06, "loss": 0.2565, "step": 26200 }, { "epoch": 39.41, "grad_norm": 3.844694137573242, "learning_rate": 6.058646616541354e-06, "loss": 0.2644, "step": 26210 }, { "epoch": 39.43, "grad_norm": 9.594381332397461, "learning_rate": 6.057142857142858e-06, "loss": 0.2381, "step": 26220 }, { "epoch": 39.44, "grad_norm": 6.174018383026123, "learning_rate": 6.0556390977443615e-06, "loss": 0.2974, "step": 26230 }, { "epoch": 39.46, "grad_norm": 4.462780952453613, "learning_rate": 6.054135338345866e-06, "loss": 0.3042, "step": 26240 }, { "epoch": 39.47, "grad_norm": 5.761167049407959, "learning_rate": 6.0526315789473685e-06, "loss": 0.2494, "step": 26250 }, { "epoch": 39.49, "grad_norm": 6.0231733322143555, "learning_rate": 6.051127819548873e-06, "loss": 0.2232, "step": 26260 }, { "epoch": 39.5, "grad_norm": 6.92008113861084, "learning_rate": 6.049624060150376e-06, "loss": 0.2763, "step": 26270 }, { "epoch": 39.52, "grad_norm": 8.816802978515625, "learning_rate": 6.048120300751881e-06, "loss": 0.2386, "step": 26280 }, { "epoch": 39.53, "grad_norm": 9.278828620910645, "learning_rate": 6.046616541353384e-06, "loss": 0.305, "step": 26290 }, { "epoch": 39.55, "grad_norm": 6.65963888168335, "learning_rate": 6.045112781954889e-06, "loss": 0.2616, "step": 26300 }, { "epoch": 39.56, "grad_norm": 6.531272888183594, "learning_rate": 6.043609022556391e-06, "loss": 0.2339, "step": 26310 }, { "epoch": 39.58, "grad_norm": 5.144484043121338, "learning_rate": 6.042105263157895e-06, "loss": 0.3106, "step": 26320 }, { "epoch": 39.59, "grad_norm": 5.5032243728637695, "learning_rate": 6.040601503759399e-06, "loss": 0.2968, "step": 26330 }, { "epoch": 39.61, "grad_norm": 3.399604320526123, "learning_rate": 6.039097744360902e-06, "loss": 0.2153, "step": 26340 }, { "epoch": 39.62, "grad_norm": 6.184195041656494, "learning_rate": 6.037593984962406e-06, "loss": 0.306, "step": 26350 }, { "epoch": 39.64, "grad_norm": 5.967001914978027, "learning_rate": 6.03609022556391e-06, "loss": 0.3201, "step": 26360 }, { "epoch": 39.65, "grad_norm": 3.1237454414367676, "learning_rate": 6.034586466165414e-06, "loss": 0.2411, "step": 26370 }, { "epoch": 39.67, "grad_norm": 5.807510852813721, "learning_rate": 6.033082706766918e-06, "loss": 0.2651, "step": 26380 }, { "epoch": 39.68, "grad_norm": 3.013021230697632, "learning_rate": 6.031578947368422e-06, "loss": 0.2835, "step": 26390 }, { "epoch": 39.7, "grad_norm": 6.245032787322998, "learning_rate": 6.030075187969925e-06, "loss": 0.3153, "step": 26400 }, { "epoch": 39.71, "grad_norm": 2.45760440826416, "learning_rate": 6.028571428571429e-06, "loss": 0.2501, "step": 26410 }, { "epoch": 39.73, "grad_norm": 6.795231342315674, "learning_rate": 6.0270676691729326e-06, "loss": 0.2885, "step": 26420 }, { "epoch": 39.74, "grad_norm": 4.70608377456665, "learning_rate": 6.025563909774437e-06, "loss": 0.2741, "step": 26430 }, { "epoch": 39.76, "grad_norm": 3.3417322635650635, "learning_rate": 6.02406015037594e-06, "loss": 0.31, "step": 26440 }, { "epoch": 39.77, "grad_norm": 4.526313781738281, "learning_rate": 6.022556390977444e-06, "loss": 0.3074, "step": 26450 }, { "epoch": 39.79, "grad_norm": 4.290112495422363, "learning_rate": 6.0210526315789475e-06, "loss": 0.2311, "step": 26460 }, { "epoch": 39.8, "grad_norm": 7.921751499176025, "learning_rate": 6.019548872180452e-06, "loss": 0.2454, "step": 26470 }, { "epoch": 39.82, "grad_norm": 5.810790538787842, "learning_rate": 6.018045112781955e-06, "loss": 0.2412, "step": 26480 }, { "epoch": 39.83, "grad_norm": 7.158168792724609, "learning_rate": 6.01654135338346e-06, "loss": 0.2827, "step": 26490 }, { "epoch": 39.85, "grad_norm": 5.13280725479126, "learning_rate": 6.015037593984962e-06, "loss": 0.2417, "step": 26500 }, { "epoch": 39.86, "grad_norm": 5.112004280090332, "learning_rate": 6.013533834586467e-06, "loss": 0.2607, "step": 26510 }, { "epoch": 39.88, "grad_norm": 3.5853915214538574, "learning_rate": 6.01203007518797e-06, "loss": 0.2105, "step": 26520 }, { "epoch": 39.89, "grad_norm": 7.726737976074219, "learning_rate": 6.010526315789475e-06, "loss": 0.2266, "step": 26530 }, { "epoch": 39.91, "grad_norm": 4.511539936065674, "learning_rate": 6.009022556390977e-06, "loss": 0.2388, "step": 26540 }, { "epoch": 39.92, "grad_norm": 6.07355260848999, "learning_rate": 6.007518796992482e-06, "loss": 0.2129, "step": 26550 }, { "epoch": 39.94, "grad_norm": 4.063088417053223, "learning_rate": 6.006015037593985e-06, "loss": 0.2759, "step": 26560 }, { "epoch": 39.95, "grad_norm": 8.524380683898926, "learning_rate": 6.00451127819549e-06, "loss": 0.2703, "step": 26570 }, { "epoch": 39.97, "grad_norm": 5.095054626464844, "learning_rate": 6.003007518796993e-06, "loss": 0.2418, "step": 26580 }, { "epoch": 39.98, "grad_norm": 4.896398067474365, "learning_rate": 6.001503759398497e-06, "loss": 0.294, "step": 26590 }, { "epoch": 40.0, "grad_norm": 41.88473892211914, "learning_rate": 6e-06, "loss": 0.2252, "step": 26600 }, { "epoch": 40.0, "eval_accuracy": 0.9313, "eval_loss": 0.2959575057029724, "eval_runtime": 84.9779, "eval_samples_per_second": 117.678, "eval_steps_per_second": 0.471, "step": 26600 }, { "epoch": 40.02, "grad_norm": 11.272649765014648, "learning_rate": 5.9984962406015045e-06, "loss": 0.2401, "step": 26610 }, { "epoch": 40.03, "grad_norm": 6.342247486114502, "learning_rate": 5.996992481203008e-06, "loss": 0.1853, "step": 26620 }, { "epoch": 40.05, "grad_norm": 8.127535820007324, "learning_rate": 5.995488721804512e-06, "loss": 0.2091, "step": 26630 }, { "epoch": 40.06, "grad_norm": 7.700329303741455, "learning_rate": 5.993984962406015e-06, "loss": 0.2537, "step": 26640 }, { "epoch": 40.08, "grad_norm": 4.249181747436523, "learning_rate": 5.9924812030075194e-06, "loss": 0.2576, "step": 26650 }, { "epoch": 40.09, "grad_norm": 6.067375659942627, "learning_rate": 5.990977443609023e-06, "loss": 0.2815, "step": 26660 }, { "epoch": 40.11, "grad_norm": 5.851858139038086, "learning_rate": 5.989473684210527e-06, "loss": 0.2741, "step": 26670 }, { "epoch": 40.12, "grad_norm": 2.2680578231811523, "learning_rate": 5.987969924812031e-06, "loss": 0.2571, "step": 26680 }, { "epoch": 40.14, "grad_norm": 6.329833984375, "learning_rate": 5.986466165413534e-06, "loss": 0.1918, "step": 26690 }, { "epoch": 40.15, "grad_norm": 3.3135337829589844, "learning_rate": 5.984962406015038e-06, "loss": 0.2999, "step": 26700 }, { "epoch": 40.17, "grad_norm": 5.360442161560059, "learning_rate": 5.983458646616542e-06, "loss": 0.248, "step": 26710 }, { "epoch": 40.18, "grad_norm": 5.619331359863281, "learning_rate": 5.981954887218046e-06, "loss": 0.2665, "step": 26720 }, { "epoch": 40.2, "grad_norm": 6.200700283050537, "learning_rate": 5.98045112781955e-06, "loss": 0.2415, "step": 26730 }, { "epoch": 40.21, "grad_norm": 6.159794330596924, "learning_rate": 5.978947368421053e-06, "loss": 0.3265, "step": 26740 }, { "epoch": 40.23, "grad_norm": 4.464012622833252, "learning_rate": 5.977443609022557e-06, "loss": 0.2426, "step": 26750 }, { "epoch": 40.24, "grad_norm": 6.896475791931152, "learning_rate": 5.975939849624061e-06, "loss": 0.2842, "step": 26760 }, { "epoch": 40.26, "grad_norm": 5.030261993408203, "learning_rate": 5.974436090225565e-06, "loss": 0.2208, "step": 26770 }, { "epoch": 40.27, "grad_norm": 7.4109392166137695, "learning_rate": 5.972932330827068e-06, "loss": 0.3284, "step": 26780 }, { "epoch": 40.29, "grad_norm": 3.889845132827759, "learning_rate": 5.971428571428572e-06, "loss": 0.2688, "step": 26790 }, { "epoch": 40.3, "grad_norm": 2.1321589946746826, "learning_rate": 5.969924812030076e-06, "loss": 0.1939, "step": 26800 }, { "epoch": 40.32, "grad_norm": 10.229703903198242, "learning_rate": 5.968421052631579e-06, "loss": 0.2851, "step": 26810 }, { "epoch": 40.33, "grad_norm": 2.8729848861694336, "learning_rate": 5.9669172932330835e-06, "loss": 0.2451, "step": 26820 }, { "epoch": 40.35, "grad_norm": 5.045018672943115, "learning_rate": 5.965413533834586e-06, "loss": 0.2655, "step": 26830 }, { "epoch": 40.36, "grad_norm": 6.061784744262695, "learning_rate": 5.9639097744360905e-06, "loss": 0.2106, "step": 26840 }, { "epoch": 40.38, "grad_norm": 4.3670654296875, "learning_rate": 5.962406015037594e-06, "loss": 0.2508, "step": 26850 }, { "epoch": 40.39, "grad_norm": 4.169200897216797, "learning_rate": 5.9609022556390984e-06, "loss": 0.2374, "step": 26860 }, { "epoch": 40.41, "grad_norm": 9.870522499084473, "learning_rate": 5.959398496240601e-06, "loss": 0.2112, "step": 26870 }, { "epoch": 40.42, "grad_norm": 4.387085914611816, "learning_rate": 5.9578947368421055e-06, "loss": 0.2245, "step": 26880 }, { "epoch": 40.44, "grad_norm": 4.728475093841553, "learning_rate": 5.956390977443609e-06, "loss": 0.2252, "step": 26890 }, { "epoch": 40.45, "grad_norm": 3.2639694213867188, "learning_rate": 5.954887218045113e-06, "loss": 0.2637, "step": 26900 }, { "epoch": 40.47, "grad_norm": 3.462743043899536, "learning_rate": 5.953383458646617e-06, "loss": 0.2298, "step": 26910 }, { "epoch": 40.48, "grad_norm": 7.873288631439209, "learning_rate": 5.951879699248121e-06, "loss": 0.2781, "step": 26920 }, { "epoch": 40.5, "grad_norm": 5.6623334884643555, "learning_rate": 5.950375939849624e-06, "loss": 0.3083, "step": 26930 }, { "epoch": 40.51, "grad_norm": 4.990388870239258, "learning_rate": 5.948872180451128e-06, "loss": 0.2423, "step": 26940 }, { "epoch": 40.53, "grad_norm": 4.813294887542725, "learning_rate": 5.947368421052632e-06, "loss": 0.3243, "step": 26950 }, { "epoch": 40.54, "grad_norm": 4.974578857421875, "learning_rate": 5.945864661654136e-06, "loss": 0.2494, "step": 26960 }, { "epoch": 40.56, "grad_norm": 5.326529502868652, "learning_rate": 5.944360902255639e-06, "loss": 0.3272, "step": 26970 }, { "epoch": 40.57, "grad_norm": 6.8446245193481445, "learning_rate": 5.942857142857143e-06, "loss": 0.3175, "step": 26980 }, { "epoch": 40.59, "grad_norm": 7.015409469604492, "learning_rate": 5.941353383458647e-06, "loss": 0.2259, "step": 26990 }, { "epoch": 40.6, "grad_norm": 5.787068843841553, "learning_rate": 5.939849624060151e-06, "loss": 0.2027, "step": 27000 }, { "epoch": 40.62, "grad_norm": 8.761579513549805, "learning_rate": 5.938345864661655e-06, "loss": 0.229, "step": 27010 }, { "epoch": 40.63, "grad_norm": 6.765686988830566, "learning_rate": 5.936842105263159e-06, "loss": 0.2945, "step": 27020 }, { "epoch": 40.65, "grad_norm": 8.93405532836914, "learning_rate": 5.935338345864662e-06, "loss": 0.3073, "step": 27030 }, { "epoch": 40.66, "grad_norm": 6.20574951171875, "learning_rate": 5.933834586466166e-06, "loss": 0.2785, "step": 27040 }, { "epoch": 40.68, "grad_norm": 7.853849411010742, "learning_rate": 5.9323308270676695e-06, "loss": 0.2294, "step": 27050 }, { "epoch": 40.69, "grad_norm": 4.646696090698242, "learning_rate": 5.930827067669174e-06, "loss": 0.2995, "step": 27060 }, { "epoch": 40.71, "grad_norm": 7.266605377197266, "learning_rate": 5.9293233082706766e-06, "loss": 0.259, "step": 27070 }, { "epoch": 40.72, "grad_norm": 6.365235805511475, "learning_rate": 5.927819548872181e-06, "loss": 0.2905, "step": 27080 }, { "epoch": 40.74, "grad_norm": 4.6850457191467285, "learning_rate": 5.9263157894736844e-06, "loss": 0.2313, "step": 27090 }, { "epoch": 40.75, "grad_norm": 2.9510128498077393, "learning_rate": 5.924812030075189e-06, "loss": 0.3019, "step": 27100 }, { "epoch": 40.77, "grad_norm": 6.478631973266602, "learning_rate": 5.923308270676692e-06, "loss": 0.2318, "step": 27110 }, { "epoch": 40.78, "grad_norm": 4.466511249542236, "learning_rate": 5.921804511278197e-06, "loss": 0.2816, "step": 27120 }, { "epoch": 40.8, "grad_norm": 3.0053863525390625, "learning_rate": 5.920300751879699e-06, "loss": 0.2526, "step": 27130 }, { "epoch": 40.81, "grad_norm": 4.911371231079102, "learning_rate": 5.918796992481204e-06, "loss": 0.2546, "step": 27140 }, { "epoch": 40.83, "grad_norm": 7.452986240386963, "learning_rate": 5.917293233082707e-06, "loss": 0.2378, "step": 27150 }, { "epoch": 40.84, "grad_norm": 6.792994499206543, "learning_rate": 5.915789473684212e-06, "loss": 0.3151, "step": 27160 }, { "epoch": 40.86, "grad_norm": 5.100854396820068, "learning_rate": 5.914285714285714e-06, "loss": 0.253, "step": 27170 }, { "epoch": 40.87, "grad_norm": 3.16489577293396, "learning_rate": 5.912781954887219e-06, "loss": 0.2431, "step": 27180 }, { "epoch": 40.89, "grad_norm": 9.801965713500977, "learning_rate": 5.911278195488722e-06, "loss": 0.2013, "step": 27190 }, { "epoch": 40.9, "grad_norm": 3.110069990158081, "learning_rate": 5.9097744360902265e-06, "loss": 0.2555, "step": 27200 }, { "epoch": 40.92, "grad_norm": 4.8249006271362305, "learning_rate": 5.90827067669173e-06, "loss": 0.1823, "step": 27210 }, { "epoch": 40.93, "grad_norm": 2.37221622467041, "learning_rate": 5.9067669172932344e-06, "loss": 0.2177, "step": 27220 }, { "epoch": 40.95, "grad_norm": 4.915449142456055, "learning_rate": 5.905263157894737e-06, "loss": 0.2638, "step": 27230 }, { "epoch": 40.96, "grad_norm": 4.170838832855225, "learning_rate": 5.9037593984962415e-06, "loss": 0.2424, "step": 27240 }, { "epoch": 40.98, "grad_norm": 6.760472297668457, "learning_rate": 5.902255639097745e-06, "loss": 0.237, "step": 27250 }, { "epoch": 40.99, "grad_norm": 4.23274564743042, "learning_rate": 5.900751879699249e-06, "loss": 0.2453, "step": 27260 }, { "epoch": 41.0, "eval_accuracy": 0.9302, "eval_loss": 0.2976926863193512, "eval_runtime": 84.6737, "eval_samples_per_second": 118.1, "eval_steps_per_second": 0.472, "step": 27265 }, { "epoch": 41.01, "grad_norm": 7.621011734008789, "learning_rate": 5.899248120300752e-06, "loss": 0.2292, "step": 27270 }, { "epoch": 41.02, "grad_norm": 3.8886141777038574, "learning_rate": 5.897744360902256e-06, "loss": 0.1851, "step": 27280 }, { "epoch": 41.04, "grad_norm": 5.533069133758545, "learning_rate": 5.89624060150376e-06, "loss": 0.2613, "step": 27290 }, { "epoch": 41.05, "grad_norm": 4.005669116973877, "learning_rate": 5.8947368421052634e-06, "loss": 0.1644, "step": 27300 }, { "epoch": 41.07, "grad_norm": 3.7740938663482666, "learning_rate": 5.893233082706768e-06, "loss": 0.1893, "step": 27310 }, { "epoch": 41.08, "grad_norm": 8.077632904052734, "learning_rate": 5.8917293233082705e-06, "loss": 0.2166, "step": 27320 }, { "epoch": 41.1, "grad_norm": 2.8428125381469727, "learning_rate": 5.890225563909775e-06, "loss": 0.2274, "step": 27330 }, { "epoch": 41.11, "grad_norm": 2.748422861099243, "learning_rate": 5.888721804511278e-06, "loss": 0.1786, "step": 27340 }, { "epoch": 41.13, "grad_norm": 6.080495834350586, "learning_rate": 5.887218045112783e-06, "loss": 0.3044, "step": 27350 }, { "epoch": 41.14, "grad_norm": 6.971614837646484, "learning_rate": 5.885714285714285e-06, "loss": 0.2141, "step": 27360 }, { "epoch": 41.16, "grad_norm": 3.363379716873169, "learning_rate": 5.88421052631579e-06, "loss": 0.3337, "step": 27370 }, { "epoch": 41.17, "grad_norm": 4.116064548492432, "learning_rate": 5.882706766917293e-06, "loss": 0.2026, "step": 27380 }, { "epoch": 41.19, "grad_norm": 2.9727118015289307, "learning_rate": 5.881203007518798e-06, "loss": 0.2342, "step": 27390 }, { "epoch": 41.2, "grad_norm": 4.2843337059021, "learning_rate": 5.879699248120301e-06, "loss": 0.2033, "step": 27400 }, { "epoch": 41.22, "grad_norm": 11.28203010559082, "learning_rate": 5.8781954887218055e-06, "loss": 0.2623, "step": 27410 }, { "epoch": 41.23, "grad_norm": 1.7853152751922607, "learning_rate": 5.876691729323308e-06, "loss": 0.1762, "step": 27420 }, { "epoch": 41.25, "grad_norm": 2.5484893321990967, "learning_rate": 5.8751879699248126e-06, "loss": 0.1423, "step": 27430 }, { "epoch": 41.26, "grad_norm": 5.5011186599731445, "learning_rate": 5.873684210526316e-06, "loss": 0.2804, "step": 27440 }, { "epoch": 41.28, "grad_norm": 4.6855645179748535, "learning_rate": 5.8721804511278204e-06, "loss": 0.2465, "step": 27450 }, { "epoch": 41.29, "grad_norm": 7.020749092102051, "learning_rate": 5.870676691729323e-06, "loss": 0.1858, "step": 27460 }, { "epoch": 41.31, "grad_norm": 3.362569808959961, "learning_rate": 5.8691729323308275e-06, "loss": 0.2039, "step": 27470 }, { "epoch": 41.32, "grad_norm": 7.235138893127441, "learning_rate": 5.867669172932331e-06, "loss": 0.2206, "step": 27480 }, { "epoch": 41.34, "grad_norm": 8.273979187011719, "learning_rate": 5.866165413533835e-06, "loss": 0.2561, "step": 27490 }, { "epoch": 41.35, "grad_norm": 9.396625518798828, "learning_rate": 5.864661654135339e-06, "loss": 0.2517, "step": 27500 }, { "epoch": 41.37, "grad_norm": 6.119128704071045, "learning_rate": 5.863157894736842e-06, "loss": 0.2438, "step": 27510 }, { "epoch": 41.38, "grad_norm": 6.962316989898682, "learning_rate": 5.861654135338346e-06, "loss": 0.267, "step": 27520 }, { "epoch": 41.4, "grad_norm": 2.296393632888794, "learning_rate": 5.86015037593985e-06, "loss": 0.2358, "step": 27530 }, { "epoch": 41.41, "grad_norm": 6.2318830490112305, "learning_rate": 5.858646616541354e-06, "loss": 0.2647, "step": 27540 }, { "epoch": 41.43, "grad_norm": 5.737059116363525, "learning_rate": 5.857142857142858e-06, "loss": 0.2347, "step": 27550 }, { "epoch": 41.44, "grad_norm": 6.697840213775635, "learning_rate": 5.855639097744361e-06, "loss": 0.1956, "step": 27560 }, { "epoch": 41.46, "grad_norm": 4.799352169036865, "learning_rate": 5.854135338345865e-06, "loss": 0.2545, "step": 27570 }, { "epoch": 41.47, "grad_norm": 9.115018844604492, "learning_rate": 5.852631578947369e-06, "loss": 0.2221, "step": 27580 }, { "epoch": 41.49, "grad_norm": 3.4361791610717773, "learning_rate": 5.851127819548873e-06, "loss": 0.2736, "step": 27590 }, { "epoch": 41.5, "grad_norm": 6.11599588394165, "learning_rate": 5.849624060150377e-06, "loss": 0.2768, "step": 27600 }, { "epoch": 41.52, "grad_norm": 17.091064453125, "learning_rate": 5.84812030075188e-06, "loss": 0.2645, "step": 27610 }, { "epoch": 41.53, "grad_norm": 2.6468470096588135, "learning_rate": 5.846616541353384e-06, "loss": 0.2765, "step": 27620 }, { "epoch": 41.55, "grad_norm": 8.783407211303711, "learning_rate": 5.845112781954888e-06, "loss": 0.2277, "step": 27630 }, { "epoch": 41.56, "grad_norm": 9.349631309509277, "learning_rate": 5.8436090225563915e-06, "loss": 0.2197, "step": 27640 }, { "epoch": 41.58, "grad_norm": 5.436890125274658, "learning_rate": 5.842105263157896e-06, "loss": 0.2632, "step": 27650 }, { "epoch": 41.59, "grad_norm": 2.644437313079834, "learning_rate": 5.840601503759399e-06, "loss": 0.2113, "step": 27660 }, { "epoch": 41.61, "grad_norm": 7.14797306060791, "learning_rate": 5.839097744360903e-06, "loss": 0.2395, "step": 27670 }, { "epoch": 41.62, "grad_norm": 5.408485412597656, "learning_rate": 5.8375939849624065e-06, "loss": 0.23, "step": 27680 }, { "epoch": 41.64, "grad_norm": 6.316678524017334, "learning_rate": 5.836090225563911e-06, "loss": 0.2506, "step": 27690 }, { "epoch": 41.65, "grad_norm": 4.856528282165527, "learning_rate": 5.8345864661654135e-06, "loss": 0.2565, "step": 27700 }, { "epoch": 41.67, "grad_norm": 3.731872797012329, "learning_rate": 5.833082706766918e-06, "loss": 0.2733, "step": 27710 }, { "epoch": 41.68, "grad_norm": 10.921002388000488, "learning_rate": 5.831578947368421e-06, "loss": 0.233, "step": 27720 }, { "epoch": 41.7, "grad_norm": 8.051673889160156, "learning_rate": 5.830075187969926e-06, "loss": 0.2357, "step": 27730 }, { "epoch": 41.71, "grad_norm": 5.143133640289307, "learning_rate": 5.828571428571429e-06, "loss": 0.2399, "step": 27740 }, { "epoch": 41.73, "grad_norm": 6.688822269439697, "learning_rate": 5.827067669172934e-06, "loss": 0.2199, "step": 27750 }, { "epoch": 41.74, "grad_norm": 5.77666711807251, "learning_rate": 5.825563909774436e-06, "loss": 0.2227, "step": 27760 }, { "epoch": 41.76, "grad_norm": 5.851940155029297, "learning_rate": 5.824060150375941e-06, "loss": 0.2161, "step": 27770 }, { "epoch": 41.77, "grad_norm": 3.7167811393737793, "learning_rate": 5.822556390977444e-06, "loss": 0.2276, "step": 27780 }, { "epoch": 41.79, "grad_norm": 1.5249907970428467, "learning_rate": 5.8210526315789486e-06, "loss": 0.2857, "step": 27790 }, { "epoch": 41.8, "grad_norm": 3.788996458053589, "learning_rate": 5.819548872180451e-06, "loss": 0.2901, "step": 27800 }, { "epoch": 41.82, "grad_norm": 3.4979562759399414, "learning_rate": 5.818045112781955e-06, "loss": 0.2643, "step": 27810 }, { "epoch": 41.83, "grad_norm": 5.074125289916992, "learning_rate": 5.816541353383459e-06, "loss": 0.2909, "step": 27820 }, { "epoch": 41.85, "grad_norm": 7.354587078094482, "learning_rate": 5.815037593984963e-06, "loss": 0.2756, "step": 27830 }, { "epoch": 41.86, "grad_norm": 5.332225322723389, "learning_rate": 5.813533834586467e-06, "loss": 0.2369, "step": 27840 }, { "epoch": 41.88, "grad_norm": 3.5564475059509277, "learning_rate": 5.81203007518797e-06, "loss": 0.3292, "step": 27850 }, { "epoch": 41.89, "grad_norm": 5.977893829345703, "learning_rate": 5.810526315789474e-06, "loss": 0.2441, "step": 27860 }, { "epoch": 41.91, "grad_norm": 7.151272773742676, "learning_rate": 5.8090225563909776e-06, "loss": 0.2375, "step": 27870 }, { "epoch": 41.92, "grad_norm": 3.628330707550049, "learning_rate": 5.807518796992482e-06, "loss": 0.2102, "step": 27880 }, { "epoch": 41.94, "grad_norm": 6.060537338256836, "learning_rate": 5.806015037593985e-06, "loss": 0.2511, "step": 27890 }, { "epoch": 41.95, "grad_norm": 6.281295299530029, "learning_rate": 5.804511278195489e-06, "loss": 0.2389, "step": 27900 }, { "epoch": 41.97, "grad_norm": 6.898097515106201, "learning_rate": 5.8030075187969925e-06, "loss": 0.282, "step": 27910 }, { "epoch": 41.98, "grad_norm": 4.405340671539307, "learning_rate": 5.801503759398497e-06, "loss": 0.2674, "step": 27920 }, { "epoch": 42.0, "grad_norm": 4.30500602722168, "learning_rate": 5.8e-06, "loss": 0.2467, "step": 27930 }, { "epoch": 42.0, "eval_accuracy": 0.9293, "eval_loss": 0.30341148376464844, "eval_runtime": 84.726, "eval_samples_per_second": 118.028, "eval_steps_per_second": 0.472, "step": 27930 }, { "epoch": 42.02, "grad_norm": 6.07786750793457, "learning_rate": 5.798496240601505e-06, "loss": 0.2712, "step": 27940 }, { "epoch": 42.03, "grad_norm": 6.053528308868408, "learning_rate": 5.796992481203007e-06, "loss": 0.2482, "step": 27950 }, { "epoch": 42.05, "grad_norm": 5.752837657928467, "learning_rate": 5.795488721804512e-06, "loss": 0.2579, "step": 27960 }, { "epoch": 42.06, "grad_norm": 8.296350479125977, "learning_rate": 5.793984962406015e-06, "loss": 0.2481, "step": 27970 }, { "epoch": 42.08, "grad_norm": 4.709738254547119, "learning_rate": 5.79248120300752e-06, "loss": 0.1771, "step": 27980 }, { "epoch": 42.09, "grad_norm": 3.9645566940307617, "learning_rate": 5.790977443609022e-06, "loss": 0.218, "step": 27990 }, { "epoch": 42.11, "grad_norm": 5.714948654174805, "learning_rate": 5.789473684210527e-06, "loss": 0.2734, "step": 28000 }, { "epoch": 42.12, "grad_norm": 3.882260799407959, "learning_rate": 5.78796992481203e-06, "loss": 0.2747, "step": 28010 }, { "epoch": 42.14, "grad_norm": 4.841667175292969, "learning_rate": 5.786466165413535e-06, "loss": 0.2728, "step": 28020 }, { "epoch": 42.15, "grad_norm": 7.8732523918151855, "learning_rate": 5.784962406015038e-06, "loss": 0.2417, "step": 28030 }, { "epoch": 42.17, "grad_norm": 6.003359317779541, "learning_rate": 5.7834586466165425e-06, "loss": 0.2442, "step": 28040 }, { "epoch": 42.18, "grad_norm": 4.850991249084473, "learning_rate": 5.781954887218045e-06, "loss": 0.2731, "step": 28050 }, { "epoch": 42.2, "grad_norm": 5.1097025871276855, "learning_rate": 5.7804511278195495e-06, "loss": 0.1836, "step": 28060 }, { "epoch": 42.21, "grad_norm": 6.158971309661865, "learning_rate": 5.778947368421053e-06, "loss": 0.2186, "step": 28070 }, { "epoch": 42.23, "grad_norm": 4.05385160446167, "learning_rate": 5.777443609022557e-06, "loss": 0.2081, "step": 28080 }, { "epoch": 42.24, "grad_norm": 6.728810787200928, "learning_rate": 5.77593984962406e-06, "loss": 0.2637, "step": 28090 }, { "epoch": 42.26, "grad_norm": 4.585230350494385, "learning_rate": 5.7744360902255644e-06, "loss": 0.2556, "step": 28100 }, { "epoch": 42.27, "grad_norm": 4.899889945983887, "learning_rate": 5.772932330827068e-06, "loss": 0.2692, "step": 28110 }, { "epoch": 42.29, "grad_norm": 8.909112930297852, "learning_rate": 5.771428571428572e-06, "loss": 0.1978, "step": 28120 }, { "epoch": 42.3, "grad_norm": 5.728882312774658, "learning_rate": 5.769924812030076e-06, "loss": 0.2635, "step": 28130 }, { "epoch": 42.32, "grad_norm": 4.432710647583008, "learning_rate": 5.76842105263158e-06, "loss": 0.2616, "step": 28140 }, { "epoch": 42.33, "grad_norm": 3.3793838024139404, "learning_rate": 5.766917293233083e-06, "loss": 0.2387, "step": 28150 }, { "epoch": 42.35, "grad_norm": 10.177301406860352, "learning_rate": 5.765413533834587e-06, "loss": 0.2494, "step": 28160 }, { "epoch": 42.36, "grad_norm": 5.089664459228516, "learning_rate": 5.763909774436091e-06, "loss": 0.2588, "step": 28170 }, { "epoch": 42.38, "grad_norm": 3.203890800476074, "learning_rate": 5.762406015037595e-06, "loss": 0.3037, "step": 28180 }, { "epoch": 42.39, "grad_norm": 5.658225059509277, "learning_rate": 5.760902255639098e-06, "loss": 0.2522, "step": 28190 }, { "epoch": 42.41, "grad_norm": 3.704941987991333, "learning_rate": 5.759398496240602e-06, "loss": 0.2382, "step": 28200 }, { "epoch": 42.42, "grad_norm": 4.1319780349731445, "learning_rate": 5.757894736842106e-06, "loss": 0.2014, "step": 28210 }, { "epoch": 42.44, "grad_norm": 4.998769760131836, "learning_rate": 5.75639097744361e-06, "loss": 0.2205, "step": 28220 }, { "epoch": 42.45, "grad_norm": 2.3330812454223633, "learning_rate": 5.7548872180451136e-06, "loss": 0.2242, "step": 28230 }, { "epoch": 42.47, "grad_norm": 3.1000936031341553, "learning_rate": 5.753383458646618e-06, "loss": 0.2238, "step": 28240 }, { "epoch": 42.48, "grad_norm": 5.262942790985107, "learning_rate": 5.751879699248121e-06, "loss": 0.2225, "step": 28250 }, { "epoch": 42.5, "grad_norm": 7.2910475730896, "learning_rate": 5.750375939849625e-06, "loss": 0.2829, "step": 28260 }, { "epoch": 42.51, "grad_norm": 2.4421637058258057, "learning_rate": 5.7488721804511285e-06, "loss": 0.284, "step": 28270 }, { "epoch": 42.53, "grad_norm": 2.7164719104766846, "learning_rate": 5.747368421052633e-06, "loss": 0.2569, "step": 28280 }, { "epoch": 42.54, "grad_norm": 5.2328877449035645, "learning_rate": 5.7458646616541355e-06, "loss": 0.2374, "step": 28290 }, { "epoch": 42.56, "grad_norm": 3.4275119304656982, "learning_rate": 5.744360902255639e-06, "loss": 0.2283, "step": 28300 }, { "epoch": 42.57, "grad_norm": 3.65972900390625, "learning_rate": 5.742857142857143e-06, "loss": 0.307, "step": 28310 }, { "epoch": 42.59, "grad_norm": 4.684558391571045, "learning_rate": 5.741353383458647e-06, "loss": 0.2323, "step": 28320 }, { "epoch": 42.6, "grad_norm": 7.553560733795166, "learning_rate": 5.739849624060151e-06, "loss": 0.1697, "step": 28330 }, { "epoch": 42.62, "grad_norm": 5.401693820953369, "learning_rate": 5.738345864661654e-06, "loss": 0.2497, "step": 28340 }, { "epoch": 42.63, "grad_norm": 4.120377540588379, "learning_rate": 5.736842105263158e-06, "loss": 0.285, "step": 28350 }, { "epoch": 42.65, "grad_norm": 6.782698154449463, "learning_rate": 5.735338345864662e-06, "loss": 0.2505, "step": 28360 }, { "epoch": 42.66, "grad_norm": 10.522151947021484, "learning_rate": 5.733834586466166e-06, "loss": 0.234, "step": 28370 }, { "epoch": 42.68, "grad_norm": 6.4431233406066895, "learning_rate": 5.732330827067669e-06, "loss": 0.2463, "step": 28380 }, { "epoch": 42.69, "grad_norm": 5.447474479675293, "learning_rate": 5.730827067669173e-06, "loss": 0.334, "step": 28390 }, { "epoch": 42.71, "grad_norm": 7.193316459655762, "learning_rate": 5.729323308270677e-06, "loss": 0.1952, "step": 28400 }, { "epoch": 42.72, "grad_norm": 3.5891122817993164, "learning_rate": 5.727819548872181e-06, "loss": 0.222, "step": 28410 }, { "epoch": 42.74, "grad_norm": 2.9756336212158203, "learning_rate": 5.726315789473685e-06, "loss": 0.2087, "step": 28420 }, { "epoch": 42.75, "grad_norm": 5.7030720710754395, "learning_rate": 5.724812030075188e-06, "loss": 0.2127, "step": 28430 }, { "epoch": 42.77, "grad_norm": 4.419146537780762, "learning_rate": 5.723308270676692e-06, "loss": 0.1878, "step": 28440 }, { "epoch": 42.78, "grad_norm": 5.313589096069336, "learning_rate": 5.721804511278196e-06, "loss": 0.2445, "step": 28450 }, { "epoch": 42.8, "grad_norm": 5.905307769775391, "learning_rate": 5.7203007518797e-06, "loss": 0.2381, "step": 28460 }, { "epoch": 42.81, "grad_norm": 5.510866641998291, "learning_rate": 5.718796992481204e-06, "loss": 0.2788, "step": 28470 }, { "epoch": 42.83, "grad_norm": 5.849859237670898, "learning_rate": 5.717293233082707e-06, "loss": 0.2436, "step": 28480 }, { "epoch": 42.84, "grad_norm": 5.243779182434082, "learning_rate": 5.715789473684211e-06, "loss": 0.1705, "step": 28490 }, { "epoch": 42.86, "grad_norm": 3.643617868423462, "learning_rate": 5.7142857142857145e-06, "loss": 0.3002, "step": 28500 }, { "epoch": 42.87, "grad_norm": 3.605794906616211, "learning_rate": 5.712781954887219e-06, "loss": 0.1879, "step": 28510 }, { "epoch": 42.89, "grad_norm": 4.405106067657471, "learning_rate": 5.711278195488722e-06, "loss": 0.222, "step": 28520 }, { "epoch": 42.9, "grad_norm": 7.1118597984313965, "learning_rate": 5.709774436090226e-06, "loss": 0.3624, "step": 28530 }, { "epoch": 42.92, "grad_norm": 5.815327167510986, "learning_rate": 5.7082706766917294e-06, "loss": 0.2162, "step": 28540 }, { "epoch": 42.93, "grad_norm": 5.0367021560668945, "learning_rate": 5.706766917293234e-06, "loss": 0.2481, "step": 28550 }, { "epoch": 42.95, "grad_norm": 6.6812005043029785, "learning_rate": 5.705263157894737e-06, "loss": 0.2977, "step": 28560 }, { "epoch": 42.96, "grad_norm": 3.0472841262817383, "learning_rate": 5.703759398496242e-06, "loss": 0.225, "step": 28570 }, { "epoch": 42.98, "grad_norm": 5.001314163208008, "learning_rate": 5.702255639097744e-06, "loss": 0.1936, "step": 28580 }, { "epoch": 42.99, "grad_norm": 7.2746124267578125, "learning_rate": 5.700751879699249e-06, "loss": 0.2208, "step": 28590 }, { "epoch": 43.0, "eval_accuracy": 0.9316, "eval_loss": 0.30223846435546875, "eval_runtime": 84.2035, "eval_samples_per_second": 118.76, "eval_steps_per_second": 0.475, "step": 28595 }, { "epoch": 43.01, "grad_norm": 5.429502010345459, "learning_rate": 5.699248120300752e-06, "loss": 0.2629, "step": 28600 }, { "epoch": 43.02, "grad_norm": 8.030689239501953, "learning_rate": 5.697744360902257e-06, "loss": 0.1887, "step": 28610 }, { "epoch": 43.04, "grad_norm": 4.963591575622559, "learning_rate": 5.696240601503759e-06, "loss": 0.2608, "step": 28620 }, { "epoch": 43.05, "grad_norm": 7.090287685394287, "learning_rate": 5.694736842105264e-06, "loss": 0.248, "step": 28630 }, { "epoch": 43.07, "grad_norm": 5.958043575286865, "learning_rate": 5.693233082706767e-06, "loss": 0.3033, "step": 28640 }, { "epoch": 43.08, "grad_norm": 6.555380821228027, "learning_rate": 5.6917293233082715e-06, "loss": 0.2399, "step": 28650 }, { "epoch": 43.1, "grad_norm": 7.416017055511475, "learning_rate": 5.690225563909775e-06, "loss": 0.2513, "step": 28660 }, { "epoch": 43.11, "grad_norm": 4.798736095428467, "learning_rate": 5.688721804511279e-06, "loss": 0.2721, "step": 28670 }, { "epoch": 43.13, "grad_norm": 6.545383930206299, "learning_rate": 5.687218045112782e-06, "loss": 0.2962, "step": 28680 }, { "epoch": 43.14, "grad_norm": 7.702247142791748, "learning_rate": 5.6857142857142865e-06, "loss": 0.2098, "step": 28690 }, { "epoch": 43.16, "grad_norm": 2.8116698265075684, "learning_rate": 5.68421052631579e-06, "loss": 0.1915, "step": 28700 }, { "epoch": 43.17, "grad_norm": 8.904080390930176, "learning_rate": 5.682706766917294e-06, "loss": 0.3113, "step": 28710 }, { "epoch": 43.19, "grad_norm": 3.884316921234131, "learning_rate": 5.681203007518797e-06, "loss": 0.2209, "step": 28720 }, { "epoch": 43.2, "grad_norm": 3.413797616958618, "learning_rate": 5.679699248120301e-06, "loss": 0.1643, "step": 28730 }, { "epoch": 43.22, "grad_norm": 7.109938144683838, "learning_rate": 5.678195488721805e-06, "loss": 0.2486, "step": 28740 }, { "epoch": 43.23, "grad_norm": 4.222686290740967, "learning_rate": 5.676691729323309e-06, "loss": 0.2572, "step": 28750 }, { "epoch": 43.25, "grad_norm": 4.973549842834473, "learning_rate": 5.675187969924813e-06, "loss": 0.2302, "step": 28760 }, { "epoch": 43.26, "grad_norm": 4.342545986175537, "learning_rate": 5.673684210526317e-06, "loss": 0.2825, "step": 28770 }, { "epoch": 43.28, "grad_norm": 5.619150638580322, "learning_rate": 5.67218045112782e-06, "loss": 0.2352, "step": 28780 }, { "epoch": 43.29, "grad_norm": 6.378529071807861, "learning_rate": 5.670676691729323e-06, "loss": 0.231, "step": 28790 }, { "epoch": 43.31, "grad_norm": 3.949969530105591, "learning_rate": 5.669172932330828e-06, "loss": 0.2399, "step": 28800 }, { "epoch": 43.32, "grad_norm": 3.982423782348633, "learning_rate": 5.66766917293233e-06, "loss": 0.1841, "step": 28810 }, { "epoch": 43.34, "grad_norm": 3.037238359451294, "learning_rate": 5.666165413533835e-06, "loss": 0.2084, "step": 28820 }, { "epoch": 43.35, "grad_norm": 5.667693614959717, "learning_rate": 5.664661654135338e-06, "loss": 0.2061, "step": 28830 }, { "epoch": 43.37, "grad_norm": 5.317113876342773, "learning_rate": 5.663157894736843e-06, "loss": 0.2213, "step": 28840 }, { "epoch": 43.38, "grad_norm": 7.288003921508789, "learning_rate": 5.661654135338346e-06, "loss": 0.3024, "step": 28850 }, { "epoch": 43.4, "grad_norm": 6.291513919830322, "learning_rate": 5.6601503759398505e-06, "loss": 0.2002, "step": 28860 }, { "epoch": 43.41, "grad_norm": 4.533992767333984, "learning_rate": 5.658646616541353e-06, "loss": 0.2687, "step": 28870 }, { "epoch": 43.43, "grad_norm": 5.96859073638916, "learning_rate": 5.6571428571428576e-06, "loss": 0.2142, "step": 28880 }, { "epoch": 43.44, "grad_norm": 5.588306903839111, "learning_rate": 5.655639097744361e-06, "loss": 0.2045, "step": 28890 }, { "epoch": 43.46, "grad_norm": 3.9208312034606934, "learning_rate": 5.6541353383458654e-06, "loss": 0.3291, "step": 28900 }, { "epoch": 43.47, "grad_norm": 4.575656414031982, "learning_rate": 5.652631578947368e-06, "loss": 0.203, "step": 28910 }, { "epoch": 43.49, "grad_norm": 4.715184211730957, "learning_rate": 5.6511278195488725e-06, "loss": 0.2584, "step": 28920 }, { "epoch": 43.5, "grad_norm": 7.345559597015381, "learning_rate": 5.649624060150376e-06, "loss": 0.2279, "step": 28930 }, { "epoch": 43.52, "grad_norm": 8.647790908813477, "learning_rate": 5.64812030075188e-06, "loss": 0.286, "step": 28940 }, { "epoch": 43.53, "grad_norm": 3.186676025390625, "learning_rate": 5.646616541353384e-06, "loss": 0.2159, "step": 28950 }, { "epoch": 43.55, "grad_norm": 5.287961006164551, "learning_rate": 5.645112781954888e-06, "loss": 0.2934, "step": 28960 }, { "epoch": 43.56, "grad_norm": 1.5112135410308838, "learning_rate": 5.643609022556391e-06, "loss": 0.2194, "step": 28970 }, { "epoch": 43.58, "grad_norm": 4.162924289703369, "learning_rate": 5.642105263157895e-06, "loss": 0.1885, "step": 28980 }, { "epoch": 43.59, "grad_norm": 6.321695327758789, "learning_rate": 5.640601503759399e-06, "loss": 0.1848, "step": 28990 }, { "epoch": 43.61, "grad_norm": 7.07379674911499, "learning_rate": 5.639097744360903e-06, "loss": 0.2176, "step": 29000 }, { "epoch": 43.62, "grad_norm": 2.475600242614746, "learning_rate": 5.637593984962406e-06, "loss": 0.2473, "step": 29010 }, { "epoch": 43.64, "grad_norm": 2.369236946105957, "learning_rate": 5.63609022556391e-06, "loss": 0.2338, "step": 29020 }, { "epoch": 43.65, "grad_norm": 8.563982963562012, "learning_rate": 5.634586466165414e-06, "loss": 0.2339, "step": 29030 }, { "epoch": 43.67, "grad_norm": 11.370944023132324, "learning_rate": 5.633082706766918e-06, "loss": 0.2524, "step": 29040 }, { "epoch": 43.68, "grad_norm": 8.391033172607422, "learning_rate": 5.631578947368422e-06, "loss": 0.2441, "step": 29050 }, { "epoch": 43.7, "grad_norm": 4.559657096862793, "learning_rate": 5.630075187969926e-06, "loss": 0.2515, "step": 29060 }, { "epoch": 43.71, "grad_norm": 4.1660637855529785, "learning_rate": 5.628571428571429e-06, "loss": 0.2495, "step": 29070 }, { "epoch": 43.73, "grad_norm": 8.0914306640625, "learning_rate": 5.627067669172933e-06, "loss": 0.2166, "step": 29080 }, { "epoch": 43.74, "grad_norm": 5.4428019523620605, "learning_rate": 5.6255639097744365e-06, "loss": 0.2574, "step": 29090 }, { "epoch": 43.76, "grad_norm": 4.377140045166016, "learning_rate": 5.624060150375941e-06, "loss": 0.2215, "step": 29100 }, { "epoch": 43.77, "grad_norm": 5.647352695465088, "learning_rate": 5.6225563909774436e-06, "loss": 0.2947, "step": 29110 }, { "epoch": 43.79, "grad_norm": 4.882458209991455, "learning_rate": 5.621052631578948e-06, "loss": 0.2522, "step": 29120 }, { "epoch": 43.8, "grad_norm": 6.157678604125977, "learning_rate": 5.6195488721804515e-06, "loss": 0.221, "step": 29130 }, { "epoch": 43.82, "grad_norm": 6.887104511260986, "learning_rate": 5.618045112781956e-06, "loss": 0.2797, "step": 29140 }, { "epoch": 43.83, "grad_norm": 3.868295907974243, "learning_rate": 5.616541353383459e-06, "loss": 0.2551, "step": 29150 }, { "epoch": 43.85, "grad_norm": 9.585166931152344, "learning_rate": 5.615037593984964e-06, "loss": 0.2301, "step": 29160 }, { "epoch": 43.86, "grad_norm": 5.948240756988525, "learning_rate": 5.613533834586466e-06, "loss": 0.2388, "step": 29170 }, { "epoch": 43.88, "grad_norm": 5.998847961425781, "learning_rate": 5.612030075187971e-06, "loss": 0.2077, "step": 29180 }, { "epoch": 43.89, "grad_norm": 5.52202844619751, "learning_rate": 5.610526315789474e-06, "loss": 0.1981, "step": 29190 }, { "epoch": 43.91, "grad_norm": 3.2842657566070557, "learning_rate": 5.609022556390979e-06, "loss": 0.2744, "step": 29200 }, { "epoch": 43.92, "grad_norm": 6.214591026306152, "learning_rate": 5.607518796992481e-06, "loss": 0.1858, "step": 29210 }, { "epoch": 43.94, "grad_norm": 5.361661911010742, "learning_rate": 5.606015037593986e-06, "loss": 0.2499, "step": 29220 }, { "epoch": 43.95, "grad_norm": 5.610089302062988, "learning_rate": 5.604511278195489e-06, "loss": 0.2287, "step": 29230 }, { "epoch": 43.97, "grad_norm": 5.380805492401123, "learning_rate": 5.6030075187969936e-06, "loss": 0.2702, "step": 29240 }, { "epoch": 43.98, "grad_norm": 3.860438823699951, "learning_rate": 5.601503759398497e-06, "loss": 0.2509, "step": 29250 }, { "epoch": 44.0, "grad_norm": 0.10728010535240173, "learning_rate": 5.600000000000001e-06, "loss": 0.1808, "step": 29260 }, { "epoch": 44.0, "eval_accuracy": 0.9304, "eval_loss": 0.30674317479133606, "eval_runtime": 85.2945, "eval_samples_per_second": 117.241, "eval_steps_per_second": 0.469, "step": 29260 }, { "epoch": 44.02, "grad_norm": 7.593850612640381, "learning_rate": 5.598496240601504e-06, "loss": 0.2738, "step": 29270 }, { "epoch": 44.03, "grad_norm": 6.34241247177124, "learning_rate": 5.596992481203008e-06, "loss": 0.2215, "step": 29280 }, { "epoch": 44.05, "grad_norm": 5.706809997558594, "learning_rate": 5.595488721804512e-06, "loss": 0.277, "step": 29290 }, { "epoch": 44.06, "grad_norm": 4.330235481262207, "learning_rate": 5.593984962406015e-06, "loss": 0.2421, "step": 29300 }, { "epoch": 44.08, "grad_norm": 6.897051811218262, "learning_rate": 5.592481203007519e-06, "loss": 0.2199, "step": 29310 }, { "epoch": 44.09, "grad_norm": 3.312359571456909, "learning_rate": 5.5909774436090226e-06, "loss": 0.1884, "step": 29320 }, { "epoch": 44.11, "grad_norm": 4.363166809082031, "learning_rate": 5.589473684210527e-06, "loss": 0.255, "step": 29330 }, { "epoch": 44.12, "grad_norm": 7.3857269287109375, "learning_rate": 5.5879699248120304e-06, "loss": 0.298, "step": 29340 }, { "epoch": 44.14, "grad_norm": 4.645081520080566, "learning_rate": 5.586466165413534e-06, "loss": 0.318, "step": 29350 }, { "epoch": 44.15, "grad_norm": 5.76023006439209, "learning_rate": 5.5849624060150375e-06, "loss": 0.2442, "step": 29360 }, { "epoch": 44.17, "grad_norm": 3.3517050743103027, "learning_rate": 5.583458646616542e-06, "loss": 0.2442, "step": 29370 }, { "epoch": 44.18, "grad_norm": 4.758605480194092, "learning_rate": 5.581954887218045e-06, "loss": 0.2328, "step": 29380 }, { "epoch": 44.2, "grad_norm": 5.125278949737549, "learning_rate": 5.58045112781955e-06, "loss": 0.2316, "step": 29390 }, { "epoch": 44.21, "grad_norm": 5.346681594848633, "learning_rate": 5.578947368421052e-06, "loss": 0.2119, "step": 29400 }, { "epoch": 44.23, "grad_norm": 5.317344665527344, "learning_rate": 5.577443609022557e-06, "loss": 0.1829, "step": 29410 }, { "epoch": 44.24, "grad_norm": 6.973268985748291, "learning_rate": 5.57593984962406e-06, "loss": 0.2207, "step": 29420 }, { "epoch": 44.26, "grad_norm": 2.5319881439208984, "learning_rate": 5.574436090225565e-06, "loss": 0.2677, "step": 29430 }, { "epoch": 44.27, "grad_norm": 1.6542171239852905, "learning_rate": 5.572932330827068e-06, "loss": 0.2092, "step": 29440 }, { "epoch": 44.29, "grad_norm": 5.905990123748779, "learning_rate": 5.571428571428572e-06, "loss": 0.2059, "step": 29450 }, { "epoch": 44.3, "grad_norm": 3.364076614379883, "learning_rate": 5.569924812030075e-06, "loss": 0.2847, "step": 29460 }, { "epoch": 44.32, "grad_norm": 2.6462037563323975, "learning_rate": 5.5684210526315796e-06, "loss": 0.2227, "step": 29470 }, { "epoch": 44.33, "grad_norm": 4.426711559295654, "learning_rate": 5.566917293233083e-06, "loss": 0.2758, "step": 29480 }, { "epoch": 44.35, "grad_norm": 6.097959518432617, "learning_rate": 5.5654135338345875e-06, "loss": 0.256, "step": 29490 }, { "epoch": 44.36, "grad_norm": 3.295834541320801, "learning_rate": 5.56390977443609e-06, "loss": 0.2586, "step": 29500 }, { "epoch": 44.38, "grad_norm": 6.391618251800537, "learning_rate": 5.5624060150375945e-06, "loss": 0.2186, "step": 29510 }, { "epoch": 44.39, "grad_norm": 5.297180652618408, "learning_rate": 5.560902255639098e-06, "loss": 0.2465, "step": 29520 }, { "epoch": 44.41, "grad_norm": 3.471672773361206, "learning_rate": 5.559398496240602e-06, "loss": 0.2363, "step": 29530 }, { "epoch": 44.42, "grad_norm": 3.7030515670776367, "learning_rate": 5.557894736842105e-06, "loss": 0.2308, "step": 29540 }, { "epoch": 44.44, "grad_norm": 4.913259029388428, "learning_rate": 5.556390977443609e-06, "loss": 0.2459, "step": 29550 }, { "epoch": 44.45, "grad_norm": 7.469844818115234, "learning_rate": 5.554887218045113e-06, "loss": 0.1795, "step": 29560 }, { "epoch": 44.47, "grad_norm": 6.876651763916016, "learning_rate": 5.553383458646617e-06, "loss": 0.2296, "step": 29570 }, { "epoch": 44.48, "grad_norm": 3.4335014820098877, "learning_rate": 5.551879699248121e-06, "loss": 0.2769, "step": 29580 }, { "epoch": 44.5, "grad_norm": 5.788983345031738, "learning_rate": 5.550375939849625e-06, "loss": 0.2428, "step": 29590 }, { "epoch": 44.51, "grad_norm": 4.837071895599365, "learning_rate": 5.548872180451128e-06, "loss": 0.2378, "step": 29600 }, { "epoch": 44.53, "grad_norm": 4.5005202293396, "learning_rate": 5.547368421052632e-06, "loss": 0.2397, "step": 29610 }, { "epoch": 44.54, "grad_norm": 4.948091983795166, "learning_rate": 5.545864661654136e-06, "loss": 0.2521, "step": 29620 }, { "epoch": 44.56, "grad_norm": 7.149682998657227, "learning_rate": 5.54436090225564e-06, "loss": 0.2811, "step": 29630 }, { "epoch": 44.57, "grad_norm": 4.339492321014404, "learning_rate": 5.542857142857143e-06, "loss": 0.19, "step": 29640 }, { "epoch": 44.59, "grad_norm": 5.427370548248291, "learning_rate": 5.541353383458647e-06, "loss": 0.2379, "step": 29650 }, { "epoch": 44.6, "grad_norm": 3.6944634914398193, "learning_rate": 5.539849624060151e-06, "loss": 0.183, "step": 29660 }, { "epoch": 44.62, "grad_norm": 5.690896034240723, "learning_rate": 5.538345864661655e-06, "loss": 0.1527, "step": 29670 }, { "epoch": 44.63, "grad_norm": 5.7550048828125, "learning_rate": 5.5368421052631586e-06, "loss": 0.1855, "step": 29680 }, { "epoch": 44.65, "grad_norm": 2.685657024383545, "learning_rate": 5.535338345864663e-06, "loss": 0.2244, "step": 29690 }, { "epoch": 44.66, "grad_norm": 3.263343334197998, "learning_rate": 5.533834586466166e-06, "loss": 0.2051, "step": 29700 }, { "epoch": 44.68, "grad_norm": 5.3920183181762695, "learning_rate": 5.53233082706767e-06, "loss": 0.2987, "step": 29710 }, { "epoch": 44.69, "grad_norm": 9.512574195861816, "learning_rate": 5.5308270676691735e-06, "loss": 0.2528, "step": 29720 }, { "epoch": 44.71, "grad_norm": 8.392505645751953, "learning_rate": 5.529323308270678e-06, "loss": 0.2906, "step": 29730 }, { "epoch": 44.72, "grad_norm": 5.764971733093262, "learning_rate": 5.5278195488721805e-06, "loss": 0.2531, "step": 29740 }, { "epoch": 44.74, "grad_norm": 4.783633232116699, "learning_rate": 5.526315789473685e-06, "loss": 0.3142, "step": 29750 }, { "epoch": 44.75, "grad_norm": 7.186029434204102, "learning_rate": 5.524812030075188e-06, "loss": 0.2501, "step": 29760 }, { "epoch": 44.77, "grad_norm": 1.497518539428711, "learning_rate": 5.523308270676693e-06, "loss": 0.2384, "step": 29770 }, { "epoch": 44.78, "grad_norm": 8.040397644042969, "learning_rate": 5.521804511278196e-06, "loss": 0.1935, "step": 29780 }, { "epoch": 44.8, "grad_norm": 5.375740051269531, "learning_rate": 5.520300751879699e-06, "loss": 0.2086, "step": 29790 }, { "epoch": 44.81, "grad_norm": 4.856134414672852, "learning_rate": 5.518796992481203e-06, "loss": 0.2516, "step": 29800 }, { "epoch": 44.83, "grad_norm": 5.769529819488525, "learning_rate": 5.517293233082707e-06, "loss": 0.2667, "step": 29810 }, { "epoch": 44.84, "grad_norm": 3.9014487266540527, "learning_rate": 5.515789473684211e-06, "loss": 0.2218, "step": 29820 }, { "epoch": 44.86, "grad_norm": 3.6634175777435303, "learning_rate": 5.514285714285714e-06, "loss": 0.2317, "step": 29830 }, { "epoch": 44.87, "grad_norm": 3.2795653343200684, "learning_rate": 5.512781954887218e-06, "loss": 0.2201, "step": 29840 }, { "epoch": 44.89, "grad_norm": 3.049172878265381, "learning_rate": 5.511278195488722e-06, "loss": 0.1521, "step": 29850 }, { "epoch": 44.9, "grad_norm": 3.333717107772827, "learning_rate": 5.509774436090226e-06, "loss": 0.272, "step": 29860 }, { "epoch": 44.92, "grad_norm": 6.023979187011719, "learning_rate": 5.50827067669173e-06, "loss": 0.2422, "step": 29870 }, { "epoch": 44.93, "grad_norm": 2.427889347076416, "learning_rate": 5.506766917293234e-06, "loss": 0.219, "step": 29880 }, { "epoch": 44.95, "grad_norm": 7.4705729484558105, "learning_rate": 5.505263157894737e-06, "loss": 0.2594, "step": 29890 }, { "epoch": 44.96, "grad_norm": 3.8348019123077393, "learning_rate": 5.503759398496241e-06, "loss": 0.2171, "step": 29900 }, { "epoch": 44.98, "grad_norm": 6.202742576599121, "learning_rate": 5.502255639097745e-06, "loss": 0.3124, "step": 29910 }, { "epoch": 44.99, "grad_norm": 6.531280517578125, "learning_rate": 5.500751879699249e-06, "loss": 0.2477, "step": 29920 }, { "epoch": 45.0, "eval_accuracy": 0.9289, "eval_loss": 0.30727890133857727, "eval_runtime": 84.3468, "eval_samples_per_second": 118.558, "eval_steps_per_second": 0.474, "step": 29925 }, { "epoch": 45.01, "grad_norm": 6.2103376388549805, "learning_rate": 5.499248120300752e-06, "loss": 0.3184, "step": 29930 }, { "epoch": 45.02, "grad_norm": 4.898624897003174, "learning_rate": 5.497744360902256e-06, "loss": 0.2263, "step": 29940 }, { "epoch": 45.04, "grad_norm": 5.601806640625, "learning_rate": 5.4962406015037595e-06, "loss": 0.1897, "step": 29950 }, { "epoch": 45.05, "grad_norm": 3.2715799808502197, "learning_rate": 5.494736842105264e-06, "loss": 0.2605, "step": 29960 }, { "epoch": 45.07, "grad_norm": 4.670425891876221, "learning_rate": 5.493233082706767e-06, "loss": 0.2316, "step": 29970 }, { "epoch": 45.08, "grad_norm": 3.2674357891082764, "learning_rate": 5.491729323308272e-06, "loss": 0.1808, "step": 29980 }, { "epoch": 45.1, "grad_norm": 4.537569522857666, "learning_rate": 5.4902255639097744e-06, "loss": 0.2665, "step": 29990 }, { "epoch": 45.11, "grad_norm": 4.6132378578186035, "learning_rate": 5.488721804511279e-06, "loss": 0.2267, "step": 30000 }, { "epoch": 45.13, "grad_norm": 6.702213764190674, "learning_rate": 5.487218045112782e-06, "loss": 0.2676, "step": 30010 }, { "epoch": 45.14, "grad_norm": 5.990668296813965, "learning_rate": 5.485714285714287e-06, "loss": 0.265, "step": 30020 }, { "epoch": 45.16, "grad_norm": 4.7114787101745605, "learning_rate": 5.484210526315789e-06, "loss": 0.204, "step": 30030 }, { "epoch": 45.17, "grad_norm": 2.577725648880005, "learning_rate": 5.482706766917294e-06, "loss": 0.2601, "step": 30040 }, { "epoch": 45.19, "grad_norm": 7.188079833984375, "learning_rate": 5.481203007518797e-06, "loss": 0.2447, "step": 30050 }, { "epoch": 45.2, "grad_norm": 8.671704292297363, "learning_rate": 5.479699248120302e-06, "loss": 0.202, "step": 30060 }, { "epoch": 45.22, "grad_norm": 1.6407883167266846, "learning_rate": 5.478195488721805e-06, "loss": 0.2504, "step": 30070 }, { "epoch": 45.23, "grad_norm": 6.040456295013428, "learning_rate": 5.476691729323309e-06, "loss": 0.2701, "step": 30080 }, { "epoch": 45.25, "grad_norm": 4.857377529144287, "learning_rate": 5.475187969924812e-06, "loss": 0.1846, "step": 30090 }, { "epoch": 45.26, "grad_norm": 4.847814083099365, "learning_rate": 5.4736842105263165e-06, "loss": 0.2355, "step": 30100 }, { "epoch": 45.28, "grad_norm": 5.133174896240234, "learning_rate": 5.47218045112782e-06, "loss": 0.2244, "step": 30110 }, { "epoch": 45.29, "grad_norm": 4.437646389007568, "learning_rate": 5.470676691729324e-06, "loss": 0.2366, "step": 30120 }, { "epoch": 45.31, "grad_norm": 9.677255630493164, "learning_rate": 5.469172932330827e-06, "loss": 0.2258, "step": 30130 }, { "epoch": 45.32, "grad_norm": 10.691313743591309, "learning_rate": 5.4676691729323314e-06, "loss": 0.2529, "step": 30140 }, { "epoch": 45.34, "grad_norm": 4.892699718475342, "learning_rate": 5.466165413533835e-06, "loss": 0.212, "step": 30150 }, { "epoch": 45.35, "grad_norm": 8.477937698364258, "learning_rate": 5.464661654135339e-06, "loss": 0.2291, "step": 30160 }, { "epoch": 45.37, "grad_norm": 4.592231273651123, "learning_rate": 5.463157894736843e-06, "loss": 0.2567, "step": 30170 }, { "epoch": 45.38, "grad_norm": 4.459981918334961, "learning_rate": 5.461654135338346e-06, "loss": 0.2596, "step": 30180 }, { "epoch": 45.4, "grad_norm": 6.071181774139404, "learning_rate": 5.46015037593985e-06, "loss": 0.2445, "step": 30190 }, { "epoch": 45.41, "grad_norm": 5.071913242340088, "learning_rate": 5.458646616541354e-06, "loss": 0.208, "step": 30200 }, { "epoch": 45.43, "grad_norm": 7.438906669616699, "learning_rate": 5.457142857142858e-06, "loss": 0.2321, "step": 30210 }, { "epoch": 45.44, "grad_norm": 5.6083292961120605, "learning_rate": 5.455639097744362e-06, "loss": 0.1798, "step": 30220 }, { "epoch": 45.46, "grad_norm": 5.303403377532959, "learning_rate": 5.454135338345865e-06, "loss": 0.2718, "step": 30230 }, { "epoch": 45.47, "grad_norm": 4.844057083129883, "learning_rate": 5.452631578947369e-06, "loss": 0.1896, "step": 30240 }, { "epoch": 45.49, "grad_norm": 4.131418228149414, "learning_rate": 5.451127819548873e-06, "loss": 0.2972, "step": 30250 }, { "epoch": 45.5, "grad_norm": 4.064949035644531, "learning_rate": 5.449624060150377e-06, "loss": 0.2298, "step": 30260 }, { "epoch": 45.52, "grad_norm": 4.775879859924316, "learning_rate": 5.44812030075188e-06, "loss": 0.2578, "step": 30270 }, { "epoch": 45.53, "grad_norm": 8.634382247924805, "learning_rate": 5.446616541353383e-06, "loss": 0.2508, "step": 30280 }, { "epoch": 45.55, "grad_norm": 2.0520710945129395, "learning_rate": 5.445112781954888e-06, "loss": 0.2033, "step": 30290 }, { "epoch": 45.56, "grad_norm": 7.116695880889893, "learning_rate": 5.443609022556391e-06, "loss": 0.2528, "step": 30300 }, { "epoch": 45.58, "grad_norm": 5.326464653015137, "learning_rate": 5.4421052631578955e-06, "loss": 0.2536, "step": 30310 }, { "epoch": 45.59, "grad_norm": 10.817031860351562, "learning_rate": 5.440601503759398e-06, "loss": 0.2231, "step": 30320 }, { "epoch": 45.61, "grad_norm": 4.3931965827941895, "learning_rate": 5.4390977443609025e-06, "loss": 0.261, "step": 30330 }, { "epoch": 45.62, "grad_norm": 5.171022891998291, "learning_rate": 5.437593984962406e-06, "loss": 0.2064, "step": 30340 }, { "epoch": 45.64, "grad_norm": 6.176837921142578, "learning_rate": 5.4360902255639104e-06, "loss": 0.2539, "step": 30350 }, { "epoch": 45.65, "grad_norm": 5.6013078689575195, "learning_rate": 5.434586466165413e-06, "loss": 0.229, "step": 30360 }, { "epoch": 45.67, "grad_norm": 7.163257122039795, "learning_rate": 5.4330827067669175e-06, "loss": 0.2193, "step": 30370 }, { "epoch": 45.68, "grad_norm": 5.970539093017578, "learning_rate": 5.431578947368421e-06, "loss": 0.2238, "step": 30380 }, { "epoch": 45.7, "grad_norm": 4.074548721313477, "learning_rate": 5.430075187969925e-06, "loss": 0.1859, "step": 30390 }, { "epoch": 45.71, "grad_norm": 6.107884883880615, "learning_rate": 5.428571428571429e-06, "loss": 0.2406, "step": 30400 }, { "epoch": 45.73, "grad_norm": 6.470170497894287, "learning_rate": 5.427067669172933e-06, "loss": 0.2258, "step": 30410 }, { "epoch": 45.74, "grad_norm": 5.55424165725708, "learning_rate": 5.425563909774436e-06, "loss": 0.2494, "step": 30420 }, { "epoch": 45.76, "grad_norm": 7.8290910720825195, "learning_rate": 5.42406015037594e-06, "loss": 0.1809, "step": 30430 }, { "epoch": 45.77, "grad_norm": 3.04001784324646, "learning_rate": 5.422556390977444e-06, "loss": 0.2146, "step": 30440 }, { "epoch": 45.79, "grad_norm": 4.655550479888916, "learning_rate": 5.421052631578948e-06, "loss": 0.2232, "step": 30450 }, { "epoch": 45.8, "grad_norm": 5.950998306274414, "learning_rate": 5.419548872180451e-06, "loss": 0.275, "step": 30460 }, { "epoch": 45.82, "grad_norm": 6.617697238922119, "learning_rate": 5.418045112781955e-06, "loss": 0.1975, "step": 30470 }, { "epoch": 45.83, "grad_norm": 4.946775913238525, "learning_rate": 5.416541353383459e-06, "loss": 0.2631, "step": 30480 }, { "epoch": 45.85, "grad_norm": 8.360345840454102, "learning_rate": 5.415037593984963e-06, "loss": 0.2695, "step": 30490 }, { "epoch": 45.86, "grad_norm": 5.876543998718262, "learning_rate": 5.413533834586467e-06, "loss": 0.253, "step": 30500 }, { "epoch": 45.88, "grad_norm": 5.3216047286987305, "learning_rate": 5.412030075187971e-06, "loss": 0.2594, "step": 30510 }, { "epoch": 45.89, "grad_norm": 3.109912633895874, "learning_rate": 5.410526315789474e-06, "loss": 0.2005, "step": 30520 }, { "epoch": 45.91, "grad_norm": 16.188390731811523, "learning_rate": 5.409022556390978e-06, "loss": 0.1955, "step": 30530 }, { "epoch": 45.92, "grad_norm": 4.817110061645508, "learning_rate": 5.4075187969924815e-06, "loss": 0.2133, "step": 30540 }, { "epoch": 45.94, "grad_norm": 6.045027732849121, "learning_rate": 5.406015037593986e-06, "loss": 0.2392, "step": 30550 }, { "epoch": 45.95, "grad_norm": 3.955580234527588, "learning_rate": 5.4045112781954886e-06, "loss": 0.224, "step": 30560 }, { "epoch": 45.97, "grad_norm": 2.896059036254883, "learning_rate": 5.403007518796993e-06, "loss": 0.2249, "step": 30570 }, { "epoch": 45.98, "grad_norm": 5.959082126617432, "learning_rate": 5.4015037593984964e-06, "loss": 0.2038, "step": 30580 }, { "epoch": 46.0, "grad_norm": 23.55235481262207, "learning_rate": 5.400000000000001e-06, "loss": 0.2059, "step": 30590 }, { "epoch": 46.0, "eval_accuracy": 0.931, "eval_loss": 0.30095645785331726, "eval_runtime": 84.8003, "eval_samples_per_second": 117.924, "eval_steps_per_second": 0.472, "step": 30590 }, { "epoch": 46.02, "grad_norm": 14.315238952636719, "learning_rate": 5.398496240601504e-06, "loss": 0.2133, "step": 30600 }, { "epoch": 46.03, "grad_norm": 3.028569221496582, "learning_rate": 5.396992481203009e-06, "loss": 0.2082, "step": 30610 }, { "epoch": 46.05, "grad_norm": 4.44739294052124, "learning_rate": 5.395488721804511e-06, "loss": 0.3014, "step": 30620 }, { "epoch": 46.06, "grad_norm": 5.786952018737793, "learning_rate": 5.393984962406016e-06, "loss": 0.196, "step": 30630 }, { "epoch": 46.08, "grad_norm": 6.879361152648926, "learning_rate": 5.392481203007519e-06, "loss": 0.2238, "step": 30640 }, { "epoch": 46.09, "grad_norm": 2.7093942165374756, "learning_rate": 5.390977443609024e-06, "loss": 0.236, "step": 30650 }, { "epoch": 46.11, "grad_norm": 4.412895679473877, "learning_rate": 5.389473684210526e-06, "loss": 0.214, "step": 30660 }, { "epoch": 46.12, "grad_norm": 2.3530495166778564, "learning_rate": 5.387969924812031e-06, "loss": 0.2996, "step": 30670 }, { "epoch": 46.14, "grad_norm": 5.1143107414245605, "learning_rate": 5.386466165413534e-06, "loss": 0.2465, "step": 30680 }, { "epoch": 46.15, "grad_norm": 5.444381237030029, "learning_rate": 5.3849624060150385e-06, "loss": 0.2921, "step": 30690 }, { "epoch": 46.17, "grad_norm": 6.554684162139893, "learning_rate": 5.383458646616542e-06, "loss": 0.2532, "step": 30700 }, { "epoch": 46.18, "grad_norm": 6.188846588134766, "learning_rate": 5.3819548872180464e-06, "loss": 0.1924, "step": 30710 }, { "epoch": 46.2, "grad_norm": 4.2928361892700195, "learning_rate": 5.380451127819549e-06, "loss": 0.2115, "step": 30720 }, { "epoch": 46.21, "grad_norm": 1.6907163858413696, "learning_rate": 5.3789473684210535e-06, "loss": 0.3012, "step": 30730 }, { "epoch": 46.23, "grad_norm": 8.911703109741211, "learning_rate": 5.377443609022557e-06, "loss": 0.2386, "step": 30740 }, { "epoch": 46.24, "grad_norm": 12.857329368591309, "learning_rate": 5.375939849624061e-06, "loss": 0.2622, "step": 30750 }, { "epoch": 46.26, "grad_norm": 4.299805641174316, "learning_rate": 5.374436090225564e-06, "loss": 0.2495, "step": 30760 }, { "epoch": 46.27, "grad_norm": 4.013417720794678, "learning_rate": 5.3729323308270675e-06, "loss": 0.2414, "step": 30770 }, { "epoch": 46.29, "grad_norm": 5.057952404022217, "learning_rate": 5.371428571428572e-06, "loss": 0.3043, "step": 30780 }, { "epoch": 46.3, "grad_norm": 2.708482265472412, "learning_rate": 5.3699248120300754e-06, "loss": 0.2512, "step": 30790 }, { "epoch": 46.32, "grad_norm": 4.2768988609313965, "learning_rate": 5.36842105263158e-06, "loss": 0.2791, "step": 30800 }, { "epoch": 46.33, "grad_norm": 10.092656135559082, "learning_rate": 5.3669172932330825e-06, "loss": 0.2439, "step": 30810 }, { "epoch": 46.35, "grad_norm": 7.980869293212891, "learning_rate": 5.365413533834587e-06, "loss": 0.2967, "step": 30820 }, { "epoch": 46.36, "grad_norm": 6.0706000328063965, "learning_rate": 5.36390977443609e-06, "loss": 0.2448, "step": 30830 }, { "epoch": 46.38, "grad_norm": 5.074240207672119, "learning_rate": 5.362406015037595e-06, "loss": 0.2367, "step": 30840 }, { "epoch": 46.39, "grad_norm": 4.816855430603027, "learning_rate": 5.360902255639097e-06, "loss": 0.2318, "step": 30850 }, { "epoch": 46.41, "grad_norm": 3.8746345043182373, "learning_rate": 5.359398496240602e-06, "loss": 0.2336, "step": 30860 }, { "epoch": 46.42, "grad_norm": 5.22980260848999, "learning_rate": 5.357894736842105e-06, "loss": 0.2148, "step": 30870 }, { "epoch": 46.44, "grad_norm": 6.546250343322754, "learning_rate": 5.35639097744361e-06, "loss": 0.2513, "step": 30880 }, { "epoch": 46.45, "grad_norm": 3.120495319366455, "learning_rate": 5.354887218045113e-06, "loss": 0.2673, "step": 30890 }, { "epoch": 46.47, "grad_norm": 4.755849838256836, "learning_rate": 5.3533834586466175e-06, "loss": 0.2706, "step": 30900 }, { "epoch": 46.48, "grad_norm": 3.598883628845215, "learning_rate": 5.35187969924812e-06, "loss": 0.1639, "step": 30910 }, { "epoch": 46.5, "grad_norm": 4.807301044464111, "learning_rate": 5.3503759398496246e-06, "loss": 0.2063, "step": 30920 }, { "epoch": 46.51, "grad_norm": 4.723282337188721, "learning_rate": 5.348872180451128e-06, "loss": 0.1954, "step": 30930 }, { "epoch": 46.53, "grad_norm": 6.898508071899414, "learning_rate": 5.3473684210526325e-06, "loss": 0.274, "step": 30940 }, { "epoch": 46.54, "grad_norm": 1.9944006204605103, "learning_rate": 5.345864661654135e-06, "loss": 0.2548, "step": 30950 }, { "epoch": 46.56, "grad_norm": 4.713189125061035, "learning_rate": 5.3443609022556395e-06, "loss": 0.2342, "step": 30960 }, { "epoch": 46.57, "grad_norm": 5.438635349273682, "learning_rate": 5.342857142857143e-06, "loss": 0.2282, "step": 30970 }, { "epoch": 46.59, "grad_norm": 4.57274055480957, "learning_rate": 5.341353383458647e-06, "loss": 0.2339, "step": 30980 }, { "epoch": 46.6, "grad_norm": 6.225501537322998, "learning_rate": 5.339849624060151e-06, "loss": 0.2293, "step": 30990 }, { "epoch": 46.62, "grad_norm": 10.704837799072266, "learning_rate": 5.338345864661654e-06, "loss": 0.2327, "step": 31000 }, { "epoch": 46.63, "grad_norm": 7.582201957702637, "learning_rate": 5.336842105263158e-06, "loss": 0.2563, "step": 31010 }, { "epoch": 46.65, "grad_norm": 6.811306476593018, "learning_rate": 5.335338345864662e-06, "loss": 0.198, "step": 31020 }, { "epoch": 46.66, "grad_norm": 3.75140118598938, "learning_rate": 5.333834586466166e-06, "loss": 0.2295, "step": 31030 }, { "epoch": 46.68, "grad_norm": 7.410831928253174, "learning_rate": 5.33233082706767e-06, "loss": 0.2624, "step": 31040 }, { "epoch": 46.69, "grad_norm": 4.584934711456299, "learning_rate": 5.330827067669173e-06, "loss": 0.2611, "step": 31050 }, { "epoch": 46.71, "grad_norm": 2.435068130493164, "learning_rate": 5.329323308270677e-06, "loss": 0.2045, "step": 31060 }, { "epoch": 46.72, "grad_norm": 6.5712056159973145, "learning_rate": 5.327819548872181e-06, "loss": 0.1974, "step": 31070 }, { "epoch": 46.74, "grad_norm": 6.2264862060546875, "learning_rate": 5.326315789473685e-06, "loss": 0.3578, "step": 31080 }, { "epoch": 46.75, "grad_norm": 4.949437141418457, "learning_rate": 5.324812030075189e-06, "loss": 0.2336, "step": 31090 }, { "epoch": 46.77, "grad_norm": 8.773530006408691, "learning_rate": 5.323308270676692e-06, "loss": 0.2793, "step": 31100 }, { "epoch": 46.78, "grad_norm": 4.285604000091553, "learning_rate": 5.321804511278196e-06, "loss": 0.2367, "step": 31110 }, { "epoch": 46.8, "grad_norm": 5.313040256500244, "learning_rate": 5.3203007518797e-06, "loss": 0.2804, "step": 31120 }, { "epoch": 46.81, "grad_norm": 4.113051414489746, "learning_rate": 5.3187969924812035e-06, "loss": 0.2158, "step": 31130 }, { "epoch": 46.83, "grad_norm": 3.403592824935913, "learning_rate": 5.317293233082708e-06, "loss": 0.2477, "step": 31140 }, { "epoch": 46.84, "grad_norm": 9.54311466217041, "learning_rate": 5.315789473684211e-06, "loss": 0.2816, "step": 31150 }, { "epoch": 46.86, "grad_norm": 11.368732452392578, "learning_rate": 5.314285714285715e-06, "loss": 0.2512, "step": 31160 }, { "epoch": 46.87, "grad_norm": 8.554758071899414, "learning_rate": 5.3127819548872185e-06, "loss": 0.2527, "step": 31170 }, { "epoch": 46.89, "grad_norm": 4.945856094360352, "learning_rate": 5.311278195488723e-06, "loss": 0.2287, "step": 31180 }, { "epoch": 46.9, "grad_norm": 5.310131072998047, "learning_rate": 5.3097744360902255e-06, "loss": 0.2594, "step": 31190 }, { "epoch": 46.92, "grad_norm": 5.681679725646973, "learning_rate": 5.30827067669173e-06, "loss": 0.2152, "step": 31200 }, { "epoch": 46.93, "grad_norm": 7.65255069732666, "learning_rate": 5.306766917293233e-06, "loss": 0.2017, "step": 31210 }, { "epoch": 46.95, "grad_norm": 7.68698787689209, "learning_rate": 5.305263157894738e-06, "loss": 0.2355, "step": 31220 }, { "epoch": 46.96, "grad_norm": 6.632236003875732, "learning_rate": 5.303759398496241e-06, "loss": 0.2626, "step": 31230 }, { "epoch": 46.98, "grad_norm": 3.1763651371002197, "learning_rate": 5.302255639097746e-06, "loss": 0.2013, "step": 31240 }, { "epoch": 46.99, "grad_norm": 5.487941741943359, "learning_rate": 5.300751879699248e-06, "loss": 0.2156, "step": 31250 }, { "epoch": 47.0, "eval_accuracy": 0.9318, "eval_loss": 0.2919594645500183, "eval_runtime": 84.7462, "eval_samples_per_second": 117.999, "eval_steps_per_second": 0.472, "step": 31255 }, { "epoch": 47.01, "grad_norm": 4.854064464569092, "learning_rate": 5.299248120300753e-06, "loss": 0.2527, "step": 31260 }, { "epoch": 47.02, "grad_norm": 6.271472454071045, "learning_rate": 5.297744360902256e-06, "loss": 0.191, "step": 31270 }, { "epoch": 47.04, "grad_norm": 6.080266952514648, "learning_rate": 5.296240601503759e-06, "loss": 0.2479, "step": 31280 }, { "epoch": 47.05, "grad_norm": 7.085132122039795, "learning_rate": 5.294736842105263e-06, "loss": 0.2624, "step": 31290 }, { "epoch": 47.07, "grad_norm": 4.636129856109619, "learning_rate": 5.293233082706767e-06, "loss": 0.1893, "step": 31300 }, { "epoch": 47.08, "grad_norm": 6.469307899475098, "learning_rate": 5.291729323308271e-06, "loss": 0.1865, "step": 31310 }, { "epoch": 47.1, "grad_norm": 8.775065422058105, "learning_rate": 5.290225563909775e-06, "loss": 0.2459, "step": 31320 }, { "epoch": 47.11, "grad_norm": 3.9796481132507324, "learning_rate": 5.288721804511279e-06, "loss": 0.257, "step": 31330 }, { "epoch": 47.13, "grad_norm": 5.34701681137085, "learning_rate": 5.287218045112782e-06, "loss": 0.199, "step": 31340 }, { "epoch": 47.14, "grad_norm": 6.631876468658447, "learning_rate": 5.285714285714286e-06, "loss": 0.2502, "step": 31350 }, { "epoch": 47.16, "grad_norm": 4.332927227020264, "learning_rate": 5.2842105263157896e-06, "loss": 0.2145, "step": 31360 }, { "epoch": 47.17, "grad_norm": 7.87563419342041, "learning_rate": 5.282706766917294e-06, "loss": 0.2484, "step": 31370 }, { "epoch": 47.19, "grad_norm": 4.990164279937744, "learning_rate": 5.281203007518797e-06, "loss": 0.2739, "step": 31380 }, { "epoch": 47.2, "grad_norm": 5.960522174835205, "learning_rate": 5.279699248120301e-06, "loss": 0.2388, "step": 31390 }, { "epoch": 47.22, "grad_norm": 8.016762733459473, "learning_rate": 5.2781954887218045e-06, "loss": 0.2013, "step": 31400 }, { "epoch": 47.23, "grad_norm": 4.126075744628906, "learning_rate": 5.276691729323309e-06, "loss": 0.2994, "step": 31410 }, { "epoch": 47.25, "grad_norm": 4.315171241760254, "learning_rate": 5.275187969924812e-06, "loss": 0.3082, "step": 31420 }, { "epoch": 47.26, "grad_norm": 6.842277526855469, "learning_rate": 5.273684210526317e-06, "loss": 0.2019, "step": 31430 }, { "epoch": 47.28, "grad_norm": 16.402454376220703, "learning_rate": 5.272180451127819e-06, "loss": 0.3009, "step": 31440 }, { "epoch": 47.29, "grad_norm": 4.734708309173584, "learning_rate": 5.270676691729324e-06, "loss": 0.1851, "step": 31450 }, { "epoch": 47.31, "grad_norm": 7.005868434906006, "learning_rate": 5.269172932330827e-06, "loss": 0.2021, "step": 31460 }, { "epoch": 47.32, "grad_norm": 4.445467948913574, "learning_rate": 5.267669172932332e-06, "loss": 0.2028, "step": 31470 }, { "epoch": 47.34, "grad_norm": 3.294844627380371, "learning_rate": 5.266165413533834e-06, "loss": 0.2901, "step": 31480 }, { "epoch": 47.35, "grad_norm": 5.847194194793701, "learning_rate": 5.264661654135339e-06, "loss": 0.2215, "step": 31490 }, { "epoch": 47.37, "grad_norm": 3.2134320735931396, "learning_rate": 5.263157894736842e-06, "loss": 0.213, "step": 31500 }, { "epoch": 47.38, "grad_norm": 7.5339035987854, "learning_rate": 5.261654135338347e-06, "loss": 0.2378, "step": 31510 }, { "epoch": 47.4, "grad_norm": 3.7223424911499023, "learning_rate": 5.26015037593985e-06, "loss": 0.3221, "step": 31520 }, { "epoch": 47.41, "grad_norm": 4.247413158416748, "learning_rate": 5.2586466165413545e-06, "loss": 0.274, "step": 31530 }, { "epoch": 47.43, "grad_norm": 4.432199478149414, "learning_rate": 5.257142857142857e-06, "loss": 0.2672, "step": 31540 }, { "epoch": 47.44, "grad_norm": 4.548630714416504, "learning_rate": 5.2556390977443615e-06, "loss": 0.2413, "step": 31550 }, { "epoch": 47.46, "grad_norm": 5.084230899810791, "learning_rate": 5.254135338345865e-06, "loss": 0.233, "step": 31560 }, { "epoch": 47.47, "grad_norm": 2.7033839225769043, "learning_rate": 5.252631578947369e-06, "loss": 0.2344, "step": 31570 }, { "epoch": 47.49, "grad_norm": 6.172457218170166, "learning_rate": 5.251127819548872e-06, "loss": 0.2259, "step": 31580 }, { "epoch": 47.5, "grad_norm": 4.346304893493652, "learning_rate": 5.2496240601503764e-06, "loss": 0.2755, "step": 31590 }, { "epoch": 47.52, "grad_norm": 5.721127986907959, "learning_rate": 5.24812030075188e-06, "loss": 0.2132, "step": 31600 }, { "epoch": 47.53, "grad_norm": 4.425881862640381, "learning_rate": 5.246616541353384e-06, "loss": 0.249, "step": 31610 }, { "epoch": 47.55, "grad_norm": 5.007637977600098, "learning_rate": 5.245112781954888e-06, "loss": 0.2291, "step": 31620 }, { "epoch": 47.56, "grad_norm": 5.64668083190918, "learning_rate": 5.243609022556392e-06, "loss": 0.2487, "step": 31630 }, { "epoch": 47.58, "grad_norm": 4.099886417388916, "learning_rate": 5.242105263157895e-06, "loss": 0.2259, "step": 31640 }, { "epoch": 47.59, "grad_norm": 4.3517584800720215, "learning_rate": 5.240601503759399e-06, "loss": 0.1817, "step": 31650 }, { "epoch": 47.61, "grad_norm": 7.435219764709473, "learning_rate": 5.239097744360903e-06, "loss": 0.2105, "step": 31660 }, { "epoch": 47.62, "grad_norm": 4.386735439300537, "learning_rate": 5.237593984962407e-06, "loss": 0.2167, "step": 31670 }, { "epoch": 47.64, "grad_norm": 9.977165222167969, "learning_rate": 5.23609022556391e-06, "loss": 0.2578, "step": 31680 }, { "epoch": 47.65, "grad_norm": 5.100271701812744, "learning_rate": 5.234586466165414e-06, "loss": 0.2037, "step": 31690 }, { "epoch": 47.67, "grad_norm": 12.209273338317871, "learning_rate": 5.233082706766918e-06, "loss": 0.2373, "step": 31700 }, { "epoch": 47.68, "grad_norm": 4.825228691101074, "learning_rate": 5.231578947368422e-06, "loss": 0.217, "step": 31710 }, { "epoch": 47.7, "grad_norm": 4.916600227355957, "learning_rate": 5.2300751879699256e-06, "loss": 0.2243, "step": 31720 }, { "epoch": 47.71, "grad_norm": 4.243865966796875, "learning_rate": 5.22857142857143e-06, "loss": 0.198, "step": 31730 }, { "epoch": 47.73, "grad_norm": 4.085118293762207, "learning_rate": 5.227067669172933e-06, "loss": 0.2393, "step": 31740 }, { "epoch": 47.74, "grad_norm": 3.169874906539917, "learning_rate": 5.225563909774437e-06, "loss": 0.227, "step": 31750 }, { "epoch": 47.76, "grad_norm": 5.03513240814209, "learning_rate": 5.2240601503759405e-06, "loss": 0.1579, "step": 31760 }, { "epoch": 47.77, "grad_norm": 4.764927864074707, "learning_rate": 5.222556390977443e-06, "loss": 0.2179, "step": 31770 }, { "epoch": 47.79, "grad_norm": 3.114337205886841, "learning_rate": 5.2210526315789475e-06, "loss": 0.1934, "step": 31780 }, { "epoch": 47.8, "grad_norm": 4.358081817626953, "learning_rate": 5.219548872180451e-06, "loss": 0.1997, "step": 31790 }, { "epoch": 47.82, "grad_norm": 4.839754104614258, "learning_rate": 5.218045112781955e-06, "loss": 0.2273, "step": 31800 }, { "epoch": 47.83, "grad_norm": 4.898540496826172, "learning_rate": 5.216541353383459e-06, "loss": 0.2084, "step": 31810 }, { "epoch": 47.85, "grad_norm": 7.094751834869385, "learning_rate": 5.215037593984963e-06, "loss": 0.2072, "step": 31820 }, { "epoch": 47.86, "grad_norm": 6.301358699798584, "learning_rate": 5.213533834586466e-06, "loss": 0.2076, "step": 31830 }, { "epoch": 47.88, "grad_norm": 5.953322410583496, "learning_rate": 5.21203007518797e-06, "loss": 0.234, "step": 31840 }, { "epoch": 47.89, "grad_norm": 4.481212615966797, "learning_rate": 5.210526315789474e-06, "loss": 0.2074, "step": 31850 }, { "epoch": 47.91, "grad_norm": 7.783965587615967, "learning_rate": 5.209022556390978e-06, "loss": 0.2597, "step": 31860 }, { "epoch": 47.92, "grad_norm": 10.910694122314453, "learning_rate": 5.207518796992481e-06, "loss": 0.2851, "step": 31870 }, { "epoch": 47.94, "grad_norm": 1.9801486730575562, "learning_rate": 5.206015037593985e-06, "loss": 0.2154, "step": 31880 }, { "epoch": 47.95, "grad_norm": 11.243240356445312, "learning_rate": 5.204511278195489e-06, "loss": 0.2158, "step": 31890 }, { "epoch": 47.97, "grad_norm": 2.5445377826690674, "learning_rate": 5.203007518796993e-06, "loss": 0.2497, "step": 31900 }, { "epoch": 47.98, "grad_norm": 5.46054744720459, "learning_rate": 5.201503759398497e-06, "loss": 0.226, "step": 31910 }, { "epoch": 48.0, "grad_norm": 26.797359466552734, "learning_rate": 5.2e-06, "loss": 0.2719, "step": 31920 }, { "epoch": 48.0, "eval_accuracy": 0.9311, "eval_loss": 0.3056710362434387, "eval_runtime": 84.8487, "eval_samples_per_second": 117.857, "eval_steps_per_second": 0.471, "step": 31920 }, { "epoch": 48.02, "grad_norm": 3.771094799041748, "learning_rate": 5.198496240601504e-06, "loss": 0.197, "step": 31930 }, { "epoch": 48.03, "grad_norm": 3.460973024368286, "learning_rate": 5.196992481203008e-06, "loss": 0.2012, "step": 31940 }, { "epoch": 48.05, "grad_norm": 7.17149543762207, "learning_rate": 5.195488721804512e-06, "loss": 0.2799, "step": 31950 }, { "epoch": 48.06, "grad_norm": 2.8762929439544678, "learning_rate": 5.193984962406016e-06, "loss": 0.2632, "step": 31960 }, { "epoch": 48.08, "grad_norm": 5.614086627960205, "learning_rate": 5.192481203007519e-06, "loss": 0.205, "step": 31970 }, { "epoch": 48.09, "grad_norm": 5.698775768280029, "learning_rate": 5.190977443609023e-06, "loss": 0.2031, "step": 31980 }, { "epoch": 48.11, "grad_norm": 3.4972689151763916, "learning_rate": 5.1894736842105265e-06, "loss": 0.1935, "step": 31990 }, { "epoch": 48.12, "grad_norm": 6.399228096008301, "learning_rate": 5.187969924812031e-06, "loss": 0.201, "step": 32000 }, { "epoch": 48.14, "grad_norm": 6.805654048919678, "learning_rate": 5.186466165413534e-06, "loss": 0.1714, "step": 32010 }, { "epoch": 48.15, "grad_norm": 4.721649646759033, "learning_rate": 5.184962406015038e-06, "loss": 0.2139, "step": 32020 }, { "epoch": 48.17, "grad_norm": 5.464878082275391, "learning_rate": 5.1834586466165414e-06, "loss": 0.2137, "step": 32030 }, { "epoch": 48.18, "grad_norm": 5.358616352081299, "learning_rate": 5.181954887218046e-06, "loss": 0.2196, "step": 32040 }, { "epoch": 48.2, "grad_norm": 8.136012077331543, "learning_rate": 5.180451127819549e-06, "loss": 0.2861, "step": 32050 }, { "epoch": 48.21, "grad_norm": 10.187384605407715, "learning_rate": 5.178947368421054e-06, "loss": 0.2223, "step": 32060 }, { "epoch": 48.23, "grad_norm": 4.251842498779297, "learning_rate": 5.177443609022556e-06, "loss": 0.1949, "step": 32070 }, { "epoch": 48.24, "grad_norm": 6.019373893737793, "learning_rate": 5.175939849624061e-06, "loss": 0.2512, "step": 32080 }, { "epoch": 48.26, "grad_norm": 5.162561416625977, "learning_rate": 5.174436090225564e-06, "loss": 0.2408, "step": 32090 }, { "epoch": 48.27, "grad_norm": 4.33914852142334, "learning_rate": 5.172932330827069e-06, "loss": 0.2901, "step": 32100 }, { "epoch": 48.29, "grad_norm": 6.14069938659668, "learning_rate": 5.171428571428571e-06, "loss": 0.1849, "step": 32110 }, { "epoch": 48.3, "grad_norm": 4.674006462097168, "learning_rate": 5.169924812030076e-06, "loss": 0.2483, "step": 32120 }, { "epoch": 48.32, "grad_norm": 9.517473220825195, "learning_rate": 5.168421052631579e-06, "loss": 0.2329, "step": 32130 }, { "epoch": 48.33, "grad_norm": 3.6502087116241455, "learning_rate": 5.1669172932330835e-06, "loss": 0.2357, "step": 32140 }, { "epoch": 48.35, "grad_norm": 2.8939104080200195, "learning_rate": 5.165413533834587e-06, "loss": 0.265, "step": 32150 }, { "epoch": 48.36, "grad_norm": 4.541179656982422, "learning_rate": 5.163909774436091e-06, "loss": 0.2007, "step": 32160 }, { "epoch": 48.38, "grad_norm": 3.5995399951934814, "learning_rate": 5.162406015037594e-06, "loss": 0.1895, "step": 32170 }, { "epoch": 48.39, "grad_norm": 4.528938293457031, "learning_rate": 5.1609022556390985e-06, "loss": 0.2574, "step": 32180 }, { "epoch": 48.41, "grad_norm": 3.6735799312591553, "learning_rate": 5.159398496240602e-06, "loss": 0.2564, "step": 32190 }, { "epoch": 48.42, "grad_norm": 7.794544219970703, "learning_rate": 5.157894736842106e-06, "loss": 0.2534, "step": 32200 }, { "epoch": 48.44, "grad_norm": 9.559613227844238, "learning_rate": 5.156390977443609e-06, "loss": 0.2308, "step": 32210 }, { "epoch": 48.45, "grad_norm": 5.519898414611816, "learning_rate": 5.154887218045113e-06, "loss": 0.2299, "step": 32220 }, { "epoch": 48.47, "grad_norm": 5.717340469360352, "learning_rate": 5.153383458646617e-06, "loss": 0.2285, "step": 32230 }, { "epoch": 48.48, "grad_norm": 4.257745265960693, "learning_rate": 5.151879699248121e-06, "loss": 0.2257, "step": 32240 }, { "epoch": 48.5, "grad_norm": 3.741241693496704, "learning_rate": 5.150375939849625e-06, "loss": 0.2006, "step": 32250 }, { "epoch": 48.51, "grad_norm": 4.753995418548584, "learning_rate": 5.1488721804511275e-06, "loss": 0.2896, "step": 32260 }, { "epoch": 48.53, "grad_norm": 4.3221635818481445, "learning_rate": 5.147368421052632e-06, "loss": 0.2416, "step": 32270 }, { "epoch": 48.54, "grad_norm": 6.452643394470215, "learning_rate": 5.145864661654135e-06, "loss": 0.2546, "step": 32280 }, { "epoch": 48.56, "grad_norm": 10.680849075317383, "learning_rate": 5.14436090225564e-06, "loss": 0.2578, "step": 32290 }, { "epoch": 48.57, "grad_norm": 10.565488815307617, "learning_rate": 5.142857142857142e-06, "loss": 0.2105, "step": 32300 }, { "epoch": 48.59, "grad_norm": 6.964345455169678, "learning_rate": 5.141353383458647e-06, "loss": 0.2407, "step": 32310 }, { "epoch": 48.6, "grad_norm": 5.355147838592529, "learning_rate": 5.13984962406015e-06, "loss": 0.122, "step": 32320 }, { "epoch": 48.62, "grad_norm": 4.7337565422058105, "learning_rate": 5.138345864661655e-06, "loss": 0.2202, "step": 32330 }, { "epoch": 48.63, "grad_norm": 3.4737660884857178, "learning_rate": 5.136842105263158e-06, "loss": 0.2382, "step": 32340 }, { "epoch": 48.65, "grad_norm": 3.8572425842285156, "learning_rate": 5.1353383458646625e-06, "loss": 0.2735, "step": 32350 }, { "epoch": 48.66, "grad_norm": 3.9931530952453613, "learning_rate": 5.133834586466165e-06, "loss": 0.2399, "step": 32360 }, { "epoch": 48.68, "grad_norm": 6.738966941833496, "learning_rate": 5.1323308270676696e-06, "loss": 0.2595, "step": 32370 }, { "epoch": 48.69, "grad_norm": 5.455244064331055, "learning_rate": 5.130827067669173e-06, "loss": 0.1811, "step": 32380 }, { "epoch": 48.71, "grad_norm": 8.760961532592773, "learning_rate": 5.1293233082706774e-06, "loss": 0.2323, "step": 32390 }, { "epoch": 48.72, "grad_norm": 1.5300565958023071, "learning_rate": 5.12781954887218e-06, "loss": 0.2225, "step": 32400 }, { "epoch": 48.74, "grad_norm": 4.378961563110352, "learning_rate": 5.1263157894736845e-06, "loss": 0.1771, "step": 32410 }, { "epoch": 48.75, "grad_norm": 5.027568340301514, "learning_rate": 5.124812030075188e-06, "loss": 0.1993, "step": 32420 }, { "epoch": 48.77, "grad_norm": 3.989525556564331, "learning_rate": 5.123308270676692e-06, "loss": 0.2182, "step": 32430 }, { "epoch": 48.78, "grad_norm": 5.23897123336792, "learning_rate": 5.121804511278196e-06, "loss": 0.1689, "step": 32440 }, { "epoch": 48.8, "grad_norm": 6.119510173797607, "learning_rate": 5.1203007518797e-06, "loss": 0.1778, "step": 32450 }, { "epoch": 48.81, "grad_norm": 5.041784286499023, "learning_rate": 5.118796992481203e-06, "loss": 0.2215, "step": 32460 }, { "epoch": 48.83, "grad_norm": 4.170535564422607, "learning_rate": 5.117293233082707e-06, "loss": 0.1809, "step": 32470 }, { "epoch": 48.84, "grad_norm": 5.3539204597473145, "learning_rate": 5.115789473684211e-06, "loss": 0.2089, "step": 32480 }, { "epoch": 48.86, "grad_norm": 3.4489662647247314, "learning_rate": 5.114285714285715e-06, "loss": 0.1827, "step": 32490 }, { "epoch": 48.87, "grad_norm": 3.438225507736206, "learning_rate": 5.112781954887218e-06, "loss": 0.205, "step": 32500 }, { "epoch": 48.89, "grad_norm": 5.150193214416504, "learning_rate": 5.111278195488722e-06, "loss": 0.195, "step": 32510 }, { "epoch": 48.9, "grad_norm": 7.413297176361084, "learning_rate": 5.109774436090226e-06, "loss": 0.2422, "step": 32520 }, { "epoch": 48.92, "grad_norm": 4.774962902069092, "learning_rate": 5.10827067669173e-06, "loss": 0.2325, "step": 32530 }, { "epoch": 48.93, "grad_norm": 6.498291969299316, "learning_rate": 5.106766917293234e-06, "loss": 0.257, "step": 32540 }, { "epoch": 48.95, "grad_norm": 5.5614728927612305, "learning_rate": 5.105263157894738e-06, "loss": 0.18, "step": 32550 }, { "epoch": 48.96, "grad_norm": 9.62303352355957, "learning_rate": 5.103759398496241e-06, "loss": 0.2386, "step": 32560 }, { "epoch": 48.98, "grad_norm": 7.92848539352417, "learning_rate": 5.102255639097745e-06, "loss": 0.2292, "step": 32570 }, { "epoch": 48.99, "grad_norm": 4.110299587249756, "learning_rate": 5.1007518796992485e-06, "loss": 0.2156, "step": 32580 }, { "epoch": 49.0, "eval_accuracy": 0.9292, "eval_loss": 0.31269803643226624, "eval_runtime": 84.4748, "eval_samples_per_second": 118.379, "eval_steps_per_second": 0.474, "step": 32585 }, { "epoch": 49.01, "grad_norm": 7.004157543182373, "learning_rate": 5.099248120300753e-06, "loss": 0.3448, "step": 32590 }, { "epoch": 49.02, "grad_norm": 4.223759651184082, "learning_rate": 5.097744360902256e-06, "loss": 0.2868, "step": 32600 }, { "epoch": 49.04, "grad_norm": 3.9704689979553223, "learning_rate": 5.09624060150376e-06, "loss": 0.2549, "step": 32610 }, { "epoch": 49.05, "grad_norm": 3.7155163288116455, "learning_rate": 5.0947368421052635e-06, "loss": 0.2238, "step": 32620 }, { "epoch": 49.07, "grad_norm": 4.680662631988525, "learning_rate": 5.093233082706768e-06, "loss": 0.2062, "step": 32630 }, { "epoch": 49.08, "grad_norm": 4.773167610168457, "learning_rate": 5.091729323308271e-06, "loss": 0.1666, "step": 32640 }, { "epoch": 49.1, "grad_norm": 8.678242683410645, "learning_rate": 5.090225563909776e-06, "loss": 0.2808, "step": 32650 }, { "epoch": 49.11, "grad_norm": 5.068692207336426, "learning_rate": 5.088721804511278e-06, "loss": 0.2623, "step": 32660 }, { "epoch": 49.13, "grad_norm": 2.034926414489746, "learning_rate": 5.087218045112783e-06, "loss": 0.2039, "step": 32670 }, { "epoch": 49.14, "grad_norm": 7.186866283416748, "learning_rate": 5.085714285714286e-06, "loss": 0.2584, "step": 32680 }, { "epoch": 49.16, "grad_norm": 7.276577472686768, "learning_rate": 5.084210526315791e-06, "loss": 0.1756, "step": 32690 }, { "epoch": 49.17, "grad_norm": 3.856623888015747, "learning_rate": 5.082706766917293e-06, "loss": 0.1908, "step": 32700 }, { "epoch": 49.19, "grad_norm": 4.449421405792236, "learning_rate": 5.081203007518798e-06, "loss": 0.233, "step": 32710 }, { "epoch": 49.2, "grad_norm": 7.532309055328369, "learning_rate": 5.079699248120301e-06, "loss": 0.2067, "step": 32720 }, { "epoch": 49.22, "grad_norm": 5.170470237731934, "learning_rate": 5.0781954887218056e-06, "loss": 0.2273, "step": 32730 }, { "epoch": 49.23, "grad_norm": 3.9134700298309326, "learning_rate": 5.076691729323309e-06, "loss": 0.2791, "step": 32740 }, { "epoch": 49.25, "grad_norm": 6.696217060089111, "learning_rate": 5.075187969924813e-06, "loss": 0.224, "step": 32750 }, { "epoch": 49.26, "grad_norm": 9.15458869934082, "learning_rate": 5.073684210526316e-06, "loss": 0.2591, "step": 32760 }, { "epoch": 49.28, "grad_norm": 3.095736026763916, "learning_rate": 5.07218045112782e-06, "loss": 0.1681, "step": 32770 }, { "epoch": 49.29, "grad_norm": 5.608436107635498, "learning_rate": 5.070676691729324e-06, "loss": 0.2269, "step": 32780 }, { "epoch": 49.31, "grad_norm": 5.287787437438965, "learning_rate": 5.069172932330827e-06, "loss": 0.2443, "step": 32790 }, { "epoch": 49.32, "grad_norm": 8.475576400756836, "learning_rate": 5.067669172932331e-06, "loss": 0.3255, "step": 32800 }, { "epoch": 49.34, "grad_norm": 3.967184066772461, "learning_rate": 5.0661654135338346e-06, "loss": 0.1712, "step": 32810 }, { "epoch": 49.35, "grad_norm": 7.337131023406982, "learning_rate": 5.064661654135339e-06, "loss": 0.3166, "step": 32820 }, { "epoch": 49.37, "grad_norm": 6.623044967651367, "learning_rate": 5.0631578947368424e-06, "loss": 0.3035, "step": 32830 }, { "epoch": 49.38, "grad_norm": 5.275474548339844, "learning_rate": 5.061654135338346e-06, "loss": 0.2025, "step": 32840 }, { "epoch": 49.4, "grad_norm": 6.363266468048096, "learning_rate": 5.0601503759398495e-06, "loss": 0.3033, "step": 32850 }, { "epoch": 49.41, "grad_norm": 3.8887858390808105, "learning_rate": 5.058646616541354e-06, "loss": 0.2282, "step": 32860 }, { "epoch": 49.43, "grad_norm": 7.158453941345215, "learning_rate": 5.057142857142857e-06, "loss": 0.2427, "step": 32870 }, { "epoch": 49.44, "grad_norm": 5.235705375671387, "learning_rate": 5.055639097744362e-06, "loss": 0.3072, "step": 32880 }, { "epoch": 49.46, "grad_norm": 3.328618049621582, "learning_rate": 5.054135338345864e-06, "loss": 0.2078, "step": 32890 }, { "epoch": 49.47, "grad_norm": 7.1164398193359375, "learning_rate": 5.052631578947369e-06, "loss": 0.2707, "step": 32900 }, { "epoch": 49.49, "grad_norm": 4.149364948272705, "learning_rate": 5.051127819548872e-06, "loss": 0.1759, "step": 32910 }, { "epoch": 49.5, "grad_norm": 5.859124183654785, "learning_rate": 5.049624060150377e-06, "loss": 0.2588, "step": 32920 }, { "epoch": 49.52, "grad_norm": 7.663355350494385, "learning_rate": 5.04812030075188e-06, "loss": 0.285, "step": 32930 }, { "epoch": 49.53, "grad_norm": 5.694937229156494, "learning_rate": 5.046616541353384e-06, "loss": 0.2546, "step": 32940 }, { "epoch": 49.55, "grad_norm": 7.81641960144043, "learning_rate": 5.045112781954887e-06, "loss": 0.1886, "step": 32950 }, { "epoch": 49.56, "grad_norm": 4.236837387084961, "learning_rate": 5.043609022556392e-06, "loss": 0.2051, "step": 32960 }, { "epoch": 49.58, "grad_norm": 4.034359931945801, "learning_rate": 5.042105263157895e-06, "loss": 0.1713, "step": 32970 }, { "epoch": 49.59, "grad_norm": 10.613497734069824, "learning_rate": 5.0406015037593995e-06, "loss": 0.2747, "step": 32980 }, { "epoch": 49.61, "grad_norm": 6.468116760253906, "learning_rate": 5.039097744360902e-06, "loss": 0.2624, "step": 32990 }, { "epoch": 49.62, "grad_norm": 6.871288776397705, "learning_rate": 5.0375939849624065e-06, "loss": 0.2418, "step": 33000 }, { "epoch": 49.64, "grad_norm": 5.56874942779541, "learning_rate": 5.03609022556391e-06, "loss": 0.2581, "step": 33010 }, { "epoch": 49.65, "grad_norm": 4.636134147644043, "learning_rate": 5.034586466165414e-06, "loss": 0.2183, "step": 33020 }, { "epoch": 49.67, "grad_norm": 3.762443780899048, "learning_rate": 5.033082706766917e-06, "loss": 0.2067, "step": 33030 }, { "epoch": 49.68, "grad_norm": 2.3653151988983154, "learning_rate": 5.0315789473684214e-06, "loss": 0.1524, "step": 33040 }, { "epoch": 49.7, "grad_norm": 1.23353111743927, "learning_rate": 5.030075187969925e-06, "loss": 0.2496, "step": 33050 }, { "epoch": 49.71, "grad_norm": 6.81777811050415, "learning_rate": 5.028571428571429e-06, "loss": 0.2662, "step": 33060 }, { "epoch": 49.73, "grad_norm": 3.0888495445251465, "learning_rate": 5.027067669172933e-06, "loss": 0.2782, "step": 33070 }, { "epoch": 49.74, "grad_norm": 5.303954124450684, "learning_rate": 5.025563909774437e-06, "loss": 0.205, "step": 33080 }, { "epoch": 49.76, "grad_norm": 3.523970603942871, "learning_rate": 5.02406015037594e-06, "loss": 0.1989, "step": 33090 }, { "epoch": 49.77, "grad_norm": 5.378790378570557, "learning_rate": 5.022556390977444e-06, "loss": 0.2331, "step": 33100 }, { "epoch": 49.79, "grad_norm": 5.677578926086426, "learning_rate": 5.021052631578948e-06, "loss": 0.2528, "step": 33110 }, { "epoch": 49.8, "grad_norm": 9.107217788696289, "learning_rate": 5.019548872180452e-06, "loss": 0.1952, "step": 33120 }, { "epoch": 49.82, "grad_norm": 4.993006706237793, "learning_rate": 5.018045112781955e-06, "loss": 0.1862, "step": 33130 }, { "epoch": 49.83, "grad_norm": 5.515392780303955, "learning_rate": 5.016541353383459e-06, "loss": 0.2022, "step": 33140 }, { "epoch": 49.85, "grad_norm": 8.87382698059082, "learning_rate": 5.015037593984963e-06, "loss": 0.2332, "step": 33150 }, { "epoch": 49.86, "grad_norm": 6.892481327056885, "learning_rate": 5.013533834586467e-06, "loss": 0.1816, "step": 33160 }, { "epoch": 49.88, "grad_norm": 2.5804250240325928, "learning_rate": 5.0120300751879706e-06, "loss": 0.223, "step": 33170 }, { "epoch": 49.89, "grad_norm": 20.23832130432129, "learning_rate": 5.010526315789475e-06, "loss": 0.1953, "step": 33180 }, { "epoch": 49.91, "grad_norm": 3.5330405235290527, "learning_rate": 5.009022556390978e-06, "loss": 0.2249, "step": 33190 }, { "epoch": 49.92, "grad_norm": 7.592809677124023, "learning_rate": 5.007518796992482e-06, "loss": 0.3143, "step": 33200 }, { "epoch": 49.94, "grad_norm": 5.459137916564941, "learning_rate": 5.0060150375939855e-06, "loss": 0.2856, "step": 33210 }, { "epoch": 49.95, "grad_norm": 5.517751216888428, "learning_rate": 5.00451127819549e-06, "loss": 0.1942, "step": 33220 }, { "epoch": 49.97, "grad_norm": 5.425515651702881, "learning_rate": 5.0030075187969925e-06, "loss": 0.207, "step": 33230 }, { "epoch": 49.98, "grad_norm": 4.391520023345947, "learning_rate": 5.001503759398497e-06, "loss": 0.2186, "step": 33240 }, { "epoch": 50.0, "grad_norm": 8.936457633972168, "learning_rate": 5e-06, "loss": 0.2562, "step": 33250 }, { "epoch": 50.0, "eval_accuracy": 0.93, "eval_loss": 0.31154853105545044, "eval_runtime": 84.9326, "eval_samples_per_second": 117.74, "eval_steps_per_second": 0.471, "step": 33250 }, { "epoch": 50.02, "grad_norm": 4.7076497077941895, "learning_rate": 4.998496240601504e-06, "loss": 0.2196, "step": 33260 }, { "epoch": 50.03, "grad_norm": 3.4939560890197754, "learning_rate": 4.996992481203008e-06, "loss": 0.2139, "step": 33270 }, { "epoch": 50.05, "grad_norm": 3.0447769165039062, "learning_rate": 4.995488721804512e-06, "loss": 0.1865, "step": 33280 }, { "epoch": 50.06, "grad_norm": 2.098543405532837, "learning_rate": 4.993984962406015e-06, "loss": 0.2419, "step": 33290 }, { "epoch": 50.08, "grad_norm": 5.257837772369385, "learning_rate": 4.992481203007519e-06, "loss": 0.2051, "step": 33300 }, { "epoch": 50.09, "grad_norm": 4.075168609619141, "learning_rate": 4.990977443609023e-06, "loss": 0.2148, "step": 33310 }, { "epoch": 50.11, "grad_norm": 3.889608144760132, "learning_rate": 4.989473684210527e-06, "loss": 0.2621, "step": 33320 }, { "epoch": 50.12, "grad_norm": 5.086583614349365, "learning_rate": 4.98796992481203e-06, "loss": 0.2163, "step": 33330 }, { "epoch": 50.14, "grad_norm": 4.367302894592285, "learning_rate": 4.986466165413535e-06, "loss": 0.2182, "step": 33340 }, { "epoch": 50.15, "grad_norm": 4.624885559082031, "learning_rate": 4.984962406015038e-06, "loss": 0.2048, "step": 33350 }, { "epoch": 50.17, "grad_norm": 5.809417247772217, "learning_rate": 4.983458646616542e-06, "loss": 0.2646, "step": 33360 }, { "epoch": 50.18, "grad_norm": 2.8429479598999023, "learning_rate": 4.981954887218046e-06, "loss": 0.2445, "step": 33370 }, { "epoch": 50.2, "grad_norm": 7.78208065032959, "learning_rate": 4.9804511278195495e-06, "loss": 0.186, "step": 33380 }, { "epoch": 50.21, "grad_norm": 5.874340534210205, "learning_rate": 4.978947368421053e-06, "loss": 0.2056, "step": 33390 }, { "epoch": 50.23, "grad_norm": 3.485530138015747, "learning_rate": 4.977443609022557e-06, "loss": 0.2269, "step": 33400 }, { "epoch": 50.24, "grad_norm": 6.335367679595947, "learning_rate": 4.975939849624061e-06, "loss": 0.1929, "step": 33410 }, { "epoch": 50.26, "grad_norm": 7.2522382736206055, "learning_rate": 4.9744360902255645e-06, "loss": 0.2561, "step": 33420 }, { "epoch": 50.27, "grad_norm": 8.870294570922852, "learning_rate": 4.972932330827068e-06, "loss": 0.2507, "step": 33430 }, { "epoch": 50.29, "grad_norm": 2.679324150085449, "learning_rate": 4.971428571428572e-06, "loss": 0.1811, "step": 33440 }, { "epoch": 50.3, "grad_norm": 4.810848236083984, "learning_rate": 4.969924812030076e-06, "loss": 0.2152, "step": 33450 }, { "epoch": 50.32, "grad_norm": 7.02167272567749, "learning_rate": 4.968421052631579e-06, "loss": 0.1854, "step": 33460 }, { "epoch": 50.33, "grad_norm": 3.6129260063171387, "learning_rate": 4.966917293233084e-06, "loss": 0.234, "step": 33470 }, { "epoch": 50.35, "grad_norm": 3.452362060546875, "learning_rate": 4.965413533834587e-06, "loss": 0.2666, "step": 33480 }, { "epoch": 50.36, "grad_norm": 6.474914073944092, "learning_rate": 4.963909774436091e-06, "loss": 0.2418, "step": 33490 }, { "epoch": 50.38, "grad_norm": 3.1944398880004883, "learning_rate": 4.962406015037594e-06, "loss": 0.2529, "step": 33500 }, { "epoch": 50.39, "grad_norm": 4.8096699714660645, "learning_rate": 4.960902255639098e-06, "loss": 0.2176, "step": 33510 }, { "epoch": 50.41, "grad_norm": 4.8568925857543945, "learning_rate": 4.959398496240601e-06, "loss": 0.1876, "step": 33520 }, { "epoch": 50.42, "grad_norm": 5.153539657592773, "learning_rate": 4.957894736842106e-06, "loss": 0.2234, "step": 33530 }, { "epoch": 50.44, "grad_norm": 12.457379341125488, "learning_rate": 4.956390977443609e-06, "loss": 0.2267, "step": 33540 }, { "epoch": 50.45, "grad_norm": 3.840301990509033, "learning_rate": 4.954887218045113e-06, "loss": 0.231, "step": 33550 }, { "epoch": 50.47, "grad_norm": 4.2671966552734375, "learning_rate": 4.953383458646617e-06, "loss": 0.2221, "step": 33560 }, { "epoch": 50.48, "grad_norm": 1.5949358940124512, "learning_rate": 4.951879699248121e-06, "loss": 0.2137, "step": 33570 }, { "epoch": 50.5, "grad_norm": 4.011662006378174, "learning_rate": 4.950375939849624e-06, "loss": 0.2406, "step": 33580 }, { "epoch": 50.51, "grad_norm": 5.408310413360596, "learning_rate": 4.948872180451128e-06, "loss": 0.2115, "step": 33590 }, { "epoch": 50.53, "grad_norm": 13.068920135498047, "learning_rate": 4.947368421052632e-06, "loss": 0.2343, "step": 33600 }, { "epoch": 50.54, "grad_norm": 5.289875507354736, "learning_rate": 4.9458646616541356e-06, "loss": 0.212, "step": 33610 }, { "epoch": 50.56, "grad_norm": 9.420212745666504, "learning_rate": 4.944360902255639e-06, "loss": 0.2435, "step": 33620 }, { "epoch": 50.57, "grad_norm": 4.894474506378174, "learning_rate": 4.9428571428571435e-06, "loss": 0.1806, "step": 33630 }, { "epoch": 50.59, "grad_norm": 5.293659687042236, "learning_rate": 4.941353383458647e-06, "loss": 0.2205, "step": 33640 }, { "epoch": 50.6, "grad_norm": 3.308318853378296, "learning_rate": 4.9398496240601505e-06, "loss": 0.2025, "step": 33650 }, { "epoch": 50.62, "grad_norm": 6.303762435913086, "learning_rate": 4.938345864661655e-06, "loss": 0.2399, "step": 33660 }, { "epoch": 50.63, "grad_norm": 5.376105308532715, "learning_rate": 4.936842105263158e-06, "loss": 0.1792, "step": 33670 }, { "epoch": 50.65, "grad_norm": 7.851215839385986, "learning_rate": 4.935338345864662e-06, "loss": 0.2101, "step": 33680 }, { "epoch": 50.66, "grad_norm": 4.721893310546875, "learning_rate": 4.933834586466165e-06, "loss": 0.2835, "step": 33690 }, { "epoch": 50.68, "grad_norm": 5.323988914489746, "learning_rate": 4.93233082706767e-06, "loss": 0.2572, "step": 33700 }, { "epoch": 50.69, "grad_norm": 6.7880964279174805, "learning_rate": 4.930827067669173e-06, "loss": 0.2099, "step": 33710 }, { "epoch": 50.71, "grad_norm": 4.452812194824219, "learning_rate": 4.929323308270677e-06, "loss": 0.2393, "step": 33720 }, { "epoch": 50.72, "grad_norm": 3.799269676208496, "learning_rate": 4.927819548872181e-06, "loss": 0.1869, "step": 33730 }, { "epoch": 50.74, "grad_norm": 4.9699273109436035, "learning_rate": 4.926315789473685e-06, "loss": 0.242, "step": 33740 }, { "epoch": 50.75, "grad_norm": 3.6977415084838867, "learning_rate": 4.924812030075188e-06, "loss": 0.1882, "step": 33750 }, { "epoch": 50.77, "grad_norm": 4.581582546234131, "learning_rate": 4.923308270676692e-06, "loss": 0.2257, "step": 33760 }, { "epoch": 50.78, "grad_norm": 6.505821228027344, "learning_rate": 4.921804511278196e-06, "loss": 0.2212, "step": 33770 }, { "epoch": 50.8, "grad_norm": 6.289919376373291, "learning_rate": 4.9203007518797e-06, "loss": 0.2553, "step": 33780 }, { "epoch": 50.81, "grad_norm": 3.341113567352295, "learning_rate": 4.918796992481203e-06, "loss": 0.2567, "step": 33790 }, { "epoch": 50.83, "grad_norm": 4.804263114929199, "learning_rate": 4.9172932330827075e-06, "loss": 0.2432, "step": 33800 }, { "epoch": 50.84, "grad_norm": 6.457090854644775, "learning_rate": 4.915789473684211e-06, "loss": 0.223, "step": 33810 }, { "epoch": 50.86, "grad_norm": 7.253601551055908, "learning_rate": 4.9142857142857145e-06, "loss": 0.2747, "step": 33820 }, { "epoch": 50.87, "grad_norm": 4.229115962982178, "learning_rate": 4.912781954887219e-06, "loss": 0.2616, "step": 33830 }, { "epoch": 50.89, "grad_norm": 3.4108991622924805, "learning_rate": 4.9112781954887224e-06, "loss": 0.2176, "step": 33840 }, { "epoch": 50.9, "grad_norm": 6.427495002746582, "learning_rate": 4.909774436090226e-06, "loss": 0.1867, "step": 33850 }, { "epoch": 50.92, "grad_norm": 4.823879718780518, "learning_rate": 4.9082706766917295e-06, "loss": 0.2758, "step": 33860 }, { "epoch": 50.93, "grad_norm": 5.894002914428711, "learning_rate": 4.906766917293234e-06, "loss": 0.275, "step": 33870 }, { "epoch": 50.95, "grad_norm": 7.533518314361572, "learning_rate": 4.905263157894737e-06, "loss": 0.2269, "step": 33880 }, { "epoch": 50.96, "grad_norm": 6.328519344329834, "learning_rate": 4.903759398496241e-06, "loss": 0.2842, "step": 33890 }, { "epoch": 50.98, "grad_norm": 2.4413256645202637, "learning_rate": 4.902255639097745e-06, "loss": 0.1786, "step": 33900 }, { "epoch": 50.99, "grad_norm": 2.602782964706421, "learning_rate": 4.900751879699249e-06, "loss": 0.1847, "step": 33910 }, { "epoch": 51.0, "eval_accuracy": 0.9311, "eval_loss": 0.3058427572250366, "eval_runtime": 84.9491, "eval_samples_per_second": 117.718, "eval_steps_per_second": 0.471, "step": 33915 }, { "epoch": 51.01, "grad_norm": 12.357138633728027, "learning_rate": 4.899248120300752e-06, "loss": 0.1841, "step": 33920 }, { "epoch": 51.02, "grad_norm": 6.953237056732178, "learning_rate": 4.897744360902257e-06, "loss": 0.2775, "step": 33930 }, { "epoch": 51.04, "grad_norm": 4.370208263397217, "learning_rate": 4.89624060150376e-06, "loss": 0.216, "step": 33940 }, { "epoch": 51.05, "grad_norm": 5.940341472625732, "learning_rate": 4.894736842105264e-06, "loss": 0.2888, "step": 33950 }, { "epoch": 51.07, "grad_norm": 2.9402363300323486, "learning_rate": 4.893233082706767e-06, "loss": 0.2437, "step": 33960 }, { "epoch": 51.08, "grad_norm": 6.118091583251953, "learning_rate": 4.8917293233082716e-06, "loss": 0.2558, "step": 33970 }, { "epoch": 51.1, "grad_norm": 3.255457878112793, "learning_rate": 4.890225563909775e-06, "loss": 0.217, "step": 33980 }, { "epoch": 51.11, "grad_norm": 6.439407825469971, "learning_rate": 4.888721804511279e-06, "loss": 0.2119, "step": 33990 }, { "epoch": 51.13, "grad_norm": 5.628686904907227, "learning_rate": 4.887218045112782e-06, "loss": 0.2647, "step": 34000 }, { "epoch": 51.14, "grad_norm": 3.908249616622925, "learning_rate": 4.885714285714286e-06, "loss": 0.221, "step": 34010 }, { "epoch": 51.16, "grad_norm": 5.72270393371582, "learning_rate": 4.88421052631579e-06, "loss": 0.2116, "step": 34020 }, { "epoch": 51.17, "grad_norm": 6.743198394775391, "learning_rate": 4.8827067669172935e-06, "loss": 0.2716, "step": 34030 }, { "epoch": 51.19, "grad_norm": 6.313354969024658, "learning_rate": 4.881203007518797e-06, "loss": 0.242, "step": 34040 }, { "epoch": 51.2, "grad_norm": 4.181947708129883, "learning_rate": 4.8796992481203006e-06, "loss": 0.2448, "step": 34050 }, { "epoch": 51.22, "grad_norm": 7.440803050994873, "learning_rate": 4.878195488721805e-06, "loss": 0.2734, "step": 34060 }, { "epoch": 51.23, "grad_norm": 7.924306869506836, "learning_rate": 4.8766917293233085e-06, "loss": 0.2438, "step": 34070 }, { "epoch": 51.25, "grad_norm": 4.542608737945557, "learning_rate": 4.875187969924812e-06, "loss": 0.2291, "step": 34080 }, { "epoch": 51.26, "grad_norm": 3.60923171043396, "learning_rate": 4.873684210526316e-06, "loss": 0.201, "step": 34090 }, { "epoch": 51.28, "grad_norm": 7.753152847290039, "learning_rate": 4.87218045112782e-06, "loss": 0.2213, "step": 34100 }, { "epoch": 51.29, "grad_norm": 4.770752429962158, "learning_rate": 4.870676691729323e-06, "loss": 0.1961, "step": 34110 }, { "epoch": 51.31, "grad_norm": 4.9310479164123535, "learning_rate": 4.869172932330828e-06, "loss": 0.1574, "step": 34120 }, { "epoch": 51.32, "grad_norm": 3.8556582927703857, "learning_rate": 4.867669172932331e-06, "loss": 0.2608, "step": 34130 }, { "epoch": 51.34, "grad_norm": 6.298683166503906, "learning_rate": 4.866165413533835e-06, "loss": 0.2558, "step": 34140 }, { "epoch": 51.35, "grad_norm": 3.553882122039795, "learning_rate": 4.864661654135338e-06, "loss": 0.152, "step": 34150 }, { "epoch": 51.37, "grad_norm": 4.3429975509643555, "learning_rate": 4.863157894736843e-06, "loss": 0.2239, "step": 34160 }, { "epoch": 51.38, "grad_norm": 3.7060399055480957, "learning_rate": 4.861654135338346e-06, "loss": 0.268, "step": 34170 }, { "epoch": 51.4, "grad_norm": 4.588679313659668, "learning_rate": 4.86015037593985e-06, "loss": 0.2724, "step": 34180 }, { "epoch": 51.41, "grad_norm": 3.715130567550659, "learning_rate": 4.858646616541354e-06, "loss": 0.1813, "step": 34190 }, { "epoch": 51.43, "grad_norm": 6.059221267700195, "learning_rate": 4.857142857142858e-06, "loss": 0.201, "step": 34200 }, { "epoch": 51.44, "grad_norm": 9.141858100891113, "learning_rate": 4.855639097744361e-06, "loss": 0.2786, "step": 34210 }, { "epoch": 51.46, "grad_norm": 5.355362892150879, "learning_rate": 4.854135338345865e-06, "loss": 0.2175, "step": 34220 }, { "epoch": 51.47, "grad_norm": 4.845487117767334, "learning_rate": 4.852631578947369e-06, "loss": 0.2597, "step": 34230 }, { "epoch": 51.49, "grad_norm": 4.735893249511719, "learning_rate": 4.8511278195488725e-06, "loss": 0.2157, "step": 34240 }, { "epoch": 51.5, "grad_norm": 5.093930244445801, "learning_rate": 4.849624060150376e-06, "loss": 0.1836, "step": 34250 }, { "epoch": 51.52, "grad_norm": 1.877554178237915, "learning_rate": 4.84812030075188e-06, "loss": 0.1809, "step": 34260 }, { "epoch": 51.53, "grad_norm": 5.213804721832275, "learning_rate": 4.846616541353384e-06, "loss": 0.2439, "step": 34270 }, { "epoch": 51.55, "grad_norm": 2.9461185932159424, "learning_rate": 4.8451127819548874e-06, "loss": 0.1654, "step": 34280 }, { "epoch": 51.56, "grad_norm": 3.598033905029297, "learning_rate": 4.843609022556392e-06, "loss": 0.1807, "step": 34290 }, { "epoch": 51.58, "grad_norm": 3.246472120285034, "learning_rate": 4.842105263157895e-06, "loss": 0.2873, "step": 34300 }, { "epoch": 51.59, "grad_norm": 7.0338921546936035, "learning_rate": 4.840601503759399e-06, "loss": 0.257, "step": 34310 }, { "epoch": 51.61, "grad_norm": 7.394874095916748, "learning_rate": 4.839097744360902e-06, "loss": 0.1845, "step": 34320 }, { "epoch": 51.62, "grad_norm": 5.90761661529541, "learning_rate": 4.837593984962407e-06, "loss": 0.2476, "step": 34330 }, { "epoch": 51.64, "grad_norm": 2.5447800159454346, "learning_rate": 4.83609022556391e-06, "loss": 0.2252, "step": 34340 }, { "epoch": 51.65, "grad_norm": 6.951369285583496, "learning_rate": 4.834586466165414e-06, "loss": 0.198, "step": 34350 }, { "epoch": 51.67, "grad_norm": 8.201958656311035, "learning_rate": 4.833082706766918e-06, "loss": 0.2374, "step": 34360 }, { "epoch": 51.68, "grad_norm": 7.837949752807617, "learning_rate": 4.831578947368422e-06, "loss": 0.1674, "step": 34370 }, { "epoch": 51.7, "grad_norm": 5.410804748535156, "learning_rate": 4.830075187969925e-06, "loss": 0.2734, "step": 34380 }, { "epoch": 51.71, "grad_norm": 3.385745048522949, "learning_rate": 4.8285714285714295e-06, "loss": 0.234, "step": 34390 }, { "epoch": 51.73, "grad_norm": 7.365379810333252, "learning_rate": 4.827067669172933e-06, "loss": 0.2139, "step": 34400 }, { "epoch": 51.74, "grad_norm": 6.774968147277832, "learning_rate": 4.8255639097744366e-06, "loss": 0.151, "step": 34410 }, { "epoch": 51.76, "grad_norm": 5.092763900756836, "learning_rate": 4.82406015037594e-06, "loss": 0.2152, "step": 34420 }, { "epoch": 51.77, "grad_norm": 3.2947609424591064, "learning_rate": 4.8225563909774445e-06, "loss": 0.2309, "step": 34430 }, { "epoch": 51.79, "grad_norm": 4.706571102142334, "learning_rate": 4.821052631578948e-06, "loss": 0.2546, "step": 34440 }, { "epoch": 51.8, "grad_norm": 8.115716934204102, "learning_rate": 4.8195488721804515e-06, "loss": 0.1969, "step": 34450 }, { "epoch": 51.82, "grad_norm": 7.135550022125244, "learning_rate": 4.818045112781956e-06, "loss": 0.2594, "step": 34460 }, { "epoch": 51.83, "grad_norm": 4.663880348205566, "learning_rate": 4.816541353383459e-06, "loss": 0.245, "step": 34470 }, { "epoch": 51.85, "grad_norm": 5.021750450134277, "learning_rate": 4.815037593984963e-06, "loss": 0.2571, "step": 34480 }, { "epoch": 51.86, "grad_norm": 3.6975669860839844, "learning_rate": 4.813533834586466e-06, "loss": 0.2923, "step": 34490 }, { "epoch": 51.88, "grad_norm": 5.429826259613037, "learning_rate": 4.81203007518797e-06, "loss": 0.158, "step": 34500 }, { "epoch": 51.89, "grad_norm": 5.443629264831543, "learning_rate": 4.8105263157894735e-06, "loss": 0.1788, "step": 34510 }, { "epoch": 51.91, "grad_norm": 2.6010379791259766, "learning_rate": 4.809022556390978e-06, "loss": 0.2203, "step": 34520 }, { "epoch": 51.92, "grad_norm": 2.72519588470459, "learning_rate": 4.807518796992481e-06, "loss": 0.2021, "step": 34530 }, { "epoch": 51.94, "grad_norm": 7.415553569793701, "learning_rate": 4.806015037593985e-06, "loss": 0.2145, "step": 34540 }, { "epoch": 51.95, "grad_norm": 3.181380033493042, "learning_rate": 4.804511278195489e-06, "loss": 0.1729, "step": 34550 }, { "epoch": 51.97, "grad_norm": 4.509653091430664, "learning_rate": 4.803007518796993e-06, "loss": 0.1733, "step": 34560 }, { "epoch": 51.98, "grad_norm": 2.9621808528900146, "learning_rate": 4.801503759398496e-06, "loss": 0.2322, "step": 34570 }, { "epoch": 52.0, "grad_norm": 16.024185180664062, "learning_rate": 4.800000000000001e-06, "loss": 0.2453, "step": 34580 }, { "epoch": 52.0, "eval_accuracy": 0.9308, "eval_loss": 0.3179681897163391, "eval_runtime": 84.5142, "eval_samples_per_second": 118.323, "eval_steps_per_second": 0.473, "step": 34580 }, { "epoch": 52.02, "grad_norm": 4.019076347351074, "learning_rate": 4.798496240601504e-06, "loss": 0.1856, "step": 34590 }, { "epoch": 52.03, "grad_norm": 4.832670211791992, "learning_rate": 4.796992481203008e-06, "loss": 0.1791, "step": 34600 }, { "epoch": 52.05, "grad_norm": 3.983247756958008, "learning_rate": 4.795488721804511e-06, "loss": 0.2176, "step": 34610 }, { "epoch": 52.06, "grad_norm": 5.2948832511901855, "learning_rate": 4.7939849624060156e-06, "loss": 0.2737, "step": 34620 }, { "epoch": 52.08, "grad_norm": 3.9428210258483887, "learning_rate": 4.792481203007519e-06, "loss": 0.2457, "step": 34630 }, { "epoch": 52.09, "grad_norm": 7.841516971588135, "learning_rate": 4.790977443609023e-06, "loss": 0.2125, "step": 34640 }, { "epoch": 52.11, "grad_norm": 2.363363265991211, "learning_rate": 4.789473684210527e-06, "loss": 0.1784, "step": 34650 }, { "epoch": 52.12, "grad_norm": 3.259340286254883, "learning_rate": 4.7879699248120305e-06, "loss": 0.2177, "step": 34660 }, { "epoch": 52.14, "grad_norm": 2.5872132778167725, "learning_rate": 4.786466165413534e-06, "loss": 0.229, "step": 34670 }, { "epoch": 52.15, "grad_norm": 6.223973274230957, "learning_rate": 4.7849624060150375e-06, "loss": 0.2612, "step": 34680 }, { "epoch": 52.17, "grad_norm": 8.220977783203125, "learning_rate": 4.783458646616542e-06, "loss": 0.2316, "step": 34690 }, { "epoch": 52.18, "grad_norm": 5.418039321899414, "learning_rate": 4.781954887218045e-06, "loss": 0.2063, "step": 34700 }, { "epoch": 52.2, "grad_norm": 2.79838490486145, "learning_rate": 4.780451127819549e-06, "loss": 0.1965, "step": 34710 }, { "epoch": 52.21, "grad_norm": 5.194334030151367, "learning_rate": 4.778947368421053e-06, "loss": 0.1817, "step": 34720 }, { "epoch": 52.23, "grad_norm": 4.306404113769531, "learning_rate": 4.777443609022557e-06, "loss": 0.1943, "step": 34730 }, { "epoch": 52.24, "grad_norm": 5.353394508361816, "learning_rate": 4.77593984962406e-06, "loss": 0.2019, "step": 34740 }, { "epoch": 52.26, "grad_norm": 4.193369388580322, "learning_rate": 4.774436090225565e-06, "loss": 0.1593, "step": 34750 }, { "epoch": 52.27, "grad_norm": 5.131227493286133, "learning_rate": 4.772932330827068e-06, "loss": 0.2845, "step": 34760 }, { "epoch": 52.29, "grad_norm": 6.315420627593994, "learning_rate": 4.771428571428572e-06, "loss": 0.2163, "step": 34770 }, { "epoch": 52.3, "grad_norm": 5.833438873291016, "learning_rate": 4.769924812030075e-06, "loss": 0.2064, "step": 34780 }, { "epoch": 52.32, "grad_norm": 2.4818150997161865, "learning_rate": 4.76842105263158e-06, "loss": 0.2169, "step": 34790 }, { "epoch": 52.33, "grad_norm": 5.627359390258789, "learning_rate": 4.766917293233083e-06, "loss": 0.2654, "step": 34800 }, { "epoch": 52.35, "grad_norm": 4.044781684875488, "learning_rate": 4.765413533834587e-06, "loss": 0.233, "step": 34810 }, { "epoch": 52.36, "grad_norm": 9.770264625549316, "learning_rate": 4.763909774436091e-06, "loss": 0.2018, "step": 34820 }, { "epoch": 52.38, "grad_norm": 5.436629295349121, "learning_rate": 4.7624060150375945e-06, "loss": 0.1943, "step": 34830 }, { "epoch": 52.39, "grad_norm": 8.624287605285645, "learning_rate": 4.760902255639098e-06, "loss": 0.2665, "step": 34840 }, { "epoch": 52.41, "grad_norm": 5.233580589294434, "learning_rate": 4.759398496240602e-06, "loss": 0.1882, "step": 34850 }, { "epoch": 52.42, "grad_norm": 5.307055950164795, "learning_rate": 4.757894736842106e-06, "loss": 0.2765, "step": 34860 }, { "epoch": 52.44, "grad_norm": 4.9283928871154785, "learning_rate": 4.7563909774436095e-06, "loss": 0.2012, "step": 34870 }, { "epoch": 52.45, "grad_norm": 7.545740604400635, "learning_rate": 4.754887218045113e-06, "loss": 0.1997, "step": 34880 }, { "epoch": 52.47, "grad_norm": 4.995537757873535, "learning_rate": 4.753383458646617e-06, "loss": 0.2371, "step": 34890 }, { "epoch": 52.48, "grad_norm": 4.122995376586914, "learning_rate": 4.751879699248121e-06, "loss": 0.2107, "step": 34900 }, { "epoch": 52.5, "grad_norm": 6.755688667297363, "learning_rate": 4.750375939849624e-06, "loss": 0.1947, "step": 34910 }, { "epoch": 52.51, "grad_norm": 3.243288278579712, "learning_rate": 4.748872180451129e-06, "loss": 0.2091, "step": 34920 }, { "epoch": 52.53, "grad_norm": 4.718216896057129, "learning_rate": 4.747368421052632e-06, "loss": 0.2318, "step": 34930 }, { "epoch": 52.54, "grad_norm": 6.7740936279296875, "learning_rate": 4.745864661654136e-06, "loss": 0.2014, "step": 34940 }, { "epoch": 52.56, "grad_norm": 3.995626449584961, "learning_rate": 4.744360902255639e-06, "loss": 0.2062, "step": 34950 }, { "epoch": 52.57, "grad_norm": 6.141635417938232, "learning_rate": 4.742857142857144e-06, "loss": 0.183, "step": 34960 }, { "epoch": 52.59, "grad_norm": 7.161436557769775, "learning_rate": 4.741353383458647e-06, "loss": 0.2518, "step": 34970 }, { "epoch": 52.6, "grad_norm": 2.2341339588165283, "learning_rate": 4.739849624060151e-06, "loss": 0.2516, "step": 34980 }, { "epoch": 52.62, "grad_norm": 5.752125263214111, "learning_rate": 4.738345864661654e-06, "loss": 0.1559, "step": 34990 }, { "epoch": 52.63, "grad_norm": 4.806941032409668, "learning_rate": 4.736842105263158e-06, "loss": 0.177, "step": 35000 }, { "epoch": 52.65, "grad_norm": 2.0574851036071777, "learning_rate": 4.735338345864662e-06, "loss": 0.1954, "step": 35010 }, { "epoch": 52.66, "grad_norm": 4.443716526031494, "learning_rate": 4.733834586466166e-06, "loss": 0.2045, "step": 35020 }, { "epoch": 52.68, "grad_norm": 3.878716468811035, "learning_rate": 4.732330827067669e-06, "loss": 0.2095, "step": 35030 }, { "epoch": 52.69, "grad_norm": 10.685707092285156, "learning_rate": 4.7308270676691735e-06, "loss": 0.2586, "step": 35040 }, { "epoch": 52.71, "grad_norm": 3.904240369796753, "learning_rate": 4.729323308270677e-06, "loss": 0.3103, "step": 35050 }, { "epoch": 52.72, "grad_norm": 4.6063337326049805, "learning_rate": 4.7278195488721806e-06, "loss": 0.1517, "step": 35060 }, { "epoch": 52.74, "grad_norm": 4.317811489105225, "learning_rate": 4.726315789473684e-06, "loss": 0.216, "step": 35070 }, { "epoch": 52.75, "grad_norm": 7.456090927124023, "learning_rate": 4.7248120300751884e-06, "loss": 0.2438, "step": 35080 }, { "epoch": 52.77, "grad_norm": 7.284855842590332, "learning_rate": 4.723308270676692e-06, "loss": 0.2263, "step": 35090 }, { "epoch": 52.78, "grad_norm": 4.406165599822998, "learning_rate": 4.7218045112781955e-06, "loss": 0.181, "step": 35100 }, { "epoch": 52.8, "grad_norm": 2.9364101886749268, "learning_rate": 4.7203007518797e-06, "loss": 0.2351, "step": 35110 }, { "epoch": 52.81, "grad_norm": 5.638926029205322, "learning_rate": 4.718796992481203e-06, "loss": 0.2289, "step": 35120 }, { "epoch": 52.83, "grad_norm": 4.295120716094971, "learning_rate": 4.717293233082707e-06, "loss": 0.211, "step": 35130 }, { "epoch": 52.84, "grad_norm": 4.422874450683594, "learning_rate": 4.71578947368421e-06, "loss": 0.1947, "step": 35140 }, { "epoch": 52.86, "grad_norm": 0.5836885571479797, "learning_rate": 4.714285714285715e-06, "loss": 0.2299, "step": 35150 }, { "epoch": 52.87, "grad_norm": 4.698862552642822, "learning_rate": 4.712781954887218e-06, "loss": 0.1864, "step": 35160 }, { "epoch": 52.89, "grad_norm": 4.577108860015869, "learning_rate": 4.711278195488722e-06, "loss": 0.2257, "step": 35170 }, { "epoch": 52.9, "grad_norm": 7.971583843231201, "learning_rate": 4.709774436090226e-06, "loss": 0.2093, "step": 35180 }, { "epoch": 52.92, "grad_norm": 6.064690589904785, "learning_rate": 4.70827067669173e-06, "loss": 0.2444, "step": 35190 }, { "epoch": 52.93, "grad_norm": 7.225642681121826, "learning_rate": 4.706766917293233e-06, "loss": 0.2325, "step": 35200 }, { "epoch": 52.95, "grad_norm": 4.481410503387451, "learning_rate": 4.705263157894738e-06, "loss": 0.2357, "step": 35210 }, { "epoch": 52.96, "grad_norm": 4.95440673828125, "learning_rate": 4.703759398496241e-06, "loss": 0.2526, "step": 35220 }, { "epoch": 52.98, "grad_norm": 3.8696534633636475, "learning_rate": 4.702255639097745e-06, "loss": 0.2009, "step": 35230 }, { "epoch": 52.99, "grad_norm": 6.271843910217285, "learning_rate": 4.700751879699248e-06, "loss": 0.2763, "step": 35240 }, { "epoch": 53.0, "eval_accuracy": 0.932, "eval_loss": 0.30757245421409607, "eval_runtime": 84.7331, "eval_samples_per_second": 118.018, "eval_steps_per_second": 0.472, "step": 35245 }, { "epoch": 53.01, "grad_norm": 8.736787796020508, "learning_rate": 4.6992481203007525e-06, "loss": 0.2478, "step": 35250 }, { "epoch": 53.02, "grad_norm": 6.983564853668213, "learning_rate": 4.697744360902256e-06, "loss": 0.2756, "step": 35260 }, { "epoch": 53.04, "grad_norm": 3.659640073776245, "learning_rate": 4.6962406015037595e-06, "loss": 0.2843, "step": 35270 }, { "epoch": 53.05, "grad_norm": 6.107889175415039, "learning_rate": 4.694736842105264e-06, "loss": 0.2212, "step": 35280 }, { "epoch": 53.07, "grad_norm": 3.515962600708008, "learning_rate": 4.693233082706767e-06, "loss": 0.199, "step": 35290 }, { "epoch": 53.08, "grad_norm": 4.900839328765869, "learning_rate": 4.691729323308271e-06, "loss": 0.2398, "step": 35300 }, { "epoch": 53.1, "grad_norm": 4.189992904663086, "learning_rate": 4.690225563909775e-06, "loss": 0.2255, "step": 35310 }, { "epoch": 53.11, "grad_norm": 4.84755277633667, "learning_rate": 4.688721804511279e-06, "loss": 0.1594, "step": 35320 }, { "epoch": 53.13, "grad_norm": 4.748797416687012, "learning_rate": 4.687218045112782e-06, "loss": 0.1627, "step": 35330 }, { "epoch": 53.14, "grad_norm": 9.220650672912598, "learning_rate": 4.685714285714286e-06, "loss": 0.2756, "step": 35340 }, { "epoch": 53.16, "grad_norm": 4.247942924499512, "learning_rate": 4.68421052631579e-06, "loss": 0.1987, "step": 35350 }, { "epoch": 53.17, "grad_norm": 4.994369983673096, "learning_rate": 4.682706766917294e-06, "loss": 0.1973, "step": 35360 }, { "epoch": 53.19, "grad_norm": 5.42191743850708, "learning_rate": 4.681203007518797e-06, "loss": 0.2074, "step": 35370 }, { "epoch": 53.2, "grad_norm": 3.5923995971679688, "learning_rate": 4.679699248120302e-06, "loss": 0.2612, "step": 35380 }, { "epoch": 53.22, "grad_norm": 5.668188095092773, "learning_rate": 4.678195488721805e-06, "loss": 0.2088, "step": 35390 }, { "epoch": 53.23, "grad_norm": 4.510677337646484, "learning_rate": 4.676691729323309e-06, "loss": 0.2987, "step": 35400 }, { "epoch": 53.25, "grad_norm": 6.8448028564453125, "learning_rate": 4.675187969924812e-06, "loss": 0.236, "step": 35410 }, { "epoch": 53.26, "grad_norm": 5.6076130867004395, "learning_rate": 4.6736842105263166e-06, "loss": 0.183, "step": 35420 }, { "epoch": 53.28, "grad_norm": 5.029841423034668, "learning_rate": 4.67218045112782e-06, "loss": 0.1961, "step": 35430 }, { "epoch": 53.29, "grad_norm": 4.591315746307373, "learning_rate": 4.670676691729324e-06, "loss": 0.2227, "step": 35440 }, { "epoch": 53.31, "grad_norm": 8.496804237365723, "learning_rate": 4.669172932330828e-06, "loss": 0.2237, "step": 35450 }, { "epoch": 53.32, "grad_norm": 4.822425842285156, "learning_rate": 4.6676691729323315e-06, "loss": 0.2348, "step": 35460 }, { "epoch": 53.34, "grad_norm": 3.9777705669403076, "learning_rate": 4.666165413533835e-06, "loss": 0.2115, "step": 35470 }, { "epoch": 53.35, "grad_norm": 16.29306983947754, "learning_rate": 4.664661654135339e-06, "loss": 0.1931, "step": 35480 }, { "epoch": 53.37, "grad_norm": 2.485229730606079, "learning_rate": 4.663157894736842e-06, "loss": 0.1755, "step": 35490 }, { "epoch": 53.38, "grad_norm": 5.529495716094971, "learning_rate": 4.661654135338346e-06, "loss": 0.2145, "step": 35500 }, { "epoch": 53.4, "grad_norm": 5.058990478515625, "learning_rate": 4.66015037593985e-06, "loss": 0.2215, "step": 35510 }, { "epoch": 53.41, "grad_norm": 3.2668142318725586, "learning_rate": 4.6586466165413534e-06, "loss": 0.2098, "step": 35520 }, { "epoch": 53.43, "grad_norm": 8.407674789428711, "learning_rate": 4.657142857142857e-06, "loss": 0.1849, "step": 35530 }, { "epoch": 53.44, "grad_norm": 5.4570488929748535, "learning_rate": 4.655639097744361e-06, "loss": 0.2515, "step": 35540 }, { "epoch": 53.46, "grad_norm": 4.499083042144775, "learning_rate": 4.654135338345865e-06, "loss": 0.2259, "step": 35550 }, { "epoch": 53.47, "grad_norm": 9.1441068649292, "learning_rate": 4.652631578947368e-06, "loss": 0.2769, "step": 35560 }, { "epoch": 53.49, "grad_norm": 5.146740436553955, "learning_rate": 4.651127819548873e-06, "loss": 0.2152, "step": 35570 }, { "epoch": 53.5, "grad_norm": 7.009411811828613, "learning_rate": 4.649624060150376e-06, "loss": 0.2264, "step": 35580 }, { "epoch": 53.52, "grad_norm": 6.005060195922852, "learning_rate": 4.64812030075188e-06, "loss": 0.1889, "step": 35590 }, { "epoch": 53.53, "grad_norm": 8.248251914978027, "learning_rate": 4.646616541353383e-06, "loss": 0.2329, "step": 35600 }, { "epoch": 53.55, "grad_norm": 6.950926303863525, "learning_rate": 4.645112781954888e-06, "loss": 0.2437, "step": 35610 }, { "epoch": 53.56, "grad_norm": 6.0409836769104, "learning_rate": 4.643609022556391e-06, "loss": 0.1981, "step": 35620 }, { "epoch": 53.58, "grad_norm": 6.35344123840332, "learning_rate": 4.642105263157895e-06, "loss": 0.2629, "step": 35630 }, { "epoch": 53.59, "grad_norm": 4.371424674987793, "learning_rate": 4.640601503759399e-06, "loss": 0.2816, "step": 35640 }, { "epoch": 53.61, "grad_norm": 6.9506659507751465, "learning_rate": 4.639097744360903e-06, "loss": 0.2337, "step": 35650 }, { "epoch": 53.62, "grad_norm": 4.818626403808594, "learning_rate": 4.637593984962406e-06, "loss": 0.1822, "step": 35660 }, { "epoch": 53.64, "grad_norm": 5.629130840301514, "learning_rate": 4.6360902255639105e-06, "loss": 0.1786, "step": 35670 }, { "epoch": 53.65, "grad_norm": 5.757498264312744, "learning_rate": 4.634586466165414e-06, "loss": 0.1709, "step": 35680 }, { "epoch": 53.67, "grad_norm": 6.2977800369262695, "learning_rate": 4.6330827067669175e-06, "loss": 0.2847, "step": 35690 }, { "epoch": 53.68, "grad_norm": 4.801835536956787, "learning_rate": 4.631578947368421e-06, "loss": 0.2441, "step": 35700 }, { "epoch": 53.7, "grad_norm": 4.978909015655518, "learning_rate": 4.630075187969925e-06, "loss": 0.2422, "step": 35710 }, { "epoch": 53.71, "grad_norm": 2.3563828468322754, "learning_rate": 4.628571428571429e-06, "loss": 0.2588, "step": 35720 }, { "epoch": 53.73, "grad_norm": 5.150228977203369, "learning_rate": 4.6270676691729324e-06, "loss": 0.2531, "step": 35730 }, { "epoch": 53.74, "grad_norm": 6.409964084625244, "learning_rate": 4.625563909774437e-06, "loss": 0.1561, "step": 35740 }, { "epoch": 53.76, "grad_norm": 4.846547603607178, "learning_rate": 4.62406015037594e-06, "loss": 0.2398, "step": 35750 }, { "epoch": 53.77, "grad_norm": 5.468544006347656, "learning_rate": 4.622556390977444e-06, "loss": 0.2449, "step": 35760 }, { "epoch": 53.79, "grad_norm": 5.312195301055908, "learning_rate": 4.621052631578948e-06, "loss": 0.2073, "step": 35770 }, { "epoch": 53.8, "grad_norm": 7.502721309661865, "learning_rate": 4.619548872180452e-06, "loss": 0.246, "step": 35780 }, { "epoch": 53.82, "grad_norm": 3.2691874504089355, "learning_rate": 4.618045112781955e-06, "loss": 0.2128, "step": 35790 }, { "epoch": 53.83, "grad_norm": 8.228704452514648, "learning_rate": 4.616541353383459e-06, "loss": 0.2276, "step": 35800 }, { "epoch": 53.85, "grad_norm": 5.924352169036865, "learning_rate": 4.615037593984963e-06, "loss": 0.219, "step": 35810 }, { "epoch": 53.86, "grad_norm": 3.2145702838897705, "learning_rate": 4.613533834586467e-06, "loss": 0.235, "step": 35820 }, { "epoch": 53.88, "grad_norm": 8.739072799682617, "learning_rate": 4.61203007518797e-06, "loss": 0.1665, "step": 35830 }, { "epoch": 53.89, "grad_norm": 3.9934237003326416, "learning_rate": 4.6105263157894745e-06, "loss": 0.1886, "step": 35840 }, { "epoch": 53.91, "grad_norm": 6.60872745513916, "learning_rate": 4.609022556390978e-06, "loss": 0.2398, "step": 35850 }, { "epoch": 53.92, "grad_norm": 7.813724994659424, "learning_rate": 4.6075187969924816e-06, "loss": 0.2371, "step": 35860 }, { "epoch": 53.94, "grad_norm": 5.628241539001465, "learning_rate": 4.606015037593985e-06, "loss": 0.238, "step": 35870 }, { "epoch": 53.95, "grad_norm": 4.604690074920654, "learning_rate": 4.6045112781954894e-06, "loss": 0.2523, "step": 35880 }, { "epoch": 53.97, "grad_norm": 3.7630534172058105, "learning_rate": 4.603007518796993e-06, "loss": 0.2467, "step": 35890 }, { "epoch": 53.98, "grad_norm": 5.215484142303467, "learning_rate": 4.6015037593984965e-06, "loss": 0.2155, "step": 35900 }, { "epoch": 54.0, "grad_norm": 1.31098210811615, "learning_rate": 4.600000000000001e-06, "loss": 0.1876, "step": 35910 }, { "epoch": 54.0, "eval_accuracy": 0.9318, "eval_loss": 0.3097255229949951, "eval_runtime": 84.7816, "eval_samples_per_second": 117.95, "eval_steps_per_second": 0.472, "step": 35910 }, { "epoch": 54.02, "grad_norm": 1.5609022378921509, "learning_rate": 4.598496240601504e-06, "loss": 0.1557, "step": 35920 }, { "epoch": 54.03, "grad_norm": 5.817564487457275, "learning_rate": 4.596992481203008e-06, "loss": 0.2565, "step": 35930 }, { "epoch": 54.05, "grad_norm": 4.572742938995361, "learning_rate": 4.595488721804512e-06, "loss": 0.2119, "step": 35940 }, { "epoch": 54.06, "grad_norm": 6.757458686828613, "learning_rate": 4.593984962406016e-06, "loss": 0.2227, "step": 35950 }, { "epoch": 54.08, "grad_norm": 5.000956058502197, "learning_rate": 4.592481203007519e-06, "loss": 0.212, "step": 35960 }, { "epoch": 54.09, "grad_norm": 9.407301902770996, "learning_rate": 4.590977443609023e-06, "loss": 0.2416, "step": 35970 }, { "epoch": 54.11, "grad_norm": 5.10260534286499, "learning_rate": 4.589473684210526e-06, "loss": 0.2637, "step": 35980 }, { "epoch": 54.12, "grad_norm": 6.334413528442383, "learning_rate": 4.58796992481203e-06, "loss": 0.2182, "step": 35990 }, { "epoch": 54.14, "grad_norm": 3.6286683082580566, "learning_rate": 4.586466165413534e-06, "loss": 0.1799, "step": 36000 }, { "epoch": 54.15, "grad_norm": 4.822414398193359, "learning_rate": 4.584962406015038e-06, "loss": 0.278, "step": 36010 }, { "epoch": 54.17, "grad_norm": 9.361292839050293, "learning_rate": 4.583458646616541e-06, "loss": 0.2302, "step": 36020 }, { "epoch": 54.18, "grad_norm": 5.843459129333496, "learning_rate": 4.581954887218046e-06, "loss": 0.2186, "step": 36030 }, { "epoch": 54.2, "grad_norm": 3.1918785572052, "learning_rate": 4.580451127819549e-06, "loss": 0.2655, "step": 36040 }, { "epoch": 54.21, "grad_norm": 5.561599254608154, "learning_rate": 4.578947368421053e-06, "loss": 0.2058, "step": 36050 }, { "epoch": 54.23, "grad_norm": 2.6801440715789795, "learning_rate": 4.577443609022556e-06, "loss": 0.2265, "step": 36060 }, { "epoch": 54.24, "grad_norm": 5.820893287658691, "learning_rate": 4.5759398496240605e-06, "loss": 0.2562, "step": 36070 }, { "epoch": 54.26, "grad_norm": 5.376248836517334, "learning_rate": 4.574436090225564e-06, "loss": 0.233, "step": 36080 }, { "epoch": 54.27, "grad_norm": 3.018950939178467, "learning_rate": 4.572932330827068e-06, "loss": 0.2185, "step": 36090 }, { "epoch": 54.29, "grad_norm": 7.458950996398926, "learning_rate": 4.571428571428572e-06, "loss": 0.2014, "step": 36100 }, { "epoch": 54.3, "grad_norm": 4.388574600219727, "learning_rate": 4.5699248120300755e-06, "loss": 0.2267, "step": 36110 }, { "epoch": 54.32, "grad_norm": 4.121464252471924, "learning_rate": 4.568421052631579e-06, "loss": 0.3017, "step": 36120 }, { "epoch": 54.33, "grad_norm": 3.9616527557373047, "learning_rate": 4.566917293233083e-06, "loss": 0.2357, "step": 36130 }, { "epoch": 54.35, "grad_norm": 7.739973545074463, "learning_rate": 4.565413533834587e-06, "loss": 0.2267, "step": 36140 }, { "epoch": 54.36, "grad_norm": 5.042054176330566, "learning_rate": 4.56390977443609e-06, "loss": 0.2686, "step": 36150 }, { "epoch": 54.38, "grad_norm": 6.037046909332275, "learning_rate": 4.562406015037594e-06, "loss": 0.1939, "step": 36160 }, { "epoch": 54.39, "grad_norm": 6.330253601074219, "learning_rate": 4.560902255639098e-06, "loss": 0.2152, "step": 36170 }, { "epoch": 54.41, "grad_norm": 5.244846820831299, "learning_rate": 4.559398496240602e-06, "loss": 0.1767, "step": 36180 }, { "epoch": 54.42, "grad_norm": 4.092870712280273, "learning_rate": 4.557894736842105e-06, "loss": 0.2417, "step": 36190 }, { "epoch": 54.44, "grad_norm": 4.611246585845947, "learning_rate": 4.55639097744361e-06, "loss": 0.2388, "step": 36200 }, { "epoch": 54.45, "grad_norm": 5.9267377853393555, "learning_rate": 4.554887218045113e-06, "loss": 0.2705, "step": 36210 }, { "epoch": 54.47, "grad_norm": 3.914341926574707, "learning_rate": 4.553383458646617e-06, "loss": 0.2293, "step": 36220 }, { "epoch": 54.48, "grad_norm": 6.353694438934326, "learning_rate": 4.551879699248121e-06, "loss": 0.2363, "step": 36230 }, { "epoch": 54.5, "grad_norm": 5.587188720703125, "learning_rate": 4.550375939849625e-06, "loss": 0.2194, "step": 36240 }, { "epoch": 54.51, "grad_norm": 7.160801887512207, "learning_rate": 4.548872180451128e-06, "loss": 0.2358, "step": 36250 }, { "epoch": 54.53, "grad_norm": 4.975449085235596, "learning_rate": 4.547368421052632e-06, "loss": 0.2003, "step": 36260 }, { "epoch": 54.54, "grad_norm": 5.509252071380615, "learning_rate": 4.545864661654136e-06, "loss": 0.2053, "step": 36270 }, { "epoch": 54.56, "grad_norm": 6.666650772094727, "learning_rate": 4.5443609022556395e-06, "loss": 0.1576, "step": 36280 }, { "epoch": 54.57, "grad_norm": 7.263132095336914, "learning_rate": 4.542857142857143e-06, "loss": 0.1459, "step": 36290 }, { "epoch": 54.59, "grad_norm": 4.0889410972595215, "learning_rate": 4.541353383458647e-06, "loss": 0.1864, "step": 36300 }, { "epoch": 54.6, "grad_norm": 5.318135738372803, "learning_rate": 4.539849624060151e-06, "loss": 0.258, "step": 36310 }, { "epoch": 54.62, "grad_norm": 5.723468780517578, "learning_rate": 4.5383458646616544e-06, "loss": 0.2035, "step": 36320 }, { "epoch": 54.63, "grad_norm": 5.231049060821533, "learning_rate": 4.536842105263158e-06, "loss": 0.1719, "step": 36330 }, { "epoch": 54.65, "grad_norm": 5.921431541442871, "learning_rate": 4.535338345864662e-06, "loss": 0.3196, "step": 36340 }, { "epoch": 54.66, "grad_norm": 5.602309703826904, "learning_rate": 4.533834586466166e-06, "loss": 0.2069, "step": 36350 }, { "epoch": 54.68, "grad_norm": 8.991955757141113, "learning_rate": 4.532330827067669e-06, "loss": 0.2646, "step": 36360 }, { "epoch": 54.69, "grad_norm": 3.8575797080993652, "learning_rate": 4.530827067669174e-06, "loss": 0.1887, "step": 36370 }, { "epoch": 54.71, "grad_norm": 3.4165918827056885, "learning_rate": 4.529323308270677e-06, "loss": 0.1746, "step": 36380 }, { "epoch": 54.72, "grad_norm": 5.644247531890869, "learning_rate": 4.527819548872181e-06, "loss": 0.2347, "step": 36390 }, { "epoch": 54.74, "grad_norm": 6.519909381866455, "learning_rate": 4.526315789473685e-06, "loss": 0.232, "step": 36400 }, { "epoch": 54.75, "grad_norm": 3.5762217044830322, "learning_rate": 4.524812030075189e-06, "loss": 0.2537, "step": 36410 }, { "epoch": 54.77, "grad_norm": 4.4489898681640625, "learning_rate": 4.523308270676692e-06, "loss": 0.2041, "step": 36420 }, { "epoch": 54.78, "grad_norm": 5.16048526763916, "learning_rate": 4.521804511278196e-06, "loss": 0.1966, "step": 36430 }, { "epoch": 54.8, "grad_norm": 4.274946689605713, "learning_rate": 4.5203007518797e-06, "loss": 0.2945, "step": 36440 }, { "epoch": 54.81, "grad_norm": 4.338449954986572, "learning_rate": 4.518796992481204e-06, "loss": 0.1972, "step": 36450 }, { "epoch": 54.83, "grad_norm": 5.914998531341553, "learning_rate": 4.517293233082707e-06, "loss": 0.1903, "step": 36460 }, { "epoch": 54.84, "grad_norm": 23.506519317626953, "learning_rate": 4.5157894736842115e-06, "loss": 0.2072, "step": 36470 }, { "epoch": 54.86, "grad_norm": 5.0130743980407715, "learning_rate": 4.514285714285714e-06, "loss": 0.2302, "step": 36480 }, { "epoch": 54.87, "grad_norm": 3.10636568069458, "learning_rate": 4.5127819548872185e-06, "loss": 0.1711, "step": 36490 }, { "epoch": 54.89, "grad_norm": 6.681315898895264, "learning_rate": 4.511278195488722e-06, "loss": 0.2502, "step": 36500 }, { "epoch": 54.9, "grad_norm": 5.002006530761719, "learning_rate": 4.5097744360902255e-06, "loss": 0.1971, "step": 36510 }, { "epoch": 54.92, "grad_norm": 1.6818307638168335, "learning_rate": 4.508270676691729e-06, "loss": 0.1604, "step": 36520 }, { "epoch": 54.93, "grad_norm": 5.976077556610107, "learning_rate": 4.5067669172932334e-06, "loss": 0.2416, "step": 36530 }, { "epoch": 54.95, "grad_norm": 4.581597805023193, "learning_rate": 4.505263157894737e-06, "loss": 0.2433, "step": 36540 }, { "epoch": 54.96, "grad_norm": 6.989867210388184, "learning_rate": 4.5037593984962405e-06, "loss": 0.2687, "step": 36550 }, { "epoch": 54.98, "grad_norm": 8.435478210449219, "learning_rate": 4.502255639097745e-06, "loss": 0.2427, "step": 36560 }, { "epoch": 54.99, "grad_norm": 5.939743518829346, "learning_rate": 4.500751879699248e-06, "loss": 0.1774, "step": 36570 }, { "epoch": 55.0, "eval_accuracy": 0.9321, "eval_loss": 0.3104659616947174, "eval_runtime": 84.6339, "eval_samples_per_second": 118.156, "eval_steps_per_second": 0.473, "step": 36575 }, { "epoch": 55.01, "grad_norm": 4.729255676269531, "learning_rate": 4.499248120300752e-06, "loss": 0.4104, "step": 36580 }, { "epoch": 55.02, "grad_norm": 4.213466644287109, "learning_rate": 4.497744360902256e-06, "loss": 0.24, "step": 36590 }, { "epoch": 55.04, "grad_norm": 2.702681064605713, "learning_rate": 4.49624060150376e-06, "loss": 0.1929, "step": 36600 }, { "epoch": 55.05, "grad_norm": 5.444637775421143, "learning_rate": 4.494736842105263e-06, "loss": 0.2168, "step": 36610 }, { "epoch": 55.07, "grad_norm": 8.909059524536133, "learning_rate": 4.493233082706767e-06, "loss": 0.2456, "step": 36620 }, { "epoch": 55.08, "grad_norm": 3.999340772628784, "learning_rate": 4.491729323308271e-06, "loss": 0.1867, "step": 36630 }, { "epoch": 55.1, "grad_norm": 4.647693157196045, "learning_rate": 4.490225563909775e-06, "loss": 0.2448, "step": 36640 }, { "epoch": 55.11, "grad_norm": 4.264559745788574, "learning_rate": 4.488721804511278e-06, "loss": 0.1637, "step": 36650 }, { "epoch": 55.13, "grad_norm": 2.194603204727173, "learning_rate": 4.4872180451127826e-06, "loss": 0.2602, "step": 36660 }, { "epoch": 55.14, "grad_norm": 3.7559118270874023, "learning_rate": 4.485714285714286e-06, "loss": 0.2043, "step": 36670 }, { "epoch": 55.16, "grad_norm": 3.0917916297912598, "learning_rate": 4.48421052631579e-06, "loss": 0.1839, "step": 36680 }, { "epoch": 55.17, "grad_norm": 4.948066234588623, "learning_rate": 4.482706766917294e-06, "loss": 0.2476, "step": 36690 }, { "epoch": 55.19, "grad_norm": 5.164941310882568, "learning_rate": 4.4812030075187975e-06, "loss": 0.1977, "step": 36700 }, { "epoch": 55.2, "grad_norm": 4.678724765777588, "learning_rate": 4.479699248120301e-06, "loss": 0.2576, "step": 36710 }, { "epoch": 55.22, "grad_norm": 8.130949020385742, "learning_rate": 4.4781954887218045e-06, "loss": 0.232, "step": 36720 }, { "epoch": 55.23, "grad_norm": 4.381587505340576, "learning_rate": 4.476691729323309e-06, "loss": 0.2102, "step": 36730 }, { "epoch": 55.25, "grad_norm": 3.894073724746704, "learning_rate": 4.475187969924812e-06, "loss": 0.2271, "step": 36740 }, { "epoch": 55.26, "grad_norm": 7.002061367034912, "learning_rate": 4.473684210526316e-06, "loss": 0.1887, "step": 36750 }, { "epoch": 55.28, "grad_norm": 5.8444132804870605, "learning_rate": 4.47218045112782e-06, "loss": 0.2588, "step": 36760 }, { "epoch": 55.29, "grad_norm": 3.5753517150878906, "learning_rate": 4.470676691729324e-06, "loss": 0.2312, "step": 36770 }, { "epoch": 55.31, "grad_norm": 7.695021629333496, "learning_rate": 4.469172932330827e-06, "loss": 0.2307, "step": 36780 }, { "epoch": 55.32, "grad_norm": 4.55169153213501, "learning_rate": 4.467669172932331e-06, "loss": 0.2242, "step": 36790 }, { "epoch": 55.34, "grad_norm": 2.349003553390503, "learning_rate": 4.466165413533835e-06, "loss": 0.2035, "step": 36800 }, { "epoch": 55.35, "grad_norm": 5.827582359313965, "learning_rate": 4.464661654135339e-06, "loss": 0.2916, "step": 36810 }, { "epoch": 55.37, "grad_norm": 3.1232826709747314, "learning_rate": 4.463157894736842e-06, "loss": 0.231, "step": 36820 }, { "epoch": 55.38, "grad_norm": 6.336644172668457, "learning_rate": 4.461654135338347e-06, "loss": 0.2344, "step": 36830 }, { "epoch": 55.4, "grad_norm": 4.629255771636963, "learning_rate": 4.46015037593985e-06, "loss": 0.2577, "step": 36840 }, { "epoch": 55.41, "grad_norm": 5.478154182434082, "learning_rate": 4.458646616541354e-06, "loss": 0.2311, "step": 36850 }, { "epoch": 55.43, "grad_norm": 7.846320629119873, "learning_rate": 4.457142857142858e-06, "loss": 0.247, "step": 36860 }, { "epoch": 55.44, "grad_norm": 3.461475133895874, "learning_rate": 4.4556390977443615e-06, "loss": 0.1488, "step": 36870 }, { "epoch": 55.46, "grad_norm": 3.4155285358428955, "learning_rate": 4.454135338345865e-06, "loss": 0.1368, "step": 36880 }, { "epoch": 55.47, "grad_norm": 2.114795684814453, "learning_rate": 4.452631578947369e-06, "loss": 0.1632, "step": 36890 }, { "epoch": 55.49, "grad_norm": 1.200542688369751, "learning_rate": 4.451127819548873e-06, "loss": 0.1591, "step": 36900 }, { "epoch": 55.5, "grad_norm": 4.999506950378418, "learning_rate": 4.4496240601503765e-06, "loss": 0.1666, "step": 36910 }, { "epoch": 55.52, "grad_norm": 3.8099682331085205, "learning_rate": 4.44812030075188e-06, "loss": 0.1601, "step": 36920 }, { "epoch": 55.53, "grad_norm": 7.439540386199951, "learning_rate": 4.446616541353384e-06, "loss": 0.2179, "step": 36930 }, { "epoch": 55.55, "grad_norm": 4.367668151855469, "learning_rate": 4.445112781954888e-06, "loss": 0.1956, "step": 36940 }, { "epoch": 55.56, "grad_norm": 5.904849529266357, "learning_rate": 4.443609022556391e-06, "loss": 0.2132, "step": 36950 }, { "epoch": 55.58, "grad_norm": 9.836445808410645, "learning_rate": 4.442105263157896e-06, "loss": 0.2255, "step": 36960 }, { "epoch": 55.59, "grad_norm": 2.742265224456787, "learning_rate": 4.440601503759399e-06, "loss": 0.2005, "step": 36970 }, { "epoch": 55.61, "grad_norm": 6.371535778045654, "learning_rate": 4.439097744360902e-06, "loss": 0.1944, "step": 36980 }, { "epoch": 55.62, "grad_norm": 3.961942195892334, "learning_rate": 4.437593984962406e-06, "loss": 0.1885, "step": 36990 }, { "epoch": 55.64, "grad_norm": 7.4699320793151855, "learning_rate": 4.43609022556391e-06, "loss": 0.2383, "step": 37000 }, { "epoch": 55.65, "grad_norm": 2.479861259460449, "learning_rate": 4.434586466165413e-06, "loss": 0.2797, "step": 37010 }, { "epoch": 55.67, "grad_norm": 4.376903057098389, "learning_rate": 4.433082706766918e-06, "loss": 0.1898, "step": 37020 }, { "epoch": 55.68, "grad_norm": 5.335219860076904, "learning_rate": 4.431578947368421e-06, "loss": 0.2382, "step": 37030 }, { "epoch": 55.7, "grad_norm": 4.678366184234619, "learning_rate": 4.430075187969925e-06, "loss": 0.2029, "step": 37040 }, { "epoch": 55.71, "grad_norm": 5.352789878845215, "learning_rate": 4.428571428571429e-06, "loss": 0.2286, "step": 37050 }, { "epoch": 55.73, "grad_norm": 5.189038276672363, "learning_rate": 4.427067669172933e-06, "loss": 0.1941, "step": 37060 }, { "epoch": 55.74, "grad_norm": 10.683388710021973, "learning_rate": 4.425563909774436e-06, "loss": 0.195, "step": 37070 }, { "epoch": 55.76, "grad_norm": 6.178273677825928, "learning_rate": 4.42406015037594e-06, "loss": 0.2207, "step": 37080 }, { "epoch": 55.77, "grad_norm": 3.819103479385376, "learning_rate": 4.422556390977444e-06, "loss": 0.2128, "step": 37090 }, { "epoch": 55.79, "grad_norm": 6.3040452003479, "learning_rate": 4.4210526315789476e-06, "loss": 0.1738, "step": 37100 }, { "epoch": 55.8, "grad_norm": 5.171235084533691, "learning_rate": 4.419548872180451e-06, "loss": 0.2041, "step": 37110 }, { "epoch": 55.82, "grad_norm": 5.310617446899414, "learning_rate": 4.4180451127819555e-06, "loss": 0.2201, "step": 37120 }, { "epoch": 55.83, "grad_norm": 7.987985610961914, "learning_rate": 4.416541353383459e-06, "loss": 0.1631, "step": 37130 }, { "epoch": 55.85, "grad_norm": 3.634016275405884, "learning_rate": 4.4150375939849625e-06, "loss": 0.248, "step": 37140 }, { "epoch": 55.86, "grad_norm": 7.253101348876953, "learning_rate": 4.413533834586467e-06, "loss": 0.2631, "step": 37150 }, { "epoch": 55.88, "grad_norm": 5.290616989135742, "learning_rate": 4.41203007518797e-06, "loss": 0.1595, "step": 37160 }, { "epoch": 55.89, "grad_norm": 7.694485187530518, "learning_rate": 4.410526315789474e-06, "loss": 0.2274, "step": 37170 }, { "epoch": 55.91, "grad_norm": 4.015926837921143, "learning_rate": 4.409022556390977e-06, "loss": 0.1889, "step": 37180 }, { "epoch": 55.92, "grad_norm": 6.907248020172119, "learning_rate": 4.407518796992482e-06, "loss": 0.2389, "step": 37190 }, { "epoch": 55.94, "grad_norm": 9.580093383789062, "learning_rate": 4.406015037593985e-06, "loss": 0.1979, "step": 37200 }, { "epoch": 55.95, "grad_norm": 5.89567756652832, "learning_rate": 4.404511278195489e-06, "loss": 0.2167, "step": 37210 }, { "epoch": 55.97, "grad_norm": 6.980398178100586, "learning_rate": 4.403007518796993e-06, "loss": 0.2273, "step": 37220 }, { "epoch": 55.98, "grad_norm": 4.1732869148254395, "learning_rate": 4.401503759398497e-06, "loss": 0.217, "step": 37230 }, { "epoch": 56.0, "grad_norm": 0.04293264448642731, "learning_rate": 4.4e-06, "loss": 0.2011, "step": 37240 }, { "epoch": 56.0, "eval_accuracy": 0.9337, "eval_loss": 0.3107641935348511, "eval_runtime": 84.6401, "eval_samples_per_second": 118.147, "eval_steps_per_second": 0.473, "step": 37240 }, { "epoch": 56.02, "grad_norm": 2.9971933364868164, "learning_rate": 4.398496240601504e-06, "loss": 0.1546, "step": 37250 }, { "epoch": 56.03, "grad_norm": 4.23447322845459, "learning_rate": 4.396992481203008e-06, "loss": 0.2525, "step": 37260 }, { "epoch": 56.05, "grad_norm": 2.2622056007385254, "learning_rate": 4.395488721804512e-06, "loss": 0.192, "step": 37270 }, { "epoch": 56.06, "grad_norm": 6.475707054138184, "learning_rate": 4.393984962406015e-06, "loss": 0.2287, "step": 37280 }, { "epoch": 56.08, "grad_norm": 5.559025764465332, "learning_rate": 4.3924812030075195e-06, "loss": 0.2248, "step": 37290 }, { "epoch": 56.09, "grad_norm": 8.06891918182373, "learning_rate": 4.390977443609023e-06, "loss": 0.1641, "step": 37300 }, { "epoch": 56.11, "grad_norm": 3.4583325386047363, "learning_rate": 4.3894736842105266e-06, "loss": 0.231, "step": 37310 }, { "epoch": 56.12, "grad_norm": 11.369089126586914, "learning_rate": 4.387969924812031e-06, "loss": 0.2368, "step": 37320 }, { "epoch": 56.14, "grad_norm": 3.740903854370117, "learning_rate": 4.3864661654135344e-06, "loss": 0.1868, "step": 37330 }, { "epoch": 56.15, "grad_norm": 7.389920234680176, "learning_rate": 4.384962406015038e-06, "loss": 0.2224, "step": 37340 }, { "epoch": 56.17, "grad_norm": 4.443047046661377, "learning_rate": 4.3834586466165415e-06, "loss": 0.2308, "step": 37350 }, { "epoch": 56.18, "grad_norm": 5.819294452667236, "learning_rate": 4.381954887218046e-06, "loss": 0.1937, "step": 37360 }, { "epoch": 56.2, "grad_norm": 4.005828857421875, "learning_rate": 4.380451127819549e-06, "loss": 0.2269, "step": 37370 }, { "epoch": 56.21, "grad_norm": 9.527501106262207, "learning_rate": 4.378947368421053e-06, "loss": 0.2721, "step": 37380 }, { "epoch": 56.23, "grad_norm": 5.382596492767334, "learning_rate": 4.377443609022557e-06, "loss": 0.2143, "step": 37390 }, { "epoch": 56.24, "grad_norm": 8.085970878601074, "learning_rate": 4.375939849624061e-06, "loss": 0.1549, "step": 37400 }, { "epoch": 56.26, "grad_norm": 7.813641548156738, "learning_rate": 4.374436090225564e-06, "loss": 0.1926, "step": 37410 }, { "epoch": 56.27, "grad_norm": 5.974503993988037, "learning_rate": 4.372932330827069e-06, "loss": 0.2217, "step": 37420 }, { "epoch": 56.29, "grad_norm": 3.7494096755981445, "learning_rate": 4.371428571428572e-06, "loss": 0.1835, "step": 37430 }, { "epoch": 56.3, "grad_norm": 8.034648895263672, "learning_rate": 4.369924812030076e-06, "loss": 0.2319, "step": 37440 }, { "epoch": 56.32, "grad_norm": 5.222734451293945, "learning_rate": 4.368421052631579e-06, "loss": 0.2468, "step": 37450 }, { "epoch": 56.33, "grad_norm": 2.54164981842041, "learning_rate": 4.3669172932330836e-06, "loss": 0.1801, "step": 37460 }, { "epoch": 56.35, "grad_norm": 8.72768783569336, "learning_rate": 4.365413533834586e-06, "loss": 0.2092, "step": 37470 }, { "epoch": 56.36, "grad_norm": 10.628849983215332, "learning_rate": 4.363909774436091e-06, "loss": 0.2025, "step": 37480 }, { "epoch": 56.38, "grad_norm": 5.263908863067627, "learning_rate": 4.362406015037594e-06, "loss": 0.1637, "step": 37490 }, { "epoch": 56.39, "grad_norm": 3.4017181396484375, "learning_rate": 4.360902255639098e-06, "loss": 0.2772, "step": 37500 }, { "epoch": 56.41, "grad_norm": 6.786311626434326, "learning_rate": 4.359398496240602e-06, "loss": 0.1987, "step": 37510 }, { "epoch": 56.42, "grad_norm": 6.281602382659912, "learning_rate": 4.3578947368421055e-06, "loss": 0.1707, "step": 37520 }, { "epoch": 56.44, "grad_norm": 6.129687786102295, "learning_rate": 4.356390977443609e-06, "loss": 0.2591, "step": 37530 }, { "epoch": 56.45, "grad_norm": 4.573347568511963, "learning_rate": 4.3548872180451126e-06, "loss": 0.2142, "step": 37540 }, { "epoch": 56.47, "grad_norm": 4.783934593200684, "learning_rate": 4.353383458646617e-06, "loss": 0.1747, "step": 37550 }, { "epoch": 56.48, "grad_norm": 3.693443775177002, "learning_rate": 4.3518796992481205e-06, "loss": 0.2222, "step": 37560 }, { "epoch": 56.5, "grad_norm": 5.919257640838623, "learning_rate": 4.350375939849624e-06, "loss": 0.2443, "step": 37570 }, { "epoch": 56.51, "grad_norm": 9.530694007873535, "learning_rate": 4.348872180451128e-06, "loss": 0.1908, "step": 37580 }, { "epoch": 56.53, "grad_norm": 5.846238613128662, "learning_rate": 4.347368421052632e-06, "loss": 0.2141, "step": 37590 }, { "epoch": 56.54, "grad_norm": 5.0612616539001465, "learning_rate": 4.345864661654135e-06, "loss": 0.1802, "step": 37600 }, { "epoch": 56.56, "grad_norm": 3.5078976154327393, "learning_rate": 4.34436090225564e-06, "loss": 0.1856, "step": 37610 }, { "epoch": 56.57, "grad_norm": 2.6494715213775635, "learning_rate": 4.342857142857143e-06, "loss": 0.2228, "step": 37620 }, { "epoch": 56.59, "grad_norm": 7.06261682510376, "learning_rate": 4.341353383458647e-06, "loss": 0.1996, "step": 37630 }, { "epoch": 56.6, "grad_norm": 4.200351238250732, "learning_rate": 4.33984962406015e-06, "loss": 0.1868, "step": 37640 }, { "epoch": 56.62, "grad_norm": 3.3782460689544678, "learning_rate": 4.338345864661655e-06, "loss": 0.1933, "step": 37650 }, { "epoch": 56.63, "grad_norm": 5.649930953979492, "learning_rate": 4.336842105263158e-06, "loss": 0.1826, "step": 37660 }, { "epoch": 56.65, "grad_norm": 5.642304420471191, "learning_rate": 4.335338345864662e-06, "loss": 0.232, "step": 37670 }, { "epoch": 56.66, "grad_norm": 4.194146156311035, "learning_rate": 4.333834586466166e-06, "loss": 0.232, "step": 37680 }, { "epoch": 56.68, "grad_norm": 6.466404438018799, "learning_rate": 4.33233082706767e-06, "loss": 0.184, "step": 37690 }, { "epoch": 56.69, "grad_norm": 6.861640930175781, "learning_rate": 4.330827067669173e-06, "loss": 0.2036, "step": 37700 }, { "epoch": 56.71, "grad_norm": 5.0166850090026855, "learning_rate": 4.329323308270677e-06, "loss": 0.2537, "step": 37710 }, { "epoch": 56.72, "grad_norm": 6.62353515625, "learning_rate": 4.327819548872181e-06, "loss": 0.2708, "step": 37720 }, { "epoch": 56.74, "grad_norm": 5.707888603210449, "learning_rate": 4.3263157894736845e-06, "loss": 0.162, "step": 37730 }, { "epoch": 56.75, "grad_norm": 6.166172981262207, "learning_rate": 4.324812030075188e-06, "loss": 0.2149, "step": 37740 }, { "epoch": 56.77, "grad_norm": 8.481252670288086, "learning_rate": 4.323308270676692e-06, "loss": 0.2423, "step": 37750 }, { "epoch": 56.78, "grad_norm": 1.8157340288162231, "learning_rate": 4.321804511278196e-06, "loss": 0.2519, "step": 37760 }, { "epoch": 56.8, "grad_norm": 6.047143459320068, "learning_rate": 4.3203007518796994e-06, "loss": 0.3102, "step": 37770 }, { "epoch": 56.81, "grad_norm": 3.9835569858551025, "learning_rate": 4.318796992481204e-06, "loss": 0.2141, "step": 37780 }, { "epoch": 56.83, "grad_norm": 3.2040421962738037, "learning_rate": 4.317293233082707e-06, "loss": 0.1938, "step": 37790 }, { "epoch": 56.84, "grad_norm": 3.5100557804107666, "learning_rate": 4.315789473684211e-06, "loss": 0.1956, "step": 37800 }, { "epoch": 56.86, "grad_norm": 12.860550880432129, "learning_rate": 4.314285714285714e-06, "loss": 0.2704, "step": 37810 }, { "epoch": 56.87, "grad_norm": 4.295982360839844, "learning_rate": 4.312781954887219e-06, "loss": 0.2478, "step": 37820 }, { "epoch": 56.89, "grad_norm": 5.491456031799316, "learning_rate": 4.311278195488722e-06, "loss": 0.2482, "step": 37830 }, { "epoch": 56.9, "grad_norm": 3.705702781677246, "learning_rate": 4.309774436090226e-06, "loss": 0.189, "step": 37840 }, { "epoch": 56.92, "grad_norm": 9.076353073120117, "learning_rate": 4.30827067669173e-06, "loss": 0.2324, "step": 37850 }, { "epoch": 56.93, "grad_norm": 5.757368564605713, "learning_rate": 4.306766917293234e-06, "loss": 0.2676, "step": 37860 }, { "epoch": 56.95, "grad_norm": 5.485102653503418, "learning_rate": 4.305263157894737e-06, "loss": 0.1747, "step": 37870 }, { "epoch": 56.96, "grad_norm": 3.859330892562866, "learning_rate": 4.3037593984962415e-06, "loss": 0.2029, "step": 37880 }, { "epoch": 56.98, "grad_norm": 2.6711108684539795, "learning_rate": 4.302255639097745e-06, "loss": 0.2083, "step": 37890 }, { "epoch": 56.99, "grad_norm": 5.341872692108154, "learning_rate": 4.3007518796992486e-06, "loss": 0.2142, "step": 37900 }, { "epoch": 57.0, "eval_accuracy": 0.9312, "eval_loss": 0.3190965950489044, "eval_runtime": 84.8921, "eval_samples_per_second": 117.797, "eval_steps_per_second": 0.471, "step": 37905 }, { "epoch": 57.01, "grad_norm": 3.6653482913970947, "learning_rate": 4.299248120300752e-06, "loss": 0.2008, "step": 37910 }, { "epoch": 57.02, "grad_norm": 3.9756722450256348, "learning_rate": 4.2977443609022565e-06, "loss": 0.1872, "step": 37920 }, { "epoch": 57.04, "grad_norm": 9.079606056213379, "learning_rate": 4.29624060150376e-06, "loss": 0.1913, "step": 37930 }, { "epoch": 57.05, "grad_norm": 5.466242790222168, "learning_rate": 4.2947368421052635e-06, "loss": 0.2364, "step": 37940 }, { "epoch": 57.07, "grad_norm": 1.4235931634902954, "learning_rate": 4.293233082706768e-06, "loss": 0.2255, "step": 37950 }, { "epoch": 57.08, "grad_norm": 13.605971336364746, "learning_rate": 4.291729323308271e-06, "loss": 0.2454, "step": 37960 }, { "epoch": 57.1, "grad_norm": 5.704050064086914, "learning_rate": 4.290225563909775e-06, "loss": 0.2411, "step": 37970 }, { "epoch": 57.11, "grad_norm": 4.244338035583496, "learning_rate": 4.288721804511278e-06, "loss": 0.2289, "step": 37980 }, { "epoch": 57.13, "grad_norm": 5.691931247711182, "learning_rate": 4.287218045112782e-06, "loss": 0.2031, "step": 37990 }, { "epoch": 57.14, "grad_norm": 8.121678352355957, "learning_rate": 4.2857142857142855e-06, "loss": 0.2434, "step": 38000 }, { "epoch": 57.16, "grad_norm": 2.5804800987243652, "learning_rate": 4.28421052631579e-06, "loss": 0.1663, "step": 38010 }, { "epoch": 57.17, "grad_norm": 6.460157871246338, "learning_rate": 4.282706766917293e-06, "loss": 0.1966, "step": 38020 }, { "epoch": 57.19, "grad_norm": 6.010727405548096, "learning_rate": 4.281203007518797e-06, "loss": 0.2547, "step": 38030 }, { "epoch": 57.2, "grad_norm": 3.152200937271118, "learning_rate": 4.279699248120301e-06, "loss": 0.1743, "step": 38040 }, { "epoch": 57.22, "grad_norm": 6.710914134979248, "learning_rate": 4.278195488721805e-06, "loss": 0.2268, "step": 38050 }, { "epoch": 57.23, "grad_norm": 2.6717779636383057, "learning_rate": 4.276691729323308e-06, "loss": 0.1521, "step": 38060 }, { "epoch": 57.25, "grad_norm": 5.592037200927734, "learning_rate": 4.275187969924813e-06, "loss": 0.2159, "step": 38070 }, { "epoch": 57.26, "grad_norm": 3.0837130546569824, "learning_rate": 4.273684210526316e-06, "loss": 0.2232, "step": 38080 }, { "epoch": 57.28, "grad_norm": 5.526956558227539, "learning_rate": 4.27218045112782e-06, "loss": 0.1866, "step": 38090 }, { "epoch": 57.29, "grad_norm": 5.77144718170166, "learning_rate": 4.270676691729323e-06, "loss": 0.2533, "step": 38100 }, { "epoch": 57.31, "grad_norm": 3.7679331302642822, "learning_rate": 4.2691729323308276e-06, "loss": 0.2217, "step": 38110 }, { "epoch": 57.32, "grad_norm": 19.891773223876953, "learning_rate": 4.267669172932331e-06, "loss": 0.1851, "step": 38120 }, { "epoch": 57.34, "grad_norm": 8.34205436706543, "learning_rate": 4.266165413533835e-06, "loss": 0.2048, "step": 38130 }, { "epoch": 57.35, "grad_norm": 5.428603172302246, "learning_rate": 4.264661654135339e-06, "loss": 0.2122, "step": 38140 }, { "epoch": 57.37, "grad_norm": 3.5390725135803223, "learning_rate": 4.2631578947368425e-06, "loss": 0.2041, "step": 38150 }, { "epoch": 57.38, "grad_norm": 4.302058219909668, "learning_rate": 4.261654135338346e-06, "loss": 0.1994, "step": 38160 }, { "epoch": 57.4, "grad_norm": 16.373668670654297, "learning_rate": 4.2601503759398495e-06, "loss": 0.2482, "step": 38170 }, { "epoch": 57.41, "grad_norm": 3.5577502250671387, "learning_rate": 4.258646616541354e-06, "loss": 0.1791, "step": 38180 }, { "epoch": 57.43, "grad_norm": 3.964195728302002, "learning_rate": 4.257142857142857e-06, "loss": 0.2346, "step": 38190 }, { "epoch": 57.44, "grad_norm": 4.708043575286865, "learning_rate": 4.255639097744361e-06, "loss": 0.1822, "step": 38200 }, { "epoch": 57.46, "grad_norm": 4.237730503082275, "learning_rate": 4.254135338345865e-06, "loss": 0.1897, "step": 38210 }, { "epoch": 57.47, "grad_norm": 4.254096508026123, "learning_rate": 4.252631578947369e-06, "loss": 0.2153, "step": 38220 }, { "epoch": 57.49, "grad_norm": 2.041449546813965, "learning_rate": 4.251127819548872e-06, "loss": 0.1745, "step": 38230 }, { "epoch": 57.5, "grad_norm": 2.9774394035339355, "learning_rate": 4.249624060150377e-06, "loss": 0.2481, "step": 38240 }, { "epoch": 57.52, "grad_norm": 2.6556615829467773, "learning_rate": 4.24812030075188e-06, "loss": 0.1992, "step": 38250 }, { "epoch": 57.53, "grad_norm": 4.096647262573242, "learning_rate": 4.246616541353384e-06, "loss": 0.2644, "step": 38260 }, { "epoch": 57.55, "grad_norm": 4.746277809143066, "learning_rate": 4.245112781954887e-06, "loss": 0.1919, "step": 38270 }, { "epoch": 57.56, "grad_norm": 5.280014514923096, "learning_rate": 4.243609022556392e-06, "loss": 0.2576, "step": 38280 }, { "epoch": 57.58, "grad_norm": 6.924928188323975, "learning_rate": 4.242105263157895e-06, "loss": 0.1854, "step": 38290 }, { "epoch": 57.59, "grad_norm": 3.799520969390869, "learning_rate": 4.240601503759399e-06, "loss": 0.2675, "step": 38300 }, { "epoch": 57.61, "grad_norm": 5.440141677856445, "learning_rate": 4.239097744360903e-06, "loss": 0.1975, "step": 38310 }, { "epoch": 57.62, "grad_norm": 5.276878356933594, "learning_rate": 4.2375939849624065e-06, "loss": 0.2251, "step": 38320 }, { "epoch": 57.64, "grad_norm": 5.117806434631348, "learning_rate": 4.23609022556391e-06, "loss": 0.1818, "step": 38330 }, { "epoch": 57.65, "grad_norm": 5.137388229370117, "learning_rate": 4.2345864661654144e-06, "loss": 0.2429, "step": 38340 }, { "epoch": 57.67, "grad_norm": 1.6786266565322876, "learning_rate": 4.233082706766918e-06, "loss": 0.2236, "step": 38350 }, { "epoch": 57.68, "grad_norm": 4.657053470611572, "learning_rate": 4.2315789473684215e-06, "loss": 0.2304, "step": 38360 }, { "epoch": 57.7, "grad_norm": 6.737263202667236, "learning_rate": 4.230075187969925e-06, "loss": 0.1869, "step": 38370 }, { "epoch": 57.71, "grad_norm": 8.260579109191895, "learning_rate": 4.228571428571429e-06, "loss": 0.2162, "step": 38380 }, { "epoch": 57.73, "grad_norm": 4.183199405670166, "learning_rate": 4.227067669172933e-06, "loss": 0.2047, "step": 38390 }, { "epoch": 57.74, "grad_norm": 6.769223213195801, "learning_rate": 4.225563909774436e-06, "loss": 0.1686, "step": 38400 }, { "epoch": 57.76, "grad_norm": 6.208756923675537, "learning_rate": 4.224060150375941e-06, "loss": 0.2128, "step": 38410 }, { "epoch": 57.77, "grad_norm": 5.576897144317627, "learning_rate": 4.222556390977444e-06, "loss": 0.1921, "step": 38420 }, { "epoch": 57.79, "grad_norm": 6.998033046722412, "learning_rate": 4.221052631578948e-06, "loss": 0.2155, "step": 38430 }, { "epoch": 57.8, "grad_norm": 4.374655246734619, "learning_rate": 4.219548872180451e-06, "loss": 0.2439, "step": 38440 }, { "epoch": 57.82, "grad_norm": 3.7079293727874756, "learning_rate": 4.218045112781956e-06, "loss": 0.228, "step": 38450 }, { "epoch": 57.83, "grad_norm": 7.11881685256958, "learning_rate": 4.216541353383459e-06, "loss": 0.2366, "step": 38460 }, { "epoch": 57.85, "grad_norm": 11.91295051574707, "learning_rate": 4.215037593984963e-06, "loss": 0.2144, "step": 38470 }, { "epoch": 57.86, "grad_norm": 4.202861309051514, "learning_rate": 4.213533834586466e-06, "loss": 0.2088, "step": 38480 }, { "epoch": 57.88, "grad_norm": 2.2428081035614014, "learning_rate": 4.21203007518797e-06, "loss": 0.2238, "step": 38490 }, { "epoch": 57.89, "grad_norm": 6.293793678283691, "learning_rate": 4.210526315789474e-06, "loss": 0.246, "step": 38500 }, { "epoch": 57.91, "grad_norm": 3.7611231803894043, "learning_rate": 4.209022556390978e-06, "loss": 0.2574, "step": 38510 }, { "epoch": 57.92, "grad_norm": 3.548879623413086, "learning_rate": 4.207518796992481e-06, "loss": 0.2064, "step": 38520 }, { "epoch": 57.94, "grad_norm": 3.5769736766815186, "learning_rate": 4.2060150375939855e-06, "loss": 0.1324, "step": 38530 }, { "epoch": 57.95, "grad_norm": 6.342837810516357, "learning_rate": 4.204511278195489e-06, "loss": 0.1634, "step": 38540 }, { "epoch": 57.97, "grad_norm": 3.722198724746704, "learning_rate": 4.2030075187969926e-06, "loss": 0.2034, "step": 38550 }, { "epoch": 57.98, "grad_norm": 6.514225482940674, "learning_rate": 4.201503759398496e-06, "loss": 0.2175, "step": 38560 }, { "epoch": 58.0, "grad_norm": 0.08241364359855652, "learning_rate": 4.2000000000000004e-06, "loss": 0.1931, "step": 38570 }, { "epoch": 58.0, "eval_accuracy": 0.9299, "eval_loss": 0.3219141960144043, "eval_runtime": 84.8061, "eval_samples_per_second": 117.916, "eval_steps_per_second": 0.472, "step": 38570 }, { "epoch": 58.02, "grad_norm": 3.8991994857788086, "learning_rate": 4.198496240601504e-06, "loss": 0.2143, "step": 38580 }, { "epoch": 58.03, "grad_norm": 4.180047035217285, "learning_rate": 4.1969924812030075e-06, "loss": 0.1819, "step": 38590 }, { "epoch": 58.05, "grad_norm": 9.162282943725586, "learning_rate": 4.195488721804512e-06, "loss": 0.2224, "step": 38600 }, { "epoch": 58.06, "grad_norm": 3.6437289714813232, "learning_rate": 4.193984962406015e-06, "loss": 0.1961, "step": 38610 }, { "epoch": 58.08, "grad_norm": 5.206214904785156, "learning_rate": 4.192481203007519e-06, "loss": 0.2267, "step": 38620 }, { "epoch": 58.09, "grad_norm": 7.128195762634277, "learning_rate": 4.190977443609022e-06, "loss": 0.2716, "step": 38630 }, { "epoch": 58.11, "grad_norm": 6.48333740234375, "learning_rate": 4.189473684210527e-06, "loss": 0.2381, "step": 38640 }, { "epoch": 58.12, "grad_norm": 6.768774509429932, "learning_rate": 4.18796992481203e-06, "loss": 0.2204, "step": 38650 }, { "epoch": 58.14, "grad_norm": 7.444092273712158, "learning_rate": 4.186466165413534e-06, "loss": 0.2563, "step": 38660 }, { "epoch": 58.15, "grad_norm": 5.427529335021973, "learning_rate": 4.184962406015038e-06, "loss": 0.1636, "step": 38670 }, { "epoch": 58.17, "grad_norm": 9.827468872070312, "learning_rate": 4.183458646616542e-06, "loss": 0.2042, "step": 38680 }, { "epoch": 58.18, "grad_norm": 7.735494613647461, "learning_rate": 4.181954887218045e-06, "loss": 0.2047, "step": 38690 }, { "epoch": 58.2, "grad_norm": 4.068072319030762, "learning_rate": 4.18045112781955e-06, "loss": 0.2105, "step": 38700 }, { "epoch": 58.21, "grad_norm": 5.502895832061768, "learning_rate": 4.178947368421053e-06, "loss": 0.2376, "step": 38710 }, { "epoch": 58.23, "grad_norm": 4.636054515838623, "learning_rate": 4.177443609022557e-06, "loss": 0.2079, "step": 38720 }, { "epoch": 58.24, "grad_norm": 3.9590959548950195, "learning_rate": 4.17593984962406e-06, "loss": 0.2144, "step": 38730 }, { "epoch": 58.26, "grad_norm": 4.759368896484375, "learning_rate": 4.1744360902255645e-06, "loss": 0.1763, "step": 38740 }, { "epoch": 58.27, "grad_norm": 4.7535400390625, "learning_rate": 4.172932330827068e-06, "loss": 0.1848, "step": 38750 }, { "epoch": 58.29, "grad_norm": 8.80599308013916, "learning_rate": 4.1714285714285715e-06, "loss": 0.2202, "step": 38760 }, { "epoch": 58.3, "grad_norm": 5.612436771392822, "learning_rate": 4.169924812030076e-06, "loss": 0.1748, "step": 38770 }, { "epoch": 58.32, "grad_norm": 5.2577972412109375, "learning_rate": 4.1684210526315794e-06, "loss": 0.2463, "step": 38780 }, { "epoch": 58.33, "grad_norm": 6.162819862365723, "learning_rate": 4.166917293233083e-06, "loss": 0.1727, "step": 38790 }, { "epoch": 58.35, "grad_norm": 5.589300155639648, "learning_rate": 4.165413533834587e-06, "loss": 0.2035, "step": 38800 }, { "epoch": 58.36, "grad_norm": 3.496182680130005, "learning_rate": 4.163909774436091e-06, "loss": 0.1862, "step": 38810 }, { "epoch": 58.38, "grad_norm": 7.816127777099609, "learning_rate": 4.162406015037594e-06, "loss": 0.1733, "step": 38820 }, { "epoch": 58.39, "grad_norm": 8.98507308959961, "learning_rate": 4.160902255639098e-06, "loss": 0.1806, "step": 38830 }, { "epoch": 58.41, "grad_norm": 3.1957809925079346, "learning_rate": 4.159398496240602e-06, "loss": 0.1946, "step": 38840 }, { "epoch": 58.42, "grad_norm": 2.860567331314087, "learning_rate": 4.157894736842106e-06, "loss": 0.2619, "step": 38850 }, { "epoch": 58.44, "grad_norm": 6.364305019378662, "learning_rate": 4.156390977443609e-06, "loss": 0.2311, "step": 38860 }, { "epoch": 58.45, "grad_norm": 8.90013599395752, "learning_rate": 4.154887218045114e-06, "loss": 0.2202, "step": 38870 }, { "epoch": 58.47, "grad_norm": 6.431805610656738, "learning_rate": 4.153383458646617e-06, "loss": 0.1814, "step": 38880 }, { "epoch": 58.48, "grad_norm": 7.369435787200928, "learning_rate": 4.151879699248121e-06, "loss": 0.2696, "step": 38890 }, { "epoch": 58.5, "grad_norm": 4.004574775695801, "learning_rate": 4.150375939849624e-06, "loss": 0.1867, "step": 38900 }, { "epoch": 58.51, "grad_norm": 3.348543643951416, "learning_rate": 4.1488721804511286e-06, "loss": 0.1993, "step": 38910 }, { "epoch": 58.53, "grad_norm": 3.0820324420928955, "learning_rate": 4.147368421052632e-06, "loss": 0.2131, "step": 38920 }, { "epoch": 58.54, "grad_norm": 5.479022026062012, "learning_rate": 4.145864661654136e-06, "loss": 0.255, "step": 38930 }, { "epoch": 58.56, "grad_norm": 5.253530979156494, "learning_rate": 4.14436090225564e-06, "loss": 0.1944, "step": 38940 }, { "epoch": 58.57, "grad_norm": 3.170219659805298, "learning_rate": 4.1428571428571435e-06, "loss": 0.1768, "step": 38950 }, { "epoch": 58.59, "grad_norm": 3.0692005157470703, "learning_rate": 4.141353383458647e-06, "loss": 0.2054, "step": 38960 }, { "epoch": 58.6, "grad_norm": 5.293381214141846, "learning_rate": 4.1398496240601505e-06, "loss": 0.2317, "step": 38970 }, { "epoch": 58.62, "grad_norm": 5.903608322143555, "learning_rate": 4.138345864661654e-06, "loss": 0.2098, "step": 38980 }, { "epoch": 58.63, "grad_norm": 6.2776665687561035, "learning_rate": 4.136842105263158e-06, "loss": 0.2591, "step": 38990 }, { "epoch": 58.65, "grad_norm": 1.9607787132263184, "learning_rate": 4.135338345864662e-06, "loss": 0.1817, "step": 39000 }, { "epoch": 58.66, "grad_norm": 4.821249961853027, "learning_rate": 4.1338345864661654e-06, "loss": 0.1306, "step": 39010 }, { "epoch": 58.68, "grad_norm": 4.712565898895264, "learning_rate": 4.132330827067669e-06, "loss": 0.2086, "step": 39020 }, { "epoch": 58.69, "grad_norm": 4.460330486297607, "learning_rate": 4.130827067669173e-06, "loss": 0.1441, "step": 39030 }, { "epoch": 58.71, "grad_norm": 4.247945308685303, "learning_rate": 4.129323308270677e-06, "loss": 0.2307, "step": 39040 }, { "epoch": 58.72, "grad_norm": 5.8864521980285645, "learning_rate": 4.12781954887218e-06, "loss": 0.2461, "step": 39050 }, { "epoch": 58.74, "grad_norm": 4.723229885101318, "learning_rate": 4.126315789473685e-06, "loss": 0.2235, "step": 39060 }, { "epoch": 58.75, "grad_norm": 3.2411322593688965, "learning_rate": 4.124812030075188e-06, "loss": 0.1957, "step": 39070 }, { "epoch": 58.77, "grad_norm": 7.346851348876953, "learning_rate": 4.123308270676692e-06, "loss": 0.2102, "step": 39080 }, { "epoch": 58.78, "grad_norm": 5.536585330963135, "learning_rate": 4.121804511278195e-06, "loss": 0.2596, "step": 39090 }, { "epoch": 58.8, "grad_norm": 2.5167481899261475, "learning_rate": 4.1203007518797e-06, "loss": 0.1739, "step": 39100 }, { "epoch": 58.81, "grad_norm": 2.2827956676483154, "learning_rate": 4.118796992481203e-06, "loss": 0.2394, "step": 39110 }, { "epoch": 58.83, "grad_norm": 3.863255262374878, "learning_rate": 4.117293233082707e-06, "loss": 0.205, "step": 39120 }, { "epoch": 58.84, "grad_norm": 5.798130035400391, "learning_rate": 4.115789473684211e-06, "loss": 0.2017, "step": 39130 }, { "epoch": 58.86, "grad_norm": 3.510673761367798, "learning_rate": 4.114285714285715e-06, "loss": 0.2036, "step": 39140 }, { "epoch": 58.87, "grad_norm": 4.811961650848389, "learning_rate": 4.112781954887218e-06, "loss": 0.2651, "step": 39150 }, { "epoch": 58.89, "grad_norm": 6.230356216430664, "learning_rate": 4.1112781954887225e-06, "loss": 0.1561, "step": 39160 }, { "epoch": 58.9, "grad_norm": 5.7233757972717285, "learning_rate": 4.109774436090226e-06, "loss": 0.1541, "step": 39170 }, { "epoch": 58.92, "grad_norm": 6.532334327697754, "learning_rate": 4.1082706766917295e-06, "loss": 0.2049, "step": 39180 }, { "epoch": 58.93, "grad_norm": 4.839334011077881, "learning_rate": 4.106766917293233e-06, "loss": 0.1846, "step": 39190 }, { "epoch": 58.95, "grad_norm": 8.426265716552734, "learning_rate": 4.105263157894737e-06, "loss": 0.2213, "step": 39200 }, { "epoch": 58.96, "grad_norm": 3.9503211975097656, "learning_rate": 4.103759398496241e-06, "loss": 0.1891, "step": 39210 }, { "epoch": 58.98, "grad_norm": 5.404723167419434, "learning_rate": 4.1022556390977444e-06, "loss": 0.2554, "step": 39220 }, { "epoch": 58.99, "grad_norm": 2.1365013122558594, "learning_rate": 4.100751879699249e-06, "loss": 0.2328, "step": 39230 }, { "epoch": 59.0, "eval_accuracy": 0.9316, "eval_loss": 0.315520316362381, "eval_runtime": 84.5641, "eval_samples_per_second": 118.253, "eval_steps_per_second": 0.473, "step": 39235 }, { "epoch": 59.01, "grad_norm": 6.348912239074707, "learning_rate": 4.099248120300752e-06, "loss": 0.174, "step": 39240 }, { "epoch": 59.02, "grad_norm": 7.103587627410889, "learning_rate": 4.097744360902256e-06, "loss": 0.2624, "step": 39250 }, { "epoch": 59.04, "grad_norm": 3.9258017539978027, "learning_rate": 4.09624060150376e-06, "loss": 0.2086, "step": 39260 }, { "epoch": 59.05, "grad_norm": 4.404173851013184, "learning_rate": 4.094736842105264e-06, "loss": 0.2327, "step": 39270 }, { "epoch": 59.07, "grad_norm": 4.899421691894531, "learning_rate": 4.093233082706767e-06, "loss": 0.2527, "step": 39280 }, { "epoch": 59.08, "grad_norm": 5.5442094802856445, "learning_rate": 4.091729323308271e-06, "loss": 0.1896, "step": 39290 }, { "epoch": 59.1, "grad_norm": 3.816484212875366, "learning_rate": 4.090225563909775e-06, "loss": 0.2282, "step": 39300 }, { "epoch": 59.11, "grad_norm": 7.135293006896973, "learning_rate": 4.088721804511279e-06, "loss": 0.217, "step": 39310 }, { "epoch": 59.13, "grad_norm": 7.170220375061035, "learning_rate": 4.087218045112782e-06, "loss": 0.2658, "step": 39320 }, { "epoch": 59.14, "grad_norm": 3.617730140686035, "learning_rate": 4.0857142857142865e-06, "loss": 0.1469, "step": 39330 }, { "epoch": 59.16, "grad_norm": 3.9617254734039307, "learning_rate": 4.08421052631579e-06, "loss": 0.2034, "step": 39340 }, { "epoch": 59.17, "grad_norm": 5.964526653289795, "learning_rate": 4.0827067669172936e-06, "loss": 0.2128, "step": 39350 }, { "epoch": 59.19, "grad_norm": 5.290602684020996, "learning_rate": 4.081203007518797e-06, "loss": 0.2284, "step": 39360 }, { "epoch": 59.2, "grad_norm": 9.3060941696167, "learning_rate": 4.0796992481203015e-06, "loss": 0.2395, "step": 39370 }, { "epoch": 59.22, "grad_norm": 7.939350128173828, "learning_rate": 4.078195488721805e-06, "loss": 0.2119, "step": 39380 }, { "epoch": 59.23, "grad_norm": 5.00067138671875, "learning_rate": 4.0766917293233085e-06, "loss": 0.1631, "step": 39390 }, { "epoch": 59.25, "grad_norm": 5.9795918464660645, "learning_rate": 4.075187969924813e-06, "loss": 0.1553, "step": 39400 }, { "epoch": 59.26, "grad_norm": 2.5546774864196777, "learning_rate": 4.073684210526316e-06, "loss": 0.159, "step": 39410 }, { "epoch": 59.28, "grad_norm": 8.247179985046387, "learning_rate": 4.07218045112782e-06, "loss": 0.2191, "step": 39420 }, { "epoch": 59.29, "grad_norm": 5.384758949279785, "learning_rate": 4.070676691729324e-06, "loss": 0.1586, "step": 39430 }, { "epoch": 59.31, "grad_norm": 2.1114683151245117, "learning_rate": 4.069172932330828e-06, "loss": 0.2198, "step": 39440 }, { "epoch": 59.32, "grad_norm": 7.196309566497803, "learning_rate": 4.067669172932331e-06, "loss": 0.2132, "step": 39450 }, { "epoch": 59.34, "grad_norm": 12.441351890563965, "learning_rate": 4.066165413533835e-06, "loss": 0.1764, "step": 39460 }, { "epoch": 59.35, "grad_norm": 4.480091094970703, "learning_rate": 4.064661654135338e-06, "loss": 0.2407, "step": 39470 }, { "epoch": 59.37, "grad_norm": 4.517475605010986, "learning_rate": 4.063157894736842e-06, "loss": 0.2106, "step": 39480 }, { "epoch": 59.38, "grad_norm": 6.213492393493652, "learning_rate": 4.061654135338346e-06, "loss": 0.2005, "step": 39490 }, { "epoch": 59.4, "grad_norm": 3.382683515548706, "learning_rate": 4.06015037593985e-06, "loss": 0.17, "step": 39500 }, { "epoch": 59.41, "grad_norm": 5.262118816375732, "learning_rate": 4.058646616541353e-06, "loss": 0.2167, "step": 39510 }, { "epoch": 59.43, "grad_norm": 8.032793998718262, "learning_rate": 4.057142857142858e-06, "loss": 0.2287, "step": 39520 }, { "epoch": 59.44, "grad_norm": 5.3855767250061035, "learning_rate": 4.055639097744361e-06, "loss": 0.1747, "step": 39530 }, { "epoch": 59.46, "grad_norm": 8.112833976745605, "learning_rate": 4.054135338345865e-06, "loss": 0.1867, "step": 39540 }, { "epoch": 59.47, "grad_norm": 4.0277910232543945, "learning_rate": 4.052631578947368e-06, "loss": 0.2311, "step": 39550 }, { "epoch": 59.49, "grad_norm": 2.4710066318511963, "learning_rate": 4.0511278195488725e-06, "loss": 0.1935, "step": 39560 }, { "epoch": 59.5, "grad_norm": 7.679681301116943, "learning_rate": 4.049624060150376e-06, "loss": 0.2504, "step": 39570 }, { "epoch": 59.52, "grad_norm": 8.625081062316895, "learning_rate": 4.04812030075188e-06, "loss": 0.1737, "step": 39580 }, { "epoch": 59.53, "grad_norm": 4.943390846252441, "learning_rate": 4.046616541353384e-06, "loss": 0.2392, "step": 39590 }, { "epoch": 59.55, "grad_norm": 3.67423677444458, "learning_rate": 4.0451127819548875e-06, "loss": 0.1376, "step": 39600 }, { "epoch": 59.56, "grad_norm": 5.872762203216553, "learning_rate": 4.043609022556391e-06, "loss": 0.22, "step": 39610 }, { "epoch": 59.58, "grad_norm": 6.167865753173828, "learning_rate": 4.042105263157895e-06, "loss": 0.2386, "step": 39620 }, { "epoch": 59.59, "grad_norm": 3.7467803955078125, "learning_rate": 4.040601503759399e-06, "loss": 0.2012, "step": 39630 }, { "epoch": 59.61, "grad_norm": 4.501016139984131, "learning_rate": 4.039097744360902e-06, "loss": 0.2285, "step": 39640 }, { "epoch": 59.62, "grad_norm": 5.832193851470947, "learning_rate": 4.037593984962406e-06, "loss": 0.1822, "step": 39650 }, { "epoch": 59.64, "grad_norm": 3.999112844467163, "learning_rate": 4.03609022556391e-06, "loss": 0.2316, "step": 39660 }, { "epoch": 59.65, "grad_norm": 9.898731231689453, "learning_rate": 4.034586466165414e-06, "loss": 0.2284, "step": 39670 }, { "epoch": 59.67, "grad_norm": 4.816281318664551, "learning_rate": 4.033082706766917e-06, "loss": 0.168, "step": 39680 }, { "epoch": 59.68, "grad_norm": 6.9781293869018555, "learning_rate": 4.031578947368422e-06, "loss": 0.2007, "step": 39690 }, { "epoch": 59.7, "grad_norm": 4.193832874298096, "learning_rate": 4.030075187969925e-06, "loss": 0.2435, "step": 39700 }, { "epoch": 59.71, "grad_norm": 3.802783966064453, "learning_rate": 4.028571428571429e-06, "loss": 0.2144, "step": 39710 }, { "epoch": 59.73, "grad_norm": 2.7991650104522705, "learning_rate": 4.027067669172933e-06, "loss": 0.1264, "step": 39720 }, { "epoch": 59.74, "grad_norm": 4.3173394203186035, "learning_rate": 4.025563909774437e-06, "loss": 0.1968, "step": 39730 }, { "epoch": 59.76, "grad_norm": 4.434994220733643, "learning_rate": 4.02406015037594e-06, "loss": 0.2191, "step": 39740 }, { "epoch": 59.77, "grad_norm": 3.0349013805389404, "learning_rate": 4.022556390977444e-06, "loss": 0.1819, "step": 39750 }, { "epoch": 59.79, "grad_norm": 3.8737924098968506, "learning_rate": 4.021052631578948e-06, "loss": 0.2129, "step": 39760 }, { "epoch": 59.8, "grad_norm": 6.809603214263916, "learning_rate": 4.0195488721804515e-06, "loss": 0.2261, "step": 39770 }, { "epoch": 59.82, "grad_norm": 1.9871443510055542, "learning_rate": 4.018045112781955e-06, "loss": 0.1694, "step": 39780 }, { "epoch": 59.83, "grad_norm": 7.0583977699279785, "learning_rate": 4.016541353383459e-06, "loss": 0.2135, "step": 39790 }, { "epoch": 59.85, "grad_norm": 5.820276737213135, "learning_rate": 4.015037593984963e-06, "loss": 0.2536, "step": 39800 }, { "epoch": 59.86, "grad_norm": 6.490072250366211, "learning_rate": 4.0135338345864665e-06, "loss": 0.1756, "step": 39810 }, { "epoch": 59.88, "grad_norm": 7.442599296569824, "learning_rate": 4.01203007518797e-06, "loss": 0.1695, "step": 39820 }, { "epoch": 59.89, "grad_norm": 5.296566486358643, "learning_rate": 4.010526315789474e-06, "loss": 0.2458, "step": 39830 }, { "epoch": 59.91, "grad_norm": 13.77005672454834, "learning_rate": 4.009022556390978e-06, "loss": 0.218, "step": 39840 }, { "epoch": 59.92, "grad_norm": 4.534656524658203, "learning_rate": 4.007518796992481e-06, "loss": 0.2051, "step": 39850 }, { "epoch": 59.94, "grad_norm": 6.368296146392822, "learning_rate": 4.006015037593986e-06, "loss": 0.1526, "step": 39860 }, { "epoch": 59.95, "grad_norm": 7.145541667938232, "learning_rate": 4.004511278195489e-06, "loss": 0.2447, "step": 39870 }, { "epoch": 59.97, "grad_norm": 4.66719388961792, "learning_rate": 4.003007518796993e-06, "loss": 0.259, "step": 39880 }, { "epoch": 59.98, "grad_norm": 3.6382246017456055, "learning_rate": 4.001503759398497e-06, "loss": 0.1915, "step": 39890 }, { "epoch": 60.0, "grad_norm": 0.5675032138824463, "learning_rate": 4.000000000000001e-06, "loss": 0.145, "step": 39900 }, { "epoch": 60.0, "eval_accuracy": 0.9295, "eval_loss": 0.3215762674808502, "eval_runtime": 85.0976, "eval_samples_per_second": 117.512, "eval_steps_per_second": 0.47, "step": 39900 }, { "epoch": 60.02, "grad_norm": 2.28545880317688, "learning_rate": 3.998496240601504e-06, "loss": 0.1901, "step": 39910 }, { "epoch": 60.03, "grad_norm": 15.001397132873535, "learning_rate": 3.996992481203008e-06, "loss": 0.1951, "step": 39920 }, { "epoch": 60.05, "grad_norm": 6.06882905960083, "learning_rate": 3.995488721804512e-06, "loss": 0.1432, "step": 39930 }, { "epoch": 60.06, "grad_norm": 3.984029769897461, "learning_rate": 3.993984962406016e-06, "loss": 0.2015, "step": 39940 }, { "epoch": 60.08, "grad_norm": 17.75435447692871, "learning_rate": 3.992481203007519e-06, "loss": 0.2074, "step": 39950 }, { "epoch": 60.09, "grad_norm": 6.104353427886963, "learning_rate": 3.990977443609023e-06, "loss": 0.2112, "step": 39960 }, { "epoch": 60.11, "grad_norm": 13.67701530456543, "learning_rate": 3.989473684210526e-06, "loss": 0.226, "step": 39970 }, { "epoch": 60.12, "grad_norm": 3.5879509449005127, "learning_rate": 3.9879699248120305e-06, "loss": 0.176, "step": 39980 }, { "epoch": 60.14, "grad_norm": 4.916759490966797, "learning_rate": 3.986466165413534e-06, "loss": 0.1977, "step": 39990 }, { "epoch": 60.15, "grad_norm": 7.997819423675537, "learning_rate": 3.9849624060150376e-06, "loss": 0.199, "step": 40000 }, { "epoch": 60.17, "grad_norm": 3.076766014099121, "learning_rate": 3.983458646616541e-06, "loss": 0.1479, "step": 40010 }, { "epoch": 60.18, "grad_norm": 6.796036720275879, "learning_rate": 3.9819548872180454e-06, "loss": 0.2257, "step": 40020 }, { "epoch": 60.2, "grad_norm": 6.836452007293701, "learning_rate": 3.980451127819549e-06, "loss": 0.2594, "step": 40030 }, { "epoch": 60.21, "grad_norm": 7.558508396148682, "learning_rate": 3.9789473684210525e-06, "loss": 0.2029, "step": 40040 }, { "epoch": 60.23, "grad_norm": 7.763191223144531, "learning_rate": 3.977443609022557e-06, "loss": 0.1931, "step": 40050 }, { "epoch": 60.24, "grad_norm": 6.8963799476623535, "learning_rate": 3.97593984962406e-06, "loss": 0.2369, "step": 40060 }, { "epoch": 60.26, "grad_norm": 6.124290943145752, "learning_rate": 3.974436090225564e-06, "loss": 0.2332, "step": 40070 }, { "epoch": 60.27, "grad_norm": 2.4686286449432373, "learning_rate": 3.972932330827068e-06, "loss": 0.1754, "step": 40080 }, { "epoch": 60.29, "grad_norm": 4.793791770935059, "learning_rate": 3.971428571428572e-06, "loss": 0.2353, "step": 40090 }, { "epoch": 60.3, "grad_norm": 2.7970879077911377, "learning_rate": 3.969924812030075e-06, "loss": 0.1597, "step": 40100 }, { "epoch": 60.32, "grad_norm": 6.253334045410156, "learning_rate": 3.968421052631579e-06, "loss": 0.2115, "step": 40110 }, { "epoch": 60.33, "grad_norm": 7.9991631507873535, "learning_rate": 3.966917293233083e-06, "loss": 0.2242, "step": 40120 }, { "epoch": 60.35, "grad_norm": 7.119922637939453, "learning_rate": 3.965413533834587e-06, "loss": 0.2398, "step": 40130 }, { "epoch": 60.36, "grad_norm": 2.1998627185821533, "learning_rate": 3.96390977443609e-06, "loss": 0.1425, "step": 40140 }, { "epoch": 60.38, "grad_norm": 5.503266334533691, "learning_rate": 3.9624060150375946e-06, "loss": 0.1403, "step": 40150 }, { "epoch": 60.39, "grad_norm": 4.746212959289551, "learning_rate": 3.960902255639098e-06, "loss": 0.2247, "step": 40160 }, { "epoch": 60.41, "grad_norm": 3.6244451999664307, "learning_rate": 3.959398496240602e-06, "loss": 0.2011, "step": 40170 }, { "epoch": 60.42, "grad_norm": 2.700594663619995, "learning_rate": 3.957894736842106e-06, "loss": 0.1956, "step": 40180 }, { "epoch": 60.44, "grad_norm": 8.941556930541992, "learning_rate": 3.9563909774436095e-06, "loss": 0.2188, "step": 40190 }, { "epoch": 60.45, "grad_norm": 3.507028818130493, "learning_rate": 3.954887218045113e-06, "loss": 0.191, "step": 40200 }, { "epoch": 60.47, "grad_norm": 3.7977683544158936, "learning_rate": 3.9533834586466165e-06, "loss": 0.194, "step": 40210 }, { "epoch": 60.48, "grad_norm": 4.61478328704834, "learning_rate": 3.951879699248121e-06, "loss": 0.2135, "step": 40220 }, { "epoch": 60.5, "grad_norm": 3.350219488143921, "learning_rate": 3.950375939849624e-06, "loss": 0.224, "step": 40230 }, { "epoch": 60.51, "grad_norm": 4.675546169281006, "learning_rate": 3.948872180451128e-06, "loss": 0.1851, "step": 40240 }, { "epoch": 60.53, "grad_norm": 4.169593811035156, "learning_rate": 3.947368421052632e-06, "loss": 0.2605, "step": 40250 }, { "epoch": 60.54, "grad_norm": 4.376767635345459, "learning_rate": 3.945864661654136e-06, "loss": 0.2116, "step": 40260 }, { "epoch": 60.56, "grad_norm": 3.2285714149475098, "learning_rate": 3.944360902255639e-06, "loss": 0.1881, "step": 40270 }, { "epoch": 60.57, "grad_norm": 3.8466668128967285, "learning_rate": 3.942857142857143e-06, "loss": 0.2534, "step": 40280 }, { "epoch": 60.59, "grad_norm": 8.687283515930176, "learning_rate": 3.941353383458647e-06, "loss": 0.2319, "step": 40290 }, { "epoch": 60.6, "grad_norm": 3.930408239364624, "learning_rate": 3.939849624060151e-06, "loss": 0.2331, "step": 40300 }, { "epoch": 60.62, "grad_norm": 3.52876615524292, "learning_rate": 3.938345864661654e-06, "loss": 0.1579, "step": 40310 }, { "epoch": 60.63, "grad_norm": 8.303112030029297, "learning_rate": 3.936842105263159e-06, "loss": 0.1893, "step": 40320 }, { "epoch": 60.65, "grad_norm": 6.3138957023620605, "learning_rate": 3.935338345864662e-06, "loss": 0.192, "step": 40330 }, { "epoch": 60.66, "grad_norm": 3.4053993225097656, "learning_rate": 3.933834586466166e-06, "loss": 0.1935, "step": 40340 }, { "epoch": 60.68, "grad_norm": 9.262184143066406, "learning_rate": 3.93233082706767e-06, "loss": 0.1957, "step": 40350 }, { "epoch": 60.69, "grad_norm": 5.6493754386901855, "learning_rate": 3.9308270676691736e-06, "loss": 0.2533, "step": 40360 }, { "epoch": 60.71, "grad_norm": 11.651237487792969, "learning_rate": 3.929323308270677e-06, "loss": 0.1964, "step": 40370 }, { "epoch": 60.72, "grad_norm": 3.903280019760132, "learning_rate": 3.927819548872181e-06, "loss": 0.1883, "step": 40380 }, { "epoch": 60.74, "grad_norm": 1.6574941873550415, "learning_rate": 3.926315789473685e-06, "loss": 0.2372, "step": 40390 }, { "epoch": 60.75, "grad_norm": 6.501513957977295, "learning_rate": 3.9248120300751885e-06, "loss": 0.13, "step": 40400 }, { "epoch": 60.77, "grad_norm": 5.910683631896973, "learning_rate": 3.923308270676692e-06, "loss": 0.2353, "step": 40410 }, { "epoch": 60.78, "grad_norm": 1.645150065422058, "learning_rate": 3.921804511278196e-06, "loss": 0.2189, "step": 40420 }, { "epoch": 60.8, "grad_norm": 5.449909210205078, "learning_rate": 3.9203007518797e-06, "loss": 0.1899, "step": 40430 }, { "epoch": 60.81, "grad_norm": 4.566460609436035, "learning_rate": 3.918796992481203e-06, "loss": 0.2828, "step": 40440 }, { "epoch": 60.83, "grad_norm": 4.585579872131348, "learning_rate": 3.917293233082707e-06, "loss": 0.1729, "step": 40450 }, { "epoch": 60.84, "grad_norm": 6.5645976066589355, "learning_rate": 3.9157894736842104e-06, "loss": 0.164, "step": 40460 }, { "epoch": 60.86, "grad_norm": 9.433652877807617, "learning_rate": 3.914285714285714e-06, "loss": 0.219, "step": 40470 }, { "epoch": 60.87, "grad_norm": 5.390902042388916, "learning_rate": 3.912781954887218e-06, "loss": 0.2299, "step": 40480 }, { "epoch": 60.89, "grad_norm": 3.7667274475097656, "learning_rate": 3.911278195488722e-06, "loss": 0.2021, "step": 40490 }, { "epoch": 60.9, "grad_norm": 4.511600017547607, "learning_rate": 3.909774436090225e-06, "loss": 0.2136, "step": 40500 }, { "epoch": 60.92, "grad_norm": 3.638052225112915, "learning_rate": 3.90827067669173e-06, "loss": 0.1652, "step": 40510 }, { "epoch": 60.93, "grad_norm": 13.318207740783691, "learning_rate": 3.906766917293233e-06, "loss": 0.1892, "step": 40520 }, { "epoch": 60.95, "grad_norm": 4.776786804199219, "learning_rate": 3.905263157894737e-06, "loss": 0.2, "step": 40530 }, { "epoch": 60.96, "grad_norm": 3.423015832901001, "learning_rate": 3.903759398496241e-06, "loss": 0.2601, "step": 40540 }, { "epoch": 60.98, "grad_norm": 4.982937812805176, "learning_rate": 3.902255639097745e-06, "loss": 0.1368, "step": 40550 }, { "epoch": 60.99, "grad_norm": 7.498082637786865, "learning_rate": 3.900751879699248e-06, "loss": 0.2804, "step": 40560 }, { "epoch": 61.0, "eval_accuracy": 0.9298, "eval_loss": 0.3252774775028229, "eval_runtime": 84.231, "eval_samples_per_second": 118.721, "eval_steps_per_second": 0.475, "step": 40565 }, { "epoch": 61.01, "grad_norm": 2.947619676589966, "learning_rate": 3.899248120300752e-06, "loss": 0.2347, "step": 40570 }, { "epoch": 61.02, "grad_norm": 4.865323543548584, "learning_rate": 3.897744360902256e-06, "loss": 0.1525, "step": 40580 }, { "epoch": 61.04, "grad_norm": 4.381734848022461, "learning_rate": 3.8962406015037596e-06, "loss": 0.2192, "step": 40590 }, { "epoch": 61.05, "grad_norm": 4.15067195892334, "learning_rate": 3.894736842105263e-06, "loss": 0.2132, "step": 40600 }, { "epoch": 61.07, "grad_norm": 4.439324855804443, "learning_rate": 3.8932330827067675e-06, "loss": 0.2321, "step": 40610 }, { "epoch": 61.08, "grad_norm": 3.797804594039917, "learning_rate": 3.891729323308271e-06, "loss": 0.1685, "step": 40620 }, { "epoch": 61.1, "grad_norm": 5.394155502319336, "learning_rate": 3.8902255639097745e-06, "loss": 0.1848, "step": 40630 }, { "epoch": 61.11, "grad_norm": 6.317572116851807, "learning_rate": 3.888721804511279e-06, "loss": 0.2337, "step": 40640 }, { "epoch": 61.13, "grad_norm": 4.338418483734131, "learning_rate": 3.887218045112782e-06, "loss": 0.1896, "step": 40650 }, { "epoch": 61.14, "grad_norm": 4.074448585510254, "learning_rate": 3.885714285714286e-06, "loss": 0.2793, "step": 40660 }, { "epoch": 61.16, "grad_norm": 7.455972671508789, "learning_rate": 3.884210526315789e-06, "loss": 0.2104, "step": 40670 }, { "epoch": 61.17, "grad_norm": 6.7889204025268555, "learning_rate": 3.882706766917294e-06, "loss": 0.2767, "step": 40680 }, { "epoch": 61.19, "grad_norm": 0.9154432415962219, "learning_rate": 3.881203007518797e-06, "loss": 0.1906, "step": 40690 }, { "epoch": 61.2, "grad_norm": 5.013433933258057, "learning_rate": 3.879699248120301e-06, "loss": 0.1908, "step": 40700 }, { "epoch": 61.22, "grad_norm": 4.357288360595703, "learning_rate": 3.878195488721805e-06, "loss": 0.1872, "step": 40710 }, { "epoch": 61.23, "grad_norm": 4.3976569175720215, "learning_rate": 3.876691729323309e-06, "loss": 0.1649, "step": 40720 }, { "epoch": 61.25, "grad_norm": 5.771651744842529, "learning_rate": 3.875187969924812e-06, "loss": 0.2027, "step": 40730 }, { "epoch": 61.26, "grad_norm": 4.635252475738525, "learning_rate": 3.873684210526316e-06, "loss": 0.2115, "step": 40740 }, { "epoch": 61.28, "grad_norm": 3.256643295288086, "learning_rate": 3.87218045112782e-06, "loss": 0.2126, "step": 40750 }, { "epoch": 61.29, "grad_norm": 5.223315715789795, "learning_rate": 3.870676691729324e-06, "loss": 0.2077, "step": 40760 }, { "epoch": 61.31, "grad_norm": 4.067598819732666, "learning_rate": 3.869172932330827e-06, "loss": 0.1596, "step": 40770 }, { "epoch": 61.32, "grad_norm": 3.9475274085998535, "learning_rate": 3.8676691729323315e-06, "loss": 0.2226, "step": 40780 }, { "epoch": 61.34, "grad_norm": 5.894591331481934, "learning_rate": 3.866165413533835e-06, "loss": 0.2033, "step": 40790 }, { "epoch": 61.35, "grad_norm": 7.091558933258057, "learning_rate": 3.8646616541353386e-06, "loss": 0.234, "step": 40800 }, { "epoch": 61.37, "grad_norm": 7.475659370422363, "learning_rate": 3.863157894736843e-06, "loss": 0.1912, "step": 40810 }, { "epoch": 61.38, "grad_norm": 4.879849910736084, "learning_rate": 3.8616541353383464e-06, "loss": 0.1841, "step": 40820 }, { "epoch": 61.4, "grad_norm": 5.782186031341553, "learning_rate": 3.86015037593985e-06, "loss": 0.2058, "step": 40830 }, { "epoch": 61.41, "grad_norm": 5.1282196044921875, "learning_rate": 3.8586466165413535e-06, "loss": 0.2359, "step": 40840 }, { "epoch": 61.43, "grad_norm": 5.245487213134766, "learning_rate": 3.857142857142858e-06, "loss": 0.2142, "step": 40850 }, { "epoch": 61.44, "grad_norm": 5.231210708618164, "learning_rate": 3.855639097744361e-06, "loss": 0.2342, "step": 40860 }, { "epoch": 61.46, "grad_norm": 2.8753762245178223, "learning_rate": 3.854135338345865e-06, "loss": 0.2064, "step": 40870 }, { "epoch": 61.47, "grad_norm": 10.435833930969238, "learning_rate": 3.852631578947369e-06, "loss": 0.1609, "step": 40880 }, { "epoch": 61.49, "grad_norm": 6.82497501373291, "learning_rate": 3.851127819548873e-06, "loss": 0.237, "step": 40890 }, { "epoch": 61.5, "grad_norm": 3.6825904846191406, "learning_rate": 3.849624060150376e-06, "loss": 0.2074, "step": 40900 }, { "epoch": 61.52, "grad_norm": 2.7955589294433594, "learning_rate": 3.848120300751881e-06, "loss": 0.1649, "step": 40910 }, { "epoch": 61.53, "grad_norm": 5.636986255645752, "learning_rate": 3.846616541353384e-06, "loss": 0.1815, "step": 40920 }, { "epoch": 61.55, "grad_norm": 5.697868347167969, "learning_rate": 3.845112781954888e-06, "loss": 0.1894, "step": 40930 }, { "epoch": 61.56, "grad_norm": 4.8334808349609375, "learning_rate": 3.843609022556391e-06, "loss": 0.2018, "step": 40940 }, { "epoch": 61.58, "grad_norm": 6.514235496520996, "learning_rate": 3.842105263157895e-06, "loss": 0.2102, "step": 40950 }, { "epoch": 61.59, "grad_norm": 4.935739994049072, "learning_rate": 3.840601503759398e-06, "loss": 0.1594, "step": 40960 }, { "epoch": 61.61, "grad_norm": 14.143031120300293, "learning_rate": 3.839097744360903e-06, "loss": 0.2094, "step": 40970 }, { "epoch": 61.62, "grad_norm": 4.090781211853027, "learning_rate": 3.837593984962406e-06, "loss": 0.2485, "step": 40980 }, { "epoch": 61.64, "grad_norm": 9.650825500488281, "learning_rate": 3.83609022556391e-06, "loss": 0.1868, "step": 40990 }, { "epoch": 61.65, "grad_norm": 3.9108171463012695, "learning_rate": 3.834586466165414e-06, "loss": 0.2519, "step": 41000 }, { "epoch": 61.67, "grad_norm": 3.324169874191284, "learning_rate": 3.8330827067669175e-06, "loss": 0.1668, "step": 41010 }, { "epoch": 61.68, "grad_norm": 6.214301586151123, "learning_rate": 3.831578947368421e-06, "loss": 0.2764, "step": 41020 }, { "epoch": 61.7, "grad_norm": 4.068943500518799, "learning_rate": 3.830075187969925e-06, "loss": 0.2038, "step": 41030 }, { "epoch": 61.71, "grad_norm": 3.677924633026123, "learning_rate": 3.828571428571429e-06, "loss": 0.1743, "step": 41040 }, { "epoch": 61.73, "grad_norm": 4.874640464782715, "learning_rate": 3.8270676691729325e-06, "loss": 0.2046, "step": 41050 }, { "epoch": 61.74, "grad_norm": 4.538558006286621, "learning_rate": 3.825563909774436e-06, "loss": 0.2542, "step": 41060 }, { "epoch": 61.76, "grad_norm": 7.119689464569092, "learning_rate": 3.82406015037594e-06, "loss": 0.1539, "step": 41070 }, { "epoch": 61.77, "grad_norm": 6.170871257781982, "learning_rate": 3.822556390977444e-06, "loss": 0.3122, "step": 41080 }, { "epoch": 61.79, "grad_norm": 4.091243267059326, "learning_rate": 3.821052631578947e-06, "loss": 0.2069, "step": 41090 }, { "epoch": 61.8, "grad_norm": 5.074096202850342, "learning_rate": 3.819548872180452e-06, "loss": 0.2046, "step": 41100 }, { "epoch": 61.82, "grad_norm": 6.762053966522217, "learning_rate": 3.818045112781955e-06, "loss": 0.1954, "step": 41110 }, { "epoch": 61.83, "grad_norm": 3.479602098464966, "learning_rate": 3.816541353383459e-06, "loss": 0.2208, "step": 41120 }, { "epoch": 61.85, "grad_norm": 4.815489768981934, "learning_rate": 3.815037593984962e-06, "loss": 0.191, "step": 41130 }, { "epoch": 61.86, "grad_norm": 3.5152673721313477, "learning_rate": 3.8135338345864663e-06, "loss": 0.245, "step": 41140 }, { "epoch": 61.88, "grad_norm": 2.4947726726531982, "learning_rate": 3.81203007518797e-06, "loss": 0.2218, "step": 41150 }, { "epoch": 61.89, "grad_norm": 3.991682291030884, "learning_rate": 3.810526315789474e-06, "loss": 0.166, "step": 41160 }, { "epoch": 61.91, "grad_norm": 5.845560550689697, "learning_rate": 3.8090225563909777e-06, "loss": 0.1897, "step": 41170 }, { "epoch": 61.92, "grad_norm": 4.220813274383545, "learning_rate": 3.8075187969924816e-06, "loss": 0.1668, "step": 41180 }, { "epoch": 61.94, "grad_norm": 6.980546951293945, "learning_rate": 3.806015037593985e-06, "loss": 0.185, "step": 41190 }, { "epoch": 61.95, "grad_norm": 4.129945278167725, "learning_rate": 3.804511278195489e-06, "loss": 0.2469, "step": 41200 }, { "epoch": 61.97, "grad_norm": 2.966998815536499, "learning_rate": 3.803007518796993e-06, "loss": 0.2203, "step": 41210 }, { "epoch": 61.98, "grad_norm": 4.130844593048096, "learning_rate": 3.8015037593984965e-06, "loss": 0.2189, "step": 41220 }, { "epoch": 62.0, "grad_norm": 19.200584411621094, "learning_rate": 3.8000000000000005e-06, "loss": 0.1696, "step": 41230 }, { "epoch": 62.0, "eval_accuracy": 0.9315, "eval_loss": 0.30856576561927795, "eval_runtime": 84.694, "eval_samples_per_second": 118.072, "eval_steps_per_second": 0.472, "step": 41230 }, { "epoch": 62.02, "grad_norm": 3.957939386367798, "learning_rate": 3.798496240601504e-06, "loss": 0.1619, "step": 41240 }, { "epoch": 62.03, "grad_norm": 5.774256229400635, "learning_rate": 3.796992481203008e-06, "loss": 0.1798, "step": 41250 }, { "epoch": 62.05, "grad_norm": 4.3362274169921875, "learning_rate": 3.795488721804512e-06, "loss": 0.1837, "step": 41260 }, { "epoch": 62.06, "grad_norm": 5.490597248077393, "learning_rate": 3.7939849624060154e-06, "loss": 0.168, "step": 41270 }, { "epoch": 62.08, "grad_norm": 5.736985683441162, "learning_rate": 3.7924812030075193e-06, "loss": 0.2156, "step": 41280 }, { "epoch": 62.09, "grad_norm": 5.470156192779541, "learning_rate": 3.790977443609023e-06, "loss": 0.1872, "step": 41290 }, { "epoch": 62.11, "grad_norm": 6.704098701477051, "learning_rate": 3.789473684210527e-06, "loss": 0.2517, "step": 41300 }, { "epoch": 62.12, "grad_norm": 3.4402177333831787, "learning_rate": 3.7879699248120303e-06, "loss": 0.1928, "step": 41310 }, { "epoch": 62.14, "grad_norm": 4.056411266326904, "learning_rate": 3.7864661654135343e-06, "loss": 0.2138, "step": 41320 }, { "epoch": 62.15, "grad_norm": 6.119357109069824, "learning_rate": 3.784962406015038e-06, "loss": 0.2381, "step": 41330 }, { "epoch": 62.17, "grad_norm": 6.0787787437438965, "learning_rate": 3.7834586466165417e-06, "loss": 0.1763, "step": 41340 }, { "epoch": 62.18, "grad_norm": 5.722217082977295, "learning_rate": 3.7819548872180457e-06, "loss": 0.1505, "step": 41350 }, { "epoch": 62.2, "grad_norm": 1.9974137544631958, "learning_rate": 3.780451127819549e-06, "loss": 0.1912, "step": 41360 }, { "epoch": 62.21, "grad_norm": 3.360246181488037, "learning_rate": 3.778947368421053e-06, "loss": 0.2063, "step": 41370 }, { "epoch": 62.23, "grad_norm": 2.2372615337371826, "learning_rate": 3.777443609022557e-06, "loss": 0.1898, "step": 41380 }, { "epoch": 62.24, "grad_norm": 7.954653263092041, "learning_rate": 3.7759398496240606e-06, "loss": 0.1837, "step": 41390 }, { "epoch": 62.26, "grad_norm": 4.030917644500732, "learning_rate": 3.7744360902255645e-06, "loss": 0.206, "step": 41400 }, { "epoch": 62.27, "grad_norm": 6.891762733459473, "learning_rate": 3.772932330827068e-06, "loss": 0.2181, "step": 41410 }, { "epoch": 62.29, "grad_norm": 6.047714710235596, "learning_rate": 3.771428571428572e-06, "loss": 0.2758, "step": 41420 }, { "epoch": 62.3, "grad_norm": 3.0663442611694336, "learning_rate": 3.769924812030076e-06, "loss": 0.1705, "step": 41430 }, { "epoch": 62.32, "grad_norm": 5.566664218902588, "learning_rate": 3.768421052631579e-06, "loss": 0.1676, "step": 41440 }, { "epoch": 62.33, "grad_norm": 3.831843614578247, "learning_rate": 3.7669172932330825e-06, "loss": 0.1544, "step": 41450 }, { "epoch": 62.35, "grad_norm": 3.094122886657715, "learning_rate": 3.7654135338345865e-06, "loss": 0.2025, "step": 41460 }, { "epoch": 62.36, "grad_norm": 10.858261108398438, "learning_rate": 3.7639097744360904e-06, "loss": 0.2037, "step": 41470 }, { "epoch": 62.38, "grad_norm": 3.284749746322632, "learning_rate": 3.762406015037594e-06, "loss": 0.177, "step": 41480 }, { "epoch": 62.39, "grad_norm": 5.909909248352051, "learning_rate": 3.760902255639098e-06, "loss": 0.1802, "step": 41490 }, { "epoch": 62.41, "grad_norm": 2.958390474319458, "learning_rate": 3.7593984962406014e-06, "loss": 0.1886, "step": 41500 }, { "epoch": 62.42, "grad_norm": 5.828741550445557, "learning_rate": 3.7578947368421053e-06, "loss": 0.2374, "step": 41510 }, { "epoch": 62.44, "grad_norm": 6.119723320007324, "learning_rate": 3.7563909774436093e-06, "loss": 0.1787, "step": 41520 }, { "epoch": 62.45, "grad_norm": 5.897652626037598, "learning_rate": 3.754887218045113e-06, "loss": 0.1704, "step": 41530 }, { "epoch": 62.47, "grad_norm": 3.6232078075408936, "learning_rate": 3.7533834586466168e-06, "loss": 0.2112, "step": 41540 }, { "epoch": 62.48, "grad_norm": 4.357728958129883, "learning_rate": 3.7518796992481203e-06, "loss": 0.1745, "step": 41550 }, { "epoch": 62.5, "grad_norm": 3.9227254390716553, "learning_rate": 3.7503759398496242e-06, "loss": 0.1279, "step": 41560 }, { "epoch": 62.51, "grad_norm": 4.716411113739014, "learning_rate": 3.748872180451128e-06, "loss": 0.1731, "step": 41570 }, { "epoch": 62.53, "grad_norm": 5.446840763092041, "learning_rate": 3.7473684210526317e-06, "loss": 0.173, "step": 41580 }, { "epoch": 62.54, "grad_norm": 9.927755355834961, "learning_rate": 3.7458646616541356e-06, "loss": 0.2158, "step": 41590 }, { "epoch": 62.56, "grad_norm": 4.404961585998535, "learning_rate": 3.744360902255639e-06, "loss": 0.2027, "step": 41600 }, { "epoch": 62.57, "grad_norm": 8.573219299316406, "learning_rate": 3.742857142857143e-06, "loss": 0.2426, "step": 41610 }, { "epoch": 62.59, "grad_norm": 5.887312889099121, "learning_rate": 3.741353383458647e-06, "loss": 0.249, "step": 41620 }, { "epoch": 62.6, "grad_norm": 8.018643379211426, "learning_rate": 3.7398496240601505e-06, "loss": 0.1449, "step": 41630 }, { "epoch": 62.62, "grad_norm": 5.178997993469238, "learning_rate": 3.7383458646616545e-06, "loss": 0.2288, "step": 41640 }, { "epoch": 62.63, "grad_norm": 4.393047332763672, "learning_rate": 3.736842105263158e-06, "loss": 0.2121, "step": 41650 }, { "epoch": 62.65, "grad_norm": 4.626613616943359, "learning_rate": 3.735338345864662e-06, "loss": 0.1793, "step": 41660 }, { "epoch": 62.66, "grad_norm": 2.9352829456329346, "learning_rate": 3.733834586466166e-06, "loss": 0.1641, "step": 41670 }, { "epoch": 62.68, "grad_norm": 7.237253665924072, "learning_rate": 3.7323308270676694e-06, "loss": 0.1669, "step": 41680 }, { "epoch": 62.69, "grad_norm": 7.4211835861206055, "learning_rate": 3.7308270676691734e-06, "loss": 0.1607, "step": 41690 }, { "epoch": 62.71, "grad_norm": 5.203456401824951, "learning_rate": 3.729323308270677e-06, "loss": 0.1946, "step": 41700 }, { "epoch": 62.72, "grad_norm": 6.299890995025635, "learning_rate": 3.727819548872181e-06, "loss": 0.17, "step": 41710 }, { "epoch": 62.74, "grad_norm": 6.215569972991943, "learning_rate": 3.7263157894736848e-06, "loss": 0.1552, "step": 41720 }, { "epoch": 62.75, "grad_norm": 6.16441011428833, "learning_rate": 3.7248120300751883e-06, "loss": 0.1511, "step": 41730 }, { "epoch": 62.77, "grad_norm": 2.391936779022217, "learning_rate": 3.7233082706766922e-06, "loss": 0.2533, "step": 41740 }, { "epoch": 62.78, "grad_norm": 5.521732807159424, "learning_rate": 3.7218045112781957e-06, "loss": 0.2338, "step": 41750 }, { "epoch": 62.8, "grad_norm": 2.640953779220581, "learning_rate": 3.7203007518796997e-06, "loss": 0.2111, "step": 41760 }, { "epoch": 62.81, "grad_norm": 4.677870273590088, "learning_rate": 3.718796992481203e-06, "loss": 0.2166, "step": 41770 }, { "epoch": 62.83, "grad_norm": 4.7918291091918945, "learning_rate": 3.717293233082707e-06, "loss": 0.2068, "step": 41780 }, { "epoch": 62.84, "grad_norm": 4.125340938568115, "learning_rate": 3.715789473684211e-06, "loss": 0.1522, "step": 41790 }, { "epoch": 62.86, "grad_norm": 5.1403117179870605, "learning_rate": 3.7142857142857146e-06, "loss": 0.2198, "step": 41800 }, { "epoch": 62.87, "grad_norm": 4.696916580200195, "learning_rate": 3.7127819548872185e-06, "loss": 0.1715, "step": 41810 }, { "epoch": 62.89, "grad_norm": 3.3158819675445557, "learning_rate": 3.711278195488722e-06, "loss": 0.172, "step": 41820 }, { "epoch": 62.9, "grad_norm": 6.180699825286865, "learning_rate": 3.709774436090226e-06, "loss": 0.2321, "step": 41830 }, { "epoch": 62.92, "grad_norm": 5.126304626464844, "learning_rate": 3.70827067669173e-06, "loss": 0.1684, "step": 41840 }, { "epoch": 62.93, "grad_norm": 7.246421813964844, "learning_rate": 3.7067669172932335e-06, "loss": 0.1933, "step": 41850 }, { "epoch": 62.95, "grad_norm": 4.105726718902588, "learning_rate": 3.7052631578947374e-06, "loss": 0.2189, "step": 41860 }, { "epoch": 62.96, "grad_norm": 3.8485569953918457, "learning_rate": 3.703759398496241e-06, "loss": 0.1698, "step": 41870 }, { "epoch": 62.98, "grad_norm": 3.669081211090088, "learning_rate": 3.702255639097745e-06, "loss": 0.1472, "step": 41880 }, { "epoch": 62.99, "grad_norm": 8.504827499389648, "learning_rate": 3.700751879699249e-06, "loss": 0.2194, "step": 41890 }, { "epoch": 63.0, "eval_accuracy": 0.9313, "eval_loss": 0.31697988510131836, "eval_runtime": 84.413, "eval_samples_per_second": 118.465, "eval_steps_per_second": 0.474, "step": 41895 }, { "epoch": 63.01, "grad_norm": 3.0113203525543213, "learning_rate": 3.6992481203007523e-06, "loss": 0.3075, "step": 41900 }, { "epoch": 63.02, "grad_norm": 5.44133186340332, "learning_rate": 3.6977443609022563e-06, "loss": 0.2535, "step": 41910 }, { "epoch": 63.04, "grad_norm": 4.054553985595703, "learning_rate": 3.69624060150376e-06, "loss": 0.1859, "step": 41920 }, { "epoch": 63.05, "grad_norm": 2.3150360584259033, "learning_rate": 3.6947368421052637e-06, "loss": 0.1969, "step": 41930 }, { "epoch": 63.07, "grad_norm": 5.139898777008057, "learning_rate": 3.693233082706767e-06, "loss": 0.1992, "step": 41940 }, { "epoch": 63.08, "grad_norm": 3.828563928604126, "learning_rate": 3.6917293233082708e-06, "loss": 0.2596, "step": 41950 }, { "epoch": 63.1, "grad_norm": 5.204103469848633, "learning_rate": 3.6902255639097743e-06, "loss": 0.1468, "step": 41960 }, { "epoch": 63.11, "grad_norm": 2.3387629985809326, "learning_rate": 3.6887218045112782e-06, "loss": 0.1782, "step": 41970 }, { "epoch": 63.13, "grad_norm": 5.02309513092041, "learning_rate": 3.687218045112782e-06, "loss": 0.2029, "step": 41980 }, { "epoch": 63.14, "grad_norm": 6.977344512939453, "learning_rate": 3.6857142857142857e-06, "loss": 0.1883, "step": 41990 }, { "epoch": 63.16, "grad_norm": 6.206759929656982, "learning_rate": 3.6842105263157896e-06, "loss": 0.2007, "step": 42000 }, { "epoch": 63.17, "grad_norm": 7.947546482086182, "learning_rate": 3.682706766917293e-06, "loss": 0.2051, "step": 42010 }, { "epoch": 63.19, "grad_norm": 5.88115119934082, "learning_rate": 3.681203007518797e-06, "loss": 0.168, "step": 42020 }, { "epoch": 63.2, "grad_norm": 5.9370198249816895, "learning_rate": 3.679699248120301e-06, "loss": 0.1566, "step": 42030 }, { "epoch": 63.22, "grad_norm": 4.835606098175049, "learning_rate": 3.6781954887218046e-06, "loss": 0.1499, "step": 42040 }, { "epoch": 63.23, "grad_norm": 4.858092308044434, "learning_rate": 3.6766917293233085e-06, "loss": 0.1637, "step": 42050 }, { "epoch": 63.25, "grad_norm": 3.878875970840454, "learning_rate": 3.675187969924812e-06, "loss": 0.2207, "step": 42060 }, { "epoch": 63.26, "grad_norm": 4.815737724304199, "learning_rate": 3.673684210526316e-06, "loss": 0.1478, "step": 42070 }, { "epoch": 63.28, "grad_norm": 5.654510498046875, "learning_rate": 3.67218045112782e-06, "loss": 0.1988, "step": 42080 }, { "epoch": 63.29, "grad_norm": 7.474091053009033, "learning_rate": 3.6706766917293234e-06, "loss": 0.1832, "step": 42090 }, { "epoch": 63.31, "grad_norm": 8.080961227416992, "learning_rate": 3.6691729323308274e-06, "loss": 0.1956, "step": 42100 }, { "epoch": 63.32, "grad_norm": 5.121028423309326, "learning_rate": 3.667669172932331e-06, "loss": 0.1844, "step": 42110 }, { "epoch": 63.34, "grad_norm": 5.617638111114502, "learning_rate": 3.666165413533835e-06, "loss": 0.2037, "step": 42120 }, { "epoch": 63.35, "grad_norm": 5.1372270584106445, "learning_rate": 3.6646616541353388e-06, "loss": 0.171, "step": 42130 }, { "epoch": 63.37, "grad_norm": 5.457404136657715, "learning_rate": 3.6631578947368423e-06, "loss": 0.1537, "step": 42140 }, { "epoch": 63.38, "grad_norm": 9.514487266540527, "learning_rate": 3.6616541353383462e-06, "loss": 0.2131, "step": 42150 }, { "epoch": 63.4, "grad_norm": 6.3367414474487305, "learning_rate": 3.6601503759398498e-06, "loss": 0.1886, "step": 42160 }, { "epoch": 63.41, "grad_norm": 7.155886650085449, "learning_rate": 3.6586466165413537e-06, "loss": 0.2358, "step": 42170 }, { "epoch": 63.43, "grad_norm": 4.044269561767578, "learning_rate": 3.6571428571428576e-06, "loss": 0.271, "step": 42180 }, { "epoch": 63.44, "grad_norm": 7.966317653656006, "learning_rate": 3.655639097744361e-06, "loss": 0.182, "step": 42190 }, { "epoch": 63.46, "grad_norm": 8.241473197937012, "learning_rate": 3.654135338345865e-06, "loss": 0.2768, "step": 42200 }, { "epoch": 63.47, "grad_norm": 5.769531726837158, "learning_rate": 3.6526315789473686e-06, "loss": 0.2628, "step": 42210 }, { "epoch": 63.49, "grad_norm": 4.371389865875244, "learning_rate": 3.6511278195488726e-06, "loss": 0.2603, "step": 42220 }, { "epoch": 63.5, "grad_norm": 3.9458630084991455, "learning_rate": 3.649624060150376e-06, "loss": 0.1757, "step": 42230 }, { "epoch": 63.52, "grad_norm": 8.014843940734863, "learning_rate": 3.64812030075188e-06, "loss": 0.1979, "step": 42240 }, { "epoch": 63.53, "grad_norm": 4.720882892608643, "learning_rate": 3.646616541353384e-06, "loss": 0.2039, "step": 42250 }, { "epoch": 63.55, "grad_norm": 13.893559455871582, "learning_rate": 3.6451127819548875e-06, "loss": 0.2065, "step": 42260 }, { "epoch": 63.56, "grad_norm": 5.4362406730651855, "learning_rate": 3.6436090225563914e-06, "loss": 0.169, "step": 42270 }, { "epoch": 63.58, "grad_norm": 4.523813247680664, "learning_rate": 3.642105263157895e-06, "loss": 0.2152, "step": 42280 }, { "epoch": 63.59, "grad_norm": 2.5966227054595947, "learning_rate": 3.640601503759399e-06, "loss": 0.1696, "step": 42290 }, { "epoch": 63.61, "grad_norm": 5.089998245239258, "learning_rate": 3.639097744360903e-06, "loss": 0.1484, "step": 42300 }, { "epoch": 63.62, "grad_norm": 5.716888427734375, "learning_rate": 3.6375939849624064e-06, "loss": 0.1763, "step": 42310 }, { "epoch": 63.64, "grad_norm": 5.487515926361084, "learning_rate": 3.6360902255639103e-06, "loss": 0.1831, "step": 42320 }, { "epoch": 63.65, "grad_norm": 3.316849708557129, "learning_rate": 3.634586466165414e-06, "loss": 0.2312, "step": 42330 }, { "epoch": 63.67, "grad_norm": 5.662327766418457, "learning_rate": 3.6330827067669178e-06, "loss": 0.2383, "step": 42340 }, { "epoch": 63.68, "grad_norm": 6.571043968200684, "learning_rate": 3.6315789473684217e-06, "loss": 0.2039, "step": 42350 }, { "epoch": 63.7, "grad_norm": 5.3168044090271, "learning_rate": 3.6300751879699252e-06, "loss": 0.1692, "step": 42360 }, { "epoch": 63.71, "grad_norm": 3.067420482635498, "learning_rate": 3.628571428571429e-06, "loss": 0.1852, "step": 42370 }, { "epoch": 63.73, "grad_norm": 8.511592864990234, "learning_rate": 3.6270676691729327e-06, "loss": 0.1966, "step": 42380 }, { "epoch": 63.74, "grad_norm": 6.0220232009887695, "learning_rate": 3.6255639097744366e-06, "loss": 0.1325, "step": 42390 }, { "epoch": 63.76, "grad_norm": 5.685608386993408, "learning_rate": 3.6240601503759406e-06, "loss": 0.228, "step": 42400 }, { "epoch": 63.77, "grad_norm": 6.7746052742004395, "learning_rate": 3.622556390977444e-06, "loss": 0.2469, "step": 42410 }, { "epoch": 63.79, "grad_norm": 5.100920677185059, "learning_rate": 3.621052631578948e-06, "loss": 0.1907, "step": 42420 }, { "epoch": 63.8, "grad_norm": 3.674767255783081, "learning_rate": 3.6195488721804515e-06, "loss": 0.1922, "step": 42430 }, { "epoch": 63.82, "grad_norm": 7.852967739105225, "learning_rate": 3.618045112781955e-06, "loss": 0.1973, "step": 42440 }, { "epoch": 63.83, "grad_norm": 3.643383741378784, "learning_rate": 3.6165413533834586e-06, "loss": 0.1633, "step": 42450 }, { "epoch": 63.85, "grad_norm": 5.850439071655273, "learning_rate": 3.6150375939849625e-06, "loss": 0.1915, "step": 42460 }, { "epoch": 63.86, "grad_norm": 4.816995620727539, "learning_rate": 3.613533834586466e-06, "loss": 0.2416, "step": 42470 }, { "epoch": 63.88, "grad_norm": 4.024276256561279, "learning_rate": 3.61203007518797e-06, "loss": 0.207, "step": 42480 }, { "epoch": 63.89, "grad_norm": 4.65853214263916, "learning_rate": 3.610526315789474e-06, "loss": 0.1428, "step": 42490 }, { "epoch": 63.91, "grad_norm": 1.6227343082427979, "learning_rate": 3.6090225563909775e-06, "loss": 0.2439, "step": 42500 }, { "epoch": 63.92, "grad_norm": 5.130610466003418, "learning_rate": 3.6075187969924814e-06, "loss": 0.2491, "step": 42510 }, { "epoch": 63.94, "grad_norm": 7.856945991516113, "learning_rate": 3.606015037593985e-06, "loss": 0.2532, "step": 42520 }, { "epoch": 63.95, "grad_norm": 3.6116113662719727, "learning_rate": 3.604511278195489e-06, "loss": 0.258, "step": 42530 }, { "epoch": 63.97, "grad_norm": 6.224160671234131, "learning_rate": 3.603007518796993e-06, "loss": 0.2729, "step": 42540 }, { "epoch": 63.98, "grad_norm": 5.5140156745910645, "learning_rate": 3.6015037593984963e-06, "loss": 0.1669, "step": 42550 }, { "epoch": 64.0, "grad_norm": 13.927995681762695, "learning_rate": 3.6000000000000003e-06, "loss": 0.2297, "step": 42560 }, { "epoch": 64.0, "eval_accuracy": 0.9293, "eval_loss": 0.3231370151042938, "eval_runtime": 85.3183, "eval_samples_per_second": 117.208, "eval_steps_per_second": 0.469, "step": 42560 }, { "epoch": 64.02, "grad_norm": 8.015310287475586, "learning_rate": 3.5984962406015038e-06, "loss": 0.2013, "step": 42570 }, { "epoch": 64.03, "grad_norm": 7.057667255401611, "learning_rate": 3.5969924812030077e-06, "loss": 0.2238, "step": 42580 }, { "epoch": 64.05, "grad_norm": 4.543213367462158, "learning_rate": 3.5954887218045117e-06, "loss": 0.1907, "step": 42590 }, { "epoch": 64.06, "grad_norm": 2.699389934539795, "learning_rate": 3.593984962406015e-06, "loss": 0.1875, "step": 42600 }, { "epoch": 64.08, "grad_norm": 10.956419944763184, "learning_rate": 3.592481203007519e-06, "loss": 0.2209, "step": 42610 }, { "epoch": 64.09, "grad_norm": 3.574812173843384, "learning_rate": 3.5909774436090226e-06, "loss": 0.2168, "step": 42620 }, { "epoch": 64.11, "grad_norm": 4.676037311553955, "learning_rate": 3.5894736842105266e-06, "loss": 0.1879, "step": 42630 }, { "epoch": 64.12, "grad_norm": 3.858914375305176, "learning_rate": 3.5879699248120305e-06, "loss": 0.2086, "step": 42640 }, { "epoch": 64.14, "grad_norm": 4.243492126464844, "learning_rate": 3.586466165413534e-06, "loss": 0.2135, "step": 42650 }, { "epoch": 64.15, "grad_norm": 5.530261039733887, "learning_rate": 3.584962406015038e-06, "loss": 0.2014, "step": 42660 }, { "epoch": 64.17, "grad_norm": 5.095816135406494, "learning_rate": 3.5834586466165415e-06, "loss": 0.1919, "step": 42670 }, { "epoch": 64.18, "grad_norm": 2.273388624191284, "learning_rate": 3.5819548872180455e-06, "loss": 0.15, "step": 42680 }, { "epoch": 64.2, "grad_norm": 5.889043807983398, "learning_rate": 3.580451127819549e-06, "loss": 0.2731, "step": 42690 }, { "epoch": 64.21, "grad_norm": 3.004957437515259, "learning_rate": 3.578947368421053e-06, "loss": 0.2121, "step": 42700 }, { "epoch": 64.23, "grad_norm": 3.732741594314575, "learning_rate": 3.577443609022557e-06, "loss": 0.1757, "step": 42710 }, { "epoch": 64.24, "grad_norm": 4.3243584632873535, "learning_rate": 3.5759398496240604e-06, "loss": 0.2344, "step": 42720 }, { "epoch": 64.26, "grad_norm": 4.092397689819336, "learning_rate": 3.5744360902255643e-06, "loss": 0.1653, "step": 42730 }, { "epoch": 64.27, "grad_norm": 5.2698187828063965, "learning_rate": 3.572932330827068e-06, "loss": 0.19, "step": 42740 }, { "epoch": 64.29, "grad_norm": 7.864371299743652, "learning_rate": 3.5714285714285718e-06, "loss": 0.1977, "step": 42750 }, { "epoch": 64.3, "grad_norm": 2.6256632804870605, "learning_rate": 3.5699248120300757e-06, "loss": 0.2334, "step": 42760 }, { "epoch": 64.32, "grad_norm": 2.710421085357666, "learning_rate": 3.5684210526315792e-06, "loss": 0.1919, "step": 42770 }, { "epoch": 64.33, "grad_norm": 5.321908950805664, "learning_rate": 3.566917293233083e-06, "loss": 0.1552, "step": 42780 }, { "epoch": 64.35, "grad_norm": 6.268770694732666, "learning_rate": 3.5654135338345867e-06, "loss": 0.1363, "step": 42790 }, { "epoch": 64.36, "grad_norm": 4.892660617828369, "learning_rate": 3.5639097744360906e-06, "loss": 0.1603, "step": 42800 }, { "epoch": 64.38, "grad_norm": 5.775477409362793, "learning_rate": 3.5624060150375946e-06, "loss": 0.2558, "step": 42810 }, { "epoch": 64.39, "grad_norm": 7.529040813446045, "learning_rate": 3.560902255639098e-06, "loss": 0.2444, "step": 42820 }, { "epoch": 64.41, "grad_norm": 2.752485513687134, "learning_rate": 3.559398496240602e-06, "loss": 0.2147, "step": 42830 }, { "epoch": 64.42, "grad_norm": 7.767008304595947, "learning_rate": 3.5578947368421056e-06, "loss": 0.2318, "step": 42840 }, { "epoch": 64.44, "grad_norm": 6.706613063812256, "learning_rate": 3.5563909774436095e-06, "loss": 0.192, "step": 42850 }, { "epoch": 64.45, "grad_norm": 3.0275185108184814, "learning_rate": 3.5548872180451135e-06, "loss": 0.1605, "step": 42860 }, { "epoch": 64.47, "grad_norm": 4.424815654754639, "learning_rate": 3.553383458646617e-06, "loss": 0.1919, "step": 42870 }, { "epoch": 64.48, "grad_norm": 4.253983020782471, "learning_rate": 3.551879699248121e-06, "loss": 0.2079, "step": 42880 }, { "epoch": 64.5, "grad_norm": 4.817580699920654, "learning_rate": 3.5503759398496244e-06, "loss": 0.1557, "step": 42890 }, { "epoch": 64.51, "grad_norm": 10.810401916503906, "learning_rate": 3.5488721804511284e-06, "loss": 0.1925, "step": 42900 }, { "epoch": 64.53, "grad_norm": 4.242228984832764, "learning_rate": 3.5473684210526323e-06, "loss": 0.1776, "step": 42910 }, { "epoch": 64.54, "grad_norm": 3.9990408420562744, "learning_rate": 3.545864661654136e-06, "loss": 0.1627, "step": 42920 }, { "epoch": 64.56, "grad_norm": 5.8364105224609375, "learning_rate": 3.544360902255639e-06, "loss": 0.1859, "step": 42930 }, { "epoch": 64.57, "grad_norm": 8.85914134979248, "learning_rate": 3.542857142857143e-06, "loss": 0.2357, "step": 42940 }, { "epoch": 64.59, "grad_norm": 6.398134708404541, "learning_rate": 3.541353383458647e-06, "loss": 0.2228, "step": 42950 }, { "epoch": 64.6, "grad_norm": 1.9391348361968994, "learning_rate": 3.5398496240601503e-06, "loss": 0.1869, "step": 42960 }, { "epoch": 64.62, "grad_norm": 4.14783239364624, "learning_rate": 3.5383458646616543e-06, "loss": 0.2012, "step": 42970 }, { "epoch": 64.63, "grad_norm": 2.7045633792877197, "learning_rate": 3.536842105263158e-06, "loss": 0.1754, "step": 42980 }, { "epoch": 64.65, "grad_norm": 4.491335391998291, "learning_rate": 3.5353383458646617e-06, "loss": 0.2195, "step": 42990 }, { "epoch": 64.66, "grad_norm": 5.1978678703308105, "learning_rate": 3.5338345864661657e-06, "loss": 0.1891, "step": 43000 }, { "epoch": 64.68, "grad_norm": 1.4553602933883667, "learning_rate": 3.532330827067669e-06, "loss": 0.1871, "step": 43010 }, { "epoch": 64.69, "grad_norm": 3.4015984535217285, "learning_rate": 3.530827067669173e-06, "loss": 0.2434, "step": 43020 }, { "epoch": 64.71, "grad_norm": 6.651217937469482, "learning_rate": 3.5293233082706767e-06, "loss": 0.1889, "step": 43030 }, { "epoch": 64.72, "grad_norm": 4.7721848487854, "learning_rate": 3.5278195488721806e-06, "loss": 0.222, "step": 43040 }, { "epoch": 64.74, "grad_norm": 2.7029495239257812, "learning_rate": 3.5263157894736846e-06, "loss": 0.1711, "step": 43050 }, { "epoch": 64.75, "grad_norm": 11.2900972366333, "learning_rate": 3.524812030075188e-06, "loss": 0.19, "step": 43060 }, { "epoch": 64.77, "grad_norm": 2.7460989952087402, "learning_rate": 3.523308270676692e-06, "loss": 0.213, "step": 43070 }, { "epoch": 64.78, "grad_norm": 3.8076796531677246, "learning_rate": 3.5218045112781955e-06, "loss": 0.1665, "step": 43080 }, { "epoch": 64.8, "grad_norm": 4.072009086608887, "learning_rate": 3.5203007518796995e-06, "loss": 0.1745, "step": 43090 }, { "epoch": 64.81, "grad_norm": 2.836899995803833, "learning_rate": 3.5187969924812034e-06, "loss": 0.1815, "step": 43100 }, { "epoch": 64.83, "grad_norm": 6.04483699798584, "learning_rate": 3.517293233082707e-06, "loss": 0.1649, "step": 43110 }, { "epoch": 64.84, "grad_norm": 3.882234811782837, "learning_rate": 3.515789473684211e-06, "loss": 0.1793, "step": 43120 }, { "epoch": 64.86, "grad_norm": 6.106405735015869, "learning_rate": 3.5142857142857144e-06, "loss": 0.2961, "step": 43130 }, { "epoch": 64.87, "grad_norm": 3.823786973953247, "learning_rate": 3.5127819548872183e-06, "loss": 0.1846, "step": 43140 }, { "epoch": 64.89, "grad_norm": 4.050282955169678, "learning_rate": 3.511278195488722e-06, "loss": 0.2784, "step": 43150 }, { "epoch": 64.9, "grad_norm": 4.701742649078369, "learning_rate": 3.509774436090226e-06, "loss": 0.1875, "step": 43160 }, { "epoch": 64.92, "grad_norm": 6.7218708992004395, "learning_rate": 3.5082706766917297e-06, "loss": 0.1865, "step": 43170 }, { "epoch": 64.93, "grad_norm": 4.6365132331848145, "learning_rate": 3.5067669172932333e-06, "loss": 0.2224, "step": 43180 }, { "epoch": 64.95, "grad_norm": 8.348788261413574, "learning_rate": 3.505263157894737e-06, "loss": 0.2309, "step": 43190 }, { "epoch": 64.96, "grad_norm": 6.804657936096191, "learning_rate": 3.5037593984962407e-06, "loss": 0.1802, "step": 43200 }, { "epoch": 64.98, "grad_norm": 6.6085405349731445, "learning_rate": 3.5022556390977447e-06, "loss": 0.2321, "step": 43210 }, { "epoch": 64.99, "grad_norm": 6.796005725860596, "learning_rate": 3.5007518796992486e-06, "loss": 0.2108, "step": 43220 }, { "epoch": 65.0, "eval_accuracy": 0.9313, "eval_loss": 0.31611478328704834, "eval_runtime": 84.6549, "eval_samples_per_second": 118.127, "eval_steps_per_second": 0.473, "step": 43225 }, { "epoch": 65.01, "grad_norm": 33.686763763427734, "learning_rate": 3.499248120300752e-06, "loss": 0.2085, "step": 43230 }, { "epoch": 65.02, "grad_norm": 4.142588138580322, "learning_rate": 3.497744360902256e-06, "loss": 0.1216, "step": 43240 }, { "epoch": 65.04, "grad_norm": 4.3597612380981445, "learning_rate": 3.4962406015037596e-06, "loss": 0.202, "step": 43250 }, { "epoch": 65.05, "grad_norm": 5.954403877258301, "learning_rate": 3.4947368421052635e-06, "loss": 0.1543, "step": 43260 }, { "epoch": 65.07, "grad_norm": 4.9531474113464355, "learning_rate": 3.4932330827067675e-06, "loss": 0.2382, "step": 43270 }, { "epoch": 65.08, "grad_norm": 4.433995723724365, "learning_rate": 3.491729323308271e-06, "loss": 0.2213, "step": 43280 }, { "epoch": 65.1, "grad_norm": 9.919368743896484, "learning_rate": 3.490225563909775e-06, "loss": 0.2256, "step": 43290 }, { "epoch": 65.11, "grad_norm": 1.1881593465805054, "learning_rate": 3.4887218045112785e-06, "loss": 0.2056, "step": 43300 }, { "epoch": 65.13, "grad_norm": 5.632315635681152, "learning_rate": 3.4872180451127824e-06, "loss": 0.2989, "step": 43310 }, { "epoch": 65.14, "grad_norm": 7.14915132522583, "learning_rate": 3.4857142857142863e-06, "loss": 0.2547, "step": 43320 }, { "epoch": 65.16, "grad_norm": 3.911741018295288, "learning_rate": 3.48421052631579e-06, "loss": 0.2429, "step": 43330 }, { "epoch": 65.17, "grad_norm": 2.1838467121124268, "learning_rate": 3.482706766917294e-06, "loss": 0.1845, "step": 43340 }, { "epoch": 65.19, "grad_norm": 3.118250846862793, "learning_rate": 3.4812030075187973e-06, "loss": 0.2137, "step": 43350 }, { "epoch": 65.2, "grad_norm": 4.089922904968262, "learning_rate": 3.4796992481203013e-06, "loss": 0.1868, "step": 43360 }, { "epoch": 65.22, "grad_norm": 5.602779388427734, "learning_rate": 3.478195488721805e-06, "loss": 0.17, "step": 43370 }, { "epoch": 65.23, "grad_norm": 8.60183334350586, "learning_rate": 3.4766917293233087e-06, "loss": 0.2105, "step": 43380 }, { "epoch": 65.25, "grad_norm": 5.107520580291748, "learning_rate": 3.4751879699248127e-06, "loss": 0.2588, "step": 43390 }, { "epoch": 65.26, "grad_norm": 10.866013526916504, "learning_rate": 3.473684210526316e-06, "loss": 0.1978, "step": 43400 }, { "epoch": 65.28, "grad_norm": 4.005733489990234, "learning_rate": 3.47218045112782e-06, "loss": 0.1811, "step": 43410 }, { "epoch": 65.29, "grad_norm": 3.853426694869995, "learning_rate": 3.470676691729324e-06, "loss": 0.2217, "step": 43420 }, { "epoch": 65.31, "grad_norm": 5.65993070602417, "learning_rate": 3.469172932330827e-06, "loss": 0.216, "step": 43430 }, { "epoch": 65.32, "grad_norm": 3.9332337379455566, "learning_rate": 3.4676691729323307e-06, "loss": 0.2144, "step": 43440 }, { "epoch": 65.34, "grad_norm": 5.378814697265625, "learning_rate": 3.4661654135338346e-06, "loss": 0.1719, "step": 43450 }, { "epoch": 65.35, "grad_norm": 4.357859134674072, "learning_rate": 3.4646616541353386e-06, "loss": 0.1897, "step": 43460 }, { "epoch": 65.37, "grad_norm": 3.1881186962127686, "learning_rate": 3.463157894736842e-06, "loss": 0.1678, "step": 43470 }, { "epoch": 65.38, "grad_norm": 3.442866325378418, "learning_rate": 3.461654135338346e-06, "loss": 0.1805, "step": 43480 }, { "epoch": 65.4, "grad_norm": 9.554397583007812, "learning_rate": 3.4601503759398496e-06, "loss": 0.178, "step": 43490 }, { "epoch": 65.41, "grad_norm": 3.902580976486206, "learning_rate": 3.4586466165413535e-06, "loss": 0.1855, "step": 43500 }, { "epoch": 65.43, "grad_norm": 5.2838850021362305, "learning_rate": 3.4571428571428574e-06, "loss": 0.2158, "step": 43510 }, { "epoch": 65.44, "grad_norm": 3.7892587184906006, "learning_rate": 3.455639097744361e-06, "loss": 0.1871, "step": 43520 }, { "epoch": 65.46, "grad_norm": 3.122152090072632, "learning_rate": 3.454135338345865e-06, "loss": 0.142, "step": 43530 }, { "epoch": 65.47, "grad_norm": 5.339186191558838, "learning_rate": 3.4526315789473684e-06, "loss": 0.1896, "step": 43540 }, { "epoch": 65.49, "grad_norm": 6.880535125732422, "learning_rate": 3.4511278195488724e-06, "loss": 0.238, "step": 43550 }, { "epoch": 65.5, "grad_norm": 4.716340065002441, "learning_rate": 3.4496240601503763e-06, "loss": 0.1584, "step": 43560 }, { "epoch": 65.52, "grad_norm": 5.00018835067749, "learning_rate": 3.44812030075188e-06, "loss": 0.211, "step": 43570 }, { "epoch": 65.53, "grad_norm": 2.4130773544311523, "learning_rate": 3.4466165413533838e-06, "loss": 0.2621, "step": 43580 }, { "epoch": 65.55, "grad_norm": 0.7229984998703003, "learning_rate": 3.4451127819548873e-06, "loss": 0.213, "step": 43590 }, { "epoch": 65.56, "grad_norm": 7.815069675445557, "learning_rate": 3.4436090225563912e-06, "loss": 0.2066, "step": 43600 }, { "epoch": 65.58, "grad_norm": 4.840019702911377, "learning_rate": 3.4421052631578947e-06, "loss": 0.182, "step": 43610 }, { "epoch": 65.59, "grad_norm": 5.94028377532959, "learning_rate": 3.4406015037593987e-06, "loss": 0.1918, "step": 43620 }, { "epoch": 65.61, "grad_norm": 5.4512939453125, "learning_rate": 3.4390977443609026e-06, "loss": 0.2402, "step": 43630 }, { "epoch": 65.62, "grad_norm": 4.47932243347168, "learning_rate": 3.437593984962406e-06, "loss": 0.206, "step": 43640 }, { "epoch": 65.64, "grad_norm": 5.5348310470581055, "learning_rate": 3.43609022556391e-06, "loss": 0.238, "step": 43650 }, { "epoch": 65.65, "grad_norm": 6.362865924835205, "learning_rate": 3.4345864661654136e-06, "loss": 0.1752, "step": 43660 }, { "epoch": 65.67, "grad_norm": 6.699802875518799, "learning_rate": 3.4330827067669176e-06, "loss": 0.2295, "step": 43670 }, { "epoch": 65.68, "grad_norm": 2.2860307693481445, "learning_rate": 3.4315789473684215e-06, "loss": 0.1659, "step": 43680 }, { "epoch": 65.7, "grad_norm": 2.297809362411499, "learning_rate": 3.430075187969925e-06, "loss": 0.1341, "step": 43690 }, { "epoch": 65.71, "grad_norm": 7.806978702545166, "learning_rate": 3.428571428571429e-06, "loss": 0.1756, "step": 43700 }, { "epoch": 65.73, "grad_norm": 8.060872077941895, "learning_rate": 3.4270676691729325e-06, "loss": 0.208, "step": 43710 }, { "epoch": 65.74, "grad_norm": 4.351926803588867, "learning_rate": 3.4255639097744364e-06, "loss": 0.1765, "step": 43720 }, { "epoch": 65.76, "grad_norm": 6.094818592071533, "learning_rate": 3.4240601503759404e-06, "loss": 0.219, "step": 43730 }, { "epoch": 65.77, "grad_norm": 5.638463497161865, "learning_rate": 3.422556390977444e-06, "loss": 0.2396, "step": 43740 }, { "epoch": 65.79, "grad_norm": 3.789339303970337, "learning_rate": 3.421052631578948e-06, "loss": 0.1845, "step": 43750 }, { "epoch": 65.8, "grad_norm": 7.909036636352539, "learning_rate": 3.4195488721804513e-06, "loss": 0.2344, "step": 43760 }, { "epoch": 65.82, "grad_norm": 6.027281761169434, "learning_rate": 3.4180451127819553e-06, "loss": 0.1474, "step": 43770 }, { "epoch": 65.83, "grad_norm": 5.84333610534668, "learning_rate": 3.4165413533834592e-06, "loss": 0.147, "step": 43780 }, { "epoch": 65.85, "grad_norm": 8.227249145507812, "learning_rate": 3.4150375939849627e-06, "loss": 0.2314, "step": 43790 }, { "epoch": 65.86, "grad_norm": 3.2717232704162598, "learning_rate": 3.4135338345864667e-06, "loss": 0.2419, "step": 43800 }, { "epoch": 65.88, "grad_norm": 5.8105926513671875, "learning_rate": 3.41203007518797e-06, "loss": 0.2375, "step": 43810 }, { "epoch": 65.89, "grad_norm": 3.668238878250122, "learning_rate": 3.410526315789474e-06, "loss": 0.234, "step": 43820 }, { "epoch": 65.91, "grad_norm": 5.884802341461182, "learning_rate": 3.409022556390978e-06, "loss": 0.1602, "step": 43830 }, { "epoch": 65.92, "grad_norm": 3.720730781555176, "learning_rate": 3.4075187969924816e-06, "loss": 0.2016, "step": 43840 }, { "epoch": 65.94, "grad_norm": 3.9387388229370117, "learning_rate": 3.4060150375939856e-06, "loss": 0.1904, "step": 43850 }, { "epoch": 65.95, "grad_norm": 7.595775604248047, "learning_rate": 3.404511278195489e-06, "loss": 0.1867, "step": 43860 }, { "epoch": 65.97, "grad_norm": 4.808709621429443, "learning_rate": 3.403007518796993e-06, "loss": 0.2034, "step": 43870 }, { "epoch": 65.98, "grad_norm": 5.746549606323242, "learning_rate": 3.401503759398497e-06, "loss": 0.1736, "step": 43880 }, { "epoch": 66.0, "grad_norm": 0.12739986181259155, "learning_rate": 3.4000000000000005e-06, "loss": 0.1696, "step": 43890 }, { "epoch": 66.0, "eval_accuracy": 0.929, "eval_loss": 0.32690301537513733, "eval_runtime": 84.7083, "eval_samples_per_second": 118.052, "eval_steps_per_second": 0.472, "step": 43890 }, { "epoch": 66.02, "grad_norm": 7.90481424331665, "learning_rate": 3.3984962406015044e-06, "loss": 0.1554, "step": 43900 }, { "epoch": 66.03, "grad_norm": 6.900293350219727, "learning_rate": 3.396992481203008e-06, "loss": 0.1998, "step": 43910 }, { "epoch": 66.05, "grad_norm": 5.227355003356934, "learning_rate": 3.3954887218045115e-06, "loss": 0.1637, "step": 43920 }, { "epoch": 66.06, "grad_norm": 3.3653359413146973, "learning_rate": 3.393984962406015e-06, "loss": 0.1715, "step": 43930 }, { "epoch": 66.08, "grad_norm": 7.483799934387207, "learning_rate": 3.392481203007519e-06, "loss": 0.1775, "step": 43940 }, { "epoch": 66.09, "grad_norm": 6.125503063201904, "learning_rate": 3.3909774436090224e-06, "loss": 0.2052, "step": 43950 }, { "epoch": 66.11, "grad_norm": 2.947721004486084, "learning_rate": 3.3894736842105264e-06, "loss": 0.1686, "step": 43960 }, { "epoch": 66.12, "grad_norm": 4.430096626281738, "learning_rate": 3.3879699248120303e-06, "loss": 0.1511, "step": 43970 }, { "epoch": 66.14, "grad_norm": 4.3853349685668945, "learning_rate": 3.386466165413534e-06, "loss": 0.2122, "step": 43980 }, { "epoch": 66.15, "grad_norm": 3.0524966716766357, "learning_rate": 3.384962406015038e-06, "loss": 0.138, "step": 43990 }, { "epoch": 66.17, "grad_norm": 2.938044786453247, "learning_rate": 3.3834586466165413e-06, "loss": 0.2325, "step": 44000 }, { "epoch": 66.18, "grad_norm": 4.645559787750244, "learning_rate": 3.3819548872180453e-06, "loss": 0.2521, "step": 44010 }, { "epoch": 66.2, "grad_norm": 7.084641456604004, "learning_rate": 3.380451127819549e-06, "loss": 0.2237, "step": 44020 }, { "epoch": 66.21, "grad_norm": 8.430667877197266, "learning_rate": 3.3789473684210527e-06, "loss": 0.2012, "step": 44030 }, { "epoch": 66.23, "grad_norm": 2.169158697128296, "learning_rate": 3.3774436090225567e-06, "loss": 0.1534, "step": 44040 }, { "epoch": 66.24, "grad_norm": 4.5548553466796875, "learning_rate": 3.37593984962406e-06, "loss": 0.2439, "step": 44050 }, { "epoch": 66.26, "grad_norm": 3.5269806385040283, "learning_rate": 3.374436090225564e-06, "loss": 0.2261, "step": 44060 }, { "epoch": 66.27, "grad_norm": 4.3473334312438965, "learning_rate": 3.3729323308270676e-06, "loss": 0.1643, "step": 44070 }, { "epoch": 66.29, "grad_norm": 1.9924315214157104, "learning_rate": 3.3714285714285716e-06, "loss": 0.1739, "step": 44080 }, { "epoch": 66.3, "grad_norm": 4.501370906829834, "learning_rate": 3.3699248120300755e-06, "loss": 0.2115, "step": 44090 }, { "epoch": 66.32, "grad_norm": 9.435755729675293, "learning_rate": 3.368421052631579e-06, "loss": 0.1906, "step": 44100 }, { "epoch": 66.33, "grad_norm": 5.539870738983154, "learning_rate": 3.366917293233083e-06, "loss": 0.1738, "step": 44110 }, { "epoch": 66.35, "grad_norm": 6.538062572479248, "learning_rate": 3.3654135338345865e-06, "loss": 0.1997, "step": 44120 }, { "epoch": 66.36, "grad_norm": 3.478639602661133, "learning_rate": 3.3639097744360904e-06, "loss": 0.2232, "step": 44130 }, { "epoch": 66.38, "grad_norm": 3.3659911155700684, "learning_rate": 3.3624060150375944e-06, "loss": 0.1294, "step": 44140 }, { "epoch": 66.39, "grad_norm": 1.8332750797271729, "learning_rate": 3.360902255639098e-06, "loss": 0.171, "step": 44150 }, { "epoch": 66.41, "grad_norm": 6.612475872039795, "learning_rate": 3.359398496240602e-06, "loss": 0.2437, "step": 44160 }, { "epoch": 66.42, "grad_norm": 4.539286136627197, "learning_rate": 3.3578947368421054e-06, "loss": 0.1853, "step": 44170 }, { "epoch": 66.44, "grad_norm": 6.16352653503418, "learning_rate": 3.3563909774436093e-06, "loss": 0.2118, "step": 44180 }, { "epoch": 66.45, "grad_norm": 6.134418487548828, "learning_rate": 3.3548872180451133e-06, "loss": 0.2087, "step": 44190 }, { "epoch": 66.47, "grad_norm": 3.634429454803467, "learning_rate": 3.3533834586466168e-06, "loss": 0.1724, "step": 44200 }, { "epoch": 66.48, "grad_norm": 3.4543323516845703, "learning_rate": 3.3518796992481207e-06, "loss": 0.1575, "step": 44210 }, { "epoch": 66.5, "grad_norm": 4.991121768951416, "learning_rate": 3.3503759398496242e-06, "loss": 0.2694, "step": 44220 }, { "epoch": 66.51, "grad_norm": 5.8151936531066895, "learning_rate": 3.348872180451128e-06, "loss": 0.2191, "step": 44230 }, { "epoch": 66.53, "grad_norm": 4.985229969024658, "learning_rate": 3.347368421052632e-06, "loss": 0.2083, "step": 44240 }, { "epoch": 66.54, "grad_norm": 4.52158260345459, "learning_rate": 3.3458646616541356e-06, "loss": 0.2116, "step": 44250 }, { "epoch": 66.56, "grad_norm": 5.498661518096924, "learning_rate": 3.3443609022556396e-06, "loss": 0.2242, "step": 44260 }, { "epoch": 66.57, "grad_norm": 4.108017444610596, "learning_rate": 3.342857142857143e-06, "loss": 0.2415, "step": 44270 }, { "epoch": 66.59, "grad_norm": 7.99500846862793, "learning_rate": 3.341353383458647e-06, "loss": 0.2235, "step": 44280 }, { "epoch": 66.6, "grad_norm": 3.367448329925537, "learning_rate": 3.339849624060151e-06, "loss": 0.1975, "step": 44290 }, { "epoch": 66.62, "grad_norm": 3.148461103439331, "learning_rate": 3.3383458646616545e-06, "loss": 0.1706, "step": 44300 }, { "epoch": 66.63, "grad_norm": 5.184333324432373, "learning_rate": 3.3368421052631584e-06, "loss": 0.158, "step": 44310 }, { "epoch": 66.65, "grad_norm": 8.876405715942383, "learning_rate": 3.335338345864662e-06, "loss": 0.2529, "step": 44320 }, { "epoch": 66.66, "grad_norm": 1.495922565460205, "learning_rate": 3.333834586466166e-06, "loss": 0.1659, "step": 44330 }, { "epoch": 66.68, "grad_norm": 4.976419925689697, "learning_rate": 3.3323308270676694e-06, "loss": 0.2705, "step": 44340 }, { "epoch": 66.69, "grad_norm": 8.409575462341309, "learning_rate": 3.3308270676691734e-06, "loss": 0.237, "step": 44350 }, { "epoch": 66.71, "grad_norm": 11.252154350280762, "learning_rate": 3.3293233082706773e-06, "loss": 0.225, "step": 44360 }, { "epoch": 66.72, "grad_norm": 6.9660820960998535, "learning_rate": 3.327819548872181e-06, "loss": 0.1612, "step": 44370 }, { "epoch": 66.74, "grad_norm": 7.809027671813965, "learning_rate": 3.3263157894736848e-06, "loss": 0.1657, "step": 44380 }, { "epoch": 66.75, "grad_norm": 4.254601955413818, "learning_rate": 3.3248120300751883e-06, "loss": 0.1746, "step": 44390 }, { "epoch": 66.77, "grad_norm": 7.9498291015625, "learning_rate": 3.3233082706766922e-06, "loss": 0.1704, "step": 44400 }, { "epoch": 66.78, "grad_norm": 2.0477476119995117, "learning_rate": 3.321804511278196e-06, "loss": 0.1805, "step": 44410 }, { "epoch": 66.8, "grad_norm": 4.7251715660095215, "learning_rate": 3.3203007518796993e-06, "loss": 0.1856, "step": 44420 }, { "epoch": 66.81, "grad_norm": 3.7293553352355957, "learning_rate": 3.3187969924812032e-06, "loss": 0.1233, "step": 44430 }, { "epoch": 66.83, "grad_norm": 6.081485748291016, "learning_rate": 3.3172932330827067e-06, "loss": 0.1719, "step": 44440 }, { "epoch": 66.84, "grad_norm": 5.9813079833984375, "learning_rate": 3.3157894736842107e-06, "loss": 0.186, "step": 44450 }, { "epoch": 66.86, "grad_norm": 6.273433208465576, "learning_rate": 3.314285714285714e-06, "loss": 0.236, "step": 44460 }, { "epoch": 66.87, "grad_norm": 6.025407314300537, "learning_rate": 3.312781954887218e-06, "loss": 0.2297, "step": 44470 }, { "epoch": 66.89, "grad_norm": 4.0798659324646, "learning_rate": 3.3112781954887217e-06, "loss": 0.1626, "step": 44480 }, { "epoch": 66.9, "grad_norm": 1.6640973091125488, "learning_rate": 3.3097744360902256e-06, "loss": 0.1482, "step": 44490 }, { "epoch": 66.92, "grad_norm": 5.056227207183838, "learning_rate": 3.3082706766917295e-06, "loss": 0.2078, "step": 44500 }, { "epoch": 66.93, "grad_norm": 4.699299335479736, "learning_rate": 3.306766917293233e-06, "loss": 0.2021, "step": 44510 }, { "epoch": 66.95, "grad_norm": 2.141939163208008, "learning_rate": 3.305263157894737e-06, "loss": 0.1897, "step": 44520 }, { "epoch": 66.96, "grad_norm": 4.3747687339782715, "learning_rate": 3.3037593984962405e-06, "loss": 0.2014, "step": 44530 }, { "epoch": 66.98, "grad_norm": 5.673415660858154, "learning_rate": 3.3022556390977445e-06, "loss": 0.2335, "step": 44540 }, { "epoch": 66.99, "grad_norm": 2.832839250564575, "learning_rate": 3.3007518796992484e-06, "loss": 0.1946, "step": 44550 }, { "epoch": 67.0, "eval_accuracy": 0.9302, "eval_loss": 0.330706924200058, "eval_runtime": 84.1739, "eval_samples_per_second": 118.802, "eval_steps_per_second": 0.475, "step": 44555 }, { "epoch": 67.01, "grad_norm": 5.633582592010498, "learning_rate": 3.299248120300752e-06, "loss": 0.1437, "step": 44560 }, { "epoch": 67.02, "grad_norm": 3.9651787281036377, "learning_rate": 3.297744360902256e-06, "loss": 0.157, "step": 44570 }, { "epoch": 67.04, "grad_norm": 3.220197916030884, "learning_rate": 3.2962406015037594e-06, "loss": 0.1883, "step": 44580 }, { "epoch": 67.05, "grad_norm": 6.965760707855225, "learning_rate": 3.2947368421052633e-06, "loss": 0.2133, "step": 44590 }, { "epoch": 67.07, "grad_norm": 7.967811107635498, "learning_rate": 3.2932330827067673e-06, "loss": 0.1972, "step": 44600 }, { "epoch": 67.08, "grad_norm": 8.779460906982422, "learning_rate": 3.291729323308271e-06, "loss": 0.2013, "step": 44610 }, { "epoch": 67.1, "grad_norm": 4.243956565856934, "learning_rate": 3.2902255639097747e-06, "loss": 0.1606, "step": 44620 }, { "epoch": 67.11, "grad_norm": 6.18154764175415, "learning_rate": 3.2887218045112783e-06, "loss": 0.231, "step": 44630 }, { "epoch": 67.13, "grad_norm": 4.825216293334961, "learning_rate": 3.287218045112782e-06, "loss": 0.2034, "step": 44640 }, { "epoch": 67.14, "grad_norm": 4.287315845489502, "learning_rate": 3.285714285714286e-06, "loss": 0.1463, "step": 44650 }, { "epoch": 67.16, "grad_norm": 4.091511249542236, "learning_rate": 3.2842105263157897e-06, "loss": 0.1423, "step": 44660 }, { "epoch": 67.17, "grad_norm": 4.6609978675842285, "learning_rate": 3.2827067669172936e-06, "loss": 0.2448, "step": 44670 }, { "epoch": 67.19, "grad_norm": 7.273349761962891, "learning_rate": 3.281203007518797e-06, "loss": 0.1618, "step": 44680 }, { "epoch": 67.2, "grad_norm": 6.079645156860352, "learning_rate": 3.279699248120301e-06, "loss": 0.2034, "step": 44690 }, { "epoch": 67.22, "grad_norm": 4.420186996459961, "learning_rate": 3.278195488721805e-06, "loss": 0.1605, "step": 44700 }, { "epoch": 67.23, "grad_norm": 2.5898821353912354, "learning_rate": 3.2766917293233085e-06, "loss": 0.1425, "step": 44710 }, { "epoch": 67.25, "grad_norm": 5.7673468589782715, "learning_rate": 3.2751879699248125e-06, "loss": 0.1487, "step": 44720 }, { "epoch": 67.26, "grad_norm": 4.787065505981445, "learning_rate": 3.273684210526316e-06, "loss": 0.2058, "step": 44730 }, { "epoch": 67.28, "grad_norm": 8.583816528320312, "learning_rate": 3.27218045112782e-06, "loss": 0.194, "step": 44740 }, { "epoch": 67.29, "grad_norm": 6.30866813659668, "learning_rate": 3.270676691729324e-06, "loss": 0.2036, "step": 44750 }, { "epoch": 67.31, "grad_norm": 3.9293291568756104, "learning_rate": 3.2691729323308274e-06, "loss": 0.1755, "step": 44760 }, { "epoch": 67.32, "grad_norm": 6.662651538848877, "learning_rate": 3.2676691729323313e-06, "loss": 0.1482, "step": 44770 }, { "epoch": 67.34, "grad_norm": 4.203287124633789, "learning_rate": 3.266165413533835e-06, "loss": 0.263, "step": 44780 }, { "epoch": 67.35, "grad_norm": 5.568451881408691, "learning_rate": 3.264661654135339e-06, "loss": 0.1302, "step": 44790 }, { "epoch": 67.37, "grad_norm": 11.907421112060547, "learning_rate": 3.2631578947368423e-06, "loss": 0.2896, "step": 44800 }, { "epoch": 67.38, "grad_norm": 7.300894737243652, "learning_rate": 3.2616541353383463e-06, "loss": 0.2324, "step": 44810 }, { "epoch": 67.4, "grad_norm": 0.4205104112625122, "learning_rate": 3.26015037593985e-06, "loss": 0.1475, "step": 44820 }, { "epoch": 67.41, "grad_norm": 2.742250919342041, "learning_rate": 3.2586466165413537e-06, "loss": 0.223, "step": 44830 }, { "epoch": 67.43, "grad_norm": 4.337477684020996, "learning_rate": 3.2571428571428577e-06, "loss": 0.1887, "step": 44840 }, { "epoch": 67.44, "grad_norm": 6.059717178344727, "learning_rate": 3.255639097744361e-06, "loss": 0.2286, "step": 44850 }, { "epoch": 67.46, "grad_norm": 8.457310676574707, "learning_rate": 3.254135338345865e-06, "loss": 0.1812, "step": 44860 }, { "epoch": 67.47, "grad_norm": 2.976374626159668, "learning_rate": 3.252631578947369e-06, "loss": 0.2129, "step": 44870 }, { "epoch": 67.49, "grad_norm": 5.421755790710449, "learning_rate": 3.2511278195488726e-06, "loss": 0.1674, "step": 44880 }, { "epoch": 67.5, "grad_norm": 7.67271614074707, "learning_rate": 3.2496240601503765e-06, "loss": 0.1708, "step": 44890 }, { "epoch": 67.52, "grad_norm": 7.604538440704346, "learning_rate": 3.24812030075188e-06, "loss": 0.205, "step": 44900 }, { "epoch": 67.53, "grad_norm": 5.754815578460693, "learning_rate": 3.246616541353384e-06, "loss": 0.1922, "step": 44910 }, { "epoch": 67.55, "grad_norm": 5.512721061706543, "learning_rate": 3.245112781954887e-06, "loss": 0.1671, "step": 44920 }, { "epoch": 67.56, "grad_norm": 5.614846706390381, "learning_rate": 3.243609022556391e-06, "loss": 0.1837, "step": 44930 }, { "epoch": 67.58, "grad_norm": 6.0121612548828125, "learning_rate": 3.2421052631578945e-06, "loss": 0.221, "step": 44940 }, { "epoch": 67.59, "grad_norm": 4.721285820007324, "learning_rate": 3.2406015037593985e-06, "loss": 0.1704, "step": 44950 }, { "epoch": 67.61, "grad_norm": 5.494776248931885, "learning_rate": 3.2390977443609024e-06, "loss": 0.2032, "step": 44960 }, { "epoch": 67.62, "grad_norm": 3.387678861618042, "learning_rate": 3.237593984962406e-06, "loss": 0.2162, "step": 44970 }, { "epoch": 67.64, "grad_norm": 5.2446417808532715, "learning_rate": 3.23609022556391e-06, "loss": 0.138, "step": 44980 }, { "epoch": 67.65, "grad_norm": 6.1764349937438965, "learning_rate": 3.2345864661654134e-06, "loss": 0.1532, "step": 44990 }, { "epoch": 67.67, "grad_norm": 4.519616603851318, "learning_rate": 3.2330827067669174e-06, "loss": 0.1948, "step": 45000 }, { "epoch": 67.68, "grad_norm": 7.864477634429932, "learning_rate": 3.2315789473684213e-06, "loss": 0.1927, "step": 45010 }, { "epoch": 67.7, "grad_norm": 2.657796859741211, "learning_rate": 3.230075187969925e-06, "loss": 0.1907, "step": 45020 }, { "epoch": 67.71, "grad_norm": 6.408271312713623, "learning_rate": 3.2285714285714288e-06, "loss": 0.1843, "step": 45030 }, { "epoch": 67.73, "grad_norm": 4.096652984619141, "learning_rate": 3.2270676691729323e-06, "loss": 0.174, "step": 45040 }, { "epoch": 67.74, "grad_norm": 4.657118797302246, "learning_rate": 3.2255639097744362e-06, "loss": 0.1888, "step": 45050 }, { "epoch": 67.76, "grad_norm": 4.9201860427856445, "learning_rate": 3.22406015037594e-06, "loss": 0.2123, "step": 45060 }, { "epoch": 67.77, "grad_norm": 5.590874195098877, "learning_rate": 3.2225563909774437e-06, "loss": 0.1487, "step": 45070 }, { "epoch": 67.79, "grad_norm": 7.1157050132751465, "learning_rate": 3.2210526315789476e-06, "loss": 0.193, "step": 45080 }, { "epoch": 67.8, "grad_norm": 2.877906560897827, "learning_rate": 3.219548872180451e-06, "loss": 0.1419, "step": 45090 }, { "epoch": 67.82, "grad_norm": 5.331236839294434, "learning_rate": 3.218045112781955e-06, "loss": 0.1581, "step": 45100 }, { "epoch": 67.83, "grad_norm": 6.2138991355896, "learning_rate": 3.216541353383459e-06, "loss": 0.201, "step": 45110 }, { "epoch": 67.85, "grad_norm": 3.055180072784424, "learning_rate": 3.2150375939849625e-06, "loss": 0.1932, "step": 45120 }, { "epoch": 67.86, "grad_norm": 5.881607532501221, "learning_rate": 3.2135338345864665e-06, "loss": 0.2327, "step": 45130 }, { "epoch": 67.88, "grad_norm": 7.188892841339111, "learning_rate": 3.21203007518797e-06, "loss": 0.1651, "step": 45140 }, { "epoch": 67.89, "grad_norm": 3.549654960632324, "learning_rate": 3.210526315789474e-06, "loss": 0.2572, "step": 45150 }, { "epoch": 67.91, "grad_norm": 9.034676551818848, "learning_rate": 3.209022556390978e-06, "loss": 0.1358, "step": 45160 }, { "epoch": 67.92, "grad_norm": 4.870662689208984, "learning_rate": 3.2075187969924814e-06, "loss": 0.207, "step": 45170 }, { "epoch": 67.94, "grad_norm": 1.617182731628418, "learning_rate": 3.2060150375939854e-06, "loss": 0.2525, "step": 45180 }, { "epoch": 67.95, "grad_norm": 2.620441198348999, "learning_rate": 3.204511278195489e-06, "loss": 0.1688, "step": 45190 }, { "epoch": 67.97, "grad_norm": 9.695724487304688, "learning_rate": 3.203007518796993e-06, "loss": 0.233, "step": 45200 }, { "epoch": 67.98, "grad_norm": 6.005362033843994, "learning_rate": 3.2015037593984968e-06, "loss": 0.1813, "step": 45210 }, { "epoch": 68.0, "grad_norm": 0.01271333172917366, "learning_rate": 3.2000000000000003e-06, "loss": 0.1492, "step": 45220 }, { "epoch": 68.0, "eval_accuracy": 0.9296, "eval_loss": 0.32478708028793335, "eval_runtime": 84.6456, "eval_samples_per_second": 118.14, "eval_steps_per_second": 0.473, "step": 45220 }, { "epoch": 68.02, "grad_norm": 9.229415893554688, "learning_rate": 3.1984962406015042e-06, "loss": 0.2163, "step": 45230 }, { "epoch": 68.03, "grad_norm": 5.0831298828125, "learning_rate": 3.1969924812030077e-06, "loss": 0.2027, "step": 45240 }, { "epoch": 68.05, "grad_norm": 4.618161678314209, "learning_rate": 3.1954887218045117e-06, "loss": 0.2085, "step": 45250 }, { "epoch": 68.06, "grad_norm": 2.447524070739746, "learning_rate": 3.193984962406015e-06, "loss": 0.1593, "step": 45260 }, { "epoch": 68.08, "grad_norm": 5.367290496826172, "learning_rate": 3.192481203007519e-06, "loss": 0.1527, "step": 45270 }, { "epoch": 68.09, "grad_norm": 3.5726702213287354, "learning_rate": 3.190977443609023e-06, "loss": 0.1963, "step": 45280 }, { "epoch": 68.11, "grad_norm": 6.074667930603027, "learning_rate": 3.1894736842105266e-06, "loss": 0.2226, "step": 45290 }, { "epoch": 68.12, "grad_norm": 6.223268508911133, "learning_rate": 3.1879699248120305e-06, "loss": 0.277, "step": 45300 }, { "epoch": 68.14, "grad_norm": 4.411190509796143, "learning_rate": 3.186466165413534e-06, "loss": 0.1662, "step": 45310 }, { "epoch": 68.15, "grad_norm": 5.630961894989014, "learning_rate": 3.184962406015038e-06, "loss": 0.2317, "step": 45320 }, { "epoch": 68.17, "grad_norm": 6.117316246032715, "learning_rate": 3.183458646616542e-06, "loss": 0.2044, "step": 45330 }, { "epoch": 68.18, "grad_norm": 6.058127403259277, "learning_rate": 3.1819548872180455e-06, "loss": 0.1925, "step": 45340 }, { "epoch": 68.2, "grad_norm": 4.601078033447266, "learning_rate": 3.1804511278195494e-06, "loss": 0.2433, "step": 45350 }, { "epoch": 68.21, "grad_norm": 11.11681842803955, "learning_rate": 3.178947368421053e-06, "loss": 0.2035, "step": 45360 }, { "epoch": 68.23, "grad_norm": 4.8060503005981445, "learning_rate": 3.177443609022557e-06, "loss": 0.1838, "step": 45370 }, { "epoch": 68.24, "grad_norm": 3.692517042160034, "learning_rate": 3.175939849624061e-06, "loss": 0.1765, "step": 45380 }, { "epoch": 68.26, "grad_norm": 7.551518440246582, "learning_rate": 3.1744360902255643e-06, "loss": 0.1897, "step": 45390 }, { "epoch": 68.27, "grad_norm": 4.985076427459717, "learning_rate": 3.1729323308270683e-06, "loss": 0.216, "step": 45400 }, { "epoch": 68.29, "grad_norm": 8.353325843811035, "learning_rate": 3.1714285714285714e-06, "loss": 0.196, "step": 45410 }, { "epoch": 68.3, "grad_norm": 6.800642490386963, "learning_rate": 3.1699248120300753e-06, "loss": 0.1798, "step": 45420 }, { "epoch": 68.32, "grad_norm": 7.222949028015137, "learning_rate": 3.168421052631579e-06, "loss": 0.1988, "step": 45430 }, { "epoch": 68.33, "grad_norm": 7.158170700073242, "learning_rate": 3.1669172932330828e-06, "loss": 0.2191, "step": 45440 }, { "epoch": 68.35, "grad_norm": 5.271254062652588, "learning_rate": 3.1654135338345863e-06, "loss": 0.2169, "step": 45450 }, { "epoch": 68.36, "grad_norm": 2.8725428581237793, "learning_rate": 3.1639097744360902e-06, "loss": 0.2091, "step": 45460 }, { "epoch": 68.38, "grad_norm": 6.836148262023926, "learning_rate": 3.162406015037594e-06, "loss": 0.2319, "step": 45470 }, { "epoch": 68.39, "grad_norm": 3.735705614089966, "learning_rate": 3.1609022556390977e-06, "loss": 0.2037, "step": 45480 }, { "epoch": 68.41, "grad_norm": 6.039109706878662, "learning_rate": 3.1593984962406016e-06, "loss": 0.2016, "step": 45490 }, { "epoch": 68.42, "grad_norm": 6.477905750274658, "learning_rate": 3.157894736842105e-06, "loss": 0.185, "step": 45500 }, { "epoch": 68.44, "grad_norm": 5.0428900718688965, "learning_rate": 3.156390977443609e-06, "loss": 0.192, "step": 45510 }, { "epoch": 68.45, "grad_norm": 4.29931640625, "learning_rate": 3.154887218045113e-06, "loss": 0.1585, "step": 45520 }, { "epoch": 68.47, "grad_norm": 4.26162052154541, "learning_rate": 3.1533834586466166e-06, "loss": 0.1979, "step": 45530 }, { "epoch": 68.48, "grad_norm": 6.1737380027771, "learning_rate": 3.1518796992481205e-06, "loss": 0.1783, "step": 45540 }, { "epoch": 68.5, "grad_norm": 3.1564579010009766, "learning_rate": 3.150375939849624e-06, "loss": 0.1443, "step": 45550 }, { "epoch": 68.51, "grad_norm": 2.920849323272705, "learning_rate": 3.148872180451128e-06, "loss": 0.2208, "step": 45560 }, { "epoch": 68.53, "grad_norm": 4.952502727508545, "learning_rate": 3.147368421052632e-06, "loss": 0.1836, "step": 45570 }, { "epoch": 68.54, "grad_norm": 2.473483085632324, "learning_rate": 3.1458646616541354e-06, "loss": 0.1324, "step": 45580 }, { "epoch": 68.56, "grad_norm": 5.73502779006958, "learning_rate": 3.1443609022556394e-06, "loss": 0.2821, "step": 45590 }, { "epoch": 68.57, "grad_norm": 3.3458425998687744, "learning_rate": 3.142857142857143e-06, "loss": 0.1789, "step": 45600 }, { "epoch": 68.59, "grad_norm": 5.300927639007568, "learning_rate": 3.141353383458647e-06, "loss": 0.2018, "step": 45610 }, { "epoch": 68.6, "grad_norm": 6.269550323486328, "learning_rate": 3.1398496240601508e-06, "loss": 0.1991, "step": 45620 }, { "epoch": 68.62, "grad_norm": 6.622176170349121, "learning_rate": 3.1383458646616543e-06, "loss": 0.1805, "step": 45630 }, { "epoch": 68.63, "grad_norm": 7.759119987487793, "learning_rate": 3.1368421052631582e-06, "loss": 0.2455, "step": 45640 }, { "epoch": 68.65, "grad_norm": 5.007221221923828, "learning_rate": 3.1353383458646618e-06, "loss": 0.1756, "step": 45650 }, { "epoch": 68.66, "grad_norm": 5.451401710510254, "learning_rate": 3.1338345864661657e-06, "loss": 0.2058, "step": 45660 }, { "epoch": 68.68, "grad_norm": 0.3662077784538269, "learning_rate": 3.1323308270676696e-06, "loss": 0.2024, "step": 45670 }, { "epoch": 68.69, "grad_norm": 7.444847583770752, "learning_rate": 3.130827067669173e-06, "loss": 0.2119, "step": 45680 }, { "epoch": 68.71, "grad_norm": 4.96779203414917, "learning_rate": 3.129323308270677e-06, "loss": 0.2708, "step": 45690 }, { "epoch": 68.72, "grad_norm": 5.92564582824707, "learning_rate": 3.1278195488721806e-06, "loss": 0.1653, "step": 45700 }, { "epoch": 68.74, "grad_norm": 5.052578449249268, "learning_rate": 3.1263157894736846e-06, "loss": 0.1841, "step": 45710 }, { "epoch": 68.75, "grad_norm": 3.4545114040374756, "learning_rate": 3.124812030075188e-06, "loss": 0.2163, "step": 45720 }, { "epoch": 68.77, "grad_norm": 5.480828762054443, "learning_rate": 3.123308270676692e-06, "loss": 0.1242, "step": 45730 }, { "epoch": 68.78, "grad_norm": 4.088112831115723, "learning_rate": 3.121804511278196e-06, "loss": 0.1112, "step": 45740 }, { "epoch": 68.8, "grad_norm": 4.465242862701416, "learning_rate": 3.1203007518796995e-06, "loss": 0.2095, "step": 45750 }, { "epoch": 68.81, "grad_norm": 7.828632831573486, "learning_rate": 3.1187969924812034e-06, "loss": 0.246, "step": 45760 }, { "epoch": 68.83, "grad_norm": 4.115504741668701, "learning_rate": 3.117293233082707e-06, "loss": 0.1527, "step": 45770 }, { "epoch": 68.84, "grad_norm": 4.330153465270996, "learning_rate": 3.115789473684211e-06, "loss": 0.1638, "step": 45780 }, { "epoch": 68.86, "grad_norm": 5.463741302490234, "learning_rate": 3.114285714285715e-06, "loss": 0.2238, "step": 45790 }, { "epoch": 68.87, "grad_norm": 7.970693111419678, "learning_rate": 3.1127819548872184e-06, "loss": 0.1802, "step": 45800 }, { "epoch": 68.89, "grad_norm": 4.3561577796936035, "learning_rate": 3.1112781954887223e-06, "loss": 0.2137, "step": 45810 }, { "epoch": 68.9, "grad_norm": 5.0850958824157715, "learning_rate": 3.109774436090226e-06, "loss": 0.1676, "step": 45820 }, { "epoch": 68.92, "grad_norm": 4.8231730461120605, "learning_rate": 3.1082706766917298e-06, "loss": 0.1825, "step": 45830 }, { "epoch": 68.93, "grad_norm": 3.689068555831909, "learning_rate": 3.1067669172932337e-06, "loss": 0.1906, "step": 45840 }, { "epoch": 68.95, "grad_norm": 3.177267074584961, "learning_rate": 3.1052631578947372e-06, "loss": 0.1653, "step": 45850 }, { "epoch": 68.96, "grad_norm": 3.5612058639526367, "learning_rate": 3.103759398496241e-06, "loss": 0.1703, "step": 45860 }, { "epoch": 68.98, "grad_norm": 7.400056838989258, "learning_rate": 3.1022556390977447e-06, "loss": 0.1732, "step": 45870 }, { "epoch": 68.99, "grad_norm": 8.14755916595459, "learning_rate": 3.1007518796992486e-06, "loss": 0.223, "step": 45880 }, { "epoch": 69.0, "eval_accuracy": 0.9293, "eval_loss": 0.33156681060791016, "eval_runtime": 84.519, "eval_samples_per_second": 118.317, "eval_steps_per_second": 0.473, "step": 45885 }, { "epoch": 69.01, "grad_norm": 4.614025592803955, "learning_rate": 3.0992481203007526e-06, "loss": 0.1577, "step": 45890 }, { "epoch": 69.02, "grad_norm": 5.442816257476807, "learning_rate": 3.097744360902256e-06, "loss": 0.2265, "step": 45900 }, { "epoch": 69.04, "grad_norm": 5.297948837280273, "learning_rate": 3.096240601503759e-06, "loss": 0.199, "step": 45910 }, { "epoch": 69.05, "grad_norm": 5.042107582092285, "learning_rate": 3.094736842105263e-06, "loss": 0.1949, "step": 45920 }, { "epoch": 69.07, "grad_norm": 6.529865264892578, "learning_rate": 3.093233082706767e-06, "loss": 0.2015, "step": 45930 }, { "epoch": 69.08, "grad_norm": 6.556023120880127, "learning_rate": 3.0917293233082706e-06, "loss": 0.2051, "step": 45940 }, { "epoch": 69.1, "grad_norm": 5.027461051940918, "learning_rate": 3.0902255639097745e-06, "loss": 0.1894, "step": 45950 }, { "epoch": 69.11, "grad_norm": 3.367723226547241, "learning_rate": 3.088721804511278e-06, "loss": 0.1466, "step": 45960 }, { "epoch": 69.13, "grad_norm": 4.521204471588135, "learning_rate": 3.087218045112782e-06, "loss": 0.1885, "step": 45970 }, { "epoch": 69.14, "grad_norm": 5.166673183441162, "learning_rate": 3.085714285714286e-06, "loss": 0.1593, "step": 45980 }, { "epoch": 69.16, "grad_norm": 4.3414812088012695, "learning_rate": 3.0842105263157895e-06, "loss": 0.1804, "step": 45990 }, { "epoch": 69.17, "grad_norm": 6.790599822998047, "learning_rate": 3.0827067669172934e-06, "loss": 0.177, "step": 46000 }, { "epoch": 69.19, "grad_norm": 5.932426452636719, "learning_rate": 3.081203007518797e-06, "loss": 0.2503, "step": 46010 }, { "epoch": 69.2, "grad_norm": 3.4542813301086426, "learning_rate": 3.079699248120301e-06, "loss": 0.1616, "step": 46020 }, { "epoch": 69.22, "grad_norm": 3.6076695919036865, "learning_rate": 3.078195488721805e-06, "loss": 0.1794, "step": 46030 }, { "epoch": 69.23, "grad_norm": 6.4292378425598145, "learning_rate": 3.0766917293233083e-06, "loss": 0.1487, "step": 46040 }, { "epoch": 69.25, "grad_norm": 7.210880279541016, "learning_rate": 3.0751879699248123e-06, "loss": 0.1703, "step": 46050 }, { "epoch": 69.26, "grad_norm": 2.0395233631134033, "learning_rate": 3.0736842105263158e-06, "loss": 0.1636, "step": 46060 }, { "epoch": 69.28, "grad_norm": 9.782295227050781, "learning_rate": 3.0721804511278197e-06, "loss": 0.1979, "step": 46070 }, { "epoch": 69.29, "grad_norm": 3.0649471282958984, "learning_rate": 3.0706766917293237e-06, "loss": 0.2242, "step": 46080 }, { "epoch": 69.31, "grad_norm": 7.512526035308838, "learning_rate": 3.069172932330827e-06, "loss": 0.2796, "step": 46090 }, { "epoch": 69.32, "grad_norm": 3.8779592514038086, "learning_rate": 3.067669172932331e-06, "loss": 0.1872, "step": 46100 }, { "epoch": 69.34, "grad_norm": 4.834461212158203, "learning_rate": 3.0661654135338346e-06, "loss": 0.1784, "step": 46110 }, { "epoch": 69.35, "grad_norm": 4.335732460021973, "learning_rate": 3.0646616541353386e-06, "loss": 0.1715, "step": 46120 }, { "epoch": 69.37, "grad_norm": 8.79086971282959, "learning_rate": 3.0631578947368425e-06, "loss": 0.2193, "step": 46130 }, { "epoch": 69.38, "grad_norm": 3.5167338848114014, "learning_rate": 3.061654135338346e-06, "loss": 0.1857, "step": 46140 }, { "epoch": 69.4, "grad_norm": 3.6358115673065186, "learning_rate": 3.06015037593985e-06, "loss": 0.1648, "step": 46150 }, { "epoch": 69.41, "grad_norm": 4.419709205627441, "learning_rate": 3.0586466165413535e-06, "loss": 0.1463, "step": 46160 }, { "epoch": 69.43, "grad_norm": 12.194480895996094, "learning_rate": 3.0571428571428575e-06, "loss": 0.2125, "step": 46170 }, { "epoch": 69.44, "grad_norm": 6.218225002288818, "learning_rate": 3.055639097744361e-06, "loss": 0.1879, "step": 46180 }, { "epoch": 69.46, "grad_norm": 4.367229461669922, "learning_rate": 3.054135338345865e-06, "loss": 0.1527, "step": 46190 }, { "epoch": 69.47, "grad_norm": 4.397371768951416, "learning_rate": 3.052631578947369e-06, "loss": 0.214, "step": 46200 }, { "epoch": 69.49, "grad_norm": 9.216353416442871, "learning_rate": 3.0511278195488724e-06, "loss": 0.2037, "step": 46210 }, { "epoch": 69.5, "grad_norm": 4.541748523712158, "learning_rate": 3.0496240601503763e-06, "loss": 0.1943, "step": 46220 }, { "epoch": 69.52, "grad_norm": 2.6720430850982666, "learning_rate": 3.04812030075188e-06, "loss": 0.1699, "step": 46230 }, { "epoch": 69.53, "grad_norm": 3.862180471420288, "learning_rate": 3.0466165413533838e-06, "loss": 0.1799, "step": 46240 }, { "epoch": 69.55, "grad_norm": 3.2997727394104004, "learning_rate": 3.0451127819548877e-06, "loss": 0.1758, "step": 46250 }, { "epoch": 69.56, "grad_norm": 6.742743015289307, "learning_rate": 3.0436090225563912e-06, "loss": 0.1675, "step": 46260 }, { "epoch": 69.58, "grad_norm": 3.4689953327178955, "learning_rate": 3.042105263157895e-06, "loss": 0.1566, "step": 46270 }, { "epoch": 69.59, "grad_norm": 4.367392063140869, "learning_rate": 3.0406015037593987e-06, "loss": 0.1839, "step": 46280 }, { "epoch": 69.61, "grad_norm": 2.263700008392334, "learning_rate": 3.0390977443609027e-06, "loss": 0.1933, "step": 46290 }, { "epoch": 69.62, "grad_norm": 7.104362487792969, "learning_rate": 3.0375939849624066e-06, "loss": 0.1841, "step": 46300 }, { "epoch": 69.64, "grad_norm": 7.7885236740112305, "learning_rate": 3.03609022556391e-06, "loss": 0.2721, "step": 46310 }, { "epoch": 69.65, "grad_norm": 5.928685665130615, "learning_rate": 3.034586466165414e-06, "loss": 0.2017, "step": 46320 }, { "epoch": 69.67, "grad_norm": 3.4155795574188232, "learning_rate": 3.0330827067669176e-06, "loss": 0.1821, "step": 46330 }, { "epoch": 69.68, "grad_norm": 4.222193717956543, "learning_rate": 3.0315789473684215e-06, "loss": 0.2059, "step": 46340 }, { "epoch": 69.7, "grad_norm": 6.882046222686768, "learning_rate": 3.0300751879699255e-06, "loss": 0.1614, "step": 46350 }, { "epoch": 69.71, "grad_norm": 4.649013996124268, "learning_rate": 3.028571428571429e-06, "loss": 0.2039, "step": 46360 }, { "epoch": 69.73, "grad_norm": 3.898035764694214, "learning_rate": 3.027067669172933e-06, "loss": 0.1677, "step": 46370 }, { "epoch": 69.74, "grad_norm": 4.432783603668213, "learning_rate": 3.0255639097744364e-06, "loss": 0.1762, "step": 46380 }, { "epoch": 69.76, "grad_norm": 8.104802131652832, "learning_rate": 3.0240601503759404e-06, "loss": 0.1619, "step": 46390 }, { "epoch": 69.77, "grad_norm": 4.149446487426758, "learning_rate": 3.0225563909774443e-06, "loss": 0.1904, "step": 46400 }, { "epoch": 69.79, "grad_norm": 6.308215141296387, "learning_rate": 3.0210526315789474e-06, "loss": 0.1996, "step": 46410 }, { "epoch": 69.8, "grad_norm": 8.179604530334473, "learning_rate": 3.019548872180451e-06, "loss": 0.2209, "step": 46420 }, { "epoch": 69.82, "grad_norm": 5.956761837005615, "learning_rate": 3.018045112781955e-06, "loss": 0.1432, "step": 46430 }, { "epoch": 69.83, "grad_norm": 5.909665584564209, "learning_rate": 3.016541353383459e-06, "loss": 0.2528, "step": 46440 }, { "epoch": 69.85, "grad_norm": 6.1077399253845215, "learning_rate": 3.0150375939849623e-06, "loss": 0.2055, "step": 46450 }, { "epoch": 69.86, "grad_norm": 4.353606700897217, "learning_rate": 3.0135338345864663e-06, "loss": 0.2351, "step": 46460 }, { "epoch": 69.88, "grad_norm": 5.350019931793213, "learning_rate": 3.01203007518797e-06, "loss": 0.1732, "step": 46470 }, { "epoch": 69.89, "grad_norm": 3.917721748352051, "learning_rate": 3.0105263157894737e-06, "loss": 0.1791, "step": 46480 }, { "epoch": 69.91, "grad_norm": 10.580587387084961, "learning_rate": 3.0090225563909777e-06, "loss": 0.1899, "step": 46490 }, { "epoch": 69.92, "grad_norm": 4.388562202453613, "learning_rate": 3.007518796992481e-06, "loss": 0.2287, "step": 46500 }, { "epoch": 69.94, "grad_norm": 5.693699359893799, "learning_rate": 3.006015037593985e-06, "loss": 0.2259, "step": 46510 }, { "epoch": 69.95, "grad_norm": 3.3817298412323, "learning_rate": 3.0045112781954887e-06, "loss": 0.2412, "step": 46520 }, { "epoch": 69.97, "grad_norm": 1.847433090209961, "learning_rate": 3.0030075187969926e-06, "loss": 0.1723, "step": 46530 }, { "epoch": 69.98, "grad_norm": 6.4717254638671875, "learning_rate": 3.0015037593984966e-06, "loss": 0.2032, "step": 46540 }, { "epoch": 70.0, "grad_norm": 0.9074813723564148, "learning_rate": 3e-06, "loss": 0.1738, "step": 46550 }, { "epoch": 70.0, "eval_accuracy": 0.9295, "eval_loss": 0.3248044550418854, "eval_runtime": 84.7278, "eval_samples_per_second": 118.025, "eval_steps_per_second": 0.472, "step": 46550 }, { "epoch": 70.02, "grad_norm": 5.463174819946289, "learning_rate": 2.998496240601504e-06, "loss": 0.2036, "step": 46560 }, { "epoch": 70.03, "grad_norm": 2.8487234115600586, "learning_rate": 2.9969924812030075e-06, "loss": 0.1619, "step": 46570 }, { "epoch": 70.05, "grad_norm": 4.683668613433838, "learning_rate": 2.9954887218045115e-06, "loss": 0.2471, "step": 46580 }, { "epoch": 70.06, "grad_norm": 5.568604946136475, "learning_rate": 2.9939849624060154e-06, "loss": 0.2199, "step": 46590 }, { "epoch": 70.08, "grad_norm": 8.11655330657959, "learning_rate": 2.992481203007519e-06, "loss": 0.1753, "step": 46600 }, { "epoch": 70.09, "grad_norm": 7.468130588531494, "learning_rate": 2.990977443609023e-06, "loss": 0.1809, "step": 46610 }, { "epoch": 70.11, "grad_norm": 6.2125115394592285, "learning_rate": 2.9894736842105264e-06, "loss": 0.2751, "step": 46620 }, { "epoch": 70.12, "grad_norm": 3.6229348182678223, "learning_rate": 2.9879699248120303e-06, "loss": 0.1723, "step": 46630 }, { "epoch": 70.14, "grad_norm": 4.364720344543457, "learning_rate": 2.986466165413534e-06, "loss": 0.1679, "step": 46640 }, { "epoch": 70.15, "grad_norm": 6.14267110824585, "learning_rate": 2.984962406015038e-06, "loss": 0.2083, "step": 46650 }, { "epoch": 70.17, "grad_norm": 3.9673428535461426, "learning_rate": 2.9834586466165418e-06, "loss": 0.1551, "step": 46660 }, { "epoch": 70.18, "grad_norm": 4.621110916137695, "learning_rate": 2.9819548872180453e-06, "loss": 0.1569, "step": 46670 }, { "epoch": 70.2, "grad_norm": 8.371132850646973, "learning_rate": 2.9804511278195492e-06, "loss": 0.1963, "step": 46680 }, { "epoch": 70.21, "grad_norm": 5.156669616699219, "learning_rate": 2.9789473684210527e-06, "loss": 0.2096, "step": 46690 }, { "epoch": 70.23, "grad_norm": 4.981484413146973, "learning_rate": 2.9774436090225567e-06, "loss": 0.1313, "step": 46700 }, { "epoch": 70.24, "grad_norm": 5.359274387359619, "learning_rate": 2.9759398496240606e-06, "loss": 0.183, "step": 46710 }, { "epoch": 70.26, "grad_norm": 6.65374231338501, "learning_rate": 2.974436090225564e-06, "loss": 0.2083, "step": 46720 }, { "epoch": 70.27, "grad_norm": 7.133225917816162, "learning_rate": 2.972932330827068e-06, "loss": 0.2509, "step": 46730 }, { "epoch": 70.29, "grad_norm": 4.245519638061523, "learning_rate": 2.9714285714285716e-06, "loss": 0.1871, "step": 46740 }, { "epoch": 70.3, "grad_norm": 5.218483924865723, "learning_rate": 2.9699248120300755e-06, "loss": 0.1595, "step": 46750 }, { "epoch": 70.32, "grad_norm": 6.059344291687012, "learning_rate": 2.9684210526315795e-06, "loss": 0.1771, "step": 46760 }, { "epoch": 70.33, "grad_norm": 4.6386237144470215, "learning_rate": 2.966917293233083e-06, "loss": 0.1868, "step": 46770 }, { "epoch": 70.35, "grad_norm": 4.3146257400512695, "learning_rate": 2.965413533834587e-06, "loss": 0.1941, "step": 46780 }, { "epoch": 70.36, "grad_norm": 4.2247395515441895, "learning_rate": 2.9639097744360905e-06, "loss": 0.2796, "step": 46790 }, { "epoch": 70.38, "grad_norm": 4.537256717681885, "learning_rate": 2.9624060150375944e-06, "loss": 0.1675, "step": 46800 }, { "epoch": 70.39, "grad_norm": 10.793876647949219, "learning_rate": 2.9609022556390983e-06, "loss": 0.1738, "step": 46810 }, { "epoch": 70.41, "grad_norm": 3.7752058506011963, "learning_rate": 2.959398496240602e-06, "loss": 0.1864, "step": 46820 }, { "epoch": 70.42, "grad_norm": 5.952646732330322, "learning_rate": 2.957894736842106e-06, "loss": 0.1994, "step": 46830 }, { "epoch": 70.44, "grad_norm": 2.576373815536499, "learning_rate": 2.9563909774436093e-06, "loss": 0.1891, "step": 46840 }, { "epoch": 70.45, "grad_norm": 3.7196900844573975, "learning_rate": 2.9548872180451133e-06, "loss": 0.2391, "step": 46850 }, { "epoch": 70.47, "grad_norm": 4.4083571434021, "learning_rate": 2.9533834586466172e-06, "loss": 0.2241, "step": 46860 }, { "epoch": 70.48, "grad_norm": 4.769841194152832, "learning_rate": 2.9518796992481207e-06, "loss": 0.1591, "step": 46870 }, { "epoch": 70.5, "grad_norm": 4.998278617858887, "learning_rate": 2.9503759398496247e-06, "loss": 0.1785, "step": 46880 }, { "epoch": 70.51, "grad_norm": 4.37769889831543, "learning_rate": 2.948872180451128e-06, "loss": 0.16, "step": 46890 }, { "epoch": 70.53, "grad_norm": 7.736121654510498, "learning_rate": 2.9473684210526317e-06, "loss": 0.2141, "step": 46900 }, { "epoch": 70.54, "grad_norm": 8.18359375, "learning_rate": 2.9458646616541352e-06, "loss": 0.2537, "step": 46910 }, { "epoch": 70.56, "grad_norm": 7.136547088623047, "learning_rate": 2.944360902255639e-06, "loss": 0.1986, "step": 46920 }, { "epoch": 70.57, "grad_norm": 2.9580116271972656, "learning_rate": 2.9428571428571427e-06, "loss": 0.1481, "step": 46930 }, { "epoch": 70.59, "grad_norm": 5.023125171661377, "learning_rate": 2.9413533834586466e-06, "loss": 0.2149, "step": 46940 }, { "epoch": 70.6, "grad_norm": 5.102907657623291, "learning_rate": 2.9398496240601506e-06, "loss": 0.1215, "step": 46950 }, { "epoch": 70.62, "grad_norm": 3.1318459510803223, "learning_rate": 2.938345864661654e-06, "loss": 0.1812, "step": 46960 }, { "epoch": 70.63, "grad_norm": 3.511244058609009, "learning_rate": 2.936842105263158e-06, "loss": 0.1735, "step": 46970 }, { "epoch": 70.65, "grad_norm": 8.335837364196777, "learning_rate": 2.9353383458646616e-06, "loss": 0.1951, "step": 46980 }, { "epoch": 70.66, "grad_norm": 3.936018228530884, "learning_rate": 2.9338345864661655e-06, "loss": 0.2294, "step": 46990 }, { "epoch": 70.68, "grad_norm": 6.189547538757324, "learning_rate": 2.9323308270676694e-06, "loss": 0.2205, "step": 47000 }, { "epoch": 70.69, "grad_norm": 3.3733437061309814, "learning_rate": 2.930827067669173e-06, "loss": 0.1982, "step": 47010 }, { "epoch": 70.71, "grad_norm": 3.5287833213806152, "learning_rate": 2.929323308270677e-06, "loss": 0.1972, "step": 47020 }, { "epoch": 70.72, "grad_norm": 21.65901756286621, "learning_rate": 2.9278195488721804e-06, "loss": 0.2244, "step": 47030 }, { "epoch": 70.74, "grad_norm": 7.8496904373168945, "learning_rate": 2.9263157894736844e-06, "loss": 0.114, "step": 47040 }, { "epoch": 70.75, "grad_norm": 3.459660053253174, "learning_rate": 2.9248120300751883e-06, "loss": 0.2043, "step": 47050 }, { "epoch": 70.77, "grad_norm": 6.128682613372803, "learning_rate": 2.923308270676692e-06, "loss": 0.1774, "step": 47060 }, { "epoch": 70.78, "grad_norm": 6.694797515869141, "learning_rate": 2.9218045112781958e-06, "loss": 0.1901, "step": 47070 }, { "epoch": 70.8, "grad_norm": 7.470389366149902, "learning_rate": 2.9203007518796993e-06, "loss": 0.1957, "step": 47080 }, { "epoch": 70.81, "grad_norm": 5.193848133087158, "learning_rate": 2.9187969924812032e-06, "loss": 0.1428, "step": 47090 }, { "epoch": 70.83, "grad_norm": 5.4394049644470215, "learning_rate": 2.9172932330827068e-06, "loss": 0.1873, "step": 47100 }, { "epoch": 70.84, "grad_norm": 5.622503280639648, "learning_rate": 2.9157894736842107e-06, "loss": 0.2147, "step": 47110 }, { "epoch": 70.86, "grad_norm": 9.180618286132812, "learning_rate": 2.9142857142857146e-06, "loss": 0.1662, "step": 47120 }, { "epoch": 70.87, "grad_norm": 3.7462944984436035, "learning_rate": 2.912781954887218e-06, "loss": 0.1925, "step": 47130 }, { "epoch": 70.89, "grad_norm": 6.142997741699219, "learning_rate": 2.911278195488722e-06, "loss": 0.1398, "step": 47140 }, { "epoch": 70.9, "grad_norm": 7.028275966644287, "learning_rate": 2.9097744360902256e-06, "loss": 0.2052, "step": 47150 }, { "epoch": 70.92, "grad_norm": 4.399960517883301, "learning_rate": 2.9082706766917296e-06, "loss": 0.2089, "step": 47160 }, { "epoch": 70.93, "grad_norm": 2.562202215194702, "learning_rate": 2.9067669172932335e-06, "loss": 0.1564, "step": 47170 }, { "epoch": 70.95, "grad_norm": 9.725334167480469, "learning_rate": 2.905263157894737e-06, "loss": 0.1702, "step": 47180 }, { "epoch": 70.96, "grad_norm": 4.6251654624938965, "learning_rate": 2.903759398496241e-06, "loss": 0.247, "step": 47190 }, { "epoch": 70.98, "grad_norm": 5.6048760414123535, "learning_rate": 2.9022556390977445e-06, "loss": 0.2587, "step": 47200 }, { "epoch": 70.99, "grad_norm": 2.857663631439209, "learning_rate": 2.9007518796992484e-06, "loss": 0.2251, "step": 47210 }, { "epoch": 71.0, "eval_accuracy": 0.9305, "eval_loss": 0.32967016100883484, "eval_runtime": 84.8297, "eval_samples_per_second": 117.883, "eval_steps_per_second": 0.472, "step": 47215 }, { "epoch": 71.01, "grad_norm": 3.354151487350464, "learning_rate": 2.8992481203007524e-06, "loss": 0.1378, "step": 47220 }, { "epoch": 71.02, "grad_norm": 14.883716583251953, "learning_rate": 2.897744360902256e-06, "loss": 0.1888, "step": 47230 }, { "epoch": 71.04, "grad_norm": 3.659508228302002, "learning_rate": 2.89624060150376e-06, "loss": 0.1611, "step": 47240 }, { "epoch": 71.05, "grad_norm": 4.597053527832031, "learning_rate": 2.8947368421052634e-06, "loss": 0.1918, "step": 47250 }, { "epoch": 71.07, "grad_norm": 4.022745132446289, "learning_rate": 2.8932330827067673e-06, "loss": 0.2231, "step": 47260 }, { "epoch": 71.08, "grad_norm": 3.9845337867736816, "learning_rate": 2.8917293233082712e-06, "loss": 0.2327, "step": 47270 }, { "epoch": 71.1, "grad_norm": 6.1547698974609375, "learning_rate": 2.8902255639097748e-06, "loss": 0.2559, "step": 47280 }, { "epoch": 71.11, "grad_norm": 10.55440616607666, "learning_rate": 2.8887218045112787e-06, "loss": 0.166, "step": 47290 }, { "epoch": 71.13, "grad_norm": 4.419111251831055, "learning_rate": 2.8872180451127822e-06, "loss": 0.1644, "step": 47300 }, { "epoch": 71.14, "grad_norm": 2.8730411529541016, "learning_rate": 2.885714285714286e-06, "loss": 0.1481, "step": 47310 }, { "epoch": 71.16, "grad_norm": 4.056236267089844, "learning_rate": 2.88421052631579e-06, "loss": 0.2006, "step": 47320 }, { "epoch": 71.17, "grad_norm": 5.246281623840332, "learning_rate": 2.8827067669172936e-06, "loss": 0.2015, "step": 47330 }, { "epoch": 71.19, "grad_norm": 4.46673059463501, "learning_rate": 2.8812030075187976e-06, "loss": 0.1138, "step": 47340 }, { "epoch": 71.2, "grad_norm": 9.421850204467773, "learning_rate": 2.879699248120301e-06, "loss": 0.2566, "step": 47350 }, { "epoch": 71.22, "grad_norm": 5.661787033081055, "learning_rate": 2.878195488721805e-06, "loss": 0.1804, "step": 47360 }, { "epoch": 71.23, "grad_norm": 4.693019390106201, "learning_rate": 2.876691729323309e-06, "loss": 0.1765, "step": 47370 }, { "epoch": 71.25, "grad_norm": 6.069882392883301, "learning_rate": 2.8751879699248125e-06, "loss": 0.173, "step": 47380 }, { "epoch": 71.26, "grad_norm": 8.419574737548828, "learning_rate": 2.8736842105263164e-06, "loss": 0.1686, "step": 47390 }, { "epoch": 71.28, "grad_norm": 2.8149406909942627, "learning_rate": 2.8721804511278195e-06, "loss": 0.2335, "step": 47400 }, { "epoch": 71.29, "grad_norm": 5.66928243637085, "learning_rate": 2.8706766917293235e-06, "loss": 0.2031, "step": 47410 }, { "epoch": 71.31, "grad_norm": 3.403899908065796, "learning_rate": 2.869172932330827e-06, "loss": 0.2124, "step": 47420 }, { "epoch": 71.32, "grad_norm": 5.2177910804748535, "learning_rate": 2.867669172932331e-06, "loss": 0.1582, "step": 47430 }, { "epoch": 71.34, "grad_norm": 5.742595195770264, "learning_rate": 2.8661654135338344e-06, "loss": 0.186, "step": 47440 }, { "epoch": 71.35, "grad_norm": 4.816529273986816, "learning_rate": 2.8646616541353384e-06, "loss": 0.1572, "step": 47450 }, { "epoch": 71.37, "grad_norm": 4.075284004211426, "learning_rate": 2.8631578947368423e-06, "loss": 0.1586, "step": 47460 }, { "epoch": 71.38, "grad_norm": 5.9553446769714355, "learning_rate": 2.861654135338346e-06, "loss": 0.1871, "step": 47470 }, { "epoch": 71.4, "grad_norm": 5.976673126220703, "learning_rate": 2.86015037593985e-06, "loss": 0.1897, "step": 47480 }, { "epoch": 71.41, "grad_norm": 3.2416350841522217, "learning_rate": 2.8586466165413533e-06, "loss": 0.1402, "step": 47490 }, { "epoch": 71.43, "grad_norm": 3.216466188430786, "learning_rate": 2.8571428571428573e-06, "loss": 0.1308, "step": 47500 }, { "epoch": 71.44, "grad_norm": 3.74507737159729, "learning_rate": 2.855639097744361e-06, "loss": 0.2048, "step": 47510 }, { "epoch": 71.46, "grad_norm": 2.735323905944824, "learning_rate": 2.8541353383458647e-06, "loss": 0.2007, "step": 47520 }, { "epoch": 71.47, "grad_norm": 4.694336414337158, "learning_rate": 2.8526315789473687e-06, "loss": 0.1947, "step": 47530 }, { "epoch": 71.49, "grad_norm": 4.027325630187988, "learning_rate": 2.851127819548872e-06, "loss": 0.1442, "step": 47540 }, { "epoch": 71.5, "grad_norm": 4.482545852661133, "learning_rate": 2.849624060150376e-06, "loss": 0.1949, "step": 47550 }, { "epoch": 71.52, "grad_norm": 21.274612426757812, "learning_rate": 2.8481203007518796e-06, "loss": 0.2099, "step": 47560 }, { "epoch": 71.53, "grad_norm": 7.941802978515625, "learning_rate": 2.8466165413533836e-06, "loss": 0.1769, "step": 47570 }, { "epoch": 71.55, "grad_norm": 11.027046203613281, "learning_rate": 2.8451127819548875e-06, "loss": 0.2497, "step": 47580 }, { "epoch": 71.56, "grad_norm": 1.5711476802825928, "learning_rate": 2.843609022556391e-06, "loss": 0.1957, "step": 47590 }, { "epoch": 71.58, "grad_norm": 8.369176864624023, "learning_rate": 2.842105263157895e-06, "loss": 0.2131, "step": 47600 }, { "epoch": 71.59, "grad_norm": 4.709144592285156, "learning_rate": 2.8406015037593985e-06, "loss": 0.2316, "step": 47610 }, { "epoch": 71.61, "grad_norm": 4.679172992706299, "learning_rate": 2.8390977443609024e-06, "loss": 0.1916, "step": 47620 }, { "epoch": 71.62, "grad_norm": 7.5052924156188965, "learning_rate": 2.8375939849624064e-06, "loss": 0.1613, "step": 47630 }, { "epoch": 71.64, "grad_norm": 6.649926662445068, "learning_rate": 2.83609022556391e-06, "loss": 0.1848, "step": 47640 }, { "epoch": 71.65, "grad_norm": 5.853745937347412, "learning_rate": 2.834586466165414e-06, "loss": 0.1762, "step": 47650 }, { "epoch": 71.67, "grad_norm": 6.767156600952148, "learning_rate": 2.8330827067669174e-06, "loss": 0.1358, "step": 47660 }, { "epoch": 71.68, "grad_norm": 2.320805311203003, "learning_rate": 2.8315789473684213e-06, "loss": 0.1576, "step": 47670 }, { "epoch": 71.7, "grad_norm": 6.782162666320801, "learning_rate": 2.8300751879699253e-06, "loss": 0.2299, "step": 47680 }, { "epoch": 71.71, "grad_norm": 5.48624324798584, "learning_rate": 2.8285714285714288e-06, "loss": 0.1981, "step": 47690 }, { "epoch": 71.73, "grad_norm": 4.4011335372924805, "learning_rate": 2.8270676691729327e-06, "loss": 0.1662, "step": 47700 }, { "epoch": 71.74, "grad_norm": 2.359865188598633, "learning_rate": 2.8255639097744362e-06, "loss": 0.1419, "step": 47710 }, { "epoch": 71.76, "grad_norm": 6.685113430023193, "learning_rate": 2.82406015037594e-06, "loss": 0.2208, "step": 47720 }, { "epoch": 71.77, "grad_norm": 4.598508358001709, "learning_rate": 2.822556390977444e-06, "loss": 0.1872, "step": 47730 }, { "epoch": 71.79, "grad_norm": 4.389933109283447, "learning_rate": 2.8210526315789476e-06, "loss": 0.196, "step": 47740 }, { "epoch": 71.8, "grad_norm": 3.9596188068389893, "learning_rate": 2.8195488721804516e-06, "loss": 0.121, "step": 47750 }, { "epoch": 71.82, "grad_norm": 1.1457980871200562, "learning_rate": 2.818045112781955e-06, "loss": 0.2009, "step": 47760 }, { "epoch": 71.83, "grad_norm": 6.121349811553955, "learning_rate": 2.816541353383459e-06, "loss": 0.1835, "step": 47770 }, { "epoch": 71.85, "grad_norm": 4.898597717285156, "learning_rate": 2.815037593984963e-06, "loss": 0.1345, "step": 47780 }, { "epoch": 71.86, "grad_norm": 10.096803665161133, "learning_rate": 2.8135338345864665e-06, "loss": 0.1851, "step": 47790 }, { "epoch": 71.88, "grad_norm": 5.386133193969727, "learning_rate": 2.8120300751879705e-06, "loss": 0.1939, "step": 47800 }, { "epoch": 71.89, "grad_norm": 4.9606428146362305, "learning_rate": 2.810526315789474e-06, "loss": 0.1756, "step": 47810 }, { "epoch": 71.91, "grad_norm": 3.927384853363037, "learning_rate": 2.809022556390978e-06, "loss": 0.1958, "step": 47820 }, { "epoch": 71.92, "grad_norm": 1.8455919027328491, "learning_rate": 2.807518796992482e-06, "loss": 0.1686, "step": 47830 }, { "epoch": 71.94, "grad_norm": 4.488922119140625, "learning_rate": 2.8060150375939854e-06, "loss": 0.1532, "step": 47840 }, { "epoch": 71.95, "grad_norm": 5.330088138580322, "learning_rate": 2.8045112781954893e-06, "loss": 0.1574, "step": 47850 }, { "epoch": 71.97, "grad_norm": 5.4682207107543945, "learning_rate": 2.803007518796993e-06, "loss": 0.1734, "step": 47860 }, { "epoch": 71.98, "grad_norm": 6.902002811431885, "learning_rate": 2.8015037593984968e-06, "loss": 0.1673, "step": 47870 }, { "epoch": 72.0, "grad_norm": 0.017349636182188988, "learning_rate": 2.8000000000000003e-06, "loss": 0.1518, "step": 47880 }, { "epoch": 72.0, "eval_accuracy": 0.9311, "eval_loss": 0.3322134017944336, "eval_runtime": 84.7025, "eval_samples_per_second": 118.06, "eval_steps_per_second": 0.472, "step": 47880 }, { "epoch": 72.02, "grad_norm": 2.0250072479248047, "learning_rate": 2.798496240601504e-06, "loss": 0.2221, "step": 47890 }, { "epoch": 72.03, "grad_norm": 3.505276679992676, "learning_rate": 2.7969924812030073e-06, "loss": 0.1402, "step": 47900 }, { "epoch": 72.05, "grad_norm": 4.723272323608398, "learning_rate": 2.7954887218045113e-06, "loss": 0.1832, "step": 47910 }, { "epoch": 72.06, "grad_norm": 11.436491012573242, "learning_rate": 2.7939849624060152e-06, "loss": 0.1595, "step": 47920 }, { "epoch": 72.08, "grad_norm": 8.044598579406738, "learning_rate": 2.7924812030075187e-06, "loss": 0.2156, "step": 47930 }, { "epoch": 72.09, "grad_norm": 5.603691577911377, "learning_rate": 2.7909774436090227e-06, "loss": 0.1677, "step": 47940 }, { "epoch": 72.11, "grad_norm": 3.3360657691955566, "learning_rate": 2.789473684210526e-06, "loss": 0.1723, "step": 47950 }, { "epoch": 72.12, "grad_norm": 5.758209228515625, "learning_rate": 2.78796992481203e-06, "loss": 0.1905, "step": 47960 }, { "epoch": 72.14, "grad_norm": 6.080915451049805, "learning_rate": 2.786466165413534e-06, "loss": 0.2027, "step": 47970 }, { "epoch": 72.15, "grad_norm": 4.1375017166137695, "learning_rate": 2.7849624060150376e-06, "loss": 0.1241, "step": 47980 }, { "epoch": 72.17, "grad_norm": 4.435542106628418, "learning_rate": 2.7834586466165415e-06, "loss": 0.2314, "step": 47990 }, { "epoch": 72.18, "grad_norm": 3.8110594749450684, "learning_rate": 2.781954887218045e-06, "loss": 0.1544, "step": 48000 }, { "epoch": 72.2, "grad_norm": 3.3765087127685547, "learning_rate": 2.780451127819549e-06, "loss": 0.2048, "step": 48010 }, { "epoch": 72.21, "grad_norm": 6.244101047515869, "learning_rate": 2.7789473684210525e-06, "loss": 0.1541, "step": 48020 }, { "epoch": 72.23, "grad_norm": 8.180159568786621, "learning_rate": 2.7774436090225565e-06, "loss": 0.1783, "step": 48030 }, { "epoch": 72.24, "grad_norm": 2.5639536380767822, "learning_rate": 2.7759398496240604e-06, "loss": 0.1643, "step": 48040 }, { "epoch": 72.26, "grad_norm": 3.273380756378174, "learning_rate": 2.774436090225564e-06, "loss": 0.1452, "step": 48050 }, { "epoch": 72.27, "grad_norm": 5.514414310455322, "learning_rate": 2.772932330827068e-06, "loss": 0.1401, "step": 48060 }, { "epoch": 72.29, "grad_norm": 6.009876251220703, "learning_rate": 2.7714285714285714e-06, "loss": 0.1812, "step": 48070 }, { "epoch": 72.3, "grad_norm": 1.5807594060897827, "learning_rate": 2.7699248120300753e-06, "loss": 0.1392, "step": 48080 }, { "epoch": 72.32, "grad_norm": 4.397557258605957, "learning_rate": 2.7684210526315793e-06, "loss": 0.1882, "step": 48090 }, { "epoch": 72.33, "grad_norm": 5.294480323791504, "learning_rate": 2.766917293233083e-06, "loss": 0.2203, "step": 48100 }, { "epoch": 72.35, "grad_norm": 2.36199688911438, "learning_rate": 2.7654135338345867e-06, "loss": 0.2092, "step": 48110 }, { "epoch": 72.36, "grad_norm": 6.017412185668945, "learning_rate": 2.7639097744360903e-06, "loss": 0.1849, "step": 48120 }, { "epoch": 72.38, "grad_norm": 6.914893627166748, "learning_rate": 2.762406015037594e-06, "loss": 0.1619, "step": 48130 }, { "epoch": 72.39, "grad_norm": 3.912938117980957, "learning_rate": 2.760902255639098e-06, "loss": 0.2099, "step": 48140 }, { "epoch": 72.41, "grad_norm": 4.893369197845459, "learning_rate": 2.7593984962406017e-06, "loss": 0.245, "step": 48150 }, { "epoch": 72.42, "grad_norm": 3.4922471046447754, "learning_rate": 2.7578947368421056e-06, "loss": 0.1452, "step": 48160 }, { "epoch": 72.44, "grad_norm": 10.81135082244873, "learning_rate": 2.756390977443609e-06, "loss": 0.1703, "step": 48170 }, { "epoch": 72.45, "grad_norm": 5.498162269592285, "learning_rate": 2.754887218045113e-06, "loss": 0.1733, "step": 48180 }, { "epoch": 72.47, "grad_norm": 3.2235593795776367, "learning_rate": 2.753383458646617e-06, "loss": 0.2167, "step": 48190 }, { "epoch": 72.48, "grad_norm": 7.933281421661377, "learning_rate": 2.7518796992481205e-06, "loss": 0.2088, "step": 48200 }, { "epoch": 72.5, "grad_norm": 3.2224316596984863, "learning_rate": 2.7503759398496245e-06, "loss": 0.1841, "step": 48210 }, { "epoch": 72.51, "grad_norm": 5.602688312530518, "learning_rate": 2.748872180451128e-06, "loss": 0.1924, "step": 48220 }, { "epoch": 72.53, "grad_norm": 4.668603897094727, "learning_rate": 2.747368421052632e-06, "loss": 0.2254, "step": 48230 }, { "epoch": 72.54, "grad_norm": 5.575139045715332, "learning_rate": 2.745864661654136e-06, "loss": 0.1475, "step": 48240 }, { "epoch": 72.56, "grad_norm": 5.640378475189209, "learning_rate": 2.7443609022556394e-06, "loss": 0.2488, "step": 48250 }, { "epoch": 72.57, "grad_norm": 5.799993515014648, "learning_rate": 2.7428571428571433e-06, "loss": 0.206, "step": 48260 }, { "epoch": 72.59, "grad_norm": 4.525552272796631, "learning_rate": 2.741353383458647e-06, "loss": 0.1442, "step": 48270 }, { "epoch": 72.6, "grad_norm": 6.230297565460205, "learning_rate": 2.739849624060151e-06, "loss": 0.2157, "step": 48280 }, { "epoch": 72.62, "grad_norm": 4.961235046386719, "learning_rate": 2.7383458646616543e-06, "loss": 0.1658, "step": 48290 }, { "epoch": 72.63, "grad_norm": 6.751160621643066, "learning_rate": 2.7368421052631583e-06, "loss": 0.2332, "step": 48300 }, { "epoch": 72.65, "grad_norm": 4.099827766418457, "learning_rate": 2.735338345864662e-06, "loss": 0.185, "step": 48310 }, { "epoch": 72.66, "grad_norm": 6.973018646240234, "learning_rate": 2.7338345864661657e-06, "loss": 0.1865, "step": 48320 }, { "epoch": 72.68, "grad_norm": 4.904829978942871, "learning_rate": 2.7323308270676697e-06, "loss": 0.1717, "step": 48330 }, { "epoch": 72.69, "grad_norm": 7.860152721405029, "learning_rate": 2.730827067669173e-06, "loss": 0.1862, "step": 48340 }, { "epoch": 72.71, "grad_norm": 5.617115020751953, "learning_rate": 2.729323308270677e-06, "loss": 0.2077, "step": 48350 }, { "epoch": 72.72, "grad_norm": 4.507359504699707, "learning_rate": 2.727819548872181e-06, "loss": 0.2003, "step": 48360 }, { "epoch": 72.74, "grad_norm": 4.982139587402344, "learning_rate": 2.7263157894736846e-06, "loss": 0.237, "step": 48370 }, { "epoch": 72.75, "grad_norm": 4.0217604637146, "learning_rate": 2.7248120300751885e-06, "loss": 0.1453, "step": 48380 }, { "epoch": 72.77, "grad_norm": 7.618423938751221, "learning_rate": 2.7233082706766916e-06, "loss": 0.1914, "step": 48390 }, { "epoch": 72.78, "grad_norm": 6.787216663360596, "learning_rate": 2.7218045112781956e-06, "loss": 0.1451, "step": 48400 }, { "epoch": 72.8, "grad_norm": 4.273850440979004, "learning_rate": 2.720300751879699e-06, "loss": 0.2088, "step": 48410 }, { "epoch": 72.81, "grad_norm": 5.214813709259033, "learning_rate": 2.718796992481203e-06, "loss": 0.1597, "step": 48420 }, { "epoch": 72.83, "grad_norm": 4.873306751251221, "learning_rate": 2.7172932330827066e-06, "loss": 0.1827, "step": 48430 }, { "epoch": 72.84, "grad_norm": 5.74524450302124, "learning_rate": 2.7157894736842105e-06, "loss": 0.1795, "step": 48440 }, { "epoch": 72.86, "grad_norm": 6.21406364440918, "learning_rate": 2.7142857142857144e-06, "loss": 0.2171, "step": 48450 }, { "epoch": 72.87, "grad_norm": 3.394968032836914, "learning_rate": 2.712781954887218e-06, "loss": 0.1758, "step": 48460 }, { "epoch": 72.89, "grad_norm": 2.936285972595215, "learning_rate": 2.711278195488722e-06, "loss": 0.2162, "step": 48470 }, { "epoch": 72.9, "grad_norm": 2.2436184883117676, "learning_rate": 2.7097744360902254e-06, "loss": 0.1331, "step": 48480 }, { "epoch": 72.92, "grad_norm": 4.331397533416748, "learning_rate": 2.7082706766917294e-06, "loss": 0.1934, "step": 48490 }, { "epoch": 72.93, "grad_norm": 8.824311256408691, "learning_rate": 2.7067669172932333e-06, "loss": 0.2538, "step": 48500 }, { "epoch": 72.95, "grad_norm": 7.463961124420166, "learning_rate": 2.705263157894737e-06, "loss": 0.212, "step": 48510 }, { "epoch": 72.96, "grad_norm": 9.955108642578125, "learning_rate": 2.7037593984962408e-06, "loss": 0.2191, "step": 48520 }, { "epoch": 72.98, "grad_norm": 3.3155603408813477, "learning_rate": 2.7022556390977443e-06, "loss": 0.1842, "step": 48530 }, { "epoch": 72.99, "grad_norm": 3.515291929244995, "learning_rate": 2.7007518796992482e-06, "loss": 0.1914, "step": 48540 }, { "epoch": 73.0, "eval_accuracy": 0.931, "eval_loss": 0.3263280391693115, "eval_runtime": 84.3768, "eval_samples_per_second": 118.516, "eval_steps_per_second": 0.474, "step": 48545 }, { "epoch": 73.01, "grad_norm": 5.640552520751953, "learning_rate": 2.699248120300752e-06, "loss": 0.1428, "step": 48550 }, { "epoch": 73.02, "grad_norm": 4.772902965545654, "learning_rate": 2.6977443609022557e-06, "loss": 0.1821, "step": 48560 }, { "epoch": 73.04, "grad_norm": 3.3709876537323, "learning_rate": 2.6962406015037596e-06, "loss": 0.1939, "step": 48570 }, { "epoch": 73.05, "grad_norm": 8.709583282470703, "learning_rate": 2.694736842105263e-06, "loss": 0.24, "step": 48580 }, { "epoch": 73.07, "grad_norm": 3.556995391845703, "learning_rate": 2.693233082706767e-06, "loss": 0.1939, "step": 48590 }, { "epoch": 73.08, "grad_norm": 4.488801002502441, "learning_rate": 2.691729323308271e-06, "loss": 0.1767, "step": 48600 }, { "epoch": 73.1, "grad_norm": 3.0920093059539795, "learning_rate": 2.6902255639097746e-06, "loss": 0.148, "step": 48610 }, { "epoch": 73.11, "grad_norm": 7.643060207366943, "learning_rate": 2.6887218045112785e-06, "loss": 0.1924, "step": 48620 }, { "epoch": 73.13, "grad_norm": 11.217390060424805, "learning_rate": 2.687218045112782e-06, "loss": 0.2318, "step": 48630 }, { "epoch": 73.14, "grad_norm": 8.313175201416016, "learning_rate": 2.685714285714286e-06, "loss": 0.1629, "step": 48640 }, { "epoch": 73.16, "grad_norm": 4.154397010803223, "learning_rate": 2.68421052631579e-06, "loss": 0.1958, "step": 48650 }, { "epoch": 73.17, "grad_norm": 3.842160224914551, "learning_rate": 2.6827067669172934e-06, "loss": 0.1451, "step": 48660 }, { "epoch": 73.19, "grad_norm": 6.915648460388184, "learning_rate": 2.6812030075187974e-06, "loss": 0.1452, "step": 48670 }, { "epoch": 73.2, "grad_norm": 4.855998992919922, "learning_rate": 2.679699248120301e-06, "loss": 0.1455, "step": 48680 }, { "epoch": 73.22, "grad_norm": 1.8087424039840698, "learning_rate": 2.678195488721805e-06, "loss": 0.1554, "step": 48690 }, { "epoch": 73.23, "grad_norm": 1.4605119228363037, "learning_rate": 2.6766917293233088e-06, "loss": 0.2174, "step": 48700 }, { "epoch": 73.25, "grad_norm": 6.26193380355835, "learning_rate": 2.6751879699248123e-06, "loss": 0.2441, "step": 48710 }, { "epoch": 73.26, "grad_norm": 6.614605903625488, "learning_rate": 2.6736842105263162e-06, "loss": 0.2272, "step": 48720 }, { "epoch": 73.28, "grad_norm": 3.6685211658477783, "learning_rate": 2.6721804511278197e-06, "loss": 0.1562, "step": 48730 }, { "epoch": 73.29, "grad_norm": 4.120753288269043, "learning_rate": 2.6706766917293237e-06, "loss": 0.1567, "step": 48740 }, { "epoch": 73.31, "grad_norm": 16.986003875732422, "learning_rate": 2.669172932330827e-06, "loss": 0.2109, "step": 48750 }, { "epoch": 73.32, "grad_norm": 6.729957580566406, "learning_rate": 2.667669172932331e-06, "loss": 0.2327, "step": 48760 }, { "epoch": 73.34, "grad_norm": 3.7136998176574707, "learning_rate": 2.666165413533835e-06, "loss": 0.2161, "step": 48770 }, { "epoch": 73.35, "grad_norm": 3.925506591796875, "learning_rate": 2.6646616541353386e-06, "loss": 0.2049, "step": 48780 }, { "epoch": 73.37, "grad_norm": 11.8861665725708, "learning_rate": 2.6631578947368426e-06, "loss": 0.1432, "step": 48790 }, { "epoch": 73.38, "grad_norm": 4.450233459472656, "learning_rate": 2.661654135338346e-06, "loss": 0.1958, "step": 48800 }, { "epoch": 73.4, "grad_norm": 7.333054542541504, "learning_rate": 2.66015037593985e-06, "loss": 0.2231, "step": 48810 }, { "epoch": 73.41, "grad_norm": 5.88569974899292, "learning_rate": 2.658646616541354e-06, "loss": 0.1502, "step": 48820 }, { "epoch": 73.43, "grad_norm": 4.52285099029541, "learning_rate": 2.6571428571428575e-06, "loss": 0.2144, "step": 48830 }, { "epoch": 73.44, "grad_norm": 6.34915828704834, "learning_rate": 2.6556390977443614e-06, "loss": 0.221, "step": 48840 }, { "epoch": 73.46, "grad_norm": 2.9954657554626465, "learning_rate": 2.654135338345865e-06, "loss": 0.1306, "step": 48850 }, { "epoch": 73.47, "grad_norm": 4.723332405090332, "learning_rate": 2.652631578947369e-06, "loss": 0.1725, "step": 48860 }, { "epoch": 73.49, "grad_norm": 5.571535587310791, "learning_rate": 2.651127819548873e-06, "loss": 0.2305, "step": 48870 }, { "epoch": 73.5, "grad_norm": 6.174119472503662, "learning_rate": 2.6496240601503763e-06, "loss": 0.2203, "step": 48880 }, { "epoch": 73.52, "grad_norm": 3.709364175796509, "learning_rate": 2.6481203007518794e-06, "loss": 0.1775, "step": 48890 }, { "epoch": 73.53, "grad_norm": 7.268237590789795, "learning_rate": 2.6466165413533834e-06, "loss": 0.212, "step": 48900 }, { "epoch": 73.55, "grad_norm": 1.238796591758728, "learning_rate": 2.6451127819548873e-06, "loss": 0.1747, "step": 48910 }, { "epoch": 73.56, "grad_norm": 3.121783971786499, "learning_rate": 2.643609022556391e-06, "loss": 0.1938, "step": 48920 }, { "epoch": 73.58, "grad_norm": 4.7542500495910645, "learning_rate": 2.6421052631578948e-06, "loss": 0.2038, "step": 48930 }, { "epoch": 73.59, "grad_norm": 5.120677471160889, "learning_rate": 2.6406015037593983e-06, "loss": 0.2355, "step": 48940 }, { "epoch": 73.61, "grad_norm": 6.299097537994385, "learning_rate": 2.6390977443609022e-06, "loss": 0.2069, "step": 48950 }, { "epoch": 73.62, "grad_norm": 4.0084428787231445, "learning_rate": 2.637593984962406e-06, "loss": 0.1697, "step": 48960 }, { "epoch": 73.64, "grad_norm": 3.608576536178589, "learning_rate": 2.6360902255639097e-06, "loss": 0.1726, "step": 48970 }, { "epoch": 73.65, "grad_norm": 5.771646499633789, "learning_rate": 2.6345864661654137e-06, "loss": 0.2716, "step": 48980 }, { "epoch": 73.67, "grad_norm": 8.164382934570312, "learning_rate": 2.633082706766917e-06, "loss": 0.1862, "step": 48990 }, { "epoch": 73.68, "grad_norm": 5.360281467437744, "learning_rate": 2.631578947368421e-06, "loss": 0.163, "step": 49000 }, { "epoch": 73.7, "grad_norm": 5.156652450561523, "learning_rate": 2.630075187969925e-06, "loss": 0.1928, "step": 49010 }, { "epoch": 73.71, "grad_norm": 3.7978944778442383, "learning_rate": 2.6285714285714286e-06, "loss": 0.1318, "step": 49020 }, { "epoch": 73.73, "grad_norm": 7.672252655029297, "learning_rate": 2.6270676691729325e-06, "loss": 0.2079, "step": 49030 }, { "epoch": 73.74, "grad_norm": 4.497259616851807, "learning_rate": 2.625563909774436e-06, "loss": 0.2074, "step": 49040 }, { "epoch": 73.76, "grad_norm": 4.420539855957031, "learning_rate": 2.62406015037594e-06, "loss": 0.1372, "step": 49050 }, { "epoch": 73.77, "grad_norm": 4.705691814422607, "learning_rate": 2.622556390977444e-06, "loss": 0.2247, "step": 49060 }, { "epoch": 73.79, "grad_norm": 4.325020790100098, "learning_rate": 2.6210526315789474e-06, "loss": 0.1768, "step": 49070 }, { "epoch": 73.8, "grad_norm": 4.0322442054748535, "learning_rate": 2.6195488721804514e-06, "loss": 0.1707, "step": 49080 }, { "epoch": 73.82, "grad_norm": 7.42966890335083, "learning_rate": 2.618045112781955e-06, "loss": 0.2521, "step": 49090 }, { "epoch": 73.83, "grad_norm": 4.630422115325928, "learning_rate": 2.616541353383459e-06, "loss": 0.2168, "step": 49100 }, { "epoch": 73.85, "grad_norm": 5.983313083648682, "learning_rate": 2.6150375939849628e-06, "loss": 0.2102, "step": 49110 }, { "epoch": 73.86, "grad_norm": 2.551830291748047, "learning_rate": 2.6135338345864663e-06, "loss": 0.1586, "step": 49120 }, { "epoch": 73.88, "grad_norm": 6.646661281585693, "learning_rate": 2.6120300751879702e-06, "loss": 0.2198, "step": 49130 }, { "epoch": 73.89, "grad_norm": 5.7124176025390625, "learning_rate": 2.6105263157894738e-06, "loss": 0.2081, "step": 49140 }, { "epoch": 73.91, "grad_norm": 15.754138946533203, "learning_rate": 2.6090225563909777e-06, "loss": 0.1661, "step": 49150 }, { "epoch": 73.92, "grad_norm": 5.884487152099609, "learning_rate": 2.6075187969924817e-06, "loss": 0.1548, "step": 49160 }, { "epoch": 73.94, "grad_norm": 1.3139573335647583, "learning_rate": 2.606015037593985e-06, "loss": 0.1773, "step": 49170 }, { "epoch": 73.95, "grad_norm": 4.792730808258057, "learning_rate": 2.604511278195489e-06, "loss": 0.1666, "step": 49180 }, { "epoch": 73.97, "grad_norm": 1.9457217454910278, "learning_rate": 2.6030075187969926e-06, "loss": 0.1575, "step": 49190 }, { "epoch": 73.98, "grad_norm": 6.806911468505859, "learning_rate": 2.6015037593984966e-06, "loss": 0.1794, "step": 49200 }, { "epoch": 74.0, "grad_norm": 0.026680290699005127, "learning_rate": 2.6e-06, "loss": 0.2097, "step": 49210 }, { "epoch": 74.0, "eval_accuracy": 0.9294, "eval_loss": 0.33669278025627136, "eval_runtime": 84.9311, "eval_samples_per_second": 117.743, "eval_steps_per_second": 0.471, "step": 49210 }, { "epoch": 74.02, "grad_norm": 6.460815906524658, "learning_rate": 2.598496240601504e-06, "loss": 0.1717, "step": 49220 }, { "epoch": 74.03, "grad_norm": 5.829145908355713, "learning_rate": 2.596992481203008e-06, "loss": 0.2016, "step": 49230 }, { "epoch": 74.05, "grad_norm": 11.389643669128418, "learning_rate": 2.5954887218045115e-06, "loss": 0.2195, "step": 49240 }, { "epoch": 74.06, "grad_norm": 2.792567253112793, "learning_rate": 2.5939849624060154e-06, "loss": 0.1553, "step": 49250 }, { "epoch": 74.08, "grad_norm": 3.7517435550689697, "learning_rate": 2.592481203007519e-06, "loss": 0.1597, "step": 49260 }, { "epoch": 74.09, "grad_norm": 4.992012023925781, "learning_rate": 2.590977443609023e-06, "loss": 0.1487, "step": 49270 }, { "epoch": 74.11, "grad_norm": 4.339962482452393, "learning_rate": 2.589473684210527e-06, "loss": 0.1671, "step": 49280 }, { "epoch": 74.12, "grad_norm": 2.0285701751708984, "learning_rate": 2.5879699248120304e-06, "loss": 0.1598, "step": 49290 }, { "epoch": 74.14, "grad_norm": 3.2112770080566406, "learning_rate": 2.5864661654135343e-06, "loss": 0.1299, "step": 49300 }, { "epoch": 74.15, "grad_norm": 3.4635910987854004, "learning_rate": 2.584962406015038e-06, "loss": 0.1812, "step": 49310 }, { "epoch": 74.17, "grad_norm": 8.604695320129395, "learning_rate": 2.5834586466165418e-06, "loss": 0.2481, "step": 49320 }, { "epoch": 74.18, "grad_norm": 3.5391902923583984, "learning_rate": 2.5819548872180457e-06, "loss": 0.1636, "step": 49330 }, { "epoch": 74.2, "grad_norm": 1.695279836654663, "learning_rate": 2.5804511278195492e-06, "loss": 0.1739, "step": 49340 }, { "epoch": 74.21, "grad_norm": 4.957259178161621, "learning_rate": 2.578947368421053e-06, "loss": 0.1369, "step": 49350 }, { "epoch": 74.23, "grad_norm": 6.202530860900879, "learning_rate": 2.5774436090225567e-06, "loss": 0.2257, "step": 49360 }, { "epoch": 74.24, "grad_norm": 7.947466850280762, "learning_rate": 2.5759398496240606e-06, "loss": 0.1961, "step": 49370 }, { "epoch": 74.26, "grad_norm": 6.451323509216309, "learning_rate": 2.5744360902255637e-06, "loss": 0.2195, "step": 49380 }, { "epoch": 74.27, "grad_norm": 5.297338008880615, "learning_rate": 2.5729323308270677e-06, "loss": 0.1799, "step": 49390 }, { "epoch": 74.29, "grad_norm": 4.922117233276367, "learning_rate": 2.571428571428571e-06, "loss": 0.2173, "step": 49400 }, { "epoch": 74.3, "grad_norm": 1.2803465127944946, "learning_rate": 2.569924812030075e-06, "loss": 0.1658, "step": 49410 }, { "epoch": 74.32, "grad_norm": 2.5603795051574707, "learning_rate": 2.568421052631579e-06, "loss": 0.1427, "step": 49420 }, { "epoch": 74.33, "grad_norm": 5.846049785614014, "learning_rate": 2.5669172932330826e-06, "loss": 0.2481, "step": 49430 }, { "epoch": 74.35, "grad_norm": 6.60819149017334, "learning_rate": 2.5654135338345865e-06, "loss": 0.1853, "step": 49440 }, { "epoch": 74.36, "grad_norm": 3.9946815967559814, "learning_rate": 2.56390977443609e-06, "loss": 0.2012, "step": 49450 }, { "epoch": 74.38, "grad_norm": 9.012832641601562, "learning_rate": 2.562406015037594e-06, "loss": 0.2052, "step": 49460 }, { "epoch": 74.39, "grad_norm": 4.631398677825928, "learning_rate": 2.560902255639098e-06, "loss": 0.1995, "step": 49470 }, { "epoch": 74.41, "grad_norm": 3.723080635070801, "learning_rate": 2.5593984962406015e-06, "loss": 0.1542, "step": 49480 }, { "epoch": 74.42, "grad_norm": 5.1829633712768555, "learning_rate": 2.5578947368421054e-06, "loss": 0.2244, "step": 49490 }, { "epoch": 74.44, "grad_norm": 6.623849391937256, "learning_rate": 2.556390977443609e-06, "loss": 0.2161, "step": 49500 }, { "epoch": 74.45, "grad_norm": 7.301265239715576, "learning_rate": 2.554887218045113e-06, "loss": 0.1301, "step": 49510 }, { "epoch": 74.47, "grad_norm": 3.6955883502960205, "learning_rate": 2.553383458646617e-06, "loss": 0.1758, "step": 49520 }, { "epoch": 74.48, "grad_norm": 9.507878303527832, "learning_rate": 2.5518796992481203e-06, "loss": 0.1833, "step": 49530 }, { "epoch": 74.5, "grad_norm": 3.5736141204833984, "learning_rate": 2.5503759398496243e-06, "loss": 0.1582, "step": 49540 }, { "epoch": 74.51, "grad_norm": 4.5012006759643555, "learning_rate": 2.548872180451128e-06, "loss": 0.1849, "step": 49550 }, { "epoch": 74.53, "grad_norm": 1.9890389442443848, "learning_rate": 2.5473684210526317e-06, "loss": 0.2109, "step": 49560 }, { "epoch": 74.54, "grad_norm": 6.087845325469971, "learning_rate": 2.5458646616541357e-06, "loss": 0.2116, "step": 49570 }, { "epoch": 74.56, "grad_norm": 4.9256157875061035, "learning_rate": 2.544360902255639e-06, "loss": 0.1751, "step": 49580 }, { "epoch": 74.57, "grad_norm": 7.381353855133057, "learning_rate": 2.542857142857143e-06, "loss": 0.1588, "step": 49590 }, { "epoch": 74.59, "grad_norm": 2.9443299770355225, "learning_rate": 2.5413533834586467e-06, "loss": 0.1763, "step": 49600 }, { "epoch": 74.6, "grad_norm": 4.252871513366699, "learning_rate": 2.5398496240601506e-06, "loss": 0.1819, "step": 49610 }, { "epoch": 74.62, "grad_norm": 4.36507511138916, "learning_rate": 2.5383458646616545e-06, "loss": 0.1788, "step": 49620 }, { "epoch": 74.63, "grad_norm": 5.069661617279053, "learning_rate": 2.536842105263158e-06, "loss": 0.2279, "step": 49630 }, { "epoch": 74.65, "grad_norm": 1.5738500356674194, "learning_rate": 2.535338345864662e-06, "loss": 0.1562, "step": 49640 }, { "epoch": 74.66, "grad_norm": 3.425536632537842, "learning_rate": 2.5338345864661655e-06, "loss": 0.1367, "step": 49650 }, { "epoch": 74.68, "grad_norm": 7.227283954620361, "learning_rate": 2.5323308270676695e-06, "loss": 0.1524, "step": 49660 }, { "epoch": 74.69, "grad_norm": 4.1548075675964355, "learning_rate": 2.530827067669173e-06, "loss": 0.1361, "step": 49670 }, { "epoch": 74.71, "grad_norm": 5.1794610023498535, "learning_rate": 2.529323308270677e-06, "loss": 0.1373, "step": 49680 }, { "epoch": 74.72, "grad_norm": 7.747637748718262, "learning_rate": 2.527819548872181e-06, "loss": 0.1758, "step": 49690 }, { "epoch": 74.74, "grad_norm": 3.845520496368408, "learning_rate": 2.5263157894736844e-06, "loss": 0.2025, "step": 49700 }, { "epoch": 74.75, "grad_norm": 8.356101989746094, "learning_rate": 2.5248120300751883e-06, "loss": 0.1597, "step": 49710 }, { "epoch": 74.77, "grad_norm": 3.1833062171936035, "learning_rate": 2.523308270676692e-06, "loss": 0.1411, "step": 49720 }, { "epoch": 74.78, "grad_norm": 6.692790508270264, "learning_rate": 2.521804511278196e-06, "loss": 0.1809, "step": 49730 }, { "epoch": 74.8, "grad_norm": 6.7796454429626465, "learning_rate": 2.5203007518796997e-06, "loss": 0.1418, "step": 49740 }, { "epoch": 74.81, "grad_norm": 4.885359287261963, "learning_rate": 2.5187969924812033e-06, "loss": 0.2639, "step": 49750 }, { "epoch": 74.83, "grad_norm": 6.379741668701172, "learning_rate": 2.517293233082707e-06, "loss": 0.2007, "step": 49760 }, { "epoch": 74.84, "grad_norm": 7.056103706359863, "learning_rate": 2.5157894736842107e-06, "loss": 0.2262, "step": 49770 }, { "epoch": 74.86, "grad_norm": 7.777561664581299, "learning_rate": 2.5142857142857147e-06, "loss": 0.2332, "step": 49780 }, { "epoch": 74.87, "grad_norm": 5.086285591125488, "learning_rate": 2.5127819548872186e-06, "loss": 0.2878, "step": 49790 }, { "epoch": 74.89, "grad_norm": 2.857342481613159, "learning_rate": 2.511278195488722e-06, "loss": 0.1854, "step": 49800 }, { "epoch": 74.9, "grad_norm": 2.6614773273468018, "learning_rate": 2.509774436090226e-06, "loss": 0.1806, "step": 49810 }, { "epoch": 74.92, "grad_norm": 6.267194747924805, "learning_rate": 2.5082706766917296e-06, "loss": 0.1506, "step": 49820 }, { "epoch": 74.93, "grad_norm": 7.167734622955322, "learning_rate": 2.5067669172932335e-06, "loss": 0.1308, "step": 49830 }, { "epoch": 74.95, "grad_norm": 3.8216676712036133, "learning_rate": 2.5052631578947375e-06, "loss": 0.1914, "step": 49840 }, { "epoch": 74.96, "grad_norm": 4.203033447265625, "learning_rate": 2.503759398496241e-06, "loss": 0.1303, "step": 49850 }, { "epoch": 74.98, "grad_norm": 2.4965991973876953, "learning_rate": 2.502255639097745e-06, "loss": 0.1785, "step": 49860 }, { "epoch": 74.99, "grad_norm": 1.2828130722045898, "learning_rate": 2.5007518796992484e-06, "loss": 0.1423, "step": 49870 }, { "epoch": 75.0, "eval_accuracy": 0.9299, "eval_loss": 0.3285817503929138, "eval_runtime": 84.9307, "eval_samples_per_second": 117.743, "eval_steps_per_second": 0.471, "step": 49875 }, { "epoch": 75.01, "grad_norm": 3.812678337097168, "learning_rate": 2.499248120300752e-06, "loss": 0.1716, "step": 49880 }, { "epoch": 75.02, "grad_norm": 1.7645814418792725, "learning_rate": 2.497744360902256e-06, "loss": 0.1242, "step": 49890 }, { "epoch": 75.04, "grad_norm": 3.069406509399414, "learning_rate": 2.4962406015037594e-06, "loss": 0.1708, "step": 49900 }, { "epoch": 75.05, "grad_norm": 4.357547283172607, "learning_rate": 2.4947368421052634e-06, "loss": 0.1933, "step": 49910 }, { "epoch": 75.07, "grad_norm": 6.222829818725586, "learning_rate": 2.4932330827067673e-06, "loss": 0.1914, "step": 49920 }, { "epoch": 75.08, "grad_norm": 5.128503322601318, "learning_rate": 2.491729323308271e-06, "loss": 0.2163, "step": 49930 }, { "epoch": 75.1, "grad_norm": 4.528236389160156, "learning_rate": 2.4902255639097748e-06, "loss": 0.1979, "step": 49940 }, { "epoch": 75.11, "grad_norm": 3.4620494842529297, "learning_rate": 2.4887218045112783e-06, "loss": 0.1922, "step": 49950 }, { "epoch": 75.13, "grad_norm": 7.054966449737549, "learning_rate": 2.4872180451127822e-06, "loss": 0.242, "step": 49960 }, { "epoch": 75.14, "grad_norm": 5.699782371520996, "learning_rate": 2.485714285714286e-06, "loss": 0.1489, "step": 49970 }, { "epoch": 75.16, "grad_norm": 6.1800408363342285, "learning_rate": 2.4842105263157897e-06, "loss": 0.1506, "step": 49980 }, { "epoch": 75.17, "grad_norm": 2.10766863822937, "learning_rate": 2.4827067669172936e-06, "loss": 0.1478, "step": 49990 }, { "epoch": 75.19, "grad_norm": 3.423696279525757, "learning_rate": 2.481203007518797e-06, "loss": 0.1526, "step": 50000 }, { "epoch": 75.2, "grad_norm": 3.7721095085144043, "learning_rate": 2.4796992481203007e-06, "loss": 0.2094, "step": 50010 }, { "epoch": 75.22, "grad_norm": 5.331075668334961, "learning_rate": 2.4781954887218046e-06, "loss": 0.1597, "step": 50020 }, { "epoch": 75.23, "grad_norm": 6.622517108917236, "learning_rate": 2.4766917293233086e-06, "loss": 0.2322, "step": 50030 }, { "epoch": 75.25, "grad_norm": 6.970418930053711, "learning_rate": 2.475187969924812e-06, "loss": 0.1812, "step": 50040 }, { "epoch": 75.26, "grad_norm": 7.753775119781494, "learning_rate": 2.473684210526316e-06, "loss": 0.1515, "step": 50050 }, { "epoch": 75.28, "grad_norm": 13.116275787353516, "learning_rate": 2.4721804511278195e-06, "loss": 0.1887, "step": 50060 }, { "epoch": 75.29, "grad_norm": 4.448867321014404, "learning_rate": 2.4706766917293235e-06, "loss": 0.1433, "step": 50070 }, { "epoch": 75.31, "grad_norm": 3.7794928550720215, "learning_rate": 2.4691729323308274e-06, "loss": 0.1493, "step": 50080 }, { "epoch": 75.32, "grad_norm": 5.0668416023254395, "learning_rate": 2.467669172932331e-06, "loss": 0.1536, "step": 50090 }, { "epoch": 75.34, "grad_norm": 3.405148983001709, "learning_rate": 2.466165413533835e-06, "loss": 0.222, "step": 50100 }, { "epoch": 75.35, "grad_norm": 5.900206565856934, "learning_rate": 2.4646616541353384e-06, "loss": 0.1388, "step": 50110 }, { "epoch": 75.37, "grad_norm": 4.416190147399902, "learning_rate": 2.4631578947368424e-06, "loss": 0.1744, "step": 50120 }, { "epoch": 75.38, "grad_norm": 7.959252834320068, "learning_rate": 2.461654135338346e-06, "loss": 0.2386, "step": 50130 }, { "epoch": 75.4, "grad_norm": 4.957620143890381, "learning_rate": 2.46015037593985e-06, "loss": 0.1717, "step": 50140 }, { "epoch": 75.41, "grad_norm": 6.785633087158203, "learning_rate": 2.4586466165413538e-06, "loss": 0.1783, "step": 50150 }, { "epoch": 75.43, "grad_norm": 8.179920196533203, "learning_rate": 2.4571428571428573e-06, "loss": 0.2003, "step": 50160 }, { "epoch": 75.44, "grad_norm": 2.4973435401916504, "learning_rate": 2.4556390977443612e-06, "loss": 0.1877, "step": 50170 }, { "epoch": 75.46, "grad_norm": 4.815390586853027, "learning_rate": 2.4541353383458647e-06, "loss": 0.26, "step": 50180 }, { "epoch": 75.47, "grad_norm": 6.23560905456543, "learning_rate": 2.4526315789473687e-06, "loss": 0.1683, "step": 50190 }, { "epoch": 75.49, "grad_norm": 0.35644736886024475, "learning_rate": 2.4511278195488726e-06, "loss": 0.1489, "step": 50200 }, { "epoch": 75.5, "grad_norm": 5.562798500061035, "learning_rate": 2.449624060150376e-06, "loss": 0.2343, "step": 50210 }, { "epoch": 75.52, "grad_norm": 6.464638710021973, "learning_rate": 2.44812030075188e-06, "loss": 0.2042, "step": 50220 }, { "epoch": 75.53, "grad_norm": 3.009085178375244, "learning_rate": 2.4466165413533836e-06, "loss": 0.1565, "step": 50230 }, { "epoch": 75.55, "grad_norm": 3.3092687129974365, "learning_rate": 2.4451127819548875e-06, "loss": 0.1583, "step": 50240 }, { "epoch": 75.56, "grad_norm": 1.3774584531784058, "learning_rate": 2.443609022556391e-06, "loss": 0.1763, "step": 50250 }, { "epoch": 75.58, "grad_norm": 4.398240089416504, "learning_rate": 2.442105263157895e-06, "loss": 0.2157, "step": 50260 }, { "epoch": 75.59, "grad_norm": 2.5172154903411865, "learning_rate": 2.4406015037593985e-06, "loss": 0.2076, "step": 50270 }, { "epoch": 75.61, "grad_norm": 5.356011867523193, "learning_rate": 2.4390977443609025e-06, "loss": 0.1441, "step": 50280 }, { "epoch": 75.62, "grad_norm": 5.028241157531738, "learning_rate": 2.437593984962406e-06, "loss": 0.1921, "step": 50290 }, { "epoch": 75.64, "grad_norm": 5.889922142028809, "learning_rate": 2.43609022556391e-06, "loss": 0.1315, "step": 50300 }, { "epoch": 75.65, "grad_norm": 6.243462562561035, "learning_rate": 2.434586466165414e-06, "loss": 0.2106, "step": 50310 }, { "epoch": 75.67, "grad_norm": 6.075836181640625, "learning_rate": 2.4330827067669174e-06, "loss": 0.2055, "step": 50320 }, { "epoch": 75.68, "grad_norm": 5.1835150718688965, "learning_rate": 2.4315789473684213e-06, "loss": 0.2298, "step": 50330 }, { "epoch": 75.7, "grad_norm": 4.660686016082764, "learning_rate": 2.430075187969925e-06, "loss": 0.173, "step": 50340 }, { "epoch": 75.71, "grad_norm": 3.6795620918273926, "learning_rate": 2.428571428571429e-06, "loss": 0.1852, "step": 50350 }, { "epoch": 75.73, "grad_norm": 4.784815788269043, "learning_rate": 2.4270676691729323e-06, "loss": 0.1739, "step": 50360 }, { "epoch": 75.74, "grad_norm": 6.113933563232422, "learning_rate": 2.4255639097744363e-06, "loss": 0.2325, "step": 50370 }, { "epoch": 75.76, "grad_norm": 3.9142940044403076, "learning_rate": 2.42406015037594e-06, "loss": 0.1929, "step": 50380 }, { "epoch": 75.77, "grad_norm": 3.883265256881714, "learning_rate": 2.4225563909774437e-06, "loss": 0.2146, "step": 50390 }, { "epoch": 75.79, "grad_norm": 10.637557029724121, "learning_rate": 2.4210526315789477e-06, "loss": 0.2488, "step": 50400 }, { "epoch": 75.8, "grad_norm": 3.309053659439087, "learning_rate": 2.419548872180451e-06, "loss": 0.2057, "step": 50410 }, { "epoch": 75.82, "grad_norm": 7.711801052093506, "learning_rate": 2.418045112781955e-06, "loss": 0.2042, "step": 50420 }, { "epoch": 75.83, "grad_norm": 3.687312602996826, "learning_rate": 2.416541353383459e-06, "loss": 0.1693, "step": 50430 }, { "epoch": 75.85, "grad_norm": 5.525257587432861, "learning_rate": 2.4150375939849626e-06, "loss": 0.2097, "step": 50440 }, { "epoch": 75.86, "grad_norm": 4.86057186126709, "learning_rate": 2.4135338345864665e-06, "loss": 0.1844, "step": 50450 }, { "epoch": 75.88, "grad_norm": 2.9280784130096436, "learning_rate": 2.41203007518797e-06, "loss": 0.13, "step": 50460 }, { "epoch": 75.89, "grad_norm": 2.978746175765991, "learning_rate": 2.410526315789474e-06, "loss": 0.1435, "step": 50470 }, { "epoch": 75.91, "grad_norm": 7.612166404724121, "learning_rate": 2.409022556390978e-06, "loss": 0.1784, "step": 50480 }, { "epoch": 75.92, "grad_norm": 5.406614303588867, "learning_rate": 2.4075187969924814e-06, "loss": 0.2408, "step": 50490 }, { "epoch": 75.94, "grad_norm": 7.032416820526123, "learning_rate": 2.406015037593985e-06, "loss": 0.1511, "step": 50500 }, { "epoch": 75.95, "grad_norm": 6.901172161102295, "learning_rate": 2.404511278195489e-06, "loss": 0.2211, "step": 50510 }, { "epoch": 75.97, "grad_norm": 5.923864841461182, "learning_rate": 2.4030075187969924e-06, "loss": 0.18, "step": 50520 }, { "epoch": 75.98, "grad_norm": 7.222829341888428, "learning_rate": 2.4015037593984964e-06, "loss": 0.2284, "step": 50530 }, { "epoch": 76.0, "grad_norm": 0.01305259671062231, "learning_rate": 2.4000000000000003e-06, "loss": 0.1953, "step": 50540 }, { "epoch": 76.0, "eval_accuracy": 0.9307, "eval_loss": 0.3337118625640869, "eval_runtime": 84.2661, "eval_samples_per_second": 118.672, "eval_steps_per_second": 0.475, "step": 50540 }, { "epoch": 76.02, "grad_norm": 5.751012802124023, "learning_rate": 2.398496240601504e-06, "loss": 0.1967, "step": 50550 }, { "epoch": 76.03, "grad_norm": 4.3412089347839355, "learning_rate": 2.3969924812030078e-06, "loss": 0.1634, "step": 50560 }, { "epoch": 76.05, "grad_norm": 5.5192155838012695, "learning_rate": 2.3954887218045113e-06, "loss": 0.1627, "step": 50570 }, { "epoch": 76.06, "grad_norm": 12.006901741027832, "learning_rate": 2.3939849624060152e-06, "loss": 0.2048, "step": 50580 }, { "epoch": 76.08, "grad_norm": 3.821943521499634, "learning_rate": 2.3924812030075188e-06, "loss": 0.1681, "step": 50590 }, { "epoch": 76.09, "grad_norm": 7.498505115509033, "learning_rate": 2.3909774436090227e-06, "loss": 0.2294, "step": 50600 }, { "epoch": 76.11, "grad_norm": 1.650476336479187, "learning_rate": 2.3894736842105266e-06, "loss": 0.1929, "step": 50610 }, { "epoch": 76.12, "grad_norm": 3.873107671737671, "learning_rate": 2.38796992481203e-06, "loss": 0.1953, "step": 50620 }, { "epoch": 76.14, "grad_norm": 5.848465442657471, "learning_rate": 2.386466165413534e-06, "loss": 0.2391, "step": 50630 }, { "epoch": 76.15, "grad_norm": 10.972421646118164, "learning_rate": 2.3849624060150376e-06, "loss": 0.1747, "step": 50640 }, { "epoch": 76.17, "grad_norm": 3.4140889644622803, "learning_rate": 2.3834586466165416e-06, "loss": 0.1541, "step": 50650 }, { "epoch": 76.18, "grad_norm": 4.576592922210693, "learning_rate": 2.3819548872180455e-06, "loss": 0.2146, "step": 50660 }, { "epoch": 76.2, "grad_norm": 7.381382465362549, "learning_rate": 2.380451127819549e-06, "loss": 0.1912, "step": 50670 }, { "epoch": 76.21, "grad_norm": 7.457504749298096, "learning_rate": 2.378947368421053e-06, "loss": 0.1681, "step": 50680 }, { "epoch": 76.23, "grad_norm": 10.005772590637207, "learning_rate": 2.3774436090225565e-06, "loss": 0.2002, "step": 50690 }, { "epoch": 76.24, "grad_norm": 8.001717567443848, "learning_rate": 2.3759398496240604e-06, "loss": 0.2091, "step": 50700 }, { "epoch": 76.26, "grad_norm": 4.301360130310059, "learning_rate": 2.3744360902255644e-06, "loss": 0.1585, "step": 50710 }, { "epoch": 76.27, "grad_norm": 2.692290782928467, "learning_rate": 2.372932330827068e-06, "loss": 0.1992, "step": 50720 }, { "epoch": 76.29, "grad_norm": 7.287367820739746, "learning_rate": 2.371428571428572e-06, "loss": 0.2296, "step": 50730 }, { "epoch": 76.3, "grad_norm": 5.782092094421387, "learning_rate": 2.3699248120300754e-06, "loss": 0.1614, "step": 50740 }, { "epoch": 76.32, "grad_norm": 1.4290227890014648, "learning_rate": 2.368421052631579e-06, "loss": 0.1392, "step": 50750 }, { "epoch": 76.33, "grad_norm": 5.241323471069336, "learning_rate": 2.366917293233083e-06, "loss": 0.119, "step": 50760 }, { "epoch": 76.35, "grad_norm": 5.544551849365234, "learning_rate": 2.3654135338345868e-06, "loss": 0.1617, "step": 50770 }, { "epoch": 76.36, "grad_norm": 4.271751880645752, "learning_rate": 2.3639097744360903e-06, "loss": 0.1435, "step": 50780 }, { "epoch": 76.38, "grad_norm": 3.508246660232544, "learning_rate": 2.3624060150375942e-06, "loss": 0.1811, "step": 50790 }, { "epoch": 76.39, "grad_norm": 6.839121341705322, "learning_rate": 2.3609022556390977e-06, "loss": 0.1943, "step": 50800 }, { "epoch": 76.41, "grad_norm": 6.446866512298584, "learning_rate": 2.3593984962406017e-06, "loss": 0.2103, "step": 50810 }, { "epoch": 76.42, "grad_norm": 3.7222402095794678, "learning_rate": 2.357894736842105e-06, "loss": 0.1546, "step": 50820 }, { "epoch": 76.44, "grad_norm": 9.640228271484375, "learning_rate": 2.356390977443609e-06, "loss": 0.2003, "step": 50830 }, { "epoch": 76.45, "grad_norm": 5.100734710693359, "learning_rate": 2.354887218045113e-06, "loss": 0.1579, "step": 50840 }, { "epoch": 76.47, "grad_norm": 2.1371684074401855, "learning_rate": 2.3533834586466166e-06, "loss": 0.1618, "step": 50850 }, { "epoch": 76.48, "grad_norm": 7.091713905334473, "learning_rate": 2.3518796992481205e-06, "loss": 0.1943, "step": 50860 }, { "epoch": 76.5, "grad_norm": 5.84473180770874, "learning_rate": 2.350375939849624e-06, "loss": 0.15, "step": 50870 }, { "epoch": 76.51, "grad_norm": 4.024866104125977, "learning_rate": 2.348872180451128e-06, "loss": 0.166, "step": 50880 }, { "epoch": 76.53, "grad_norm": 5.06535530090332, "learning_rate": 2.347368421052632e-06, "loss": 0.1813, "step": 50890 }, { "epoch": 76.54, "grad_norm": 5.515821933746338, "learning_rate": 2.3458646616541355e-06, "loss": 0.1672, "step": 50900 }, { "epoch": 76.56, "grad_norm": 5.713620185852051, "learning_rate": 2.3443609022556394e-06, "loss": 0.2043, "step": 50910 }, { "epoch": 76.57, "grad_norm": 4.078904151916504, "learning_rate": 2.342857142857143e-06, "loss": 0.1606, "step": 50920 }, { "epoch": 76.59, "grad_norm": 5.5044660568237305, "learning_rate": 2.341353383458647e-06, "loss": 0.1721, "step": 50930 }, { "epoch": 76.6, "grad_norm": 6.557012557983398, "learning_rate": 2.339849624060151e-06, "loss": 0.1614, "step": 50940 }, { "epoch": 76.62, "grad_norm": 2.52939772605896, "learning_rate": 2.3383458646616543e-06, "loss": 0.1872, "step": 50950 }, { "epoch": 76.63, "grad_norm": 1.881983757019043, "learning_rate": 2.3368421052631583e-06, "loss": 0.1311, "step": 50960 }, { "epoch": 76.65, "grad_norm": 3.7978851795196533, "learning_rate": 2.335338345864662e-06, "loss": 0.213, "step": 50970 }, { "epoch": 76.66, "grad_norm": 6.723447799682617, "learning_rate": 2.3338345864661657e-06, "loss": 0.1595, "step": 50980 }, { "epoch": 76.68, "grad_norm": 4.094229698181152, "learning_rate": 2.3323308270676697e-06, "loss": 0.1888, "step": 50990 }, { "epoch": 76.69, "grad_norm": 5.497591495513916, "learning_rate": 2.330827067669173e-06, "loss": 0.1733, "step": 51000 }, { "epoch": 76.71, "grad_norm": 6.945542812347412, "learning_rate": 2.3293233082706767e-06, "loss": 0.225, "step": 51010 }, { "epoch": 76.72, "grad_norm": 7.769118309020996, "learning_rate": 2.3278195488721807e-06, "loss": 0.2114, "step": 51020 }, { "epoch": 76.74, "grad_norm": 5.106688976287842, "learning_rate": 2.326315789473684e-06, "loss": 0.1166, "step": 51030 }, { "epoch": 76.75, "grad_norm": 7.118248462677002, "learning_rate": 2.324812030075188e-06, "loss": 0.1807, "step": 51040 }, { "epoch": 76.77, "grad_norm": 4.859402656555176, "learning_rate": 2.3233082706766916e-06, "loss": 0.217, "step": 51050 }, { "epoch": 76.78, "grad_norm": 4.8244829177856445, "learning_rate": 2.3218045112781956e-06, "loss": 0.1698, "step": 51060 }, { "epoch": 76.8, "grad_norm": 4.1632914543151855, "learning_rate": 2.3203007518796995e-06, "loss": 0.1531, "step": 51070 }, { "epoch": 76.81, "grad_norm": 5.575042247772217, "learning_rate": 2.318796992481203e-06, "loss": 0.1539, "step": 51080 }, { "epoch": 76.83, "grad_norm": 4.370471000671387, "learning_rate": 2.317293233082707e-06, "loss": 0.1313, "step": 51090 }, { "epoch": 76.84, "grad_norm": 4.721175193786621, "learning_rate": 2.3157894736842105e-06, "loss": 0.1698, "step": 51100 }, { "epoch": 76.86, "grad_norm": 6.6235857009887695, "learning_rate": 2.3142857142857145e-06, "loss": 0.2313, "step": 51110 }, { "epoch": 76.87, "grad_norm": 15.119837760925293, "learning_rate": 2.3127819548872184e-06, "loss": 0.163, "step": 51120 }, { "epoch": 76.89, "grad_norm": 2.5521862506866455, "learning_rate": 2.311278195488722e-06, "loss": 0.2031, "step": 51130 }, { "epoch": 76.9, "grad_norm": 8.849630355834961, "learning_rate": 2.309774436090226e-06, "loss": 0.182, "step": 51140 }, { "epoch": 76.92, "grad_norm": 6.023241996765137, "learning_rate": 2.3082706766917294e-06, "loss": 0.1807, "step": 51150 }, { "epoch": 76.93, "grad_norm": 6.313910007476807, "learning_rate": 2.3067669172932333e-06, "loss": 0.1622, "step": 51160 }, { "epoch": 76.95, "grad_norm": 7.0026116371154785, "learning_rate": 2.3052631578947373e-06, "loss": 0.1677, "step": 51170 }, { "epoch": 76.96, "grad_norm": 5.539087772369385, "learning_rate": 2.3037593984962408e-06, "loss": 0.1771, "step": 51180 }, { "epoch": 76.98, "grad_norm": 2.8735461235046387, "learning_rate": 2.3022556390977447e-06, "loss": 0.1641, "step": 51190 }, { "epoch": 76.99, "grad_norm": 4.759913921356201, "learning_rate": 2.3007518796992482e-06, "loss": 0.1599, "step": 51200 }, { "epoch": 77.0, "eval_accuracy": 0.9313, "eval_loss": 0.3295079469680786, "eval_runtime": 84.3369, "eval_samples_per_second": 118.572, "eval_steps_per_second": 0.474, "step": 51205 }, { "epoch": 77.01, "grad_norm": 3.568769693374634, "learning_rate": 2.299248120300752e-06, "loss": 0.1626, "step": 51210 }, { "epoch": 77.02, "grad_norm": 6.137712478637695, "learning_rate": 2.297744360902256e-06, "loss": 0.1651, "step": 51220 }, { "epoch": 77.04, "grad_norm": 2.2381041049957275, "learning_rate": 2.2962406015037596e-06, "loss": 0.1933, "step": 51230 }, { "epoch": 77.05, "grad_norm": 6.399120330810547, "learning_rate": 2.294736842105263e-06, "loss": 0.1864, "step": 51240 }, { "epoch": 77.07, "grad_norm": 5.5201334953308105, "learning_rate": 2.293233082706767e-06, "loss": 0.2318, "step": 51250 }, { "epoch": 77.08, "grad_norm": 6.623318195343018, "learning_rate": 2.2917293233082706e-06, "loss": 0.2449, "step": 51260 }, { "epoch": 77.1, "grad_norm": 5.644393444061279, "learning_rate": 2.2902255639097746e-06, "loss": 0.2073, "step": 51270 }, { "epoch": 77.11, "grad_norm": 5.483371257781982, "learning_rate": 2.288721804511278e-06, "loss": 0.185, "step": 51280 }, { "epoch": 77.13, "grad_norm": 5.581244468688965, "learning_rate": 2.287218045112782e-06, "loss": 0.2171, "step": 51290 }, { "epoch": 77.14, "grad_norm": 7.645112991333008, "learning_rate": 2.285714285714286e-06, "loss": 0.2034, "step": 51300 }, { "epoch": 77.16, "grad_norm": 5.600865364074707, "learning_rate": 2.2842105263157895e-06, "loss": 0.2298, "step": 51310 }, { "epoch": 77.17, "grad_norm": 8.792081832885742, "learning_rate": 2.2827067669172934e-06, "loss": 0.1311, "step": 51320 }, { "epoch": 77.19, "grad_norm": 4.581193447113037, "learning_rate": 2.281203007518797e-06, "loss": 0.1835, "step": 51330 }, { "epoch": 77.2, "grad_norm": 3.9883158206939697, "learning_rate": 2.279699248120301e-06, "loss": 0.1439, "step": 51340 }, { "epoch": 77.22, "grad_norm": 3.542494535446167, "learning_rate": 2.278195488721805e-06, "loss": 0.1538, "step": 51350 }, { "epoch": 77.23, "grad_norm": 3.4795663356781006, "learning_rate": 2.2766917293233084e-06, "loss": 0.1669, "step": 51360 }, { "epoch": 77.25, "grad_norm": 5.9982194900512695, "learning_rate": 2.2751879699248123e-06, "loss": 0.1479, "step": 51370 }, { "epoch": 77.26, "grad_norm": 4.513314247131348, "learning_rate": 2.273684210526316e-06, "loss": 0.2073, "step": 51380 }, { "epoch": 77.28, "grad_norm": 3.709369421005249, "learning_rate": 2.2721804511278198e-06, "loss": 0.1449, "step": 51390 }, { "epoch": 77.29, "grad_norm": 4.331890106201172, "learning_rate": 2.2706766917293237e-06, "loss": 0.1677, "step": 51400 }, { "epoch": 77.31, "grad_norm": 3.835942506790161, "learning_rate": 2.2691729323308272e-06, "loss": 0.1495, "step": 51410 }, { "epoch": 77.32, "grad_norm": 4.771125316619873, "learning_rate": 2.267669172932331e-06, "loss": 0.1771, "step": 51420 }, { "epoch": 77.34, "grad_norm": 2.807570695877075, "learning_rate": 2.2661654135338347e-06, "loss": 0.1593, "step": 51430 }, { "epoch": 77.35, "grad_norm": 2.986487627029419, "learning_rate": 2.2646616541353386e-06, "loss": 0.1418, "step": 51440 }, { "epoch": 77.37, "grad_norm": 3.4816768169403076, "learning_rate": 2.2631578947368426e-06, "loss": 0.1787, "step": 51450 }, { "epoch": 77.38, "grad_norm": 4.014631271362305, "learning_rate": 2.261654135338346e-06, "loss": 0.1266, "step": 51460 }, { "epoch": 77.4, "grad_norm": 6.388309955596924, "learning_rate": 2.26015037593985e-06, "loss": 0.2472, "step": 51470 }, { "epoch": 77.41, "grad_norm": 3.9746556282043457, "learning_rate": 2.2586466165413536e-06, "loss": 0.2174, "step": 51480 }, { "epoch": 77.43, "grad_norm": 5.2674455642700195, "learning_rate": 2.257142857142857e-06, "loss": 0.1419, "step": 51490 }, { "epoch": 77.44, "grad_norm": 4.513287544250488, "learning_rate": 2.255639097744361e-06, "loss": 0.1721, "step": 51500 }, { "epoch": 77.46, "grad_norm": 7.286464691162109, "learning_rate": 2.2541353383458645e-06, "loss": 0.1816, "step": 51510 }, { "epoch": 77.47, "grad_norm": 3.867978572845459, "learning_rate": 2.2526315789473685e-06, "loss": 0.1384, "step": 51520 }, { "epoch": 77.49, "grad_norm": 0.8837634921073914, "learning_rate": 2.2511278195488724e-06, "loss": 0.1681, "step": 51530 }, { "epoch": 77.5, "grad_norm": 8.97939395904541, "learning_rate": 2.249624060150376e-06, "loss": 0.1419, "step": 51540 }, { "epoch": 77.52, "grad_norm": 3.708096742630005, "learning_rate": 2.24812030075188e-06, "loss": 0.1476, "step": 51550 }, { "epoch": 77.53, "grad_norm": 3.5969924926757812, "learning_rate": 2.2466165413533834e-06, "loss": 0.188, "step": 51560 }, { "epoch": 77.55, "grad_norm": 2.7595226764678955, "learning_rate": 2.2451127819548873e-06, "loss": 0.2098, "step": 51570 }, { "epoch": 77.56, "grad_norm": 5.377525329589844, "learning_rate": 2.2436090225563913e-06, "loss": 0.2238, "step": 51580 }, { "epoch": 77.58, "grad_norm": 8.016778945922852, "learning_rate": 2.242105263157895e-06, "loss": 0.1233, "step": 51590 }, { "epoch": 77.59, "grad_norm": 4.480692386627197, "learning_rate": 2.2406015037593987e-06, "loss": 0.2026, "step": 51600 }, { "epoch": 77.61, "grad_norm": 6.609382629394531, "learning_rate": 2.2390977443609023e-06, "loss": 0.1878, "step": 51610 }, { "epoch": 77.62, "grad_norm": 6.064496994018555, "learning_rate": 2.237593984962406e-06, "loss": 0.2227, "step": 51620 }, { "epoch": 77.64, "grad_norm": 2.538266181945801, "learning_rate": 2.23609022556391e-06, "loss": 0.1967, "step": 51630 }, { "epoch": 77.65, "grad_norm": 5.404106140136719, "learning_rate": 2.2345864661654137e-06, "loss": 0.254, "step": 51640 }, { "epoch": 77.67, "grad_norm": 5.271445274353027, "learning_rate": 2.2330827067669176e-06, "loss": 0.132, "step": 51650 }, { "epoch": 77.68, "grad_norm": 4.448204040527344, "learning_rate": 2.231578947368421e-06, "loss": 0.1674, "step": 51660 }, { "epoch": 77.7, "grad_norm": 8.482146263122559, "learning_rate": 2.230075187969925e-06, "loss": 0.2233, "step": 51670 }, { "epoch": 77.71, "grad_norm": 3.992495536804199, "learning_rate": 2.228571428571429e-06, "loss": 0.141, "step": 51680 }, { "epoch": 77.73, "grad_norm": 9.992817878723145, "learning_rate": 2.2270676691729325e-06, "loss": 0.2356, "step": 51690 }, { "epoch": 77.74, "grad_norm": 3.759432554244995, "learning_rate": 2.2255639097744365e-06, "loss": 0.1199, "step": 51700 }, { "epoch": 77.76, "grad_norm": 4.334221363067627, "learning_rate": 2.22406015037594e-06, "loss": 0.1971, "step": 51710 }, { "epoch": 77.77, "grad_norm": 1.510976791381836, "learning_rate": 2.222556390977444e-06, "loss": 0.1708, "step": 51720 }, { "epoch": 77.79, "grad_norm": 5.777114391326904, "learning_rate": 2.221052631578948e-06, "loss": 0.2107, "step": 51730 }, { "epoch": 77.8, "grad_norm": 2.896449327468872, "learning_rate": 2.219548872180451e-06, "loss": 0.1942, "step": 51740 }, { "epoch": 77.82, "grad_norm": 17.983856201171875, "learning_rate": 2.218045112781955e-06, "loss": 0.1846, "step": 51750 }, { "epoch": 77.83, "grad_norm": 7.573366165161133, "learning_rate": 2.216541353383459e-06, "loss": 0.1972, "step": 51760 }, { "epoch": 77.85, "grad_norm": 6.8880109786987305, "learning_rate": 2.2150375939849624e-06, "loss": 0.18, "step": 51770 }, { "epoch": 77.86, "grad_norm": 2.3339192867279053, "learning_rate": 2.2135338345864663e-06, "loss": 0.1842, "step": 51780 }, { "epoch": 77.88, "grad_norm": 3.973484754562378, "learning_rate": 2.21203007518797e-06, "loss": 0.2218, "step": 51790 }, { "epoch": 77.89, "grad_norm": 3.132448673248291, "learning_rate": 2.2105263157894738e-06, "loss": 0.2052, "step": 51800 }, { "epoch": 77.91, "grad_norm": 5.713258743286133, "learning_rate": 2.2090225563909777e-06, "loss": 0.1334, "step": 51810 }, { "epoch": 77.92, "grad_norm": 4.339270114898682, "learning_rate": 2.2075187969924812e-06, "loss": 0.2167, "step": 51820 }, { "epoch": 77.94, "grad_norm": 1.3507264852523804, "learning_rate": 2.206015037593985e-06, "loss": 0.2518, "step": 51830 }, { "epoch": 77.95, "grad_norm": 7.650753974914551, "learning_rate": 2.2045112781954887e-06, "loss": 0.2293, "step": 51840 }, { "epoch": 77.97, "grad_norm": 5.412219524383545, "learning_rate": 2.2030075187969927e-06, "loss": 0.1966, "step": 51850 }, { "epoch": 77.98, "grad_norm": 7.676565647125244, "learning_rate": 2.2015037593984966e-06, "loss": 0.2069, "step": 51860 }, { "epoch": 78.0, "grad_norm": 0.12734095752239227, "learning_rate": 2.2e-06, "loss": 0.2077, "step": 51870 }, { "epoch": 78.0, "eval_accuracy": 0.9312, "eval_loss": 0.32849153876304626, "eval_runtime": 85.6428, "eval_samples_per_second": 116.764, "eval_steps_per_second": 0.467, "step": 51870 }, { "epoch": 78.02, "grad_norm": 3.1705241203308105, "learning_rate": 2.198496240601504e-06, "loss": 0.1415, "step": 51880 }, { "epoch": 78.03, "grad_norm": 4.243330955505371, "learning_rate": 2.1969924812030076e-06, "loss": 0.1765, "step": 51890 }, { "epoch": 78.05, "grad_norm": 7.999730110168457, "learning_rate": 2.1954887218045115e-06, "loss": 0.2041, "step": 51900 }, { "epoch": 78.06, "grad_norm": 4.726391792297363, "learning_rate": 2.1939849624060155e-06, "loss": 0.2337, "step": 51910 }, { "epoch": 78.08, "grad_norm": 4.67405891418457, "learning_rate": 2.192481203007519e-06, "loss": 0.1231, "step": 51920 }, { "epoch": 78.09, "grad_norm": 6.686806678771973, "learning_rate": 2.190977443609023e-06, "loss": 0.1047, "step": 51930 }, { "epoch": 78.11, "grad_norm": 2.4821736812591553, "learning_rate": 2.1894736842105264e-06, "loss": 0.1818, "step": 51940 }, { "epoch": 78.12, "grad_norm": 5.056117057800293, "learning_rate": 2.1879699248120304e-06, "loss": 0.1644, "step": 51950 }, { "epoch": 78.14, "grad_norm": 5.994955062866211, "learning_rate": 2.1864661654135343e-06, "loss": 0.1509, "step": 51960 }, { "epoch": 78.15, "grad_norm": 4.456724643707275, "learning_rate": 2.184962406015038e-06, "loss": 0.1468, "step": 51970 }, { "epoch": 78.17, "grad_norm": 3.952169895172119, "learning_rate": 2.1834586466165418e-06, "loss": 0.1785, "step": 51980 }, { "epoch": 78.18, "grad_norm": 3.9694976806640625, "learning_rate": 2.1819548872180453e-06, "loss": 0.2603, "step": 51990 }, { "epoch": 78.2, "grad_norm": 4.302563190460205, "learning_rate": 2.180451127819549e-06, "loss": 0.1793, "step": 52000 }, { "epoch": 78.21, "grad_norm": 1.5013707876205444, "learning_rate": 2.1789473684210528e-06, "loss": 0.2053, "step": 52010 }, { "epoch": 78.23, "grad_norm": 4.439993858337402, "learning_rate": 2.1774436090225563e-06, "loss": 0.1417, "step": 52020 }, { "epoch": 78.24, "grad_norm": 4.53973388671875, "learning_rate": 2.1759398496240602e-06, "loss": 0.1573, "step": 52030 }, { "epoch": 78.26, "grad_norm": 6.344988822937012, "learning_rate": 2.174436090225564e-06, "loss": 0.1905, "step": 52040 }, { "epoch": 78.27, "grad_norm": 3.4215738773345947, "learning_rate": 2.1729323308270677e-06, "loss": 0.1587, "step": 52050 }, { "epoch": 78.29, "grad_norm": 5.529082298278809, "learning_rate": 2.1714285714285716e-06, "loss": 0.1935, "step": 52060 }, { "epoch": 78.3, "grad_norm": 2.7385475635528564, "learning_rate": 2.169924812030075e-06, "loss": 0.1677, "step": 52070 }, { "epoch": 78.32, "grad_norm": 2.511803150177002, "learning_rate": 2.168421052631579e-06, "loss": 0.2112, "step": 52080 }, { "epoch": 78.33, "grad_norm": 4.103121757507324, "learning_rate": 2.166917293233083e-06, "loss": 0.1629, "step": 52090 }, { "epoch": 78.35, "grad_norm": 4.167917728424072, "learning_rate": 2.1654135338345866e-06, "loss": 0.1722, "step": 52100 }, { "epoch": 78.36, "grad_norm": 3.2801246643066406, "learning_rate": 2.1639097744360905e-06, "loss": 0.2039, "step": 52110 }, { "epoch": 78.38, "grad_norm": 4.19163703918457, "learning_rate": 2.162406015037594e-06, "loss": 0.181, "step": 52120 }, { "epoch": 78.39, "grad_norm": 3.487081527709961, "learning_rate": 2.160902255639098e-06, "loss": 0.1752, "step": 52130 }, { "epoch": 78.41, "grad_norm": 2.9936952590942383, "learning_rate": 2.159398496240602e-06, "loss": 0.2173, "step": 52140 }, { "epoch": 78.42, "grad_norm": 4.499261379241943, "learning_rate": 2.1578947368421054e-06, "loss": 0.1907, "step": 52150 }, { "epoch": 78.44, "grad_norm": 5.857420921325684, "learning_rate": 2.1563909774436094e-06, "loss": 0.1689, "step": 52160 }, { "epoch": 78.45, "grad_norm": 4.8537468910217285, "learning_rate": 2.154887218045113e-06, "loss": 0.1216, "step": 52170 }, { "epoch": 78.47, "grad_norm": 6.32722806930542, "learning_rate": 2.153383458646617e-06, "loss": 0.2262, "step": 52180 }, { "epoch": 78.48, "grad_norm": 11.071094512939453, "learning_rate": 2.1518796992481208e-06, "loss": 0.1113, "step": 52190 }, { "epoch": 78.5, "grad_norm": 5.603166580200195, "learning_rate": 2.1503759398496243e-06, "loss": 0.167, "step": 52200 }, { "epoch": 78.51, "grad_norm": 5.128636837005615, "learning_rate": 2.1488721804511282e-06, "loss": 0.1974, "step": 52210 }, { "epoch": 78.53, "grad_norm": 2.067934036254883, "learning_rate": 2.1473684210526317e-06, "loss": 0.1654, "step": 52220 }, { "epoch": 78.54, "grad_norm": 4.146607398986816, "learning_rate": 2.1458646616541357e-06, "loss": 0.1807, "step": 52230 }, { "epoch": 78.56, "grad_norm": 2.6241300106048584, "learning_rate": 2.144360902255639e-06, "loss": 0.1445, "step": 52240 }, { "epoch": 78.57, "grad_norm": 4.428233623504639, "learning_rate": 2.1428571428571427e-06, "loss": 0.1427, "step": 52250 }, { "epoch": 78.59, "grad_norm": 4.425225734710693, "learning_rate": 2.1413533834586467e-06, "loss": 0.1506, "step": 52260 }, { "epoch": 78.6, "grad_norm": 2.6560704708099365, "learning_rate": 2.1398496240601506e-06, "loss": 0.1788, "step": 52270 }, { "epoch": 78.62, "grad_norm": 3.350560188293457, "learning_rate": 2.138345864661654e-06, "loss": 0.1513, "step": 52280 }, { "epoch": 78.63, "grad_norm": 3.5324301719665527, "learning_rate": 2.136842105263158e-06, "loss": 0.1701, "step": 52290 }, { "epoch": 78.65, "grad_norm": 4.827585697174072, "learning_rate": 2.1353383458646616e-06, "loss": 0.2025, "step": 52300 }, { "epoch": 78.66, "grad_norm": 0.8045023679733276, "learning_rate": 2.1338345864661655e-06, "loss": 0.1426, "step": 52310 }, { "epoch": 78.68, "grad_norm": 4.572910308837891, "learning_rate": 2.1323308270676695e-06, "loss": 0.1788, "step": 52320 }, { "epoch": 78.69, "grad_norm": 5.141964435577393, "learning_rate": 2.130827067669173e-06, "loss": 0.191, "step": 52330 }, { "epoch": 78.71, "grad_norm": 16.9417667388916, "learning_rate": 2.129323308270677e-06, "loss": 0.1658, "step": 52340 }, { "epoch": 78.72, "grad_norm": 7.1351399421691895, "learning_rate": 2.1278195488721805e-06, "loss": 0.1802, "step": 52350 }, { "epoch": 78.74, "grad_norm": 5.015010833740234, "learning_rate": 2.1263157894736844e-06, "loss": 0.227, "step": 52360 }, { "epoch": 78.75, "grad_norm": 4.296870231628418, "learning_rate": 2.1248120300751883e-06, "loss": 0.1983, "step": 52370 }, { "epoch": 78.77, "grad_norm": 6.891214370727539, "learning_rate": 2.123308270676692e-06, "loss": 0.2017, "step": 52380 }, { "epoch": 78.78, "grad_norm": 4.369607925415039, "learning_rate": 2.121804511278196e-06, "loss": 0.1861, "step": 52390 }, { "epoch": 78.8, "grad_norm": 7.993550777435303, "learning_rate": 2.1203007518796993e-06, "loss": 0.1986, "step": 52400 }, { "epoch": 78.81, "grad_norm": 5.03591251373291, "learning_rate": 2.1187969924812033e-06, "loss": 0.1679, "step": 52410 }, { "epoch": 78.83, "grad_norm": 3.6212151050567627, "learning_rate": 2.1172932330827072e-06, "loss": 0.2262, "step": 52420 }, { "epoch": 78.84, "grad_norm": 6.3097310066223145, "learning_rate": 2.1157894736842107e-06, "loss": 0.2434, "step": 52430 }, { "epoch": 78.86, "grad_norm": 8.02677059173584, "learning_rate": 2.1142857142857147e-06, "loss": 0.1698, "step": 52440 }, { "epoch": 78.87, "grad_norm": 6.799673080444336, "learning_rate": 2.112781954887218e-06, "loss": 0.2062, "step": 52450 }, { "epoch": 78.89, "grad_norm": 3.5356316566467285, "learning_rate": 2.111278195488722e-06, "loss": 0.1969, "step": 52460 }, { "epoch": 78.9, "grad_norm": 4.078927993774414, "learning_rate": 2.1097744360902257e-06, "loss": 0.1694, "step": 52470 }, { "epoch": 78.92, "grad_norm": 4.125448226928711, "learning_rate": 2.1082706766917296e-06, "loss": 0.1752, "step": 52480 }, { "epoch": 78.93, "grad_norm": 5.614774703979492, "learning_rate": 2.106766917293233e-06, "loss": 0.1573, "step": 52490 }, { "epoch": 78.95, "grad_norm": 3.6688623428344727, "learning_rate": 2.105263157894737e-06, "loss": 0.1792, "step": 52500 }, { "epoch": 78.96, "grad_norm": 3.9358396530151367, "learning_rate": 2.1037593984962406e-06, "loss": 0.1619, "step": 52510 }, { "epoch": 78.98, "grad_norm": 3.6908819675445557, "learning_rate": 2.1022556390977445e-06, "loss": 0.1755, "step": 52520 }, { "epoch": 78.99, "grad_norm": 6.504554271697998, "learning_rate": 2.100751879699248e-06, "loss": 0.2053, "step": 52530 }, { "epoch": 79.0, "eval_accuracy": 0.9309, "eval_loss": 0.32775041460990906, "eval_runtime": 84.345, "eval_samples_per_second": 118.561, "eval_steps_per_second": 0.474, "step": 52535 }, { "epoch": 79.01, "grad_norm": 4.1648664474487305, "learning_rate": 2.099248120300752e-06, "loss": 0.2267, "step": 52540 }, { "epoch": 79.02, "grad_norm": 7.5626020431518555, "learning_rate": 2.097744360902256e-06, "loss": 0.2043, "step": 52550 }, { "epoch": 79.04, "grad_norm": 4.533354759216309, "learning_rate": 2.0962406015037594e-06, "loss": 0.1998, "step": 52560 }, { "epoch": 79.05, "grad_norm": 2.196251630783081, "learning_rate": 2.0947368421052634e-06, "loss": 0.1536, "step": 52570 }, { "epoch": 79.07, "grad_norm": 8.702381134033203, "learning_rate": 2.093233082706767e-06, "loss": 0.1829, "step": 52580 }, { "epoch": 79.08, "grad_norm": 6.189172267913818, "learning_rate": 2.091729323308271e-06, "loss": 0.2646, "step": 52590 }, { "epoch": 79.1, "grad_norm": 14.350298881530762, "learning_rate": 2.090225563909775e-06, "loss": 0.1486, "step": 52600 }, { "epoch": 79.11, "grad_norm": 5.265256404876709, "learning_rate": 2.0887218045112783e-06, "loss": 0.1852, "step": 52610 }, { "epoch": 79.13, "grad_norm": 5.7867350578308105, "learning_rate": 2.0872180451127823e-06, "loss": 0.1778, "step": 52620 }, { "epoch": 79.14, "grad_norm": 3.8867268562316895, "learning_rate": 2.0857142857142858e-06, "loss": 0.1716, "step": 52630 }, { "epoch": 79.16, "grad_norm": 10.908556938171387, "learning_rate": 2.0842105263157897e-06, "loss": 0.2087, "step": 52640 }, { "epoch": 79.17, "grad_norm": 8.96177864074707, "learning_rate": 2.0827067669172937e-06, "loss": 0.1587, "step": 52650 }, { "epoch": 79.19, "grad_norm": 4.244720935821533, "learning_rate": 2.081203007518797e-06, "loss": 0.1488, "step": 52660 }, { "epoch": 79.2, "grad_norm": 3.8978004455566406, "learning_rate": 2.079699248120301e-06, "loss": 0.1361, "step": 52670 }, { "epoch": 79.22, "grad_norm": 1.49534010887146, "learning_rate": 2.0781954887218046e-06, "loss": 0.1506, "step": 52680 }, { "epoch": 79.23, "grad_norm": 6.862305164337158, "learning_rate": 2.0766917293233086e-06, "loss": 0.2312, "step": 52690 }, { "epoch": 79.25, "grad_norm": 5.542629718780518, "learning_rate": 2.075187969924812e-06, "loss": 0.1923, "step": 52700 }, { "epoch": 79.26, "grad_norm": 5.134133338928223, "learning_rate": 2.073684210526316e-06, "loss": 0.1765, "step": 52710 }, { "epoch": 79.28, "grad_norm": 4.834011554718018, "learning_rate": 2.07218045112782e-06, "loss": 0.1291, "step": 52720 }, { "epoch": 79.29, "grad_norm": 9.411290168762207, "learning_rate": 2.0706766917293235e-06, "loss": 0.1661, "step": 52730 }, { "epoch": 79.31, "grad_norm": 4.970304489135742, "learning_rate": 2.069172932330827e-06, "loss": 0.1876, "step": 52740 }, { "epoch": 79.32, "grad_norm": 4.562291622161865, "learning_rate": 2.067669172932331e-06, "loss": 0.1942, "step": 52750 }, { "epoch": 79.34, "grad_norm": 5.195777416229248, "learning_rate": 2.0661654135338345e-06, "loss": 0.1821, "step": 52760 }, { "epoch": 79.35, "grad_norm": 6.937760829925537, "learning_rate": 2.0646616541353384e-06, "loss": 0.2023, "step": 52770 }, { "epoch": 79.37, "grad_norm": 6.619980812072754, "learning_rate": 2.0631578947368424e-06, "loss": 0.1777, "step": 52780 }, { "epoch": 79.38, "grad_norm": 2.9072442054748535, "learning_rate": 2.061654135338346e-06, "loss": 0.1435, "step": 52790 }, { "epoch": 79.4, "grad_norm": 1.8150124549865723, "learning_rate": 2.06015037593985e-06, "loss": 0.1548, "step": 52800 }, { "epoch": 79.41, "grad_norm": 4.043828010559082, "learning_rate": 2.0586466165413533e-06, "loss": 0.1577, "step": 52810 }, { "epoch": 79.43, "grad_norm": 4.577513694763184, "learning_rate": 2.0571428571428573e-06, "loss": 0.2405, "step": 52820 }, { "epoch": 79.44, "grad_norm": 4.094361782073975, "learning_rate": 2.0556390977443612e-06, "loss": 0.1409, "step": 52830 }, { "epoch": 79.46, "grad_norm": 3.072739362716675, "learning_rate": 2.0541353383458648e-06, "loss": 0.1992, "step": 52840 }, { "epoch": 79.47, "grad_norm": 5.828579425811768, "learning_rate": 2.0526315789473687e-06, "loss": 0.1278, "step": 52850 }, { "epoch": 79.49, "grad_norm": 5.253977298736572, "learning_rate": 2.0511278195488722e-06, "loss": 0.1984, "step": 52860 }, { "epoch": 79.5, "grad_norm": 3.553118944168091, "learning_rate": 2.049624060150376e-06, "loss": 0.1384, "step": 52870 }, { "epoch": 79.52, "grad_norm": 7.722125053405762, "learning_rate": 2.04812030075188e-06, "loss": 0.2084, "step": 52880 }, { "epoch": 79.53, "grad_norm": 3.237384557723999, "learning_rate": 2.0466165413533836e-06, "loss": 0.163, "step": 52890 }, { "epoch": 79.55, "grad_norm": 11.586670875549316, "learning_rate": 2.0451127819548876e-06, "loss": 0.2027, "step": 52900 }, { "epoch": 79.56, "grad_norm": 5.43867301940918, "learning_rate": 2.043609022556391e-06, "loss": 0.1672, "step": 52910 }, { "epoch": 79.58, "grad_norm": 6.87939977645874, "learning_rate": 2.042105263157895e-06, "loss": 0.1675, "step": 52920 }, { "epoch": 79.59, "grad_norm": 7.075170993804932, "learning_rate": 2.0406015037593985e-06, "loss": 0.2382, "step": 52930 }, { "epoch": 79.61, "grad_norm": 4.605086326599121, "learning_rate": 2.0390977443609025e-06, "loss": 0.1238, "step": 52940 }, { "epoch": 79.62, "grad_norm": 4.68049955368042, "learning_rate": 2.0375939849624064e-06, "loss": 0.1679, "step": 52950 }, { "epoch": 79.64, "grad_norm": 4.511661529541016, "learning_rate": 2.03609022556391e-06, "loss": 0.1604, "step": 52960 }, { "epoch": 79.65, "grad_norm": 5.428410530090332, "learning_rate": 2.034586466165414e-06, "loss": 0.1244, "step": 52970 }, { "epoch": 79.67, "grad_norm": 4.728567600250244, "learning_rate": 2.0330827067669174e-06, "loss": 0.1375, "step": 52980 }, { "epoch": 79.68, "grad_norm": 4.785933017730713, "learning_rate": 2.031578947368421e-06, "loss": 0.1517, "step": 52990 }, { "epoch": 79.7, "grad_norm": 6.504793167114258, "learning_rate": 2.030075187969925e-06, "loss": 0.1692, "step": 53000 }, { "epoch": 79.71, "grad_norm": 4.675958156585693, "learning_rate": 2.028571428571429e-06, "loss": 0.1915, "step": 53010 }, { "epoch": 79.73, "grad_norm": 5.776334285736084, "learning_rate": 2.0270676691729323e-06, "loss": 0.1526, "step": 53020 }, { "epoch": 79.74, "grad_norm": 11.183079719543457, "learning_rate": 2.0255639097744363e-06, "loss": 0.1586, "step": 53030 }, { "epoch": 79.76, "grad_norm": 8.511357307434082, "learning_rate": 2.02406015037594e-06, "loss": 0.1941, "step": 53040 }, { "epoch": 79.77, "grad_norm": 22.351648330688477, "learning_rate": 2.0225563909774437e-06, "loss": 0.1433, "step": 53050 }, { "epoch": 79.79, "grad_norm": 7.769514083862305, "learning_rate": 2.0210526315789477e-06, "loss": 0.1788, "step": 53060 }, { "epoch": 79.8, "grad_norm": 5.4393415451049805, "learning_rate": 2.019548872180451e-06, "loss": 0.2059, "step": 53070 }, { "epoch": 79.82, "grad_norm": 5.8692145347595215, "learning_rate": 2.018045112781955e-06, "loss": 0.2085, "step": 53080 }, { "epoch": 79.83, "grad_norm": 7.843011379241943, "learning_rate": 2.0165413533834587e-06, "loss": 0.2123, "step": 53090 }, { "epoch": 79.85, "grad_norm": 3.590719223022461, "learning_rate": 2.0150375939849626e-06, "loss": 0.1455, "step": 53100 }, { "epoch": 79.86, "grad_norm": 2.503929853439331, "learning_rate": 2.0135338345864665e-06, "loss": 0.1364, "step": 53110 }, { "epoch": 79.88, "grad_norm": 4.913735866546631, "learning_rate": 2.01203007518797e-06, "loss": 0.159, "step": 53120 }, { "epoch": 79.89, "grad_norm": 7.748683929443359, "learning_rate": 2.010526315789474e-06, "loss": 0.1912, "step": 53130 }, { "epoch": 79.91, "grad_norm": 5.099629878997803, "learning_rate": 2.0090225563909775e-06, "loss": 0.2093, "step": 53140 }, { "epoch": 79.92, "grad_norm": 8.263333320617676, "learning_rate": 2.0075187969924815e-06, "loss": 0.1407, "step": 53150 }, { "epoch": 79.94, "grad_norm": 9.621796607971191, "learning_rate": 2.006015037593985e-06, "loss": 0.1398, "step": 53160 }, { "epoch": 79.95, "grad_norm": 4.470506191253662, "learning_rate": 2.004511278195489e-06, "loss": 0.1811, "step": 53170 }, { "epoch": 79.97, "grad_norm": 1.4590644836425781, "learning_rate": 2.003007518796993e-06, "loss": 0.2257, "step": 53180 }, { "epoch": 79.98, "grad_norm": 5.552732944488525, "learning_rate": 2.0015037593984964e-06, "loss": 0.2092, "step": 53190 }, { "epoch": 80.0, "grad_norm": 0.11614461988210678, "learning_rate": 2.0000000000000003e-06, "loss": 0.1846, "step": 53200 }, { "epoch": 80.0, "eval_accuracy": 0.9307, "eval_loss": 0.3290669918060303, "eval_runtime": 85.0743, "eval_samples_per_second": 117.544, "eval_steps_per_second": 0.47, "step": 53200 }, { "epoch": 80.02, "grad_norm": 7.221068859100342, "learning_rate": 1.998496240601504e-06, "loss": 0.2097, "step": 53210 }, { "epoch": 80.03, "grad_norm": 8.729702949523926, "learning_rate": 1.996992481203008e-06, "loss": 0.1546, "step": 53220 }, { "epoch": 80.05, "grad_norm": 6.764802932739258, "learning_rate": 1.9954887218045113e-06, "loss": 0.1736, "step": 53230 }, { "epoch": 80.06, "grad_norm": 6.79409646987915, "learning_rate": 1.9939849624060153e-06, "loss": 0.2107, "step": 53240 }, { "epoch": 80.08, "grad_norm": 6.066366672515869, "learning_rate": 1.9924812030075188e-06, "loss": 0.2023, "step": 53250 }, { "epoch": 80.09, "grad_norm": 6.539670944213867, "learning_rate": 1.9909774436090227e-06, "loss": 0.1947, "step": 53260 }, { "epoch": 80.11, "grad_norm": 4.000879287719727, "learning_rate": 1.9894736842105262e-06, "loss": 0.1613, "step": 53270 }, { "epoch": 80.12, "grad_norm": 6.228635787963867, "learning_rate": 1.98796992481203e-06, "loss": 0.1893, "step": 53280 }, { "epoch": 80.14, "grad_norm": 3.983610153198242, "learning_rate": 1.986466165413534e-06, "loss": 0.176, "step": 53290 }, { "epoch": 80.15, "grad_norm": 4.762825965881348, "learning_rate": 1.9849624060150376e-06, "loss": 0.2192, "step": 53300 }, { "epoch": 80.17, "grad_norm": 5.903609275817871, "learning_rate": 1.9834586466165416e-06, "loss": 0.169, "step": 53310 }, { "epoch": 80.18, "grad_norm": 5.4333176612854, "learning_rate": 1.981954887218045e-06, "loss": 0.1943, "step": 53320 }, { "epoch": 80.2, "grad_norm": 6.44717264175415, "learning_rate": 1.980451127819549e-06, "loss": 0.2043, "step": 53330 }, { "epoch": 80.21, "grad_norm": 8.601734161376953, "learning_rate": 1.978947368421053e-06, "loss": 0.1577, "step": 53340 }, { "epoch": 80.23, "grad_norm": 3.8428754806518555, "learning_rate": 1.9774436090225565e-06, "loss": 0.2007, "step": 53350 }, { "epoch": 80.24, "grad_norm": 6.386511325836182, "learning_rate": 1.9759398496240604e-06, "loss": 0.2227, "step": 53360 }, { "epoch": 80.26, "grad_norm": 4.776179790496826, "learning_rate": 1.974436090225564e-06, "loss": 0.1553, "step": 53370 }, { "epoch": 80.27, "grad_norm": 4.775459289550781, "learning_rate": 1.972932330827068e-06, "loss": 0.1948, "step": 53380 }, { "epoch": 80.29, "grad_norm": 7.2186760902404785, "learning_rate": 1.9714285714285714e-06, "loss": 0.164, "step": 53390 }, { "epoch": 80.3, "grad_norm": 12.228165626525879, "learning_rate": 1.9699248120300754e-06, "loss": 0.17, "step": 53400 }, { "epoch": 80.32, "grad_norm": 3.7471702098846436, "learning_rate": 1.9684210526315793e-06, "loss": 0.1536, "step": 53410 }, { "epoch": 80.33, "grad_norm": 4.057961940765381, "learning_rate": 1.966917293233083e-06, "loss": 0.1399, "step": 53420 }, { "epoch": 80.35, "grad_norm": 5.822246074676514, "learning_rate": 1.9654135338345868e-06, "loss": 0.1627, "step": 53430 }, { "epoch": 80.36, "grad_norm": 2.5543007850646973, "learning_rate": 1.9639097744360903e-06, "loss": 0.1273, "step": 53440 }, { "epoch": 80.38, "grad_norm": 5.922774314880371, "learning_rate": 1.9624060150375942e-06, "loss": 0.212, "step": 53450 }, { "epoch": 80.39, "grad_norm": 4.007785320281982, "learning_rate": 1.960902255639098e-06, "loss": 0.1515, "step": 53460 }, { "epoch": 80.41, "grad_norm": 2.3833940029144287, "learning_rate": 1.9593984962406017e-06, "loss": 0.1828, "step": 53470 }, { "epoch": 80.42, "grad_norm": 7.1434431076049805, "learning_rate": 1.9578947368421052e-06, "loss": 0.2086, "step": 53480 }, { "epoch": 80.44, "grad_norm": 5.60752534866333, "learning_rate": 1.956390977443609e-06, "loss": 0.1768, "step": 53490 }, { "epoch": 80.45, "grad_norm": 9.981124877929688, "learning_rate": 1.9548872180451127e-06, "loss": 0.1785, "step": 53500 }, { "epoch": 80.47, "grad_norm": 2.1084699630737305, "learning_rate": 1.9533834586466166e-06, "loss": 0.2514, "step": 53510 }, { "epoch": 80.48, "grad_norm": 3.638641834259033, "learning_rate": 1.9518796992481206e-06, "loss": 0.1162, "step": 53520 }, { "epoch": 80.5, "grad_norm": 4.928158760070801, "learning_rate": 1.950375939849624e-06, "loss": 0.1602, "step": 53530 }, { "epoch": 80.51, "grad_norm": 4.468033790588379, "learning_rate": 1.948872180451128e-06, "loss": 0.0994, "step": 53540 }, { "epoch": 80.53, "grad_norm": 6.67031717300415, "learning_rate": 1.9473684210526315e-06, "loss": 0.1815, "step": 53550 }, { "epoch": 80.54, "grad_norm": 4.985138893127441, "learning_rate": 1.9458646616541355e-06, "loss": 0.2014, "step": 53560 }, { "epoch": 80.56, "grad_norm": 4.438155174255371, "learning_rate": 1.9443609022556394e-06, "loss": 0.1987, "step": 53570 }, { "epoch": 80.57, "grad_norm": 6.645019054412842, "learning_rate": 1.942857142857143e-06, "loss": 0.1801, "step": 53580 }, { "epoch": 80.59, "grad_norm": 2.9220592975616455, "learning_rate": 1.941353383458647e-06, "loss": 0.15, "step": 53590 }, { "epoch": 80.6, "grad_norm": 3.196962356567383, "learning_rate": 1.9398496240601504e-06, "loss": 0.1512, "step": 53600 }, { "epoch": 80.62, "grad_norm": 6.56658935546875, "learning_rate": 1.9383458646616544e-06, "loss": 0.1686, "step": 53610 }, { "epoch": 80.63, "grad_norm": 8.974480628967285, "learning_rate": 1.936842105263158e-06, "loss": 0.1591, "step": 53620 }, { "epoch": 80.65, "grad_norm": 9.335490226745605, "learning_rate": 1.935338345864662e-06, "loss": 0.1731, "step": 53630 }, { "epoch": 80.66, "grad_norm": 8.802009582519531, "learning_rate": 1.9338345864661658e-06, "loss": 0.1393, "step": 53640 }, { "epoch": 80.68, "grad_norm": 2.8996877670288086, "learning_rate": 1.9323308270676693e-06, "loss": 0.1955, "step": 53650 }, { "epoch": 80.69, "grad_norm": 8.079301834106445, "learning_rate": 1.9308270676691732e-06, "loss": 0.2178, "step": 53660 }, { "epoch": 80.71, "grad_norm": 4.58353853225708, "learning_rate": 1.9293233082706767e-06, "loss": 0.252, "step": 53670 }, { "epoch": 80.72, "grad_norm": 1.9493281841278076, "learning_rate": 1.9278195488721807e-06, "loss": 0.15, "step": 53680 }, { "epoch": 80.74, "grad_norm": 2.8279833793640137, "learning_rate": 1.9263157894736846e-06, "loss": 0.1748, "step": 53690 }, { "epoch": 80.75, "grad_norm": 6.2720818519592285, "learning_rate": 1.924812030075188e-06, "loss": 0.1943, "step": 53700 }, { "epoch": 80.77, "grad_norm": 5.162914752960205, "learning_rate": 1.923308270676692e-06, "loss": 0.1387, "step": 53710 }, { "epoch": 80.78, "grad_norm": 5.782294273376465, "learning_rate": 1.9218045112781956e-06, "loss": 0.1838, "step": 53720 }, { "epoch": 80.8, "grad_norm": 5.94707727432251, "learning_rate": 1.920300751879699e-06, "loss": 0.1411, "step": 53730 }, { "epoch": 80.81, "grad_norm": 7.4608073234558105, "learning_rate": 1.918796992481203e-06, "loss": 0.1491, "step": 53740 }, { "epoch": 80.83, "grad_norm": 3.1741888523101807, "learning_rate": 1.917293233082707e-06, "loss": 0.1506, "step": 53750 }, { "epoch": 80.84, "grad_norm": 5.37153959274292, "learning_rate": 1.9157894736842105e-06, "loss": 0.2074, "step": 53760 }, { "epoch": 80.86, "grad_norm": 2.308962821960449, "learning_rate": 1.9142857142857145e-06, "loss": 0.17, "step": 53770 }, { "epoch": 80.87, "grad_norm": 2.749995470046997, "learning_rate": 1.912781954887218e-06, "loss": 0.1656, "step": 53780 }, { "epoch": 80.89, "grad_norm": 4.785193920135498, "learning_rate": 1.911278195488722e-06, "loss": 0.1801, "step": 53790 }, { "epoch": 80.9, "grad_norm": 5.815794467926025, "learning_rate": 1.909774436090226e-06, "loss": 0.1982, "step": 53800 }, { "epoch": 80.92, "grad_norm": 4.500646591186523, "learning_rate": 1.9082706766917294e-06, "loss": 0.1509, "step": 53810 }, { "epoch": 80.93, "grad_norm": 3.7033073902130127, "learning_rate": 1.9067669172932331e-06, "loss": 0.1613, "step": 53820 }, { "epoch": 80.95, "grad_norm": 5.228376865386963, "learning_rate": 1.905263157894737e-06, "loss": 0.1766, "step": 53830 }, { "epoch": 80.96, "grad_norm": 5.924206256866455, "learning_rate": 1.9037593984962408e-06, "loss": 0.1858, "step": 53840 }, { "epoch": 80.98, "grad_norm": 2.352541923522949, "learning_rate": 1.9022556390977445e-06, "loss": 0.1322, "step": 53850 }, { "epoch": 80.99, "grad_norm": 5.281574249267578, "learning_rate": 1.9007518796992483e-06, "loss": 0.1909, "step": 53860 }, { "epoch": 81.0, "eval_accuracy": 0.9291, "eval_loss": 0.34166744351387024, "eval_runtime": 84.595, "eval_samples_per_second": 118.21, "eval_steps_per_second": 0.473, "step": 53865 }, { "epoch": 81.01, "grad_norm": 6.562623977661133, "learning_rate": 1.899248120300752e-06, "loss": 0.1835, "step": 53870 }, { "epoch": 81.02, "grad_norm": 6.715619087219238, "learning_rate": 1.897744360902256e-06, "loss": 0.2188, "step": 53880 }, { "epoch": 81.04, "grad_norm": 3.6184804439544678, "learning_rate": 1.8962406015037597e-06, "loss": 0.1587, "step": 53890 }, { "epoch": 81.05, "grad_norm": 7.747524261474609, "learning_rate": 1.8947368421052634e-06, "loss": 0.1861, "step": 53900 }, { "epoch": 81.07, "grad_norm": 4.222412586212158, "learning_rate": 1.8932330827067671e-06, "loss": 0.1427, "step": 53910 }, { "epoch": 81.08, "grad_norm": 7.423925399780273, "learning_rate": 1.8917293233082709e-06, "loss": 0.2178, "step": 53920 }, { "epoch": 81.1, "grad_norm": 6.593136310577393, "learning_rate": 1.8902255639097746e-06, "loss": 0.2083, "step": 53930 }, { "epoch": 81.11, "grad_norm": 5.739638805389404, "learning_rate": 1.8887218045112785e-06, "loss": 0.2182, "step": 53940 }, { "epoch": 81.13, "grad_norm": 4.201329708099365, "learning_rate": 1.8872180451127823e-06, "loss": 0.1661, "step": 53950 }, { "epoch": 81.14, "grad_norm": 4.125972270965576, "learning_rate": 1.885714285714286e-06, "loss": 0.177, "step": 53960 }, { "epoch": 81.16, "grad_norm": 9.847689628601074, "learning_rate": 1.8842105263157895e-06, "loss": 0.1991, "step": 53970 }, { "epoch": 81.17, "grad_norm": 2.6463875770568848, "learning_rate": 1.8827067669172932e-06, "loss": 0.1789, "step": 53980 }, { "epoch": 81.19, "grad_norm": 5.124504089355469, "learning_rate": 1.881203007518797e-06, "loss": 0.1799, "step": 53990 }, { "epoch": 81.2, "grad_norm": 6.498737335205078, "learning_rate": 1.8796992481203007e-06, "loss": 0.1637, "step": 54000 }, { "epoch": 81.22, "grad_norm": 5.662612438201904, "learning_rate": 1.8781954887218046e-06, "loss": 0.1571, "step": 54010 }, { "epoch": 81.23, "grad_norm": 4.6632232666015625, "learning_rate": 1.8766917293233084e-06, "loss": 0.1476, "step": 54020 }, { "epoch": 81.25, "grad_norm": 3.081489324569702, "learning_rate": 1.8751879699248121e-06, "loss": 0.1544, "step": 54030 }, { "epoch": 81.26, "grad_norm": 2.8134820461273193, "learning_rate": 1.8736842105263158e-06, "loss": 0.173, "step": 54040 }, { "epoch": 81.28, "grad_norm": 6.176023483276367, "learning_rate": 1.8721804511278196e-06, "loss": 0.1662, "step": 54050 }, { "epoch": 81.29, "grad_norm": 5.997969150543213, "learning_rate": 1.8706766917293235e-06, "loss": 0.1763, "step": 54060 }, { "epoch": 81.31, "grad_norm": 7.557049751281738, "learning_rate": 1.8691729323308272e-06, "loss": 0.2356, "step": 54070 }, { "epoch": 81.32, "grad_norm": 1.760933518409729, "learning_rate": 1.867669172932331e-06, "loss": 0.1678, "step": 54080 }, { "epoch": 81.34, "grad_norm": 5.9524993896484375, "learning_rate": 1.8661654135338347e-06, "loss": 0.1923, "step": 54090 }, { "epoch": 81.35, "grad_norm": 3.9762470722198486, "learning_rate": 1.8646616541353384e-06, "loss": 0.1556, "step": 54100 }, { "epoch": 81.37, "grad_norm": 3.218543767929077, "learning_rate": 1.8631578947368424e-06, "loss": 0.1857, "step": 54110 }, { "epoch": 81.38, "grad_norm": 6.120358467102051, "learning_rate": 1.8616541353383461e-06, "loss": 0.2167, "step": 54120 }, { "epoch": 81.4, "grad_norm": 4.884064674377441, "learning_rate": 1.8601503759398498e-06, "loss": 0.1552, "step": 54130 }, { "epoch": 81.41, "grad_norm": 5.395893573760986, "learning_rate": 1.8586466165413536e-06, "loss": 0.1528, "step": 54140 }, { "epoch": 81.43, "grad_norm": 2.572039842605591, "learning_rate": 1.8571428571428573e-06, "loss": 0.1606, "step": 54150 }, { "epoch": 81.44, "grad_norm": 2.791781425476074, "learning_rate": 1.855639097744361e-06, "loss": 0.1705, "step": 54160 }, { "epoch": 81.46, "grad_norm": 3.3535261154174805, "learning_rate": 1.854135338345865e-06, "loss": 0.2432, "step": 54170 }, { "epoch": 81.47, "grad_norm": 4.468529224395752, "learning_rate": 1.8526315789473687e-06, "loss": 0.1785, "step": 54180 }, { "epoch": 81.49, "grad_norm": 3.004258871078491, "learning_rate": 1.8511278195488724e-06, "loss": 0.2144, "step": 54190 }, { "epoch": 81.5, "grad_norm": 6.375463485717773, "learning_rate": 1.8496240601503762e-06, "loss": 0.1407, "step": 54200 }, { "epoch": 81.52, "grad_norm": 7.171412467956543, "learning_rate": 1.84812030075188e-06, "loss": 0.1834, "step": 54210 }, { "epoch": 81.53, "grad_norm": 4.493435859680176, "learning_rate": 1.8466165413533834e-06, "loss": 0.1702, "step": 54220 }, { "epoch": 81.55, "grad_norm": 3.168029308319092, "learning_rate": 1.8451127819548871e-06, "loss": 0.1886, "step": 54230 }, { "epoch": 81.56, "grad_norm": 6.606454372406006, "learning_rate": 1.843609022556391e-06, "loss": 0.1678, "step": 54240 }, { "epoch": 81.58, "grad_norm": 7.317569255828857, "learning_rate": 1.8421052631578948e-06, "loss": 0.2086, "step": 54250 }, { "epoch": 81.59, "grad_norm": 5.867735385894775, "learning_rate": 1.8406015037593986e-06, "loss": 0.2158, "step": 54260 }, { "epoch": 81.61, "grad_norm": 6.016121864318848, "learning_rate": 1.8390977443609023e-06, "loss": 0.1706, "step": 54270 }, { "epoch": 81.62, "grad_norm": 2.902163028717041, "learning_rate": 1.837593984962406e-06, "loss": 0.1605, "step": 54280 }, { "epoch": 81.64, "grad_norm": 4.931081771850586, "learning_rate": 1.83609022556391e-06, "loss": 0.1675, "step": 54290 }, { "epoch": 81.65, "grad_norm": 6.6242241859436035, "learning_rate": 1.8345864661654137e-06, "loss": 0.218, "step": 54300 }, { "epoch": 81.67, "grad_norm": 4.538776397705078, "learning_rate": 1.8330827067669174e-06, "loss": 0.1534, "step": 54310 }, { "epoch": 81.68, "grad_norm": 5.3194708824157715, "learning_rate": 1.8315789473684211e-06, "loss": 0.1789, "step": 54320 }, { "epoch": 81.7, "grad_norm": 6.646446704864502, "learning_rate": 1.8300751879699249e-06, "loss": 0.1519, "step": 54330 }, { "epoch": 81.71, "grad_norm": 7.2585129737854, "learning_rate": 1.8285714285714288e-06, "loss": 0.1952, "step": 54340 }, { "epoch": 81.73, "grad_norm": 6.1801323890686035, "learning_rate": 1.8270676691729326e-06, "loss": 0.1726, "step": 54350 }, { "epoch": 81.74, "grad_norm": 5.834466457366943, "learning_rate": 1.8255639097744363e-06, "loss": 0.1767, "step": 54360 }, { "epoch": 81.76, "grad_norm": 5.508495807647705, "learning_rate": 1.82406015037594e-06, "loss": 0.1731, "step": 54370 }, { "epoch": 81.77, "grad_norm": 1.7921135425567627, "learning_rate": 1.8225563909774437e-06, "loss": 0.2335, "step": 54380 }, { "epoch": 81.79, "grad_norm": 5.268810749053955, "learning_rate": 1.8210526315789475e-06, "loss": 0.2098, "step": 54390 }, { "epoch": 81.8, "grad_norm": 5.814802169799805, "learning_rate": 1.8195488721804514e-06, "loss": 0.1643, "step": 54400 }, { "epoch": 81.82, "grad_norm": 4.461940765380859, "learning_rate": 1.8180451127819551e-06, "loss": 0.1374, "step": 54410 }, { "epoch": 81.83, "grad_norm": 3.247183322906494, "learning_rate": 1.8165413533834589e-06, "loss": 0.1355, "step": 54420 }, { "epoch": 81.85, "grad_norm": 1.9381792545318604, "learning_rate": 1.8150375939849626e-06, "loss": 0.1723, "step": 54430 }, { "epoch": 81.86, "grad_norm": 7.478023529052734, "learning_rate": 1.8135338345864663e-06, "loss": 0.1846, "step": 54440 }, { "epoch": 81.88, "grad_norm": 5.697751522064209, "learning_rate": 1.8120300751879703e-06, "loss": 0.1653, "step": 54450 }, { "epoch": 81.89, "grad_norm": 5.588354587554932, "learning_rate": 1.810526315789474e-06, "loss": 0.1838, "step": 54460 }, { "epoch": 81.91, "grad_norm": 6.156862258911133, "learning_rate": 1.8090225563909775e-06, "loss": 0.211, "step": 54470 }, { "epoch": 81.92, "grad_norm": 7.52768087387085, "learning_rate": 1.8075187969924813e-06, "loss": 0.2083, "step": 54480 }, { "epoch": 81.94, "grad_norm": 3.558300018310547, "learning_rate": 1.806015037593985e-06, "loss": 0.1508, "step": 54490 }, { "epoch": 81.95, "grad_norm": 5.417634010314941, "learning_rate": 1.8045112781954887e-06, "loss": 0.1776, "step": 54500 }, { "epoch": 81.97, "grad_norm": 5.08277702331543, "learning_rate": 1.8030075187969925e-06, "loss": 0.1513, "step": 54510 }, { "epoch": 81.98, "grad_norm": 5.362541675567627, "learning_rate": 1.8015037593984964e-06, "loss": 0.1559, "step": 54520 }, { "epoch": 82.0, "grad_norm": 1.1385769844055176, "learning_rate": 1.8000000000000001e-06, "loss": 0.1971, "step": 54530 }, { "epoch": 82.0, "eval_accuracy": 0.9289, "eval_loss": 0.33228030800819397, "eval_runtime": 85.2251, "eval_samples_per_second": 117.336, "eval_steps_per_second": 0.469, "step": 54530 }, { "epoch": 82.02, "grad_norm": 2.4561429023742676, "learning_rate": 1.7984962406015039e-06, "loss": 0.1383, "step": 54540 }, { "epoch": 82.03, "grad_norm": 4.4165425300598145, "learning_rate": 1.7969924812030076e-06, "loss": 0.1265, "step": 54550 }, { "epoch": 82.05, "grad_norm": 8.235807418823242, "learning_rate": 1.7954887218045113e-06, "loss": 0.1736, "step": 54560 }, { "epoch": 82.06, "grad_norm": 5.088337421417236, "learning_rate": 1.7939849624060153e-06, "loss": 0.1613, "step": 54570 }, { "epoch": 82.08, "grad_norm": 4.968101501464844, "learning_rate": 1.792481203007519e-06, "loss": 0.1572, "step": 54580 }, { "epoch": 82.09, "grad_norm": 3.9068446159362793, "learning_rate": 1.7909774436090227e-06, "loss": 0.2085, "step": 54590 }, { "epoch": 82.11, "grad_norm": 4.162015914916992, "learning_rate": 1.7894736842105265e-06, "loss": 0.1924, "step": 54600 }, { "epoch": 82.12, "grad_norm": 5.8883256912231445, "learning_rate": 1.7879699248120302e-06, "loss": 0.1482, "step": 54610 }, { "epoch": 82.14, "grad_norm": 3.305724859237671, "learning_rate": 1.786466165413534e-06, "loss": 0.134, "step": 54620 }, { "epoch": 82.15, "grad_norm": 3.4198994636535645, "learning_rate": 1.7849624060150379e-06, "loss": 0.122, "step": 54630 }, { "epoch": 82.17, "grad_norm": 4.999546051025391, "learning_rate": 1.7834586466165416e-06, "loss": 0.2511, "step": 54640 }, { "epoch": 82.18, "grad_norm": 3.644953489303589, "learning_rate": 1.7819548872180453e-06, "loss": 0.1224, "step": 54650 }, { "epoch": 82.2, "grad_norm": 2.552194833755493, "learning_rate": 1.780451127819549e-06, "loss": 0.1872, "step": 54660 }, { "epoch": 82.21, "grad_norm": 6.016075134277344, "learning_rate": 1.7789473684210528e-06, "loss": 0.1731, "step": 54670 }, { "epoch": 82.23, "grad_norm": 3.725740671157837, "learning_rate": 1.7774436090225567e-06, "loss": 0.1697, "step": 54680 }, { "epoch": 82.24, "grad_norm": 4.1656999588012695, "learning_rate": 1.7759398496240605e-06, "loss": 0.1599, "step": 54690 }, { "epoch": 82.26, "grad_norm": 5.3262481689453125, "learning_rate": 1.7744360902255642e-06, "loss": 0.2012, "step": 54700 }, { "epoch": 82.27, "grad_norm": 4.335447788238525, "learning_rate": 1.772932330827068e-06, "loss": 0.1789, "step": 54710 }, { "epoch": 82.29, "grad_norm": 5.814763069152832, "learning_rate": 1.7714285714285714e-06, "loss": 0.1574, "step": 54720 }, { "epoch": 82.3, "grad_norm": 8.759305000305176, "learning_rate": 1.7699248120300752e-06, "loss": 0.1821, "step": 54730 }, { "epoch": 82.32, "grad_norm": 4.221161842346191, "learning_rate": 1.768421052631579e-06, "loss": 0.1759, "step": 54740 }, { "epoch": 82.33, "grad_norm": 4.3325910568237305, "learning_rate": 1.7669172932330828e-06, "loss": 0.258, "step": 54750 }, { "epoch": 82.35, "grad_norm": 4.424845218658447, "learning_rate": 1.7654135338345866e-06, "loss": 0.2111, "step": 54760 }, { "epoch": 82.36, "grad_norm": 6.010334014892578, "learning_rate": 1.7639097744360903e-06, "loss": 0.2227, "step": 54770 }, { "epoch": 82.38, "grad_norm": 4.39691162109375, "learning_rate": 1.762406015037594e-06, "loss": 0.1556, "step": 54780 }, { "epoch": 82.39, "grad_norm": 4.894425868988037, "learning_rate": 1.7609022556390978e-06, "loss": 0.1579, "step": 54790 }, { "epoch": 82.41, "grad_norm": 5.324883937835693, "learning_rate": 1.7593984962406017e-06, "loss": 0.2109, "step": 54800 }, { "epoch": 82.42, "grad_norm": 6.122479438781738, "learning_rate": 1.7578947368421054e-06, "loss": 0.2117, "step": 54810 }, { "epoch": 82.44, "grad_norm": 5.467765808105469, "learning_rate": 1.7563909774436092e-06, "loss": 0.1589, "step": 54820 }, { "epoch": 82.45, "grad_norm": 8.767003059387207, "learning_rate": 1.754887218045113e-06, "loss": 0.1147, "step": 54830 }, { "epoch": 82.47, "grad_norm": 9.435832977294922, "learning_rate": 1.7533834586466166e-06, "loss": 0.1292, "step": 54840 }, { "epoch": 82.48, "grad_norm": 5.217706203460693, "learning_rate": 1.7518796992481204e-06, "loss": 0.148, "step": 54850 }, { "epoch": 82.5, "grad_norm": 2.389409303665161, "learning_rate": 1.7503759398496243e-06, "loss": 0.1228, "step": 54860 }, { "epoch": 82.51, "grad_norm": 1.7925937175750732, "learning_rate": 1.748872180451128e-06, "loss": 0.1766, "step": 54870 }, { "epoch": 82.53, "grad_norm": 2.3438923358917236, "learning_rate": 1.7473684210526318e-06, "loss": 0.1621, "step": 54880 }, { "epoch": 82.54, "grad_norm": 5.975131511688232, "learning_rate": 1.7458646616541355e-06, "loss": 0.1248, "step": 54890 }, { "epoch": 82.56, "grad_norm": 12.676349639892578, "learning_rate": 1.7443609022556392e-06, "loss": 0.1825, "step": 54900 }, { "epoch": 82.57, "grad_norm": 9.043052673339844, "learning_rate": 1.7428571428571432e-06, "loss": 0.1963, "step": 54910 }, { "epoch": 82.59, "grad_norm": 2.8596065044403076, "learning_rate": 1.741353383458647e-06, "loss": 0.1685, "step": 54920 }, { "epoch": 82.6, "grad_norm": 4.849795341491699, "learning_rate": 1.7398496240601506e-06, "loss": 0.1839, "step": 54930 }, { "epoch": 82.62, "grad_norm": 7.803761959075928, "learning_rate": 1.7383458646616544e-06, "loss": 0.1732, "step": 54940 }, { "epoch": 82.63, "grad_norm": 6.521337985992432, "learning_rate": 1.736842105263158e-06, "loss": 0.1562, "step": 54950 }, { "epoch": 82.65, "grad_norm": 12.948254585266113, "learning_rate": 1.735338345864662e-06, "loss": 0.1772, "step": 54960 }, { "epoch": 82.66, "grad_norm": 3.319334030151367, "learning_rate": 1.7338345864661653e-06, "loss": 0.2134, "step": 54970 }, { "epoch": 82.68, "grad_norm": 2.6258463859558105, "learning_rate": 1.7323308270676693e-06, "loss": 0.1354, "step": 54980 }, { "epoch": 82.69, "grad_norm": 5.683926105499268, "learning_rate": 1.730827067669173e-06, "loss": 0.1092, "step": 54990 }, { "epoch": 82.71, "grad_norm": 6.205657958984375, "learning_rate": 1.7293233082706767e-06, "loss": 0.1635, "step": 55000 }, { "epoch": 82.72, "grad_norm": 2.16133713722229, "learning_rate": 1.7278195488721805e-06, "loss": 0.1746, "step": 55010 }, { "epoch": 82.74, "grad_norm": 5.119611740112305, "learning_rate": 1.7263157894736842e-06, "loss": 0.1608, "step": 55020 }, { "epoch": 82.75, "grad_norm": 5.32130241394043, "learning_rate": 1.7248120300751882e-06, "loss": 0.1386, "step": 55030 }, { "epoch": 82.77, "grad_norm": 5.220561981201172, "learning_rate": 1.7233082706766919e-06, "loss": 0.192, "step": 55040 }, { "epoch": 82.78, "grad_norm": 5.081111431121826, "learning_rate": 1.7218045112781956e-06, "loss": 0.1744, "step": 55050 }, { "epoch": 82.8, "grad_norm": 7.443409442901611, "learning_rate": 1.7203007518796993e-06, "loss": 0.2045, "step": 55060 }, { "epoch": 82.81, "grad_norm": 5.770323276519775, "learning_rate": 1.718796992481203e-06, "loss": 0.1719, "step": 55070 }, { "epoch": 82.83, "grad_norm": 5.356820106506348, "learning_rate": 1.7172932330827068e-06, "loss": 0.2003, "step": 55080 }, { "epoch": 82.84, "grad_norm": 4.85361385345459, "learning_rate": 1.7157894736842107e-06, "loss": 0.1951, "step": 55090 }, { "epoch": 82.86, "grad_norm": 4.393242359161377, "learning_rate": 1.7142857142857145e-06, "loss": 0.198, "step": 55100 }, { "epoch": 82.87, "grad_norm": 2.715772867202759, "learning_rate": 1.7127819548872182e-06, "loss": 0.131, "step": 55110 }, { "epoch": 82.89, "grad_norm": 5.954652786254883, "learning_rate": 1.711278195488722e-06, "loss": 0.2029, "step": 55120 }, { "epoch": 82.9, "grad_norm": 3.8711795806884766, "learning_rate": 1.7097744360902257e-06, "loss": 0.1509, "step": 55130 }, { "epoch": 82.92, "grad_norm": 4.712859630584717, "learning_rate": 1.7082706766917296e-06, "loss": 0.1485, "step": 55140 }, { "epoch": 82.93, "grad_norm": 6.353842735290527, "learning_rate": 1.7067669172932333e-06, "loss": 0.1959, "step": 55150 }, { "epoch": 82.95, "grad_norm": 4.206510066986084, "learning_rate": 1.705263157894737e-06, "loss": 0.1318, "step": 55160 }, { "epoch": 82.96, "grad_norm": 4.608467102050781, "learning_rate": 1.7037593984962408e-06, "loss": 0.2017, "step": 55170 }, { "epoch": 82.98, "grad_norm": 5.684893608093262, "learning_rate": 1.7022556390977445e-06, "loss": 0.1917, "step": 55180 }, { "epoch": 82.99, "grad_norm": 4.197262763977051, "learning_rate": 1.7007518796992485e-06, "loss": 0.1739, "step": 55190 }, { "epoch": 83.0, "eval_accuracy": 0.9323, "eval_loss": 0.3265763223171234, "eval_runtime": 84.4148, "eval_samples_per_second": 118.463, "eval_steps_per_second": 0.474, "step": 55195 }, { "epoch": 83.01, "grad_norm": 4.629222869873047, "learning_rate": 1.6992481203007522e-06, "loss": 0.1463, "step": 55200 }, { "epoch": 83.02, "grad_norm": 5.791532516479492, "learning_rate": 1.6977443609022557e-06, "loss": 0.1871, "step": 55210 }, { "epoch": 83.04, "grad_norm": 6.979246616363525, "learning_rate": 1.6962406015037595e-06, "loss": 0.1502, "step": 55220 }, { "epoch": 83.05, "grad_norm": 1.3585783243179321, "learning_rate": 1.6947368421052632e-06, "loss": 0.1733, "step": 55230 }, { "epoch": 83.07, "grad_norm": 1.0612996816635132, "learning_rate": 1.693233082706767e-06, "loss": 0.1586, "step": 55240 }, { "epoch": 83.08, "grad_norm": 3.546823740005493, "learning_rate": 1.6917293233082707e-06, "loss": 0.1586, "step": 55250 }, { "epoch": 83.1, "grad_norm": 4.408979892730713, "learning_rate": 1.6902255639097746e-06, "loss": 0.2008, "step": 55260 }, { "epoch": 83.11, "grad_norm": 4.838244438171387, "learning_rate": 1.6887218045112783e-06, "loss": 0.1753, "step": 55270 }, { "epoch": 83.13, "grad_norm": 2.8008768558502197, "learning_rate": 1.687218045112782e-06, "loss": 0.1515, "step": 55280 }, { "epoch": 83.14, "grad_norm": 1.7678091526031494, "learning_rate": 1.6857142857142858e-06, "loss": 0.1716, "step": 55290 }, { "epoch": 83.16, "grad_norm": 4.2498087882995605, "learning_rate": 1.6842105263157895e-06, "loss": 0.2125, "step": 55300 }, { "epoch": 83.17, "grad_norm": 4.752126216888428, "learning_rate": 1.6827067669172933e-06, "loss": 0.195, "step": 55310 }, { "epoch": 83.19, "grad_norm": 2.648486375808716, "learning_rate": 1.6812030075187972e-06, "loss": 0.1554, "step": 55320 }, { "epoch": 83.2, "grad_norm": 5.214283466339111, "learning_rate": 1.679699248120301e-06, "loss": 0.2508, "step": 55330 }, { "epoch": 83.22, "grad_norm": 7.676024913787842, "learning_rate": 1.6781954887218047e-06, "loss": 0.1883, "step": 55340 }, { "epoch": 83.23, "grad_norm": 5.258351802825928, "learning_rate": 1.6766917293233084e-06, "loss": 0.1941, "step": 55350 }, { "epoch": 83.25, "grad_norm": 5.472609043121338, "learning_rate": 1.6751879699248121e-06, "loss": 0.1522, "step": 55360 }, { "epoch": 83.26, "grad_norm": 3.57065749168396, "learning_rate": 1.673684210526316e-06, "loss": 0.1801, "step": 55370 }, { "epoch": 83.28, "grad_norm": 4.2197651863098145, "learning_rate": 1.6721804511278198e-06, "loss": 0.1963, "step": 55380 }, { "epoch": 83.29, "grad_norm": 7.963762283325195, "learning_rate": 1.6706766917293235e-06, "loss": 0.2155, "step": 55390 }, { "epoch": 83.31, "grad_norm": 4.304222106933594, "learning_rate": 1.6691729323308273e-06, "loss": 0.225, "step": 55400 }, { "epoch": 83.32, "grad_norm": 4.590184211730957, "learning_rate": 1.667669172932331e-06, "loss": 0.174, "step": 55410 }, { "epoch": 83.34, "grad_norm": 2.6624538898468018, "learning_rate": 1.6661654135338347e-06, "loss": 0.1555, "step": 55420 }, { "epoch": 83.35, "grad_norm": 6.655846118927002, "learning_rate": 1.6646616541353387e-06, "loss": 0.1236, "step": 55430 }, { "epoch": 83.37, "grad_norm": 5.1804633140563965, "learning_rate": 1.6631578947368424e-06, "loss": 0.2144, "step": 55440 }, { "epoch": 83.38, "grad_norm": 6.537972450256348, "learning_rate": 1.6616541353383461e-06, "loss": 0.1807, "step": 55450 }, { "epoch": 83.4, "grad_norm": 3.086345911026001, "learning_rate": 1.6601503759398496e-06, "loss": 0.1505, "step": 55460 }, { "epoch": 83.41, "grad_norm": 6.289426326751709, "learning_rate": 1.6586466165413534e-06, "loss": 0.1967, "step": 55470 }, { "epoch": 83.43, "grad_norm": 4.445835113525391, "learning_rate": 1.657142857142857e-06, "loss": 0.153, "step": 55480 }, { "epoch": 83.44, "grad_norm": 5.393485069274902, "learning_rate": 1.6556390977443608e-06, "loss": 0.1946, "step": 55490 }, { "epoch": 83.46, "grad_norm": 4.735557556152344, "learning_rate": 1.6541353383458648e-06, "loss": 0.1649, "step": 55500 }, { "epoch": 83.47, "grad_norm": 2.2363061904907227, "learning_rate": 1.6526315789473685e-06, "loss": 0.2182, "step": 55510 }, { "epoch": 83.49, "grad_norm": 1.6198318004608154, "learning_rate": 1.6511278195488722e-06, "loss": 0.2171, "step": 55520 }, { "epoch": 83.5, "grad_norm": 3.38696551322937, "learning_rate": 1.649624060150376e-06, "loss": 0.1405, "step": 55530 }, { "epoch": 83.52, "grad_norm": 1.2411551475524902, "learning_rate": 1.6481203007518797e-06, "loss": 0.1632, "step": 55540 }, { "epoch": 83.53, "grad_norm": 3.3203301429748535, "learning_rate": 1.6466165413533836e-06, "loss": 0.1592, "step": 55550 }, { "epoch": 83.55, "grad_norm": 13.928276062011719, "learning_rate": 1.6451127819548874e-06, "loss": 0.1434, "step": 55560 }, { "epoch": 83.56, "grad_norm": 3.75093150138855, "learning_rate": 1.643609022556391e-06, "loss": 0.2471, "step": 55570 }, { "epoch": 83.58, "grad_norm": 6.298830032348633, "learning_rate": 1.6421052631578948e-06, "loss": 0.1624, "step": 55580 }, { "epoch": 83.59, "grad_norm": 4.140471935272217, "learning_rate": 1.6406015037593986e-06, "loss": 0.1668, "step": 55590 }, { "epoch": 83.61, "grad_norm": 7.234468936920166, "learning_rate": 1.6390977443609025e-06, "loss": 0.1933, "step": 55600 }, { "epoch": 83.62, "grad_norm": 5.53890323638916, "learning_rate": 1.6375939849624062e-06, "loss": 0.1686, "step": 55610 }, { "epoch": 83.64, "grad_norm": 7.111166000366211, "learning_rate": 1.63609022556391e-06, "loss": 0.1447, "step": 55620 }, { "epoch": 83.65, "grad_norm": 4.891413688659668, "learning_rate": 1.6345864661654137e-06, "loss": 0.1894, "step": 55630 }, { "epoch": 83.67, "grad_norm": 5.022510051727295, "learning_rate": 1.6330827067669174e-06, "loss": 0.1572, "step": 55640 }, { "epoch": 83.68, "grad_norm": 6.799849033355713, "learning_rate": 1.6315789473684212e-06, "loss": 0.2438, "step": 55650 }, { "epoch": 83.7, "grad_norm": 6.336165904998779, "learning_rate": 1.630075187969925e-06, "loss": 0.1738, "step": 55660 }, { "epoch": 83.71, "grad_norm": 4.65312385559082, "learning_rate": 1.6285714285714288e-06, "loss": 0.1406, "step": 55670 }, { "epoch": 83.73, "grad_norm": 6.864182472229004, "learning_rate": 1.6270676691729326e-06, "loss": 0.1773, "step": 55680 }, { "epoch": 83.74, "grad_norm": 3.7218823432922363, "learning_rate": 1.6255639097744363e-06, "loss": 0.1292, "step": 55690 }, { "epoch": 83.76, "grad_norm": 6.442845821380615, "learning_rate": 1.62406015037594e-06, "loss": 0.1512, "step": 55700 }, { "epoch": 83.77, "grad_norm": 4.894304275512695, "learning_rate": 1.6225563909774435e-06, "loss": 0.162, "step": 55710 }, { "epoch": 83.79, "grad_norm": 5.067921161651611, "learning_rate": 1.6210526315789473e-06, "loss": 0.1662, "step": 55720 }, { "epoch": 83.8, "grad_norm": 10.18139362335205, "learning_rate": 1.6195488721804512e-06, "loss": 0.2123, "step": 55730 }, { "epoch": 83.82, "grad_norm": 6.891566753387451, "learning_rate": 1.618045112781955e-06, "loss": 0.1701, "step": 55740 }, { "epoch": 83.83, "grad_norm": 3.438103437423706, "learning_rate": 1.6165413533834587e-06, "loss": 0.1598, "step": 55750 }, { "epoch": 83.85, "grad_norm": 3.0639941692352295, "learning_rate": 1.6150375939849624e-06, "loss": 0.2099, "step": 55760 }, { "epoch": 83.86, "grad_norm": 5.10312557220459, "learning_rate": 1.6135338345864661e-06, "loss": 0.1678, "step": 55770 }, { "epoch": 83.88, "grad_norm": 3.48979115486145, "learning_rate": 1.61203007518797e-06, "loss": 0.1578, "step": 55780 }, { "epoch": 83.89, "grad_norm": 7.103468418121338, "learning_rate": 1.6105263157894738e-06, "loss": 0.1769, "step": 55790 }, { "epoch": 83.91, "grad_norm": 4.037766456604004, "learning_rate": 1.6090225563909775e-06, "loss": 0.2101, "step": 55800 }, { "epoch": 83.92, "grad_norm": 3.874589204788208, "learning_rate": 1.6075187969924813e-06, "loss": 0.2364, "step": 55810 }, { "epoch": 83.94, "grad_norm": 4.208032608032227, "learning_rate": 1.606015037593985e-06, "loss": 0.1712, "step": 55820 }, { "epoch": 83.95, "grad_norm": 4.456460475921631, "learning_rate": 1.604511278195489e-06, "loss": 0.2004, "step": 55830 }, { "epoch": 83.97, "grad_norm": 6.270118236541748, "learning_rate": 1.6030075187969927e-06, "loss": 0.156, "step": 55840 }, { "epoch": 83.98, "grad_norm": 5.951742172241211, "learning_rate": 1.6015037593984964e-06, "loss": 0.2439, "step": 55850 }, { "epoch": 84.0, "grad_norm": 0.09190316498279572, "learning_rate": 1.6000000000000001e-06, "loss": 0.1537, "step": 55860 }, { "epoch": 84.0, "eval_accuracy": 0.9294, "eval_loss": 0.33126822113990784, "eval_runtime": 84.9554, "eval_samples_per_second": 117.709, "eval_steps_per_second": 0.471, "step": 55860 }, { "epoch": 84.02, "grad_norm": 4.058501243591309, "learning_rate": 1.5984962406015039e-06, "loss": 0.238, "step": 55870 }, { "epoch": 84.03, "grad_norm": 3.7976138591766357, "learning_rate": 1.5969924812030076e-06, "loss": 0.1423, "step": 55880 }, { "epoch": 84.05, "grad_norm": 4.4287614822387695, "learning_rate": 1.5954887218045115e-06, "loss": 0.1356, "step": 55890 }, { "epoch": 84.06, "grad_norm": 5.282792568206787, "learning_rate": 1.5939849624060153e-06, "loss": 0.1902, "step": 55900 }, { "epoch": 84.08, "grad_norm": 5.178953170776367, "learning_rate": 1.592481203007519e-06, "loss": 0.1712, "step": 55910 }, { "epoch": 84.09, "grad_norm": 0.7578862905502319, "learning_rate": 1.5909774436090227e-06, "loss": 0.1615, "step": 55920 }, { "epoch": 84.11, "grad_norm": 6.100287437438965, "learning_rate": 1.5894736842105265e-06, "loss": 0.144, "step": 55930 }, { "epoch": 84.12, "grad_norm": 10.830288887023926, "learning_rate": 1.5879699248120304e-06, "loss": 0.1449, "step": 55940 }, { "epoch": 84.14, "grad_norm": 6.316339015960693, "learning_rate": 1.5864661654135341e-06, "loss": 0.202, "step": 55950 }, { "epoch": 84.15, "grad_norm": 4.204655647277832, "learning_rate": 1.5849624060150377e-06, "loss": 0.1957, "step": 55960 }, { "epoch": 84.17, "grad_norm": 5.749233722686768, "learning_rate": 1.5834586466165414e-06, "loss": 0.2423, "step": 55970 }, { "epoch": 84.18, "grad_norm": 2.607123851776123, "learning_rate": 1.5819548872180451e-06, "loss": 0.1752, "step": 55980 }, { "epoch": 84.2, "grad_norm": 3.5867726802825928, "learning_rate": 1.5804511278195489e-06, "loss": 0.2333, "step": 55990 }, { "epoch": 84.21, "grad_norm": 3.1847939491271973, "learning_rate": 1.5789473684210526e-06, "loss": 0.1552, "step": 56000 }, { "epoch": 84.23, "grad_norm": 5.968934535980225, "learning_rate": 1.5774436090225565e-06, "loss": 0.155, "step": 56010 }, { "epoch": 84.24, "grad_norm": 3.6189496517181396, "learning_rate": 1.5759398496240603e-06, "loss": 0.1612, "step": 56020 }, { "epoch": 84.26, "grad_norm": 1.3396971225738525, "learning_rate": 1.574436090225564e-06, "loss": 0.1614, "step": 56030 }, { "epoch": 84.27, "grad_norm": 2.855109930038452, "learning_rate": 1.5729323308270677e-06, "loss": 0.1615, "step": 56040 }, { "epoch": 84.29, "grad_norm": 4.582769393920898, "learning_rate": 1.5714285714285714e-06, "loss": 0.2042, "step": 56050 }, { "epoch": 84.3, "grad_norm": 5.339018821716309, "learning_rate": 1.5699248120300754e-06, "loss": 0.1094, "step": 56060 }, { "epoch": 84.32, "grad_norm": 7.760763168334961, "learning_rate": 1.5684210526315791e-06, "loss": 0.2093, "step": 56070 }, { "epoch": 84.33, "grad_norm": 7.610210418701172, "learning_rate": 1.5669172932330829e-06, "loss": 0.1975, "step": 56080 }, { "epoch": 84.35, "grad_norm": 3.7673017978668213, "learning_rate": 1.5654135338345866e-06, "loss": 0.1645, "step": 56090 }, { "epoch": 84.36, "grad_norm": 2.311070203781128, "learning_rate": 1.5639097744360903e-06, "loss": 0.1574, "step": 56100 }, { "epoch": 84.38, "grad_norm": 5.664913177490234, "learning_rate": 1.562406015037594e-06, "loss": 0.1414, "step": 56110 }, { "epoch": 84.39, "grad_norm": 6.023291110992432, "learning_rate": 1.560902255639098e-06, "loss": 0.1766, "step": 56120 }, { "epoch": 84.41, "grad_norm": 4.557356357574463, "learning_rate": 1.5593984962406017e-06, "loss": 0.1912, "step": 56130 }, { "epoch": 84.42, "grad_norm": 3.221834421157837, "learning_rate": 1.5578947368421054e-06, "loss": 0.1643, "step": 56140 }, { "epoch": 84.44, "grad_norm": 7.018657684326172, "learning_rate": 1.5563909774436092e-06, "loss": 0.2211, "step": 56150 }, { "epoch": 84.45, "grad_norm": 5.987710475921631, "learning_rate": 1.554887218045113e-06, "loss": 0.1389, "step": 56160 }, { "epoch": 84.47, "grad_norm": 3.3997559547424316, "learning_rate": 1.5533834586466169e-06, "loss": 0.1328, "step": 56170 }, { "epoch": 84.48, "grad_norm": 3.822932243347168, "learning_rate": 1.5518796992481206e-06, "loss": 0.2355, "step": 56180 }, { "epoch": 84.5, "grad_norm": 4.0138349533081055, "learning_rate": 1.5503759398496243e-06, "loss": 0.1783, "step": 56190 }, { "epoch": 84.51, "grad_norm": 8.904444694519043, "learning_rate": 1.548872180451128e-06, "loss": 0.2122, "step": 56200 }, { "epoch": 84.53, "grad_norm": 7.065569877624512, "learning_rate": 1.5473684210526316e-06, "loss": 0.1961, "step": 56210 }, { "epoch": 84.54, "grad_norm": 8.500096321105957, "learning_rate": 1.5458646616541353e-06, "loss": 0.1663, "step": 56220 }, { "epoch": 84.56, "grad_norm": 4.012298583984375, "learning_rate": 1.544360902255639e-06, "loss": 0.2012, "step": 56230 }, { "epoch": 84.57, "grad_norm": 7.489271640777588, "learning_rate": 1.542857142857143e-06, "loss": 0.1539, "step": 56240 }, { "epoch": 84.59, "grad_norm": 3.202110767364502, "learning_rate": 1.5413533834586467e-06, "loss": 0.2089, "step": 56250 }, { "epoch": 84.6, "grad_norm": 1.4287196397781372, "learning_rate": 1.5398496240601504e-06, "loss": 0.1811, "step": 56260 }, { "epoch": 84.62, "grad_norm": 5.019970417022705, "learning_rate": 1.5383458646616542e-06, "loss": 0.184, "step": 56270 }, { "epoch": 84.63, "grad_norm": 2.368131637573242, "learning_rate": 1.5368421052631579e-06, "loss": 0.1327, "step": 56280 }, { "epoch": 84.65, "grad_norm": 3.924480676651001, "learning_rate": 1.5353383458646618e-06, "loss": 0.1848, "step": 56290 }, { "epoch": 84.66, "grad_norm": 8.339082717895508, "learning_rate": 1.5338345864661656e-06, "loss": 0.2257, "step": 56300 }, { "epoch": 84.68, "grad_norm": 4.868645668029785, "learning_rate": 1.5323308270676693e-06, "loss": 0.1838, "step": 56310 }, { "epoch": 84.69, "grad_norm": 5.948423862457275, "learning_rate": 1.530827067669173e-06, "loss": 0.1669, "step": 56320 }, { "epoch": 84.71, "grad_norm": 3.8026175498962402, "learning_rate": 1.5293233082706768e-06, "loss": 0.1501, "step": 56330 }, { "epoch": 84.72, "grad_norm": 6.754022598266602, "learning_rate": 1.5278195488721805e-06, "loss": 0.1706, "step": 56340 }, { "epoch": 84.74, "grad_norm": 4.1264495849609375, "learning_rate": 1.5263157894736844e-06, "loss": 0.2263, "step": 56350 }, { "epoch": 84.75, "grad_norm": 7.627476692199707, "learning_rate": 1.5248120300751882e-06, "loss": 0.1672, "step": 56360 }, { "epoch": 84.77, "grad_norm": 3.8254830837249756, "learning_rate": 1.5233082706766919e-06, "loss": 0.2137, "step": 56370 }, { "epoch": 84.78, "grad_norm": 4.202638149261475, "learning_rate": 1.5218045112781956e-06, "loss": 0.1923, "step": 56380 }, { "epoch": 84.8, "grad_norm": 4.201712608337402, "learning_rate": 1.5203007518796994e-06, "loss": 0.1436, "step": 56390 }, { "epoch": 84.81, "grad_norm": 4.552309989929199, "learning_rate": 1.5187969924812033e-06, "loss": 0.1477, "step": 56400 }, { "epoch": 84.83, "grad_norm": 5.196654319763184, "learning_rate": 1.517293233082707e-06, "loss": 0.2116, "step": 56410 }, { "epoch": 84.84, "grad_norm": 6.609741687774658, "learning_rate": 1.5157894736842108e-06, "loss": 0.2068, "step": 56420 }, { "epoch": 84.86, "grad_norm": 3.7197117805480957, "learning_rate": 1.5142857142857145e-06, "loss": 0.2105, "step": 56430 }, { "epoch": 84.87, "grad_norm": 1.3698334693908691, "learning_rate": 1.5127819548872182e-06, "loss": 0.2096, "step": 56440 }, { "epoch": 84.89, "grad_norm": 5.72020959854126, "learning_rate": 1.5112781954887222e-06, "loss": 0.1488, "step": 56450 }, { "epoch": 84.9, "grad_norm": 2.5910472869873047, "learning_rate": 1.5097744360902255e-06, "loss": 0.1275, "step": 56460 }, { "epoch": 84.92, "grad_norm": 6.082350254058838, "learning_rate": 1.5082706766917294e-06, "loss": 0.1829, "step": 56470 }, { "epoch": 84.93, "grad_norm": 3.298006296157837, "learning_rate": 1.5067669172932331e-06, "loss": 0.1375, "step": 56480 }, { "epoch": 84.95, "grad_norm": 5.607242107391357, "learning_rate": 1.5052631578947369e-06, "loss": 0.155, "step": 56490 }, { "epoch": 84.96, "grad_norm": 3.7076168060302734, "learning_rate": 1.5037593984962406e-06, "loss": 0.1593, "step": 56500 }, { "epoch": 84.98, "grad_norm": 2.4990506172180176, "learning_rate": 1.5022556390977443e-06, "loss": 0.1322, "step": 56510 }, { "epoch": 84.99, "grad_norm": 2.6433887481689453, "learning_rate": 1.5007518796992483e-06, "loss": 0.1706, "step": 56520 }, { "epoch": 85.0, "eval_accuracy": 0.928, "eval_loss": 0.3395210802555084, "eval_runtime": 84.3204, "eval_samples_per_second": 118.595, "eval_steps_per_second": 0.474, "step": 56525 }, { "epoch": 85.01, "grad_norm": 4.445404529571533, "learning_rate": 1.499248120300752e-06, "loss": 0.162, "step": 56530 }, { "epoch": 85.02, "grad_norm": 9.234296798706055, "learning_rate": 1.4977443609022557e-06, "loss": 0.1999, "step": 56540 }, { "epoch": 85.04, "grad_norm": 4.423348426818848, "learning_rate": 1.4962406015037595e-06, "loss": 0.1403, "step": 56550 }, { "epoch": 85.05, "grad_norm": 2.3429605960845947, "learning_rate": 1.4947368421052632e-06, "loss": 0.199, "step": 56560 }, { "epoch": 85.07, "grad_norm": 5.699217319488525, "learning_rate": 1.493233082706767e-06, "loss": 0.2032, "step": 56570 }, { "epoch": 85.08, "grad_norm": 3.4555978775024414, "learning_rate": 1.4917293233082709e-06, "loss": 0.1305, "step": 56580 }, { "epoch": 85.1, "grad_norm": 4.182602882385254, "learning_rate": 1.4902255639097746e-06, "loss": 0.2042, "step": 56590 }, { "epoch": 85.11, "grad_norm": 7.7609686851501465, "learning_rate": 1.4887218045112783e-06, "loss": 0.2182, "step": 56600 }, { "epoch": 85.13, "grad_norm": 4.991521835327148, "learning_rate": 1.487218045112782e-06, "loss": 0.1919, "step": 56610 }, { "epoch": 85.14, "grad_norm": 5.988854885101318, "learning_rate": 1.4857142857142858e-06, "loss": 0.2383, "step": 56620 }, { "epoch": 85.16, "grad_norm": 3.479478597640991, "learning_rate": 1.4842105263157897e-06, "loss": 0.2143, "step": 56630 }, { "epoch": 85.17, "grad_norm": 4.183614253997803, "learning_rate": 1.4827067669172935e-06, "loss": 0.2017, "step": 56640 }, { "epoch": 85.19, "grad_norm": 4.1281418800354, "learning_rate": 1.4812030075187972e-06, "loss": 0.1597, "step": 56650 }, { "epoch": 85.2, "grad_norm": 4.900537014007568, "learning_rate": 1.479699248120301e-06, "loss": 0.1677, "step": 56660 }, { "epoch": 85.22, "grad_norm": 2.788635492324829, "learning_rate": 1.4781954887218047e-06, "loss": 0.1888, "step": 56670 }, { "epoch": 85.23, "grad_norm": 8.006430625915527, "learning_rate": 1.4766917293233086e-06, "loss": 0.1544, "step": 56680 }, { "epoch": 85.25, "grad_norm": 4.344766616821289, "learning_rate": 1.4751879699248123e-06, "loss": 0.147, "step": 56690 }, { "epoch": 85.26, "grad_norm": 5.809935092926025, "learning_rate": 1.4736842105263159e-06, "loss": 0.1912, "step": 56700 }, { "epoch": 85.28, "grad_norm": 6.294594764709473, "learning_rate": 1.4721804511278196e-06, "loss": 0.1366, "step": 56710 }, { "epoch": 85.29, "grad_norm": 5.718160152435303, "learning_rate": 1.4706766917293233e-06, "loss": 0.1554, "step": 56720 }, { "epoch": 85.31, "grad_norm": 4.895095348358154, "learning_rate": 1.469172932330827e-06, "loss": 0.2095, "step": 56730 }, { "epoch": 85.32, "grad_norm": 4.996368885040283, "learning_rate": 1.4676691729323308e-06, "loss": 0.2099, "step": 56740 }, { "epoch": 85.34, "grad_norm": 4.071951866149902, "learning_rate": 1.4661654135338347e-06, "loss": 0.1789, "step": 56750 }, { "epoch": 85.35, "grad_norm": 4.790773391723633, "learning_rate": 1.4646616541353385e-06, "loss": 0.2203, "step": 56760 }, { "epoch": 85.37, "grad_norm": 3.539854049682617, "learning_rate": 1.4631578947368422e-06, "loss": 0.1297, "step": 56770 }, { "epoch": 85.38, "grad_norm": 8.937888145446777, "learning_rate": 1.461654135338346e-06, "loss": 0.2044, "step": 56780 }, { "epoch": 85.4, "grad_norm": 5.165222644805908, "learning_rate": 1.4601503759398496e-06, "loss": 0.1447, "step": 56790 }, { "epoch": 85.41, "grad_norm": 7.947958946228027, "learning_rate": 1.4586466165413534e-06, "loss": 0.2182, "step": 56800 }, { "epoch": 85.43, "grad_norm": 3.317690849304199, "learning_rate": 1.4571428571428573e-06, "loss": 0.2299, "step": 56810 }, { "epoch": 85.44, "grad_norm": 3.5233917236328125, "learning_rate": 1.455639097744361e-06, "loss": 0.2098, "step": 56820 }, { "epoch": 85.46, "grad_norm": 3.6351675987243652, "learning_rate": 1.4541353383458648e-06, "loss": 0.1576, "step": 56830 }, { "epoch": 85.47, "grad_norm": 5.808753490447998, "learning_rate": 1.4526315789473685e-06, "loss": 0.1761, "step": 56840 }, { "epoch": 85.49, "grad_norm": 4.585028171539307, "learning_rate": 1.4511278195488722e-06, "loss": 0.1982, "step": 56850 }, { "epoch": 85.5, "grad_norm": 5.568889617919922, "learning_rate": 1.4496240601503762e-06, "loss": 0.1516, "step": 56860 }, { "epoch": 85.52, "grad_norm": 9.540451049804688, "learning_rate": 1.44812030075188e-06, "loss": 0.1922, "step": 56870 }, { "epoch": 85.53, "grad_norm": 3.1969432830810547, "learning_rate": 1.4466165413533836e-06, "loss": 0.1595, "step": 56880 }, { "epoch": 85.55, "grad_norm": 5.823395252227783, "learning_rate": 1.4451127819548874e-06, "loss": 0.2047, "step": 56890 }, { "epoch": 85.56, "grad_norm": 5.089601039886475, "learning_rate": 1.4436090225563911e-06, "loss": 0.2095, "step": 56900 }, { "epoch": 85.58, "grad_norm": 6.277038097381592, "learning_rate": 1.442105263157895e-06, "loss": 0.154, "step": 56910 }, { "epoch": 85.59, "grad_norm": 4.296693801879883, "learning_rate": 1.4406015037593988e-06, "loss": 0.1405, "step": 56920 }, { "epoch": 85.61, "grad_norm": 4.177289962768555, "learning_rate": 1.4390977443609025e-06, "loss": 0.1869, "step": 56930 }, { "epoch": 85.62, "grad_norm": 5.647763729095459, "learning_rate": 1.4375939849624062e-06, "loss": 0.2275, "step": 56940 }, { "epoch": 85.64, "grad_norm": 6.22650146484375, "learning_rate": 1.4360902255639098e-06, "loss": 0.1854, "step": 56950 }, { "epoch": 85.65, "grad_norm": 1.4636141061782837, "learning_rate": 1.4345864661654135e-06, "loss": 0.1481, "step": 56960 }, { "epoch": 85.67, "grad_norm": 5.929630279541016, "learning_rate": 1.4330827067669172e-06, "loss": 0.1671, "step": 56970 }, { "epoch": 85.68, "grad_norm": 6.411832809448242, "learning_rate": 1.4315789473684212e-06, "loss": 0.1598, "step": 56980 }, { "epoch": 85.7, "grad_norm": 4.0760602951049805, "learning_rate": 1.430075187969925e-06, "loss": 0.1493, "step": 56990 }, { "epoch": 85.71, "grad_norm": 5.515470027923584, "learning_rate": 1.4285714285714286e-06, "loss": 0.133, "step": 57000 }, { "epoch": 85.73, "grad_norm": 4.49883508682251, "learning_rate": 1.4270676691729324e-06, "loss": 0.2057, "step": 57010 }, { "epoch": 85.74, "grad_norm": 6.55890417098999, "learning_rate": 1.425563909774436e-06, "loss": 0.1498, "step": 57020 }, { "epoch": 85.76, "grad_norm": 6.194241523742676, "learning_rate": 1.4240601503759398e-06, "loss": 0.1963, "step": 57030 }, { "epoch": 85.77, "grad_norm": 5.173956394195557, "learning_rate": 1.4225563909774438e-06, "loss": 0.1443, "step": 57040 }, { "epoch": 85.79, "grad_norm": 4.395569801330566, "learning_rate": 1.4210526315789475e-06, "loss": 0.1895, "step": 57050 }, { "epoch": 85.8, "grad_norm": 6.59354305267334, "learning_rate": 1.4195488721804512e-06, "loss": 0.1671, "step": 57060 }, { "epoch": 85.82, "grad_norm": 4.2721028327941895, "learning_rate": 1.418045112781955e-06, "loss": 0.1004, "step": 57070 }, { "epoch": 85.83, "grad_norm": 6.9995436668396, "learning_rate": 1.4165413533834587e-06, "loss": 0.1782, "step": 57080 }, { "epoch": 85.85, "grad_norm": 4.45127010345459, "learning_rate": 1.4150375939849626e-06, "loss": 0.2011, "step": 57090 }, { "epoch": 85.86, "grad_norm": 7.801600456237793, "learning_rate": 1.4135338345864664e-06, "loss": 0.1099, "step": 57100 }, { "epoch": 85.88, "grad_norm": 5.012579917907715, "learning_rate": 1.41203007518797e-06, "loss": 0.1656, "step": 57110 }, { "epoch": 85.89, "grad_norm": 6.178292751312256, "learning_rate": 1.4105263157894738e-06, "loss": 0.1934, "step": 57120 }, { "epoch": 85.91, "grad_norm": 3.175626039505005, "learning_rate": 1.4090225563909776e-06, "loss": 0.1558, "step": 57130 }, { "epoch": 85.92, "grad_norm": 7.903815746307373, "learning_rate": 1.4075187969924815e-06, "loss": 0.1989, "step": 57140 }, { "epoch": 85.94, "grad_norm": 5.2723774909973145, "learning_rate": 1.4060150375939852e-06, "loss": 0.1617, "step": 57150 }, { "epoch": 85.95, "grad_norm": 7.170572280883789, "learning_rate": 1.404511278195489e-06, "loss": 0.1336, "step": 57160 }, { "epoch": 85.97, "grad_norm": 8.153802871704102, "learning_rate": 1.4030075187969927e-06, "loss": 0.1318, "step": 57170 }, { "epoch": 85.98, "grad_norm": 1.0930061340332031, "learning_rate": 1.4015037593984964e-06, "loss": 0.1619, "step": 57180 }, { "epoch": 86.0, "grad_norm": 1.082709789276123, "learning_rate": 1.4000000000000001e-06, "loss": 0.199, "step": 57190 }, { "epoch": 86.0, "eval_accuracy": 0.9303, "eval_loss": 0.33443817496299744, "eval_runtime": 84.4787, "eval_samples_per_second": 118.373, "eval_steps_per_second": 0.473, "step": 57190 }, { "epoch": 86.02, "grad_norm": 4.792495250701904, "learning_rate": 1.3984962406015037e-06, "loss": 0.1635, "step": 57200 }, { "epoch": 86.03, "grad_norm": 6.919180870056152, "learning_rate": 1.3969924812030076e-06, "loss": 0.1994, "step": 57210 }, { "epoch": 86.05, "grad_norm": 3.3340089321136475, "learning_rate": 1.3954887218045113e-06, "loss": 0.1505, "step": 57220 }, { "epoch": 86.06, "grad_norm": 4.591991424560547, "learning_rate": 1.393984962406015e-06, "loss": 0.1473, "step": 57230 }, { "epoch": 86.08, "grad_norm": 5.2993550300598145, "learning_rate": 1.3924812030075188e-06, "loss": 0.1736, "step": 57240 }, { "epoch": 86.09, "grad_norm": 3.2715325355529785, "learning_rate": 1.3909774436090225e-06, "loss": 0.1788, "step": 57250 }, { "epoch": 86.11, "grad_norm": 5.598659515380859, "learning_rate": 1.3894736842105263e-06, "loss": 0.1715, "step": 57260 }, { "epoch": 86.12, "grad_norm": 5.6875, "learning_rate": 1.3879699248120302e-06, "loss": 0.1439, "step": 57270 }, { "epoch": 86.14, "grad_norm": 3.4048891067504883, "learning_rate": 1.386466165413534e-06, "loss": 0.1514, "step": 57280 }, { "epoch": 86.15, "grad_norm": 6.038202285766602, "learning_rate": 1.3849624060150377e-06, "loss": 0.1718, "step": 57290 }, { "epoch": 86.17, "grad_norm": 4.095704078674316, "learning_rate": 1.3834586466165414e-06, "loss": 0.159, "step": 57300 }, { "epoch": 86.18, "grad_norm": 6.841707706451416, "learning_rate": 1.3819548872180451e-06, "loss": 0.1779, "step": 57310 }, { "epoch": 86.2, "grad_norm": 2.8215293884277344, "learning_rate": 1.380451127819549e-06, "loss": 0.1868, "step": 57320 }, { "epoch": 86.21, "grad_norm": 6.776797294616699, "learning_rate": 1.3789473684210528e-06, "loss": 0.183, "step": 57330 }, { "epoch": 86.23, "grad_norm": 9.345000267028809, "learning_rate": 1.3774436090225565e-06, "loss": 0.1724, "step": 57340 }, { "epoch": 86.24, "grad_norm": 5.53676700592041, "learning_rate": 1.3759398496240603e-06, "loss": 0.186, "step": 57350 }, { "epoch": 86.26, "grad_norm": 6.88644552230835, "learning_rate": 1.374436090225564e-06, "loss": 0.1801, "step": 57360 }, { "epoch": 86.27, "grad_norm": 10.067435264587402, "learning_rate": 1.372932330827068e-06, "loss": 0.1977, "step": 57370 }, { "epoch": 86.29, "grad_norm": 3.2209675312042236, "learning_rate": 1.3714285714285717e-06, "loss": 0.1715, "step": 57380 }, { "epoch": 86.3, "grad_norm": 5.24599027633667, "learning_rate": 1.3699248120300754e-06, "loss": 0.1733, "step": 57390 }, { "epoch": 86.32, "grad_norm": 4.074387550354004, "learning_rate": 1.3684210526315791e-06, "loss": 0.1588, "step": 57400 }, { "epoch": 86.33, "grad_norm": 3.2896995544433594, "learning_rate": 1.3669172932330829e-06, "loss": 0.1384, "step": 57410 }, { "epoch": 86.35, "grad_norm": 6.851515293121338, "learning_rate": 1.3654135338345866e-06, "loss": 0.2287, "step": 57420 }, { "epoch": 86.36, "grad_norm": 3.76686954498291, "learning_rate": 1.3639097744360905e-06, "loss": 0.1623, "step": 57430 }, { "epoch": 86.38, "grad_norm": 7.297066688537598, "learning_rate": 1.3624060150375943e-06, "loss": 0.1177, "step": 57440 }, { "epoch": 86.39, "grad_norm": 4.633073329925537, "learning_rate": 1.3609022556390978e-06, "loss": 0.1634, "step": 57450 }, { "epoch": 86.41, "grad_norm": 5.141488552093506, "learning_rate": 1.3593984962406015e-06, "loss": 0.2001, "step": 57460 }, { "epoch": 86.42, "grad_norm": 4.2999467849731445, "learning_rate": 1.3578947368421052e-06, "loss": 0.1434, "step": 57470 }, { "epoch": 86.44, "grad_norm": 1.721150517463684, "learning_rate": 1.356390977443609e-06, "loss": 0.2166, "step": 57480 }, { "epoch": 86.45, "grad_norm": 6.083622455596924, "learning_rate": 1.3548872180451127e-06, "loss": 0.1603, "step": 57490 }, { "epoch": 86.47, "grad_norm": 8.409628868103027, "learning_rate": 1.3533834586466167e-06, "loss": 0.1633, "step": 57500 }, { "epoch": 86.48, "grad_norm": 5.203959941864014, "learning_rate": 1.3518796992481204e-06, "loss": 0.191, "step": 57510 }, { "epoch": 86.5, "grad_norm": 3.071166515350342, "learning_rate": 1.3503759398496241e-06, "loss": 0.1879, "step": 57520 }, { "epoch": 86.51, "grad_norm": 3.929776191711426, "learning_rate": 1.3488721804511278e-06, "loss": 0.1791, "step": 57530 }, { "epoch": 86.53, "grad_norm": 4.9922709465026855, "learning_rate": 1.3473684210526316e-06, "loss": 0.1235, "step": 57540 }, { "epoch": 86.54, "grad_norm": 3.8462400436401367, "learning_rate": 1.3458646616541355e-06, "loss": 0.1654, "step": 57550 }, { "epoch": 86.56, "grad_norm": 3.1140220165252686, "learning_rate": 1.3443609022556392e-06, "loss": 0.2052, "step": 57560 }, { "epoch": 86.57, "grad_norm": 8.724045753479004, "learning_rate": 1.342857142857143e-06, "loss": 0.1703, "step": 57570 }, { "epoch": 86.59, "grad_norm": 4.744037628173828, "learning_rate": 1.3413533834586467e-06, "loss": 0.2456, "step": 57580 }, { "epoch": 86.6, "grad_norm": 2.585832118988037, "learning_rate": 1.3398496240601504e-06, "loss": 0.1464, "step": 57590 }, { "epoch": 86.62, "grad_norm": 5.518332004547119, "learning_rate": 1.3383458646616544e-06, "loss": 0.1899, "step": 57600 }, { "epoch": 86.63, "grad_norm": 12.576923370361328, "learning_rate": 1.3368421052631581e-06, "loss": 0.1964, "step": 57610 }, { "epoch": 86.65, "grad_norm": 2.010282516479492, "learning_rate": 1.3353383458646618e-06, "loss": 0.1554, "step": 57620 }, { "epoch": 86.66, "grad_norm": 2.587585687637329, "learning_rate": 1.3338345864661656e-06, "loss": 0.1413, "step": 57630 }, { "epoch": 86.68, "grad_norm": 4.673181533813477, "learning_rate": 1.3323308270676693e-06, "loss": 0.209, "step": 57640 }, { "epoch": 86.69, "grad_norm": 3.9027647972106934, "learning_rate": 1.330827067669173e-06, "loss": 0.176, "step": 57650 }, { "epoch": 86.71, "grad_norm": 7.965885639190674, "learning_rate": 1.329323308270677e-06, "loss": 0.1014, "step": 57660 }, { "epoch": 86.72, "grad_norm": 4.3658881187438965, "learning_rate": 1.3278195488721807e-06, "loss": 0.1876, "step": 57670 }, { "epoch": 86.74, "grad_norm": 8.340523719787598, "learning_rate": 1.3263157894736844e-06, "loss": 0.1785, "step": 57680 }, { "epoch": 86.75, "grad_norm": 4.654315948486328, "learning_rate": 1.3248120300751882e-06, "loss": 0.1522, "step": 57690 }, { "epoch": 86.77, "grad_norm": 7.557748317718506, "learning_rate": 1.3233082706766917e-06, "loss": 0.2559, "step": 57700 }, { "epoch": 86.78, "grad_norm": 5.0106329917907715, "learning_rate": 1.3218045112781954e-06, "loss": 0.1944, "step": 57710 }, { "epoch": 86.8, "grad_norm": 5.76795768737793, "learning_rate": 1.3203007518796992e-06, "loss": 0.1632, "step": 57720 }, { "epoch": 86.81, "grad_norm": 4.331972599029541, "learning_rate": 1.318796992481203e-06, "loss": 0.1461, "step": 57730 }, { "epoch": 86.83, "grad_norm": 5.625306606292725, "learning_rate": 1.3172932330827068e-06, "loss": 0.0988, "step": 57740 }, { "epoch": 86.84, "grad_norm": 3.771822452545166, "learning_rate": 1.3157894736842106e-06, "loss": 0.139, "step": 57750 }, { "epoch": 86.86, "grad_norm": 2.988506555557251, "learning_rate": 1.3142857142857143e-06, "loss": 0.1482, "step": 57760 }, { "epoch": 86.87, "grad_norm": 6.377354145050049, "learning_rate": 1.312781954887218e-06, "loss": 0.1966, "step": 57770 }, { "epoch": 86.89, "grad_norm": 26.265287399291992, "learning_rate": 1.311278195488722e-06, "loss": 0.2471, "step": 57780 }, { "epoch": 86.9, "grad_norm": 7.3494062423706055, "learning_rate": 1.3097744360902257e-06, "loss": 0.1744, "step": 57790 }, { "epoch": 86.92, "grad_norm": 6.284552574157715, "learning_rate": 1.3082706766917294e-06, "loss": 0.1936, "step": 57800 }, { "epoch": 86.93, "grad_norm": 2.87434720993042, "learning_rate": 1.3067669172932332e-06, "loss": 0.1445, "step": 57810 }, { "epoch": 86.95, "grad_norm": 3.9225971698760986, "learning_rate": 1.3052631578947369e-06, "loss": 0.1282, "step": 57820 }, { "epoch": 86.96, "grad_norm": 6.310343265533447, "learning_rate": 1.3037593984962408e-06, "loss": 0.1504, "step": 57830 }, { "epoch": 86.98, "grad_norm": 2.322134494781494, "learning_rate": 1.3022556390977446e-06, "loss": 0.1774, "step": 57840 }, { "epoch": 86.99, "grad_norm": 6.937019348144531, "learning_rate": 1.3007518796992483e-06, "loss": 0.2013, "step": 57850 }, { "epoch": 87.0, "eval_accuracy": 0.9294, "eval_loss": 0.33600765466690063, "eval_runtime": 84.8466, "eval_samples_per_second": 117.86, "eval_steps_per_second": 0.471, "step": 57855 }, { "epoch": 87.01, "grad_norm": 6.285330772399902, "learning_rate": 1.299248120300752e-06, "loss": 0.1726, "step": 57860 }, { "epoch": 87.02, "grad_norm": 5.90061616897583, "learning_rate": 1.2977443609022557e-06, "loss": 0.1748, "step": 57870 }, { "epoch": 87.04, "grad_norm": 4.586902141571045, "learning_rate": 1.2962406015037595e-06, "loss": 0.1611, "step": 57880 }, { "epoch": 87.05, "grad_norm": 6.66023063659668, "learning_rate": 1.2947368421052634e-06, "loss": 0.1986, "step": 57890 }, { "epoch": 87.07, "grad_norm": 5.385473728179932, "learning_rate": 1.2932330827067672e-06, "loss": 0.1067, "step": 57900 }, { "epoch": 87.08, "grad_norm": 6.819510459899902, "learning_rate": 1.2917293233082709e-06, "loss": 0.2073, "step": 57910 }, { "epoch": 87.1, "grad_norm": 6.095653533935547, "learning_rate": 1.2902255639097746e-06, "loss": 0.182, "step": 57920 }, { "epoch": 87.11, "grad_norm": 5.213226318359375, "learning_rate": 1.2887218045112783e-06, "loss": 0.1669, "step": 57930 }, { "epoch": 87.13, "grad_norm": 6.771183013916016, "learning_rate": 1.2872180451127819e-06, "loss": 0.1516, "step": 57940 }, { "epoch": 87.14, "grad_norm": 4.856118202209473, "learning_rate": 1.2857142857142856e-06, "loss": 0.1819, "step": 57950 }, { "epoch": 87.16, "grad_norm": 4.535681247711182, "learning_rate": 1.2842105263157895e-06, "loss": 0.14, "step": 57960 }, { "epoch": 87.17, "grad_norm": 4.036566734313965, "learning_rate": 1.2827067669172933e-06, "loss": 0.136, "step": 57970 }, { "epoch": 87.19, "grad_norm": 4.947079181671143, "learning_rate": 1.281203007518797e-06, "loss": 0.1617, "step": 57980 }, { "epoch": 87.2, "grad_norm": 2.912419557571411, "learning_rate": 1.2796992481203007e-06, "loss": 0.1863, "step": 57990 }, { "epoch": 87.22, "grad_norm": 5.682669639587402, "learning_rate": 1.2781954887218045e-06, "loss": 0.1317, "step": 58000 }, { "epoch": 87.23, "grad_norm": 5.527998447418213, "learning_rate": 1.2766917293233084e-06, "loss": 0.1618, "step": 58010 }, { "epoch": 87.25, "grad_norm": 6.747208595275879, "learning_rate": 1.2751879699248121e-06, "loss": 0.1686, "step": 58020 }, { "epoch": 87.26, "grad_norm": 5.503549098968506, "learning_rate": 1.2736842105263159e-06, "loss": 0.1881, "step": 58030 }, { "epoch": 87.28, "grad_norm": 5.4713029861450195, "learning_rate": 1.2721804511278196e-06, "loss": 0.184, "step": 58040 }, { "epoch": 87.29, "grad_norm": 2.895097494125366, "learning_rate": 1.2706766917293233e-06, "loss": 0.1819, "step": 58050 }, { "epoch": 87.31, "grad_norm": 6.448250770568848, "learning_rate": 1.2691729323308273e-06, "loss": 0.1763, "step": 58060 }, { "epoch": 87.32, "grad_norm": 3.6069977283477783, "learning_rate": 1.267669172932331e-06, "loss": 0.1478, "step": 58070 }, { "epoch": 87.34, "grad_norm": 13.077876091003418, "learning_rate": 1.2661654135338347e-06, "loss": 0.1803, "step": 58080 }, { "epoch": 87.35, "grad_norm": 2.1468687057495117, "learning_rate": 1.2646616541353385e-06, "loss": 0.1337, "step": 58090 }, { "epoch": 87.37, "grad_norm": 4.480571269989014, "learning_rate": 1.2631578947368422e-06, "loss": 0.1444, "step": 58100 }, { "epoch": 87.38, "grad_norm": 8.891602516174316, "learning_rate": 1.261654135338346e-06, "loss": 0.1664, "step": 58110 }, { "epoch": 87.4, "grad_norm": 8.419784545898438, "learning_rate": 1.2601503759398499e-06, "loss": 0.1594, "step": 58120 }, { "epoch": 87.41, "grad_norm": 3.2222964763641357, "learning_rate": 1.2586466165413536e-06, "loss": 0.1197, "step": 58130 }, { "epoch": 87.43, "grad_norm": 6.81358528137207, "learning_rate": 1.2571428571428573e-06, "loss": 0.1934, "step": 58140 }, { "epoch": 87.44, "grad_norm": 8.876537322998047, "learning_rate": 1.255639097744361e-06, "loss": 0.1814, "step": 58150 }, { "epoch": 87.46, "grad_norm": 7.15813684463501, "learning_rate": 1.2541353383458648e-06, "loss": 0.1791, "step": 58160 }, { "epoch": 87.47, "grad_norm": 7.933776378631592, "learning_rate": 1.2526315789473687e-06, "loss": 0.1833, "step": 58170 }, { "epoch": 87.49, "grad_norm": 4.0624284744262695, "learning_rate": 1.2511278195488725e-06, "loss": 0.1686, "step": 58180 }, { "epoch": 87.5, "grad_norm": 6.974576950073242, "learning_rate": 1.249624060150376e-06, "loss": 0.151, "step": 58190 }, { "epoch": 87.52, "grad_norm": 5.813340663909912, "learning_rate": 1.2481203007518797e-06, "loss": 0.158, "step": 58200 }, { "epoch": 87.53, "grad_norm": 4.3520073890686035, "learning_rate": 1.2466165413533837e-06, "loss": 0.1725, "step": 58210 }, { "epoch": 87.55, "grad_norm": 5.577434539794922, "learning_rate": 1.2451127819548874e-06, "loss": 0.178, "step": 58220 }, { "epoch": 87.56, "grad_norm": 1.6522976160049438, "learning_rate": 1.2436090225563911e-06, "loss": 0.17, "step": 58230 }, { "epoch": 87.58, "grad_norm": 8.239200592041016, "learning_rate": 1.2421052631578948e-06, "loss": 0.1651, "step": 58240 }, { "epoch": 87.59, "grad_norm": 3.1411798000335693, "learning_rate": 1.2406015037593986e-06, "loss": 0.1839, "step": 58250 }, { "epoch": 87.61, "grad_norm": 8.265312194824219, "learning_rate": 1.2390977443609023e-06, "loss": 0.1692, "step": 58260 }, { "epoch": 87.62, "grad_norm": 7.520175457000732, "learning_rate": 1.237593984962406e-06, "loss": 0.1909, "step": 58270 }, { "epoch": 87.64, "grad_norm": 9.53011703491211, "learning_rate": 1.2360902255639098e-06, "loss": 0.2348, "step": 58280 }, { "epoch": 87.65, "grad_norm": 5.072795391082764, "learning_rate": 1.2345864661654137e-06, "loss": 0.1431, "step": 58290 }, { "epoch": 87.67, "grad_norm": 4.908288955688477, "learning_rate": 1.2330827067669174e-06, "loss": 0.2498, "step": 58300 }, { "epoch": 87.68, "grad_norm": 5.3725762367248535, "learning_rate": 1.2315789473684212e-06, "loss": 0.1617, "step": 58310 }, { "epoch": 87.7, "grad_norm": 5.9502034187316895, "learning_rate": 1.230075187969925e-06, "loss": 0.0983, "step": 58320 }, { "epoch": 87.71, "grad_norm": 7.810244083404541, "learning_rate": 1.2285714285714286e-06, "loss": 0.1523, "step": 58330 }, { "epoch": 87.73, "grad_norm": 3.383789539337158, "learning_rate": 1.2270676691729324e-06, "loss": 0.1778, "step": 58340 }, { "epoch": 87.74, "grad_norm": 3.804659366607666, "learning_rate": 1.2255639097744363e-06, "loss": 0.1638, "step": 58350 }, { "epoch": 87.76, "grad_norm": 6.533144950866699, "learning_rate": 1.22406015037594e-06, "loss": 0.1497, "step": 58360 }, { "epoch": 87.77, "grad_norm": 5.189228057861328, "learning_rate": 1.2225563909774438e-06, "loss": 0.15, "step": 58370 }, { "epoch": 87.79, "grad_norm": 7.200681209564209, "learning_rate": 1.2210526315789475e-06, "loss": 0.151, "step": 58380 }, { "epoch": 87.8, "grad_norm": 5.005568027496338, "learning_rate": 1.2195488721804512e-06, "loss": 0.1967, "step": 58390 }, { "epoch": 87.82, "grad_norm": 7.443274974822998, "learning_rate": 1.218045112781955e-06, "loss": 0.2047, "step": 58400 }, { "epoch": 87.83, "grad_norm": 8.02253532409668, "learning_rate": 1.2165413533834587e-06, "loss": 0.1725, "step": 58410 }, { "epoch": 87.85, "grad_norm": 4.530572414398193, "learning_rate": 1.2150375939849624e-06, "loss": 0.2269, "step": 58420 }, { "epoch": 87.86, "grad_norm": 6.738208293914795, "learning_rate": 1.2135338345864662e-06, "loss": 0.1799, "step": 58430 }, { "epoch": 87.88, "grad_norm": 4.552396297454834, "learning_rate": 1.21203007518797e-06, "loss": 0.1505, "step": 58440 }, { "epoch": 87.89, "grad_norm": 3.540454387664795, "learning_rate": 1.2105263157894738e-06, "loss": 0.1816, "step": 58450 }, { "epoch": 87.91, "grad_norm": 4.805243015289307, "learning_rate": 1.2090225563909776e-06, "loss": 0.1457, "step": 58460 }, { "epoch": 87.92, "grad_norm": 6.110500812530518, "learning_rate": 1.2075187969924813e-06, "loss": 0.1308, "step": 58470 }, { "epoch": 87.94, "grad_norm": 2.2697036266326904, "learning_rate": 1.206015037593985e-06, "loss": 0.1508, "step": 58480 }, { "epoch": 87.95, "grad_norm": 1.7307939529418945, "learning_rate": 1.204511278195489e-06, "loss": 0.1436, "step": 58490 }, { "epoch": 87.97, "grad_norm": 4.120843410491943, "learning_rate": 1.2030075187969925e-06, "loss": 0.1766, "step": 58500 }, { "epoch": 87.98, "grad_norm": 5.4482927322387695, "learning_rate": 1.2015037593984962e-06, "loss": 0.2227, "step": 58510 }, { "epoch": 88.0, "grad_norm": 0.018793661147356033, "learning_rate": 1.2000000000000002e-06, "loss": 0.1495, "step": 58520 }, { "epoch": 88.0, "eval_accuracy": 0.9307, "eval_loss": 0.33708590269088745, "eval_runtime": 84.8059, "eval_samples_per_second": 117.916, "eval_steps_per_second": 0.472, "step": 58520 }, { "epoch": 88.02, "grad_norm": 4.496594429016113, "learning_rate": 1.1984962406015039e-06, "loss": 0.2237, "step": 58530 }, { "epoch": 88.03, "grad_norm": 4.44071102142334, "learning_rate": 1.1969924812030076e-06, "loss": 0.2014, "step": 58540 }, { "epoch": 88.05, "grad_norm": 4.390169620513916, "learning_rate": 1.1954887218045113e-06, "loss": 0.1108, "step": 58550 }, { "epoch": 88.06, "grad_norm": 5.913539409637451, "learning_rate": 1.193984962406015e-06, "loss": 0.1861, "step": 58560 }, { "epoch": 88.08, "grad_norm": 4.491770267486572, "learning_rate": 1.1924812030075188e-06, "loss": 0.1935, "step": 58570 }, { "epoch": 88.09, "grad_norm": 5.476345539093018, "learning_rate": 1.1909774436090228e-06, "loss": 0.1692, "step": 58580 }, { "epoch": 88.11, "grad_norm": 5.3462347984313965, "learning_rate": 1.1894736842105265e-06, "loss": 0.1797, "step": 58590 }, { "epoch": 88.12, "grad_norm": 2.1195619106292725, "learning_rate": 1.1879699248120302e-06, "loss": 0.1823, "step": 58600 }, { "epoch": 88.14, "grad_norm": 8.027178764343262, "learning_rate": 1.186466165413534e-06, "loss": 0.2147, "step": 58610 }, { "epoch": 88.15, "grad_norm": 6.265287399291992, "learning_rate": 1.1849624060150377e-06, "loss": 0.1477, "step": 58620 }, { "epoch": 88.17, "grad_norm": 4.6563191413879395, "learning_rate": 1.1834586466165414e-06, "loss": 0.1371, "step": 58630 }, { "epoch": 88.18, "grad_norm": 3.4797229766845703, "learning_rate": 1.1819548872180451e-06, "loss": 0.1688, "step": 58640 }, { "epoch": 88.2, "grad_norm": 12.818514823913574, "learning_rate": 1.1804511278195489e-06, "loss": 0.1646, "step": 58650 }, { "epoch": 88.21, "grad_norm": 6.959251880645752, "learning_rate": 1.1789473684210526e-06, "loss": 0.1892, "step": 58660 }, { "epoch": 88.23, "grad_norm": 5.956885814666748, "learning_rate": 1.1774436090225565e-06, "loss": 0.191, "step": 58670 }, { "epoch": 88.24, "grad_norm": 3.1932976245880127, "learning_rate": 1.1759398496240603e-06, "loss": 0.187, "step": 58680 }, { "epoch": 88.26, "grad_norm": 2.2420973777770996, "learning_rate": 1.174436090225564e-06, "loss": 0.1762, "step": 58690 }, { "epoch": 88.27, "grad_norm": 3.847198486328125, "learning_rate": 1.1729323308270677e-06, "loss": 0.1542, "step": 58700 }, { "epoch": 88.29, "grad_norm": 7.925451755523682, "learning_rate": 1.1714285714285715e-06, "loss": 0.1671, "step": 58710 }, { "epoch": 88.3, "grad_norm": 4.539743423461914, "learning_rate": 1.1699248120300754e-06, "loss": 0.1702, "step": 58720 }, { "epoch": 88.32, "grad_norm": 0.4386990964412689, "learning_rate": 1.1684210526315791e-06, "loss": 0.1528, "step": 58730 }, { "epoch": 88.33, "grad_norm": 0.1927802413702011, "learning_rate": 1.1669172932330829e-06, "loss": 0.1165, "step": 58740 }, { "epoch": 88.35, "grad_norm": 4.060121059417725, "learning_rate": 1.1654135338345866e-06, "loss": 0.1751, "step": 58750 }, { "epoch": 88.36, "grad_norm": 6.014774799346924, "learning_rate": 1.1639097744360903e-06, "loss": 0.1487, "step": 58760 }, { "epoch": 88.38, "grad_norm": 2.9679887294769287, "learning_rate": 1.162406015037594e-06, "loss": 0.1832, "step": 58770 }, { "epoch": 88.39, "grad_norm": 3.4474730491638184, "learning_rate": 1.1609022556390978e-06, "loss": 0.181, "step": 58780 }, { "epoch": 88.41, "grad_norm": 5.39216947555542, "learning_rate": 1.1593984962406015e-06, "loss": 0.1964, "step": 58790 }, { "epoch": 88.42, "grad_norm": 3.7178587913513184, "learning_rate": 1.1578947368421053e-06, "loss": 0.1406, "step": 58800 }, { "epoch": 88.44, "grad_norm": 2.4193215370178223, "learning_rate": 1.1563909774436092e-06, "loss": 0.1928, "step": 58810 }, { "epoch": 88.45, "grad_norm": 5.395383834838867, "learning_rate": 1.154887218045113e-06, "loss": 0.2289, "step": 58820 }, { "epoch": 88.47, "grad_norm": 3.297912120819092, "learning_rate": 1.1533834586466167e-06, "loss": 0.1687, "step": 58830 }, { "epoch": 88.48, "grad_norm": 6.842408180236816, "learning_rate": 1.1518796992481204e-06, "loss": 0.1959, "step": 58840 }, { "epoch": 88.5, "grad_norm": 2.9778201580047607, "learning_rate": 1.1503759398496241e-06, "loss": 0.1847, "step": 58850 }, { "epoch": 88.51, "grad_norm": 4.785979270935059, "learning_rate": 1.148872180451128e-06, "loss": 0.1789, "step": 58860 }, { "epoch": 88.53, "grad_norm": 2.7606077194213867, "learning_rate": 1.1473684210526316e-06, "loss": 0.1995, "step": 58870 }, { "epoch": 88.54, "grad_norm": 6.0019097328186035, "learning_rate": 1.1458646616541353e-06, "loss": 0.1979, "step": 58880 }, { "epoch": 88.56, "grad_norm": 3.31396746635437, "learning_rate": 1.144360902255639e-06, "loss": 0.1191, "step": 58890 }, { "epoch": 88.57, "grad_norm": 5.867440223693848, "learning_rate": 1.142857142857143e-06, "loss": 0.1948, "step": 58900 }, { "epoch": 88.59, "grad_norm": 6.043757438659668, "learning_rate": 1.1413533834586467e-06, "loss": 0.145, "step": 58910 }, { "epoch": 88.6, "grad_norm": 5.402288913726807, "learning_rate": 1.1398496240601504e-06, "loss": 0.166, "step": 58920 }, { "epoch": 88.62, "grad_norm": 6.221014976501465, "learning_rate": 1.1383458646616542e-06, "loss": 0.2292, "step": 58930 }, { "epoch": 88.63, "grad_norm": 6.4407572746276855, "learning_rate": 1.136842105263158e-06, "loss": 0.1624, "step": 58940 }, { "epoch": 88.65, "grad_norm": 2.440732955932617, "learning_rate": 1.1353383458646619e-06, "loss": 0.1674, "step": 58950 }, { "epoch": 88.66, "grad_norm": 4.425859451293945, "learning_rate": 1.1338345864661656e-06, "loss": 0.212, "step": 58960 }, { "epoch": 88.68, "grad_norm": 5.0957818031311035, "learning_rate": 1.1323308270676693e-06, "loss": 0.2014, "step": 58970 }, { "epoch": 88.69, "grad_norm": 3.4791855812072754, "learning_rate": 1.130827067669173e-06, "loss": 0.2379, "step": 58980 }, { "epoch": 88.71, "grad_norm": 8.51900577545166, "learning_rate": 1.1293233082706768e-06, "loss": 0.1812, "step": 58990 }, { "epoch": 88.72, "grad_norm": 3.879889965057373, "learning_rate": 1.1278195488721805e-06, "loss": 0.1396, "step": 59000 }, { "epoch": 88.74, "grad_norm": 4.356947422027588, "learning_rate": 1.1263157894736842e-06, "loss": 0.1634, "step": 59010 }, { "epoch": 88.75, "grad_norm": 3.2641735076904297, "learning_rate": 1.124812030075188e-06, "loss": 0.1805, "step": 59020 }, { "epoch": 88.77, "grad_norm": 7.944269180297852, "learning_rate": 1.1233082706766917e-06, "loss": 0.1875, "step": 59030 }, { "epoch": 88.78, "grad_norm": 3.0705432891845703, "learning_rate": 1.1218045112781956e-06, "loss": 0.1884, "step": 59040 }, { "epoch": 88.8, "grad_norm": 5.9474196434021, "learning_rate": 1.1203007518796994e-06, "loss": 0.1614, "step": 59050 }, { "epoch": 88.81, "grad_norm": 8.783775329589844, "learning_rate": 1.118796992481203e-06, "loss": 0.2516, "step": 59060 }, { "epoch": 88.83, "grad_norm": 4.90156888961792, "learning_rate": 1.1172932330827068e-06, "loss": 0.1655, "step": 59070 }, { "epoch": 88.84, "grad_norm": 4.247372627258301, "learning_rate": 1.1157894736842106e-06, "loss": 0.2167, "step": 59080 }, { "epoch": 88.86, "grad_norm": 4.955306053161621, "learning_rate": 1.1142857142857145e-06, "loss": 0.159, "step": 59090 }, { "epoch": 88.87, "grad_norm": 4.56100606918335, "learning_rate": 1.1127819548872182e-06, "loss": 0.1346, "step": 59100 }, { "epoch": 88.89, "grad_norm": 4.857382297515869, "learning_rate": 1.111278195488722e-06, "loss": 0.1887, "step": 59110 }, { "epoch": 88.9, "grad_norm": 4.279732704162598, "learning_rate": 1.1097744360902255e-06, "loss": 0.1426, "step": 59120 }, { "epoch": 88.92, "grad_norm": 4.930658340454102, "learning_rate": 1.1082706766917294e-06, "loss": 0.2294, "step": 59130 }, { "epoch": 88.93, "grad_norm": 5.308464050292969, "learning_rate": 1.1067669172932332e-06, "loss": 0.1437, "step": 59140 }, { "epoch": 88.95, "grad_norm": 5.982918739318848, "learning_rate": 1.1052631578947369e-06, "loss": 0.2073, "step": 59150 }, { "epoch": 88.96, "grad_norm": 6.234976291656494, "learning_rate": 1.1037593984962406e-06, "loss": 0.1346, "step": 59160 }, { "epoch": 88.98, "grad_norm": 12.91275691986084, "learning_rate": 1.1022556390977444e-06, "loss": 0.2041, "step": 59170 }, { "epoch": 88.99, "grad_norm": 2.407877206802368, "learning_rate": 1.1007518796992483e-06, "loss": 0.1042, "step": 59180 }, { "epoch": 89.0, "eval_accuracy": 0.9316, "eval_loss": 0.33024507761001587, "eval_runtime": 84.4728, "eval_samples_per_second": 118.381, "eval_steps_per_second": 0.474, "step": 59185 }, { "epoch": 89.01, "grad_norm": 9.109488487243652, "learning_rate": 1.099248120300752e-06, "loss": 0.187, "step": 59190 }, { "epoch": 89.02, "grad_norm": 2.8222405910491943, "learning_rate": 1.0977443609022558e-06, "loss": 0.1887, "step": 59200 }, { "epoch": 89.04, "grad_norm": 3.990464448928833, "learning_rate": 1.0962406015037595e-06, "loss": 0.1464, "step": 59210 }, { "epoch": 89.05, "grad_norm": 3.481008529663086, "learning_rate": 1.0947368421052632e-06, "loss": 0.1941, "step": 59220 }, { "epoch": 89.07, "grad_norm": 4.007563591003418, "learning_rate": 1.0932330827067672e-06, "loss": 0.2157, "step": 59230 }, { "epoch": 89.08, "grad_norm": 4.087463855743408, "learning_rate": 1.0917293233082709e-06, "loss": 0.1788, "step": 59240 }, { "epoch": 89.1, "grad_norm": 7.297598838806152, "learning_rate": 1.0902255639097744e-06, "loss": 0.1653, "step": 59250 }, { "epoch": 89.11, "grad_norm": 4.800395488739014, "learning_rate": 1.0887218045112781e-06, "loss": 0.2075, "step": 59260 }, { "epoch": 89.13, "grad_norm": 3.8315725326538086, "learning_rate": 1.087218045112782e-06, "loss": 0.1383, "step": 59270 }, { "epoch": 89.14, "grad_norm": 6.9601898193359375, "learning_rate": 1.0857142857142858e-06, "loss": 0.1993, "step": 59280 }, { "epoch": 89.16, "grad_norm": 6.42357063293457, "learning_rate": 1.0842105263157895e-06, "loss": 0.1716, "step": 59290 }, { "epoch": 89.17, "grad_norm": 4.080452919006348, "learning_rate": 1.0827067669172933e-06, "loss": 0.198, "step": 59300 }, { "epoch": 89.19, "grad_norm": 4.082363605499268, "learning_rate": 1.081203007518797e-06, "loss": 0.1456, "step": 59310 }, { "epoch": 89.2, "grad_norm": 2.618720531463623, "learning_rate": 1.079699248120301e-06, "loss": 0.2309, "step": 59320 }, { "epoch": 89.22, "grad_norm": 6.8665080070495605, "learning_rate": 1.0781954887218047e-06, "loss": 0.1487, "step": 59330 }, { "epoch": 89.23, "grad_norm": 6.956995487213135, "learning_rate": 1.0766917293233084e-06, "loss": 0.2283, "step": 59340 }, { "epoch": 89.25, "grad_norm": 4.275158405303955, "learning_rate": 1.0751879699248121e-06, "loss": 0.1732, "step": 59350 }, { "epoch": 89.26, "grad_norm": 6.3027024269104, "learning_rate": 1.0736842105263159e-06, "loss": 0.1888, "step": 59360 }, { "epoch": 89.28, "grad_norm": 6.413710594177246, "learning_rate": 1.0721804511278196e-06, "loss": 0.2427, "step": 59370 }, { "epoch": 89.29, "grad_norm": 5.2090888023376465, "learning_rate": 1.0706766917293233e-06, "loss": 0.1906, "step": 59380 }, { "epoch": 89.31, "grad_norm": 5.576053142547607, "learning_rate": 1.069172932330827e-06, "loss": 0.1398, "step": 59390 }, { "epoch": 89.32, "grad_norm": 3.6044418811798096, "learning_rate": 1.0676691729323308e-06, "loss": 0.1232, "step": 59400 }, { "epoch": 89.34, "grad_norm": 3.0662240982055664, "learning_rate": 1.0661654135338347e-06, "loss": 0.189, "step": 59410 }, { "epoch": 89.35, "grad_norm": 3.142246961593628, "learning_rate": 1.0646616541353385e-06, "loss": 0.1447, "step": 59420 }, { "epoch": 89.37, "grad_norm": 3.3175511360168457, "learning_rate": 1.0631578947368422e-06, "loss": 0.127, "step": 59430 }, { "epoch": 89.38, "grad_norm": 4.238184928894043, "learning_rate": 1.061654135338346e-06, "loss": 0.1321, "step": 59440 }, { "epoch": 89.4, "grad_norm": 7.08275842666626, "learning_rate": 1.0601503759398497e-06, "loss": 0.1795, "step": 59450 }, { "epoch": 89.41, "grad_norm": 7.184491157531738, "learning_rate": 1.0586466165413536e-06, "loss": 0.2363, "step": 59460 }, { "epoch": 89.43, "grad_norm": 4.729690074920654, "learning_rate": 1.0571428571428573e-06, "loss": 0.1752, "step": 59470 }, { "epoch": 89.44, "grad_norm": 5.500245094299316, "learning_rate": 1.055639097744361e-06, "loss": 0.1708, "step": 59480 }, { "epoch": 89.46, "grad_norm": 4.5759406089782715, "learning_rate": 1.0541353383458648e-06, "loss": 0.1884, "step": 59490 }, { "epoch": 89.47, "grad_norm": 2.6944706439971924, "learning_rate": 1.0526315789473685e-06, "loss": 0.1609, "step": 59500 }, { "epoch": 89.49, "grad_norm": 3.369946002960205, "learning_rate": 1.0511278195488723e-06, "loss": 0.1926, "step": 59510 }, { "epoch": 89.5, "grad_norm": 4.653051376342773, "learning_rate": 1.049624060150376e-06, "loss": 0.1455, "step": 59520 }, { "epoch": 89.52, "grad_norm": 0.8550413846969604, "learning_rate": 1.0481203007518797e-06, "loss": 0.1589, "step": 59530 }, { "epoch": 89.53, "grad_norm": 4.918313026428223, "learning_rate": 1.0466165413533835e-06, "loss": 0.0954, "step": 59540 }, { "epoch": 89.55, "grad_norm": 6.69185209274292, "learning_rate": 1.0451127819548874e-06, "loss": 0.1798, "step": 59550 }, { "epoch": 89.56, "grad_norm": 6.381489276885986, "learning_rate": 1.0436090225563911e-06, "loss": 0.1493, "step": 59560 }, { "epoch": 89.58, "grad_norm": 2.553143262863159, "learning_rate": 1.0421052631578949e-06, "loss": 0.1313, "step": 59570 }, { "epoch": 89.59, "grad_norm": 2.599313974380493, "learning_rate": 1.0406015037593986e-06, "loss": 0.171, "step": 59580 }, { "epoch": 89.61, "grad_norm": 6.928783893585205, "learning_rate": 1.0390977443609023e-06, "loss": 0.2035, "step": 59590 }, { "epoch": 89.62, "grad_norm": 4.362387657165527, "learning_rate": 1.037593984962406e-06, "loss": 0.1783, "step": 59600 }, { "epoch": 89.64, "grad_norm": 2.9104678630828857, "learning_rate": 1.03609022556391e-06, "loss": 0.1895, "step": 59610 }, { "epoch": 89.65, "grad_norm": 6.155645370483398, "learning_rate": 1.0345864661654135e-06, "loss": 0.1856, "step": 59620 }, { "epoch": 89.67, "grad_norm": 5.490917205810547, "learning_rate": 1.0330827067669172e-06, "loss": 0.151, "step": 59630 }, { "epoch": 89.68, "grad_norm": 0.3577495515346527, "learning_rate": 1.0315789473684212e-06, "loss": 0.1647, "step": 59640 }, { "epoch": 89.7, "grad_norm": 3.6894378662109375, "learning_rate": 1.030075187969925e-06, "loss": 0.1578, "step": 59650 }, { "epoch": 89.71, "grad_norm": 7.005927562713623, "learning_rate": 1.0285714285714286e-06, "loss": 0.1644, "step": 59660 }, { "epoch": 89.73, "grad_norm": 2.355557680130005, "learning_rate": 1.0270676691729324e-06, "loss": 0.1616, "step": 59670 }, { "epoch": 89.74, "grad_norm": 5.249048233032227, "learning_rate": 1.0255639097744361e-06, "loss": 0.1285, "step": 59680 }, { "epoch": 89.76, "grad_norm": 9.33356761932373, "learning_rate": 1.02406015037594e-06, "loss": 0.1326, "step": 59690 }, { "epoch": 89.77, "grad_norm": 3.254373550415039, "learning_rate": 1.0225563909774438e-06, "loss": 0.1191, "step": 59700 }, { "epoch": 89.79, "grad_norm": 4.756227493286133, "learning_rate": 1.0210526315789475e-06, "loss": 0.1221, "step": 59710 }, { "epoch": 89.8, "grad_norm": 4.242854118347168, "learning_rate": 1.0195488721804512e-06, "loss": 0.1932, "step": 59720 }, { "epoch": 89.82, "grad_norm": 6.5166425704956055, "learning_rate": 1.018045112781955e-06, "loss": 0.1373, "step": 59730 }, { "epoch": 89.83, "grad_norm": 2.136077880859375, "learning_rate": 1.0165413533834587e-06, "loss": 0.1699, "step": 59740 }, { "epoch": 89.85, "grad_norm": 2.9859583377838135, "learning_rate": 1.0150375939849624e-06, "loss": 0.1677, "step": 59750 }, { "epoch": 89.86, "grad_norm": 5.3651933670043945, "learning_rate": 1.0135338345864662e-06, "loss": 0.176, "step": 59760 }, { "epoch": 89.88, "grad_norm": 4.6981520652771, "learning_rate": 1.01203007518797e-06, "loss": 0.1921, "step": 59770 }, { "epoch": 89.89, "grad_norm": 4.584779262542725, "learning_rate": 1.0105263157894738e-06, "loss": 0.1803, "step": 59780 }, { "epoch": 89.91, "grad_norm": 7.521881580352783, "learning_rate": 1.0090225563909776e-06, "loss": 0.1587, "step": 59790 }, { "epoch": 89.92, "grad_norm": 7.652347564697266, "learning_rate": 1.0075187969924813e-06, "loss": 0.1852, "step": 59800 }, { "epoch": 89.94, "grad_norm": 1.188881516456604, "learning_rate": 1.006015037593985e-06, "loss": 0.1649, "step": 59810 }, { "epoch": 89.95, "grad_norm": 3.557762622833252, "learning_rate": 1.0045112781954888e-06, "loss": 0.1391, "step": 59820 }, { "epoch": 89.97, "grad_norm": 1.8400624990463257, "learning_rate": 1.0030075187969925e-06, "loss": 0.2029, "step": 59830 }, { "epoch": 89.98, "grad_norm": 5.2794575691223145, "learning_rate": 1.0015037593984964e-06, "loss": 0.1536, "step": 59840 }, { "epoch": 90.0, "grad_norm": 43.79523849487305, "learning_rate": 1.0000000000000002e-06, "loss": 0.1681, "step": 59850 }, { "epoch": 90.0, "eval_accuracy": 0.9295, "eval_loss": 0.3303840756416321, "eval_runtime": 84.4298, "eval_samples_per_second": 118.442, "eval_steps_per_second": 0.474, "step": 59850 }, { "epoch": 90.02, "grad_norm": 4.819916725158691, "learning_rate": 9.98496240601504e-07, "loss": 0.1482, "step": 59860 }, { "epoch": 90.03, "grad_norm": 8.389203071594238, "learning_rate": 9.969924812030076e-07, "loss": 0.204, "step": 59870 }, { "epoch": 90.05, "grad_norm": 4.255865097045898, "learning_rate": 9.954887218045114e-07, "loss": 0.2107, "step": 59880 }, { "epoch": 90.06, "grad_norm": 2.4185428619384766, "learning_rate": 9.93984962406015e-07, "loss": 0.2002, "step": 59890 }, { "epoch": 90.08, "grad_norm": 6.136438846588135, "learning_rate": 9.924812030075188e-07, "loss": 0.1432, "step": 59900 }, { "epoch": 90.09, "grad_norm": 4.596153259277344, "learning_rate": 9.909774436090226e-07, "loss": 0.2041, "step": 59910 }, { "epoch": 90.11, "grad_norm": 6.638514518737793, "learning_rate": 9.894736842105265e-07, "loss": 0.1594, "step": 59920 }, { "epoch": 90.12, "grad_norm": 5.299813270568848, "learning_rate": 9.879699248120302e-07, "loss": 0.1713, "step": 59930 }, { "epoch": 90.14, "grad_norm": 6.116607666015625, "learning_rate": 9.86466165413534e-07, "loss": 0.1566, "step": 59940 }, { "epoch": 90.15, "grad_norm": 6.558013916015625, "learning_rate": 9.849624060150377e-07, "loss": 0.203, "step": 59950 }, { "epoch": 90.17, "grad_norm": 2.691136598587036, "learning_rate": 9.834586466165414e-07, "loss": 0.1761, "step": 59960 }, { "epoch": 90.18, "grad_norm": 6.008741855621338, "learning_rate": 9.819548872180451e-07, "loss": 0.2017, "step": 59970 }, { "epoch": 90.2, "grad_norm": 7.76501989364624, "learning_rate": 9.80451127819549e-07, "loss": 0.188, "step": 59980 }, { "epoch": 90.21, "grad_norm": 3.065690517425537, "learning_rate": 9.789473684210526e-07, "loss": 0.1711, "step": 59990 }, { "epoch": 90.23, "grad_norm": 6.988502502441406, "learning_rate": 9.774436090225563e-07, "loss": 0.1323, "step": 60000 }, { "epoch": 90.24, "grad_norm": 4.135454177856445, "learning_rate": 9.759398496240603e-07, "loss": 0.1522, "step": 60010 }, { "epoch": 90.26, "grad_norm": 5.1034016609191895, "learning_rate": 9.74436090225564e-07, "loss": 0.1998, "step": 60020 }, { "epoch": 90.27, "grad_norm": 2.8112096786499023, "learning_rate": 9.729323308270677e-07, "loss": 0.1588, "step": 60030 }, { "epoch": 90.29, "grad_norm": 2.276792049407959, "learning_rate": 9.714285714285715e-07, "loss": 0.152, "step": 60040 }, { "epoch": 90.3, "grad_norm": 16.462791442871094, "learning_rate": 9.699248120300752e-07, "loss": 0.1913, "step": 60050 }, { "epoch": 90.32, "grad_norm": 7.889447212219238, "learning_rate": 9.68421052631579e-07, "loss": 0.1824, "step": 60060 }, { "epoch": 90.33, "grad_norm": 5.8780837059021, "learning_rate": 9.669172932330829e-07, "loss": 0.1683, "step": 60070 }, { "epoch": 90.35, "grad_norm": 1.407557487487793, "learning_rate": 9.654135338345866e-07, "loss": 0.1696, "step": 60080 }, { "epoch": 90.36, "grad_norm": 2.4125938415527344, "learning_rate": 9.639097744360903e-07, "loss": 0.1517, "step": 60090 }, { "epoch": 90.38, "grad_norm": 6.825364112854004, "learning_rate": 9.62406015037594e-07, "loss": 0.217, "step": 60100 }, { "epoch": 90.39, "grad_norm": 5.1629743576049805, "learning_rate": 9.609022556390978e-07, "loss": 0.1541, "step": 60110 }, { "epoch": 90.41, "grad_norm": 6.873205184936523, "learning_rate": 9.593984962406015e-07, "loss": 0.1416, "step": 60120 }, { "epoch": 90.42, "grad_norm": 5.59041690826416, "learning_rate": 9.578947368421053e-07, "loss": 0.153, "step": 60130 }, { "epoch": 90.44, "grad_norm": 3.045102834701538, "learning_rate": 9.56390977443609e-07, "loss": 0.1723, "step": 60140 }, { "epoch": 90.45, "grad_norm": 3.393899917602539, "learning_rate": 9.54887218045113e-07, "loss": 0.1852, "step": 60150 }, { "epoch": 90.47, "grad_norm": 5.689571380615234, "learning_rate": 9.533834586466166e-07, "loss": 0.2115, "step": 60160 }, { "epoch": 90.48, "grad_norm": 5.110867023468018, "learning_rate": 9.518796992481204e-07, "loss": 0.1877, "step": 60170 }, { "epoch": 90.5, "grad_norm": 2.7676916122436523, "learning_rate": 9.503759398496241e-07, "loss": 0.1966, "step": 60180 }, { "epoch": 90.51, "grad_norm": 0.4056377410888672, "learning_rate": 9.48872180451128e-07, "loss": 0.1819, "step": 60190 }, { "epoch": 90.53, "grad_norm": 15.291031837463379, "learning_rate": 9.473684210526317e-07, "loss": 0.1564, "step": 60200 }, { "epoch": 90.54, "grad_norm": 4.443658828735352, "learning_rate": 9.458646616541354e-07, "loss": 0.1336, "step": 60210 }, { "epoch": 90.56, "grad_norm": 8.213922500610352, "learning_rate": 9.443609022556393e-07, "loss": 0.1578, "step": 60220 }, { "epoch": 90.57, "grad_norm": 5.976234436035156, "learning_rate": 9.42857142857143e-07, "loss": 0.1815, "step": 60230 }, { "epoch": 90.59, "grad_norm": 3.436178684234619, "learning_rate": 9.413533834586466e-07, "loss": 0.1836, "step": 60240 }, { "epoch": 90.6, "grad_norm": 7.1853203773498535, "learning_rate": 9.398496240601504e-07, "loss": 0.1733, "step": 60250 }, { "epoch": 90.62, "grad_norm": 3.7736470699310303, "learning_rate": 9.383458646616542e-07, "loss": 0.0901, "step": 60260 }, { "epoch": 90.63, "grad_norm": 6.868650436401367, "learning_rate": 9.368421052631579e-07, "loss": 0.1495, "step": 60270 }, { "epoch": 90.65, "grad_norm": 6.130927085876465, "learning_rate": 9.353383458646618e-07, "loss": 0.175, "step": 60280 }, { "epoch": 90.66, "grad_norm": 2.877918004989624, "learning_rate": 9.338345864661655e-07, "loss": 0.1292, "step": 60290 }, { "epoch": 90.68, "grad_norm": 6.6926493644714355, "learning_rate": 9.323308270676692e-07, "loss": 0.1629, "step": 60300 }, { "epoch": 90.69, "grad_norm": 4.85024356842041, "learning_rate": 9.308270676691731e-07, "loss": 0.1462, "step": 60310 }, { "epoch": 90.71, "grad_norm": 4.398621559143066, "learning_rate": 9.293233082706768e-07, "loss": 0.1805, "step": 60320 }, { "epoch": 90.72, "grad_norm": 4.464656829833984, "learning_rate": 9.278195488721805e-07, "loss": 0.1624, "step": 60330 }, { "epoch": 90.74, "grad_norm": 2.839735984802246, "learning_rate": 9.263157894736844e-07, "loss": 0.1813, "step": 60340 }, { "epoch": 90.75, "grad_norm": 3.697030544281006, "learning_rate": 9.248120300751881e-07, "loss": 0.1472, "step": 60350 }, { "epoch": 90.77, "grad_norm": 4.30585241317749, "learning_rate": 9.233082706766917e-07, "loss": 0.1624, "step": 60360 }, { "epoch": 90.78, "grad_norm": 5.344664096832275, "learning_rate": 9.218045112781955e-07, "loss": 0.1486, "step": 60370 }, { "epoch": 90.8, "grad_norm": 3.9240634441375732, "learning_rate": 9.203007518796993e-07, "loss": 0.1843, "step": 60380 }, { "epoch": 90.81, "grad_norm": 2.0143322944641113, "learning_rate": 9.18796992481203e-07, "loss": 0.1147, "step": 60390 }, { "epoch": 90.83, "grad_norm": 6.274852275848389, "learning_rate": 9.172932330827068e-07, "loss": 0.169, "step": 60400 }, { "epoch": 90.84, "grad_norm": 5.181715488433838, "learning_rate": 9.157894736842106e-07, "loss": 0.1845, "step": 60410 }, { "epoch": 90.86, "grad_norm": 0.9931633472442627, "learning_rate": 9.142857142857144e-07, "loss": 0.1365, "step": 60420 }, { "epoch": 90.87, "grad_norm": 5.815258502960205, "learning_rate": 9.127819548872181e-07, "loss": 0.1571, "step": 60430 }, { "epoch": 90.89, "grad_norm": 4.237338542938232, "learning_rate": 9.112781954887219e-07, "loss": 0.1783, "step": 60440 }, { "epoch": 90.9, "grad_norm": 4.938007354736328, "learning_rate": 9.097744360902257e-07, "loss": 0.1615, "step": 60450 }, { "epoch": 90.92, "grad_norm": 7.971113204956055, "learning_rate": 9.082706766917294e-07, "loss": 0.1811, "step": 60460 }, { "epoch": 90.93, "grad_norm": 6.53290319442749, "learning_rate": 9.067669172932332e-07, "loss": 0.1816, "step": 60470 }, { "epoch": 90.95, "grad_norm": 5.972143173217773, "learning_rate": 9.05263157894737e-07, "loss": 0.2215, "step": 60480 }, { "epoch": 90.96, "grad_norm": 6.27506685256958, "learning_rate": 9.037593984962406e-07, "loss": 0.1527, "step": 60490 }, { "epoch": 90.98, "grad_norm": 3.595836877822876, "learning_rate": 9.022556390977444e-07, "loss": 0.1536, "step": 60500 }, { "epoch": 90.99, "grad_norm": 4.775961399078369, "learning_rate": 9.007518796992482e-07, "loss": 0.1802, "step": 60510 }, { "epoch": 91.0, "eval_accuracy": 0.9298, "eval_loss": 0.3351175785064697, "eval_runtime": 84.5299, "eval_samples_per_second": 118.301, "eval_steps_per_second": 0.473, "step": 60515 }, { "epoch": 91.01, "grad_norm": 7.370502948760986, "learning_rate": 8.992481203007519e-07, "loss": 0.2365, "step": 60520 }, { "epoch": 91.02, "grad_norm": 8.751474380493164, "learning_rate": 8.977443609022557e-07, "loss": 0.2011, "step": 60530 }, { "epoch": 91.04, "grad_norm": 3.2370705604553223, "learning_rate": 8.962406015037595e-07, "loss": 0.1858, "step": 60540 }, { "epoch": 91.05, "grad_norm": 5.268539905548096, "learning_rate": 8.947368421052632e-07, "loss": 0.2265, "step": 60550 }, { "epoch": 91.07, "grad_norm": 5.824682235717773, "learning_rate": 8.93233082706767e-07, "loss": 0.17, "step": 60560 }, { "epoch": 91.08, "grad_norm": 4.85299015045166, "learning_rate": 8.917293233082708e-07, "loss": 0.1715, "step": 60570 }, { "epoch": 91.1, "grad_norm": 3.4943690299987793, "learning_rate": 8.902255639097745e-07, "loss": 0.1739, "step": 60580 }, { "epoch": 91.11, "grad_norm": 5.409356594085693, "learning_rate": 8.887218045112784e-07, "loss": 0.1693, "step": 60590 }, { "epoch": 91.13, "grad_norm": 6.857398986816406, "learning_rate": 8.872180451127821e-07, "loss": 0.1883, "step": 60600 }, { "epoch": 91.14, "grad_norm": 4.355463027954102, "learning_rate": 8.857142857142857e-07, "loss": 0.1444, "step": 60610 }, { "epoch": 91.16, "grad_norm": 5.453840255737305, "learning_rate": 8.842105263157895e-07, "loss": 0.1129, "step": 60620 }, { "epoch": 91.17, "grad_norm": 2.509838581085205, "learning_rate": 8.827067669172933e-07, "loss": 0.1524, "step": 60630 }, { "epoch": 91.19, "grad_norm": 4.419145107269287, "learning_rate": 8.81203007518797e-07, "loss": 0.1007, "step": 60640 }, { "epoch": 91.2, "grad_norm": 2.9368622303009033, "learning_rate": 8.796992481203009e-07, "loss": 0.1373, "step": 60650 }, { "epoch": 91.22, "grad_norm": 5.6778082847595215, "learning_rate": 8.781954887218046e-07, "loss": 0.1914, "step": 60660 }, { "epoch": 91.23, "grad_norm": 4.565642833709717, "learning_rate": 8.766917293233083e-07, "loss": 0.1885, "step": 60670 }, { "epoch": 91.25, "grad_norm": 12.5923433303833, "learning_rate": 8.751879699248122e-07, "loss": 0.1841, "step": 60680 }, { "epoch": 91.26, "grad_norm": 3.8963727951049805, "learning_rate": 8.736842105263159e-07, "loss": 0.2082, "step": 60690 }, { "epoch": 91.28, "grad_norm": 7.847858428955078, "learning_rate": 8.721804511278196e-07, "loss": 0.2347, "step": 60700 }, { "epoch": 91.29, "grad_norm": 6.093050003051758, "learning_rate": 8.706766917293235e-07, "loss": 0.2005, "step": 60710 }, { "epoch": 91.31, "grad_norm": 6.70009708404541, "learning_rate": 8.691729323308272e-07, "loss": 0.1581, "step": 60720 }, { "epoch": 91.32, "grad_norm": 5.4743852615356445, "learning_rate": 8.67669172932331e-07, "loss": 0.1938, "step": 60730 }, { "epoch": 91.34, "grad_norm": 6.605056285858154, "learning_rate": 8.661654135338346e-07, "loss": 0.2517, "step": 60740 }, { "epoch": 91.35, "grad_norm": 5.257135391235352, "learning_rate": 8.646616541353384e-07, "loss": 0.1354, "step": 60750 }, { "epoch": 91.37, "grad_norm": 4.14031982421875, "learning_rate": 8.631578947368421e-07, "loss": 0.2008, "step": 60760 }, { "epoch": 91.38, "grad_norm": 4.664227485656738, "learning_rate": 8.616541353383459e-07, "loss": 0.166, "step": 60770 }, { "epoch": 91.4, "grad_norm": 7.249124526977539, "learning_rate": 8.601503759398497e-07, "loss": 0.1729, "step": 60780 }, { "epoch": 91.41, "grad_norm": 1.5819720029830933, "learning_rate": 8.586466165413534e-07, "loss": 0.1673, "step": 60790 }, { "epoch": 91.43, "grad_norm": 8.89696216583252, "learning_rate": 8.571428571428572e-07, "loss": 0.1627, "step": 60800 }, { "epoch": 91.44, "grad_norm": 6.409267425537109, "learning_rate": 8.55639097744361e-07, "loss": 0.1917, "step": 60810 }, { "epoch": 91.46, "grad_norm": 13.142133712768555, "learning_rate": 8.541353383458648e-07, "loss": 0.2156, "step": 60820 }, { "epoch": 91.47, "grad_norm": 3.5235838890075684, "learning_rate": 8.526315789473685e-07, "loss": 0.1644, "step": 60830 }, { "epoch": 91.49, "grad_norm": 5.23291015625, "learning_rate": 8.511278195488723e-07, "loss": 0.2202, "step": 60840 }, { "epoch": 91.5, "grad_norm": 8.659149169921875, "learning_rate": 8.496240601503761e-07, "loss": 0.1695, "step": 60850 }, { "epoch": 91.52, "grad_norm": 5.686830043792725, "learning_rate": 8.481203007518797e-07, "loss": 0.1524, "step": 60860 }, { "epoch": 91.53, "grad_norm": 5.107367992401123, "learning_rate": 8.466165413533835e-07, "loss": 0.1736, "step": 60870 }, { "epoch": 91.55, "grad_norm": 8.266801834106445, "learning_rate": 8.451127819548873e-07, "loss": 0.1509, "step": 60880 }, { "epoch": 91.56, "grad_norm": 8.239102363586426, "learning_rate": 8.43609022556391e-07, "loss": 0.1974, "step": 60890 }, { "epoch": 91.58, "grad_norm": 4.153801918029785, "learning_rate": 8.421052631578948e-07, "loss": 0.2202, "step": 60900 }, { "epoch": 91.59, "grad_norm": 4.520059585571289, "learning_rate": 8.406015037593986e-07, "loss": 0.1404, "step": 60910 }, { "epoch": 91.61, "grad_norm": 3.584502935409546, "learning_rate": 8.390977443609023e-07, "loss": 0.1914, "step": 60920 }, { "epoch": 91.62, "grad_norm": 5.777580261230469, "learning_rate": 8.375939849624061e-07, "loss": 0.1338, "step": 60930 }, { "epoch": 91.64, "grad_norm": 4.816416263580322, "learning_rate": 8.360902255639099e-07, "loss": 0.1983, "step": 60940 }, { "epoch": 91.65, "grad_norm": 1.8106648921966553, "learning_rate": 8.345864661654136e-07, "loss": 0.1728, "step": 60950 }, { "epoch": 91.67, "grad_norm": 6.586467266082764, "learning_rate": 8.330827067669174e-07, "loss": 0.1635, "step": 60960 }, { "epoch": 91.68, "grad_norm": 4.0947136878967285, "learning_rate": 8.315789473684212e-07, "loss": 0.1938, "step": 60970 }, { "epoch": 91.7, "grad_norm": 6.031853199005127, "learning_rate": 8.300751879699248e-07, "loss": 0.18, "step": 60980 }, { "epoch": 91.71, "grad_norm": 5.872425079345703, "learning_rate": 8.285714285714285e-07, "loss": 0.1693, "step": 60990 }, { "epoch": 91.73, "grad_norm": 9.427947998046875, "learning_rate": 8.270676691729324e-07, "loss": 0.187, "step": 61000 }, { "epoch": 91.74, "grad_norm": 5.179823398590088, "learning_rate": 8.255639097744361e-07, "loss": 0.1372, "step": 61010 }, { "epoch": 91.76, "grad_norm": 5.486731052398682, "learning_rate": 8.240601503759398e-07, "loss": 0.1654, "step": 61020 }, { "epoch": 91.77, "grad_norm": 3.1775834560394287, "learning_rate": 8.225563909774437e-07, "loss": 0.176, "step": 61030 }, { "epoch": 91.79, "grad_norm": 4.589211463928223, "learning_rate": 8.210526315789474e-07, "loss": 0.1452, "step": 61040 }, { "epoch": 91.8, "grad_norm": 3.6685729026794434, "learning_rate": 8.195488721804513e-07, "loss": 0.1883, "step": 61050 }, { "epoch": 91.82, "grad_norm": 4.512982368469238, "learning_rate": 8.18045112781955e-07, "loss": 0.1589, "step": 61060 }, { "epoch": 91.83, "grad_norm": 9.20313549041748, "learning_rate": 8.165413533834587e-07, "loss": 0.1399, "step": 61070 }, { "epoch": 91.85, "grad_norm": 5.7398905754089355, "learning_rate": 8.150375939849625e-07, "loss": 0.1689, "step": 61080 }, { "epoch": 91.86, "grad_norm": 4.950761795043945, "learning_rate": 8.135338345864663e-07, "loss": 0.1259, "step": 61090 }, { "epoch": 91.88, "grad_norm": 7.066133499145508, "learning_rate": 8.1203007518797e-07, "loss": 0.1673, "step": 61100 }, { "epoch": 91.89, "grad_norm": 4.5306596755981445, "learning_rate": 8.105263157894736e-07, "loss": 0.173, "step": 61110 }, { "epoch": 91.91, "grad_norm": 6.547530651092529, "learning_rate": 8.090225563909775e-07, "loss": 0.1469, "step": 61120 }, { "epoch": 91.92, "grad_norm": 6.574265003204346, "learning_rate": 8.075187969924812e-07, "loss": 0.1464, "step": 61130 }, { "epoch": 91.94, "grad_norm": 2.8120083808898926, "learning_rate": 8.06015037593985e-07, "loss": 0.1422, "step": 61140 }, { "epoch": 91.95, "grad_norm": 3.274015188217163, "learning_rate": 8.045112781954888e-07, "loss": 0.187, "step": 61150 }, { "epoch": 91.97, "grad_norm": 16.76445198059082, "learning_rate": 8.030075187969925e-07, "loss": 0.1365, "step": 61160 }, { "epoch": 91.98, "grad_norm": 3.863375663757324, "learning_rate": 8.015037593984963e-07, "loss": 0.1375, "step": 61170 }, { "epoch": 92.0, "grad_norm": 23.6773738861084, "learning_rate": 8.000000000000001e-07, "loss": 0.268, "step": 61180 }, { "epoch": 92.0, "eval_accuracy": 0.9305, "eval_loss": 0.33316439390182495, "eval_runtime": 84.8569, "eval_samples_per_second": 117.845, "eval_steps_per_second": 0.471, "step": 61180 }, { "epoch": 92.02, "grad_norm": 6.592239856719971, "learning_rate": 7.984962406015038e-07, "loss": 0.1531, "step": 61190 }, { "epoch": 92.03, "grad_norm": 3.465878963470459, "learning_rate": 7.969924812030076e-07, "loss": 0.1796, "step": 61200 }, { "epoch": 92.05, "grad_norm": 5.356743812561035, "learning_rate": 7.954887218045114e-07, "loss": 0.1227, "step": 61210 }, { "epoch": 92.06, "grad_norm": 9.6128511428833, "learning_rate": 7.939849624060152e-07, "loss": 0.1512, "step": 61220 }, { "epoch": 92.08, "grad_norm": 5.72312068939209, "learning_rate": 7.924812030075188e-07, "loss": 0.1641, "step": 61230 }, { "epoch": 92.09, "grad_norm": 2.734785556793213, "learning_rate": 7.909774436090226e-07, "loss": 0.2045, "step": 61240 }, { "epoch": 92.11, "grad_norm": 5.764986991882324, "learning_rate": 7.894736842105263e-07, "loss": 0.1482, "step": 61250 }, { "epoch": 92.12, "grad_norm": 6.9798173904418945, "learning_rate": 7.879699248120301e-07, "loss": 0.2035, "step": 61260 }, { "epoch": 92.14, "grad_norm": 6.008689880371094, "learning_rate": 7.864661654135339e-07, "loss": 0.1314, "step": 61270 }, { "epoch": 92.15, "grad_norm": 4.28978157043457, "learning_rate": 7.849624060150377e-07, "loss": 0.1305, "step": 61280 }, { "epoch": 92.17, "grad_norm": 2.4776744842529297, "learning_rate": 7.834586466165414e-07, "loss": 0.1741, "step": 61290 }, { "epoch": 92.18, "grad_norm": 6.749001502990723, "learning_rate": 7.819548872180452e-07, "loss": 0.1177, "step": 61300 }, { "epoch": 92.2, "grad_norm": 5.488044738769531, "learning_rate": 7.80451127819549e-07, "loss": 0.2004, "step": 61310 }, { "epoch": 92.21, "grad_norm": 6.313375949859619, "learning_rate": 7.789473684210527e-07, "loss": 0.2079, "step": 61320 }, { "epoch": 92.23, "grad_norm": 4.431061267852783, "learning_rate": 7.774436090225565e-07, "loss": 0.135, "step": 61330 }, { "epoch": 92.24, "grad_norm": 4.987079620361328, "learning_rate": 7.759398496240603e-07, "loss": 0.1677, "step": 61340 }, { "epoch": 92.26, "grad_norm": 6.565682411193848, "learning_rate": 7.74436090225564e-07, "loss": 0.1398, "step": 61350 }, { "epoch": 92.27, "grad_norm": 8.182050704956055, "learning_rate": 7.729323308270676e-07, "loss": 0.1806, "step": 61360 }, { "epoch": 92.29, "grad_norm": 1.415570855140686, "learning_rate": 7.714285714285715e-07, "loss": 0.1538, "step": 61370 }, { "epoch": 92.3, "grad_norm": 7.348145961761475, "learning_rate": 7.699248120300752e-07, "loss": 0.152, "step": 61380 }, { "epoch": 92.32, "grad_norm": 2.7620201110839844, "learning_rate": 7.684210526315789e-07, "loss": 0.2018, "step": 61390 }, { "epoch": 92.33, "grad_norm": 6.486240863800049, "learning_rate": 7.669172932330828e-07, "loss": 0.257, "step": 61400 }, { "epoch": 92.35, "grad_norm": 4.345780372619629, "learning_rate": 7.654135338345865e-07, "loss": 0.2216, "step": 61410 }, { "epoch": 92.36, "grad_norm": 3.1555426120758057, "learning_rate": 7.639097744360902e-07, "loss": 0.1257, "step": 61420 }, { "epoch": 92.38, "grad_norm": 4.688722610473633, "learning_rate": 7.624060150375941e-07, "loss": 0.1059, "step": 61430 }, { "epoch": 92.39, "grad_norm": 4.647052764892578, "learning_rate": 7.609022556390978e-07, "loss": 0.1449, "step": 61440 }, { "epoch": 92.41, "grad_norm": 7.654082775115967, "learning_rate": 7.593984962406016e-07, "loss": 0.2042, "step": 61450 }, { "epoch": 92.42, "grad_norm": 12.962769508361816, "learning_rate": 7.578947368421054e-07, "loss": 0.1656, "step": 61460 }, { "epoch": 92.44, "grad_norm": 4.620476245880127, "learning_rate": 7.563909774436091e-07, "loss": 0.1846, "step": 61470 }, { "epoch": 92.45, "grad_norm": 6.064635753631592, "learning_rate": 7.548872180451127e-07, "loss": 0.1734, "step": 61480 }, { "epoch": 92.47, "grad_norm": 4.810222625732422, "learning_rate": 7.533834586466166e-07, "loss": 0.1596, "step": 61490 }, { "epoch": 92.48, "grad_norm": 6.1294755935668945, "learning_rate": 7.518796992481203e-07, "loss": 0.1405, "step": 61500 }, { "epoch": 92.5, "grad_norm": 6.559119701385498, "learning_rate": 7.503759398496241e-07, "loss": 0.1389, "step": 61510 }, { "epoch": 92.51, "grad_norm": 4.981884479522705, "learning_rate": 7.488721804511279e-07, "loss": 0.1999, "step": 61520 }, { "epoch": 92.53, "grad_norm": 6.0175604820251465, "learning_rate": 7.473684210526316e-07, "loss": 0.194, "step": 61530 }, { "epoch": 92.54, "grad_norm": 6.699028968811035, "learning_rate": 7.458646616541354e-07, "loss": 0.2085, "step": 61540 }, { "epoch": 92.56, "grad_norm": 9.893536567687988, "learning_rate": 7.443609022556392e-07, "loss": 0.1569, "step": 61550 }, { "epoch": 92.57, "grad_norm": 3.370506525039673, "learning_rate": 7.428571428571429e-07, "loss": 0.1606, "step": 61560 }, { "epoch": 92.59, "grad_norm": 4.382241725921631, "learning_rate": 7.413533834586467e-07, "loss": 0.1473, "step": 61570 }, { "epoch": 92.6, "grad_norm": 1.784295916557312, "learning_rate": 7.398496240601505e-07, "loss": 0.1692, "step": 61580 }, { "epoch": 92.62, "grad_norm": 2.770347833633423, "learning_rate": 7.383458646616543e-07, "loss": 0.1704, "step": 61590 }, { "epoch": 92.63, "grad_norm": 3.2620766162872314, "learning_rate": 7.368421052631579e-07, "loss": 0.1742, "step": 61600 }, { "epoch": 92.65, "grad_norm": 8.29354476928711, "learning_rate": 7.353383458646617e-07, "loss": 0.1271, "step": 61610 }, { "epoch": 92.66, "grad_norm": 3.8892619609832764, "learning_rate": 7.338345864661654e-07, "loss": 0.2461, "step": 61620 }, { "epoch": 92.68, "grad_norm": 7.872462272644043, "learning_rate": 7.323308270676692e-07, "loss": 0.1877, "step": 61630 }, { "epoch": 92.69, "grad_norm": 4.30030632019043, "learning_rate": 7.30827067669173e-07, "loss": 0.167, "step": 61640 }, { "epoch": 92.71, "grad_norm": 3.3043644428253174, "learning_rate": 7.293233082706767e-07, "loss": 0.2256, "step": 61650 }, { "epoch": 92.72, "grad_norm": 6.955836296081543, "learning_rate": 7.278195488721805e-07, "loss": 0.2138, "step": 61660 }, { "epoch": 92.74, "grad_norm": 2.1110410690307617, "learning_rate": 7.263157894736843e-07, "loss": 0.1773, "step": 61670 }, { "epoch": 92.75, "grad_norm": 6.610522747039795, "learning_rate": 7.248120300751881e-07, "loss": 0.1779, "step": 61680 }, { "epoch": 92.77, "grad_norm": 4.267979145050049, "learning_rate": 7.233082706766918e-07, "loss": 0.1249, "step": 61690 }, { "epoch": 92.78, "grad_norm": 5.974915027618408, "learning_rate": 7.218045112781956e-07, "loss": 0.1679, "step": 61700 }, { "epoch": 92.8, "grad_norm": 2.948686122894287, "learning_rate": 7.203007518796994e-07, "loss": 0.1572, "step": 61710 }, { "epoch": 92.81, "grad_norm": 4.3910040855407715, "learning_rate": 7.187969924812031e-07, "loss": 0.1469, "step": 61720 }, { "epoch": 92.83, "grad_norm": 5.496255874633789, "learning_rate": 7.172932330827067e-07, "loss": 0.1574, "step": 61730 }, { "epoch": 92.84, "grad_norm": 6.861441135406494, "learning_rate": 7.157894736842106e-07, "loss": 0.161, "step": 61740 }, { "epoch": 92.86, "grad_norm": 6.682887554168701, "learning_rate": 7.142857142857143e-07, "loss": 0.1741, "step": 61750 }, { "epoch": 92.87, "grad_norm": 5.851301670074463, "learning_rate": 7.12781954887218e-07, "loss": 0.2264, "step": 61760 }, { "epoch": 92.89, "grad_norm": 6.062599182128906, "learning_rate": 7.112781954887219e-07, "loss": 0.1853, "step": 61770 }, { "epoch": 92.9, "grad_norm": 3.691725969314575, "learning_rate": 7.097744360902256e-07, "loss": 0.1196, "step": 61780 }, { "epoch": 92.92, "grad_norm": 6.096661567687988, "learning_rate": 7.082706766917293e-07, "loss": 0.1816, "step": 61790 }, { "epoch": 92.93, "grad_norm": 0.75389564037323, "learning_rate": 7.067669172932332e-07, "loss": 0.1673, "step": 61800 }, { "epoch": 92.95, "grad_norm": 8.029284477233887, "learning_rate": 7.052631578947369e-07, "loss": 0.1728, "step": 61810 }, { "epoch": 92.96, "grad_norm": 9.124761581420898, "learning_rate": 7.037593984962407e-07, "loss": 0.2143, "step": 61820 }, { "epoch": 92.98, "grad_norm": 6.918426036834717, "learning_rate": 7.022556390977445e-07, "loss": 0.1845, "step": 61830 }, { "epoch": 92.99, "grad_norm": 6.886169910430908, "learning_rate": 7.007518796992482e-07, "loss": 0.1807, "step": 61840 }, { "epoch": 93.0, "eval_accuracy": 0.9307, "eval_loss": 0.3299960494041443, "eval_runtime": 84.6466, "eval_samples_per_second": 118.138, "eval_steps_per_second": 0.473, "step": 61845 }, { "epoch": 93.01, "grad_norm": 5.7528886795043945, "learning_rate": 6.992481203007518e-07, "loss": 0.1765, "step": 61850 }, { "epoch": 93.02, "grad_norm": 6.218891620635986, "learning_rate": 6.977443609022557e-07, "loss": 0.1465, "step": 61860 }, { "epoch": 93.04, "grad_norm": 3.1827406883239746, "learning_rate": 6.962406015037594e-07, "loss": 0.1436, "step": 61870 }, { "epoch": 93.05, "grad_norm": 2.038198232650757, "learning_rate": 6.947368421052631e-07, "loss": 0.1519, "step": 61880 }, { "epoch": 93.07, "grad_norm": 4.038834095001221, "learning_rate": 6.93233082706767e-07, "loss": 0.1381, "step": 61890 }, { "epoch": 93.08, "grad_norm": 8.468280792236328, "learning_rate": 6.917293233082707e-07, "loss": 0.1598, "step": 61900 }, { "epoch": 93.1, "grad_norm": 5.541633129119873, "learning_rate": 6.902255639097745e-07, "loss": 0.1888, "step": 61910 }, { "epoch": 93.11, "grad_norm": 6.266165733337402, "learning_rate": 6.887218045112783e-07, "loss": 0.1338, "step": 61920 }, { "epoch": 93.13, "grad_norm": 2.7548935413360596, "learning_rate": 6.87218045112782e-07, "loss": 0.1435, "step": 61930 }, { "epoch": 93.14, "grad_norm": 6.2727131843566895, "learning_rate": 6.857142857142858e-07, "loss": 0.1858, "step": 61940 }, { "epoch": 93.16, "grad_norm": 8.00541877746582, "learning_rate": 6.842105263157896e-07, "loss": 0.1809, "step": 61950 }, { "epoch": 93.17, "grad_norm": 4.776533126831055, "learning_rate": 6.827067669172933e-07, "loss": 0.1105, "step": 61960 }, { "epoch": 93.19, "grad_norm": 0.5957467555999756, "learning_rate": 6.812030075187971e-07, "loss": 0.1535, "step": 61970 }, { "epoch": 93.2, "grad_norm": 2.1154375076293945, "learning_rate": 6.796992481203008e-07, "loss": 0.1391, "step": 61980 }, { "epoch": 93.22, "grad_norm": 3.501743793487549, "learning_rate": 6.781954887218045e-07, "loss": 0.1809, "step": 61990 }, { "epoch": 93.23, "grad_norm": 19.21070098876953, "learning_rate": 6.766917293233083e-07, "loss": 0.2236, "step": 62000 }, { "epoch": 93.25, "grad_norm": 8.024877548217773, "learning_rate": 6.751879699248121e-07, "loss": 0.1522, "step": 62010 }, { "epoch": 93.26, "grad_norm": 3.7269623279571533, "learning_rate": 6.736842105263158e-07, "loss": 0.1898, "step": 62020 }, { "epoch": 93.28, "grad_norm": 3.2718605995178223, "learning_rate": 6.721804511278196e-07, "loss": 0.1798, "step": 62030 }, { "epoch": 93.29, "grad_norm": 5.502279281616211, "learning_rate": 6.706766917293234e-07, "loss": 0.1604, "step": 62040 }, { "epoch": 93.31, "grad_norm": 5.675206661224365, "learning_rate": 6.691729323308272e-07, "loss": 0.1656, "step": 62050 }, { "epoch": 93.32, "grad_norm": 5.183054447174072, "learning_rate": 6.676691729323309e-07, "loss": 0.2103, "step": 62060 }, { "epoch": 93.34, "grad_norm": 1.9447113275527954, "learning_rate": 6.661654135338347e-07, "loss": 0.1621, "step": 62070 }, { "epoch": 93.35, "grad_norm": 3.6219539642333984, "learning_rate": 6.646616541353385e-07, "loss": 0.126, "step": 62080 }, { "epoch": 93.37, "grad_norm": 4.979419708251953, "learning_rate": 6.631578947368422e-07, "loss": 0.1749, "step": 62090 }, { "epoch": 93.38, "grad_norm": 7.001568794250488, "learning_rate": 6.616541353383458e-07, "loss": 0.1297, "step": 62100 }, { "epoch": 93.4, "grad_norm": 7.3389387130737305, "learning_rate": 6.601503759398496e-07, "loss": 0.1794, "step": 62110 }, { "epoch": 93.41, "grad_norm": 4.73595666885376, "learning_rate": 6.586466165413534e-07, "loss": 0.1852, "step": 62120 }, { "epoch": 93.43, "grad_norm": 5.426881790161133, "learning_rate": 6.571428571428571e-07, "loss": 0.2275, "step": 62130 }, { "epoch": 93.44, "grad_norm": 7.103865146636963, "learning_rate": 6.55639097744361e-07, "loss": 0.1765, "step": 62140 }, { "epoch": 93.46, "grad_norm": 5.244080066680908, "learning_rate": 6.541353383458647e-07, "loss": 0.1559, "step": 62150 }, { "epoch": 93.47, "grad_norm": 11.155570030212402, "learning_rate": 6.526315789473684e-07, "loss": 0.191, "step": 62160 }, { "epoch": 93.49, "grad_norm": 3.5410802364349365, "learning_rate": 6.511278195488723e-07, "loss": 0.1308, "step": 62170 }, { "epoch": 93.5, "grad_norm": 5.664346218109131, "learning_rate": 6.49624060150376e-07, "loss": 0.1734, "step": 62180 }, { "epoch": 93.52, "grad_norm": 3.049161434173584, "learning_rate": 6.481203007518797e-07, "loss": 0.1595, "step": 62190 }, { "epoch": 93.53, "grad_norm": 3.4412500858306885, "learning_rate": 6.466165413533836e-07, "loss": 0.1986, "step": 62200 }, { "epoch": 93.55, "grad_norm": 5.76616096496582, "learning_rate": 6.451127819548873e-07, "loss": 0.2203, "step": 62210 }, { "epoch": 93.56, "grad_norm": 3.9202961921691895, "learning_rate": 6.436090225563909e-07, "loss": 0.1739, "step": 62220 }, { "epoch": 93.58, "grad_norm": 5.559597969055176, "learning_rate": 6.421052631578948e-07, "loss": 0.1472, "step": 62230 }, { "epoch": 93.59, "grad_norm": 6.698195934295654, "learning_rate": 6.406015037593985e-07, "loss": 0.1822, "step": 62240 }, { "epoch": 93.61, "grad_norm": 7.657590866088867, "learning_rate": 6.390977443609022e-07, "loss": 0.1469, "step": 62250 }, { "epoch": 93.62, "grad_norm": 9.91557788848877, "learning_rate": 6.375939849624061e-07, "loss": 0.1382, "step": 62260 }, { "epoch": 93.64, "grad_norm": 4.059289932250977, "learning_rate": 6.360902255639098e-07, "loss": 0.1408, "step": 62270 }, { "epoch": 93.65, "grad_norm": 3.5155844688415527, "learning_rate": 6.345864661654136e-07, "loss": 0.153, "step": 62280 }, { "epoch": 93.67, "grad_norm": 2.9289627075195312, "learning_rate": 6.330827067669174e-07, "loss": 0.133, "step": 62290 }, { "epoch": 93.68, "grad_norm": 1.8064905405044556, "learning_rate": 6.315789473684211e-07, "loss": 0.1543, "step": 62300 }, { "epoch": 93.7, "grad_norm": 5.917304039001465, "learning_rate": 6.300751879699249e-07, "loss": 0.1764, "step": 62310 }, { "epoch": 93.71, "grad_norm": 9.10999584197998, "learning_rate": 6.285714285714287e-07, "loss": 0.1601, "step": 62320 }, { "epoch": 93.73, "grad_norm": 9.873708724975586, "learning_rate": 6.270676691729324e-07, "loss": 0.2098, "step": 62330 }, { "epoch": 93.74, "grad_norm": 4.124228000640869, "learning_rate": 6.255639097744362e-07, "loss": 0.1395, "step": 62340 }, { "epoch": 93.76, "grad_norm": 5.858664035797119, "learning_rate": 6.240601503759399e-07, "loss": 0.2124, "step": 62350 }, { "epoch": 93.77, "grad_norm": 5.991408348083496, "learning_rate": 6.225563909774437e-07, "loss": 0.1532, "step": 62360 }, { "epoch": 93.79, "grad_norm": 3.827173948287964, "learning_rate": 6.210526315789474e-07, "loss": 0.1139, "step": 62370 }, { "epoch": 93.8, "grad_norm": 3.3806679248809814, "learning_rate": 6.195488721804512e-07, "loss": 0.1869, "step": 62380 }, { "epoch": 93.82, "grad_norm": 5.624974250793457, "learning_rate": 6.180451127819549e-07, "loss": 0.2374, "step": 62390 }, { "epoch": 93.83, "grad_norm": 3.611284017562866, "learning_rate": 6.165413533834587e-07, "loss": 0.2001, "step": 62400 }, { "epoch": 93.85, "grad_norm": 3.4336488246917725, "learning_rate": 6.150375939849625e-07, "loss": 0.2229, "step": 62410 }, { "epoch": 93.86, "grad_norm": 5.959348201751709, "learning_rate": 6.135338345864662e-07, "loss": 0.1408, "step": 62420 }, { "epoch": 93.88, "grad_norm": 4.848119258880615, "learning_rate": 6.1203007518797e-07, "loss": 0.1566, "step": 62430 }, { "epoch": 93.89, "grad_norm": 4.178713798522949, "learning_rate": 6.105263157894738e-07, "loss": 0.1452, "step": 62440 }, { "epoch": 93.91, "grad_norm": 7.14044713973999, "learning_rate": 6.090225563909775e-07, "loss": 0.15, "step": 62450 }, { "epoch": 93.92, "grad_norm": 6.114139556884766, "learning_rate": 6.075187969924812e-07, "loss": 0.1249, "step": 62460 }, { "epoch": 93.94, "grad_norm": 2.516326904296875, "learning_rate": 6.06015037593985e-07, "loss": 0.112, "step": 62470 }, { "epoch": 93.95, "grad_norm": 6.2728705406188965, "learning_rate": 6.045112781954888e-07, "loss": 0.1425, "step": 62480 }, { "epoch": 93.97, "grad_norm": 5.063921928405762, "learning_rate": 6.030075187969925e-07, "loss": 0.1956, "step": 62490 }, { "epoch": 93.98, "grad_norm": 1.3252296447753906, "learning_rate": 6.015037593984962e-07, "loss": 0.1332, "step": 62500 }, { "epoch": 94.0, "grad_norm": 0.03227702155709267, "learning_rate": 6.000000000000001e-07, "loss": 0.1855, "step": 62510 }, { "epoch": 94.0, "eval_accuracy": 0.9303, "eval_loss": 0.33146244287490845, "eval_runtime": 85.2753, "eval_samples_per_second": 117.267, "eval_steps_per_second": 0.469, "step": 62510 }, { "epoch": 94.02, "grad_norm": 4.525468826293945, "learning_rate": 5.984962406015038e-07, "loss": 0.1478, "step": 62520 }, { "epoch": 94.03, "grad_norm": 3.999218702316284, "learning_rate": 5.969924812030075e-07, "loss": 0.1595, "step": 62530 }, { "epoch": 94.05, "grad_norm": 5.01107120513916, "learning_rate": 5.954887218045114e-07, "loss": 0.1798, "step": 62540 }, { "epoch": 94.06, "grad_norm": 3.2505600452423096, "learning_rate": 5.939849624060151e-07, "loss": 0.1772, "step": 62550 }, { "epoch": 94.08, "grad_norm": 5.565808296203613, "learning_rate": 5.924812030075188e-07, "loss": 0.163, "step": 62560 }, { "epoch": 94.09, "grad_norm": 5.079768180847168, "learning_rate": 5.909774436090226e-07, "loss": 0.1596, "step": 62570 }, { "epoch": 94.11, "grad_norm": 6.390072345733643, "learning_rate": 5.894736842105263e-07, "loss": 0.1867, "step": 62580 }, { "epoch": 94.12, "grad_norm": 3.9409000873565674, "learning_rate": 5.879699248120301e-07, "loss": 0.1428, "step": 62590 }, { "epoch": 94.14, "grad_norm": 7.87678337097168, "learning_rate": 5.864661654135339e-07, "loss": 0.139, "step": 62600 }, { "epoch": 94.15, "grad_norm": 3.9159417152404785, "learning_rate": 5.849624060150377e-07, "loss": 0.1449, "step": 62610 }, { "epoch": 94.17, "grad_norm": 5.255906581878662, "learning_rate": 5.834586466165414e-07, "loss": 0.1411, "step": 62620 }, { "epoch": 94.18, "grad_norm": 4.5999436378479, "learning_rate": 5.819548872180452e-07, "loss": 0.1697, "step": 62630 }, { "epoch": 94.2, "grad_norm": 10.575860977172852, "learning_rate": 5.804511278195489e-07, "loss": 0.1787, "step": 62640 }, { "epoch": 94.21, "grad_norm": 8.932710647583008, "learning_rate": 5.789473684210526e-07, "loss": 0.1589, "step": 62650 }, { "epoch": 94.23, "grad_norm": 8.139424324035645, "learning_rate": 5.774436090225565e-07, "loss": 0.1776, "step": 62660 }, { "epoch": 94.24, "grad_norm": 1.6099450588226318, "learning_rate": 5.759398496240602e-07, "loss": 0.1939, "step": 62670 }, { "epoch": 94.26, "grad_norm": 2.6869306564331055, "learning_rate": 5.74436090225564e-07, "loss": 0.1988, "step": 62680 }, { "epoch": 94.27, "grad_norm": 5.661433219909668, "learning_rate": 5.729323308270677e-07, "loss": 0.2013, "step": 62690 }, { "epoch": 94.29, "grad_norm": 6.6180315017700195, "learning_rate": 5.714285714285715e-07, "loss": 0.188, "step": 62700 }, { "epoch": 94.3, "grad_norm": 4.5534563064575195, "learning_rate": 5.699248120300752e-07, "loss": 0.1996, "step": 62710 }, { "epoch": 94.32, "grad_norm": 5.5353193283081055, "learning_rate": 5.68421052631579e-07, "loss": 0.1214, "step": 62720 }, { "epoch": 94.33, "grad_norm": 3.558548927307129, "learning_rate": 5.669172932330828e-07, "loss": 0.1451, "step": 62730 }, { "epoch": 94.35, "grad_norm": 4.777967929840088, "learning_rate": 5.654135338345865e-07, "loss": 0.1993, "step": 62740 }, { "epoch": 94.36, "grad_norm": 5.252827167510986, "learning_rate": 5.639097744360903e-07, "loss": 0.1442, "step": 62750 }, { "epoch": 94.38, "grad_norm": 6.862250804901123, "learning_rate": 5.62406015037594e-07, "loss": 0.191, "step": 62760 }, { "epoch": 94.39, "grad_norm": 7.284124851226807, "learning_rate": 5.609022556390978e-07, "loss": 0.1813, "step": 62770 }, { "epoch": 94.41, "grad_norm": 4.488624572753906, "learning_rate": 5.593984962406016e-07, "loss": 0.1546, "step": 62780 }, { "epoch": 94.42, "grad_norm": 4.856323719024658, "learning_rate": 5.578947368421053e-07, "loss": 0.1553, "step": 62790 }, { "epoch": 94.44, "grad_norm": 5.90994119644165, "learning_rate": 5.563909774436091e-07, "loss": 0.168, "step": 62800 }, { "epoch": 94.45, "grad_norm": 4.328577041625977, "learning_rate": 5.548872180451127e-07, "loss": 0.1539, "step": 62810 }, { "epoch": 94.47, "grad_norm": 2.8183932304382324, "learning_rate": 5.533834586466166e-07, "loss": 0.1973, "step": 62820 }, { "epoch": 94.48, "grad_norm": 5.438328266143799, "learning_rate": 5.518796992481203e-07, "loss": 0.17, "step": 62830 }, { "epoch": 94.5, "grad_norm": 4.676559925079346, "learning_rate": 5.503759398496241e-07, "loss": 0.131, "step": 62840 }, { "epoch": 94.51, "grad_norm": 8.041285514831543, "learning_rate": 5.488721804511279e-07, "loss": 0.1931, "step": 62850 }, { "epoch": 94.53, "grad_norm": 6.983100414276123, "learning_rate": 5.473684210526316e-07, "loss": 0.1621, "step": 62860 }, { "epoch": 94.54, "grad_norm": 8.173839569091797, "learning_rate": 5.458646616541354e-07, "loss": 0.1366, "step": 62870 }, { "epoch": 94.56, "grad_norm": 4.067495346069336, "learning_rate": 5.443609022556391e-07, "loss": 0.137, "step": 62880 }, { "epoch": 94.57, "grad_norm": 1.778714656829834, "learning_rate": 5.428571428571429e-07, "loss": 0.1684, "step": 62890 }, { "epoch": 94.59, "grad_norm": 4.966789722442627, "learning_rate": 5.413533834586466e-07, "loss": 0.1576, "step": 62900 }, { "epoch": 94.6, "grad_norm": 6.58605432510376, "learning_rate": 5.398496240601505e-07, "loss": 0.1515, "step": 62910 }, { "epoch": 94.62, "grad_norm": 5.37683629989624, "learning_rate": 5.383458646616542e-07, "loss": 0.2039, "step": 62920 }, { "epoch": 94.63, "grad_norm": 4.260867595672607, "learning_rate": 5.368421052631579e-07, "loss": 0.169, "step": 62930 }, { "epoch": 94.65, "grad_norm": 8.92151927947998, "learning_rate": 5.353383458646617e-07, "loss": 0.1674, "step": 62940 }, { "epoch": 94.66, "grad_norm": 5.447167873382568, "learning_rate": 5.338345864661654e-07, "loss": 0.1623, "step": 62950 }, { "epoch": 94.68, "grad_norm": 5.827322483062744, "learning_rate": 5.323308270676692e-07, "loss": 0.1627, "step": 62960 }, { "epoch": 94.69, "grad_norm": 3.478543281555176, "learning_rate": 5.30827067669173e-07, "loss": 0.1619, "step": 62970 }, { "epoch": 94.71, "grad_norm": 2.6951584815979004, "learning_rate": 5.293233082706768e-07, "loss": 0.1155, "step": 62980 }, { "epoch": 94.72, "grad_norm": 4.290323734283447, "learning_rate": 5.278195488721805e-07, "loss": 0.2299, "step": 62990 }, { "epoch": 94.74, "grad_norm": 2.3207318782806396, "learning_rate": 5.263157894736843e-07, "loss": 0.1377, "step": 63000 }, { "epoch": 94.75, "grad_norm": 4.391146659851074, "learning_rate": 5.24812030075188e-07, "loss": 0.1383, "step": 63010 }, { "epoch": 94.77, "grad_norm": 8.625882148742676, "learning_rate": 5.233082706766917e-07, "loss": 0.1567, "step": 63020 }, { "epoch": 94.78, "grad_norm": 5.556321620941162, "learning_rate": 5.218045112781956e-07, "loss": 0.1483, "step": 63030 }, { "epoch": 94.8, "grad_norm": 3.1565639972686768, "learning_rate": 5.203007518796993e-07, "loss": 0.1173, "step": 63040 }, { "epoch": 94.81, "grad_norm": 1.9764903783798218, "learning_rate": 5.18796992481203e-07, "loss": 0.1378, "step": 63050 }, { "epoch": 94.83, "grad_norm": 6.802098274230957, "learning_rate": 5.172932330827068e-07, "loss": 0.1888, "step": 63060 }, { "epoch": 94.84, "grad_norm": 10.17620849609375, "learning_rate": 5.157894736842106e-07, "loss": 0.1397, "step": 63070 }, { "epoch": 94.86, "grad_norm": 2.024324893951416, "learning_rate": 5.142857142857143e-07, "loss": 0.1553, "step": 63080 }, { "epoch": 94.87, "grad_norm": 4.072267532348633, "learning_rate": 5.127819548872181e-07, "loss": 0.1659, "step": 63090 }, { "epoch": 94.89, "grad_norm": 5.023469924926758, "learning_rate": 5.112781954887219e-07, "loss": 0.1639, "step": 63100 }, { "epoch": 94.9, "grad_norm": 8.388982772827148, "learning_rate": 5.097744360902256e-07, "loss": 0.1692, "step": 63110 }, { "epoch": 94.92, "grad_norm": 3.7290244102478027, "learning_rate": 5.082706766917294e-07, "loss": 0.1179, "step": 63120 }, { "epoch": 94.93, "grad_norm": 7.133794784545898, "learning_rate": 5.067669172932331e-07, "loss": 0.1777, "step": 63130 }, { "epoch": 94.95, "grad_norm": 1.3478121757507324, "learning_rate": 5.052631578947369e-07, "loss": 0.1594, "step": 63140 }, { "epoch": 94.96, "grad_norm": 2.427905321121216, "learning_rate": 5.037593984962407e-07, "loss": 0.1275, "step": 63150 }, { "epoch": 94.98, "grad_norm": 4.035090446472168, "learning_rate": 5.022556390977444e-07, "loss": 0.1687, "step": 63160 }, { "epoch": 94.99, "grad_norm": 3.4016542434692383, "learning_rate": 5.007518796992482e-07, "loss": 0.1747, "step": 63170 }, { "epoch": 95.0, "eval_accuracy": 0.9295, "eval_loss": 0.33236411213874817, "eval_runtime": 84.9502, "eval_samples_per_second": 117.716, "eval_steps_per_second": 0.471, "step": 63175 }, { "epoch": 95.01, "grad_norm": 6.205942630767822, "learning_rate": 4.99248120300752e-07, "loss": 0.1474, "step": 63180 }, { "epoch": 95.02, "grad_norm": 6.6230788230896, "learning_rate": 4.977443609022557e-07, "loss": 0.1549, "step": 63190 }, { "epoch": 95.04, "grad_norm": 8.845964431762695, "learning_rate": 4.962406015037594e-07, "loss": 0.1962, "step": 63200 }, { "epoch": 95.05, "grad_norm": 3.8196167945861816, "learning_rate": 4.947368421052632e-07, "loss": 0.1145, "step": 63210 }, { "epoch": 95.07, "grad_norm": 4.0402607917785645, "learning_rate": 4.93233082706767e-07, "loss": 0.1538, "step": 63220 }, { "epoch": 95.08, "grad_norm": 0.3398207724094391, "learning_rate": 4.917293233082707e-07, "loss": 0.1827, "step": 63230 }, { "epoch": 95.1, "grad_norm": 5.126389503479004, "learning_rate": 4.902255639097745e-07, "loss": 0.1691, "step": 63240 }, { "epoch": 95.11, "grad_norm": 5.045697212219238, "learning_rate": 4.887218045112782e-07, "loss": 0.1115, "step": 63250 }, { "epoch": 95.13, "grad_norm": 6.75160551071167, "learning_rate": 4.87218045112782e-07, "loss": 0.1862, "step": 63260 }, { "epoch": 95.14, "grad_norm": 3.8719749450683594, "learning_rate": 4.857142857142857e-07, "loss": 0.1878, "step": 63270 }, { "epoch": 95.16, "grad_norm": 6.269413471221924, "learning_rate": 4.842105263157895e-07, "loss": 0.1563, "step": 63280 }, { "epoch": 95.17, "grad_norm": 4.618571758270264, "learning_rate": 4.827067669172933e-07, "loss": 0.1733, "step": 63290 }, { "epoch": 95.19, "grad_norm": 4.135458469390869, "learning_rate": 4.81203007518797e-07, "loss": 0.2016, "step": 63300 }, { "epoch": 95.2, "grad_norm": 7.51487398147583, "learning_rate": 4.796992481203008e-07, "loss": 0.1643, "step": 63310 }, { "epoch": 95.22, "grad_norm": 4.156686305999756, "learning_rate": 4.781954887218045e-07, "loss": 0.1999, "step": 63320 }, { "epoch": 95.23, "grad_norm": 8.415448188781738, "learning_rate": 4.766917293233083e-07, "loss": 0.2051, "step": 63330 }, { "epoch": 95.25, "grad_norm": 3.9293386936187744, "learning_rate": 4.7518796992481207e-07, "loss": 0.1811, "step": 63340 }, { "epoch": 95.26, "grad_norm": 5.730102062225342, "learning_rate": 4.7368421052631585e-07, "loss": 0.1573, "step": 63350 }, { "epoch": 95.28, "grad_norm": 5.62534236907959, "learning_rate": 4.7218045112781963e-07, "loss": 0.1802, "step": 63360 }, { "epoch": 95.29, "grad_norm": 5.367505073547363, "learning_rate": 4.706766917293233e-07, "loss": 0.1676, "step": 63370 }, { "epoch": 95.31, "grad_norm": 5.48174524307251, "learning_rate": 4.691729323308271e-07, "loss": 0.1394, "step": 63380 }, { "epoch": 95.32, "grad_norm": 5.894782543182373, "learning_rate": 4.676691729323309e-07, "loss": 0.1703, "step": 63390 }, { "epoch": 95.34, "grad_norm": 3.4429852962493896, "learning_rate": 4.661654135338346e-07, "loss": 0.1782, "step": 63400 }, { "epoch": 95.35, "grad_norm": 5.609094142913818, "learning_rate": 4.646616541353384e-07, "loss": 0.2212, "step": 63410 }, { "epoch": 95.37, "grad_norm": 6.826117992401123, "learning_rate": 4.631578947368422e-07, "loss": 0.1549, "step": 63420 }, { "epoch": 95.38, "grad_norm": 6.199655532836914, "learning_rate": 4.6165413533834585e-07, "loss": 0.2035, "step": 63430 }, { "epoch": 95.4, "grad_norm": 3.159210443496704, "learning_rate": 4.6015037593984964e-07, "loss": 0.168, "step": 63440 }, { "epoch": 95.41, "grad_norm": 4.340883255004883, "learning_rate": 4.586466165413534e-07, "loss": 0.1506, "step": 63450 }, { "epoch": 95.43, "grad_norm": 6.630001068115234, "learning_rate": 4.571428571428572e-07, "loss": 0.1342, "step": 63460 }, { "epoch": 95.44, "grad_norm": 6.111904144287109, "learning_rate": 4.5563909774436094e-07, "loss": 0.1454, "step": 63470 }, { "epoch": 95.46, "grad_norm": 8.334033012390137, "learning_rate": 4.541353383458647e-07, "loss": 0.1583, "step": 63480 }, { "epoch": 95.47, "grad_norm": 3.537055730819702, "learning_rate": 4.526315789473685e-07, "loss": 0.1506, "step": 63490 }, { "epoch": 95.49, "grad_norm": 4.5531792640686035, "learning_rate": 4.511278195488722e-07, "loss": 0.1923, "step": 63500 }, { "epoch": 95.5, "grad_norm": 4.920889854431152, "learning_rate": 4.4962406015037597e-07, "loss": 0.1556, "step": 63510 }, { "epoch": 95.52, "grad_norm": 4.547194004058838, "learning_rate": 4.4812030075187975e-07, "loss": 0.1974, "step": 63520 }, { "epoch": 95.53, "grad_norm": 3.0299079418182373, "learning_rate": 4.466165413533835e-07, "loss": 0.1801, "step": 63530 }, { "epoch": 95.55, "grad_norm": 5.605795383453369, "learning_rate": 4.4511278195488726e-07, "loss": 0.1776, "step": 63540 }, { "epoch": 95.56, "grad_norm": 8.868003845214844, "learning_rate": 4.4360902255639105e-07, "loss": 0.1175, "step": 63550 }, { "epoch": 95.58, "grad_norm": 5.188925266265869, "learning_rate": 4.421052631578947e-07, "loss": 0.1546, "step": 63560 }, { "epoch": 95.59, "grad_norm": 3.829498529434204, "learning_rate": 4.406015037593985e-07, "loss": 0.2129, "step": 63570 }, { "epoch": 95.61, "grad_norm": 7.33263635635376, "learning_rate": 4.390977443609023e-07, "loss": 0.2169, "step": 63580 }, { "epoch": 95.62, "grad_norm": 6.37952995300293, "learning_rate": 4.375939849624061e-07, "loss": 0.1823, "step": 63590 }, { "epoch": 95.64, "grad_norm": 4.5436201095581055, "learning_rate": 4.360902255639098e-07, "loss": 0.1293, "step": 63600 }, { "epoch": 95.65, "grad_norm": 4.836911201477051, "learning_rate": 4.345864661654136e-07, "loss": 0.1519, "step": 63610 }, { "epoch": 95.67, "grad_norm": 3.8880579471588135, "learning_rate": 4.330827067669173e-07, "loss": 0.1459, "step": 63620 }, { "epoch": 95.68, "grad_norm": 5.021939277648926, "learning_rate": 4.3157894736842105e-07, "loss": 0.2607, "step": 63630 }, { "epoch": 95.7, "grad_norm": 5.220824241638184, "learning_rate": 4.3007518796992484e-07, "loss": 0.2041, "step": 63640 }, { "epoch": 95.71, "grad_norm": 6.0544514656066895, "learning_rate": 4.285714285714286e-07, "loss": 0.1999, "step": 63650 }, { "epoch": 95.73, "grad_norm": 3.8796777725219727, "learning_rate": 4.270676691729324e-07, "loss": 0.1769, "step": 63660 }, { "epoch": 95.74, "grad_norm": 3.2138679027557373, "learning_rate": 4.2556390977443613e-07, "loss": 0.1565, "step": 63670 }, { "epoch": 95.76, "grad_norm": 6.524970054626465, "learning_rate": 4.2406015037593987e-07, "loss": 0.1141, "step": 63680 }, { "epoch": 95.77, "grad_norm": 5.798688888549805, "learning_rate": 4.2255639097744365e-07, "loss": 0.182, "step": 63690 }, { "epoch": 95.79, "grad_norm": 4.808920383453369, "learning_rate": 4.210526315789474e-07, "loss": 0.1496, "step": 63700 }, { "epoch": 95.8, "grad_norm": 2.4658915996551514, "learning_rate": 4.1954887218045116e-07, "loss": 0.1157, "step": 63710 }, { "epoch": 95.82, "grad_norm": 3.5132291316986084, "learning_rate": 4.1804511278195495e-07, "loss": 0.1528, "step": 63720 }, { "epoch": 95.83, "grad_norm": 8.57204818725586, "learning_rate": 4.165413533834587e-07, "loss": 0.2021, "step": 63730 }, { "epoch": 95.85, "grad_norm": 6.247355937957764, "learning_rate": 4.150375939849624e-07, "loss": 0.1266, "step": 63740 }, { "epoch": 95.86, "grad_norm": 6.308277606964111, "learning_rate": 4.135338345864662e-07, "loss": 0.1494, "step": 63750 }, { "epoch": 95.88, "grad_norm": 4.962404251098633, "learning_rate": 4.120300751879699e-07, "loss": 0.1847, "step": 63760 }, { "epoch": 95.89, "grad_norm": 3.808089017868042, "learning_rate": 4.105263157894737e-07, "loss": 0.1727, "step": 63770 }, { "epoch": 95.91, "grad_norm": 4.107529640197754, "learning_rate": 4.090225563909775e-07, "loss": 0.1562, "step": 63780 }, { "epoch": 95.92, "grad_norm": 6.463099956512451, "learning_rate": 4.075187969924813e-07, "loss": 0.145, "step": 63790 }, { "epoch": 95.94, "grad_norm": 9.739261627197266, "learning_rate": 4.06015037593985e-07, "loss": 0.1749, "step": 63800 }, { "epoch": 95.95, "grad_norm": 4.032473087310791, "learning_rate": 4.0451127819548874e-07, "loss": 0.1542, "step": 63810 }, { "epoch": 95.97, "grad_norm": 4.938335418701172, "learning_rate": 4.030075187969925e-07, "loss": 0.1901, "step": 63820 }, { "epoch": 95.98, "grad_norm": 5.375936985015869, "learning_rate": 4.0150375939849625e-07, "loss": 0.2043, "step": 63830 }, { "epoch": 96.0, "grad_norm": 15.463907241821289, "learning_rate": 4.0000000000000003e-07, "loss": 0.1783, "step": 63840 }, { "epoch": 96.0, "eval_accuracy": 0.9315, "eval_loss": 0.3313089907169342, "eval_runtime": 85.2264, "eval_samples_per_second": 117.334, "eval_steps_per_second": 0.469, "step": 63840 }, { "epoch": 96.02, "grad_norm": 3.8098232746124268, "learning_rate": 3.984962406015038e-07, "loss": 0.177, "step": 63850 }, { "epoch": 96.03, "grad_norm": 2.248171806335449, "learning_rate": 3.969924812030076e-07, "loss": 0.1043, "step": 63860 }, { "epoch": 96.05, "grad_norm": 5.304133415222168, "learning_rate": 3.954887218045113e-07, "loss": 0.1221, "step": 63870 }, { "epoch": 96.06, "grad_norm": 9.222192764282227, "learning_rate": 3.9398496240601506e-07, "loss": 0.1829, "step": 63880 }, { "epoch": 96.08, "grad_norm": 5.404617786407471, "learning_rate": 3.9248120300751885e-07, "loss": 0.1773, "step": 63890 }, { "epoch": 96.09, "grad_norm": 4.069046974182129, "learning_rate": 3.909774436090226e-07, "loss": 0.1894, "step": 63900 }, { "epoch": 96.11, "grad_norm": 5.911056995391846, "learning_rate": 3.8947368421052636e-07, "loss": 0.1733, "step": 63910 }, { "epoch": 96.12, "grad_norm": 5.389181137084961, "learning_rate": 3.8796992481203015e-07, "loss": 0.1594, "step": 63920 }, { "epoch": 96.14, "grad_norm": 5.494384288787842, "learning_rate": 3.864661654135338e-07, "loss": 0.1526, "step": 63930 }, { "epoch": 96.15, "grad_norm": 3.113043785095215, "learning_rate": 3.849624060150376e-07, "loss": 0.1355, "step": 63940 }, { "epoch": 96.17, "grad_norm": 6.085422039031982, "learning_rate": 3.834586466165414e-07, "loss": 0.1163, "step": 63950 }, { "epoch": 96.18, "grad_norm": 4.339461326599121, "learning_rate": 3.819548872180451e-07, "loss": 0.1856, "step": 63960 }, { "epoch": 96.2, "grad_norm": 5.448220729827881, "learning_rate": 3.804511278195489e-07, "loss": 0.1677, "step": 63970 }, { "epoch": 96.21, "grad_norm": 8.586206436157227, "learning_rate": 3.789473684210527e-07, "loss": 0.1541, "step": 63980 }, { "epoch": 96.23, "grad_norm": 7.591394424438477, "learning_rate": 3.7744360902255637e-07, "loss": 0.1415, "step": 63990 }, { "epoch": 96.24, "grad_norm": 6.6830153465271, "learning_rate": 3.7593984962406015e-07, "loss": 0.1787, "step": 64000 }, { "epoch": 96.26, "grad_norm": 4.420645713806152, "learning_rate": 3.7443609022556394e-07, "loss": 0.1118, "step": 64010 }, { "epoch": 96.27, "grad_norm": 6.308743000030518, "learning_rate": 3.729323308270677e-07, "loss": 0.1742, "step": 64020 }, { "epoch": 96.29, "grad_norm": 5.491370677947998, "learning_rate": 3.7142857142857145e-07, "loss": 0.1964, "step": 64030 }, { "epoch": 96.3, "grad_norm": 3.9227075576782227, "learning_rate": 3.6992481203007523e-07, "loss": 0.1793, "step": 64040 }, { "epoch": 96.32, "grad_norm": 2.4654548168182373, "learning_rate": 3.6842105263157896e-07, "loss": 0.1948, "step": 64050 }, { "epoch": 96.33, "grad_norm": 4.7875590324401855, "learning_rate": 3.669172932330827e-07, "loss": 0.1703, "step": 64060 }, { "epoch": 96.35, "grad_norm": 5.792588233947754, "learning_rate": 3.654135338345865e-07, "loss": 0.2115, "step": 64070 }, { "epoch": 96.36, "grad_norm": 3.1025025844573975, "learning_rate": 3.6390977443609026e-07, "loss": 0.1557, "step": 64080 }, { "epoch": 96.38, "grad_norm": 7.800917625427246, "learning_rate": 3.6240601503759405e-07, "loss": 0.1808, "step": 64090 }, { "epoch": 96.39, "grad_norm": 5.239587306976318, "learning_rate": 3.609022556390978e-07, "loss": 0.1319, "step": 64100 }, { "epoch": 96.41, "grad_norm": 4.564650058746338, "learning_rate": 3.5939849624060156e-07, "loss": 0.1893, "step": 64110 }, { "epoch": 96.42, "grad_norm": 5.047513484954834, "learning_rate": 3.578947368421053e-07, "loss": 0.1427, "step": 64120 }, { "epoch": 96.44, "grad_norm": 5.4176249504089355, "learning_rate": 3.56390977443609e-07, "loss": 0.1932, "step": 64130 }, { "epoch": 96.45, "grad_norm": 5.930634021759033, "learning_rate": 3.548872180451128e-07, "loss": 0.1444, "step": 64140 }, { "epoch": 96.47, "grad_norm": 4.005468368530273, "learning_rate": 3.533834586466166e-07, "loss": 0.163, "step": 64150 }, { "epoch": 96.48, "grad_norm": 6.028830528259277, "learning_rate": 3.518796992481204e-07, "loss": 0.1771, "step": 64160 }, { "epoch": 96.5, "grad_norm": 2.6578989028930664, "learning_rate": 3.503759398496241e-07, "loss": 0.1031, "step": 64170 }, { "epoch": 96.51, "grad_norm": 5.434506416320801, "learning_rate": 3.4887218045112784e-07, "loss": 0.1739, "step": 64180 }, { "epoch": 96.53, "grad_norm": 5.756526470184326, "learning_rate": 3.4736842105263157e-07, "loss": 0.1191, "step": 64190 }, { "epoch": 96.54, "grad_norm": 4.299066543579102, "learning_rate": 3.4586466165413535e-07, "loss": 0.1522, "step": 64200 }, { "epoch": 96.56, "grad_norm": 4.130573272705078, "learning_rate": 3.4436090225563913e-07, "loss": 0.1381, "step": 64210 }, { "epoch": 96.57, "grad_norm": 5.61849308013916, "learning_rate": 3.428571428571429e-07, "loss": 0.1467, "step": 64220 }, { "epoch": 96.59, "grad_norm": 4.367659568786621, "learning_rate": 3.4135338345864665e-07, "loss": 0.1557, "step": 64230 }, { "epoch": 96.6, "grad_norm": 2.8992245197296143, "learning_rate": 3.398496240601504e-07, "loss": 0.1537, "step": 64240 }, { "epoch": 96.62, "grad_norm": 3.538975715637207, "learning_rate": 3.3834586466165416e-07, "loss": 0.1396, "step": 64250 }, { "epoch": 96.63, "grad_norm": 8.194727897644043, "learning_rate": 3.368421052631579e-07, "loss": 0.1698, "step": 64260 }, { "epoch": 96.65, "grad_norm": 10.290156364440918, "learning_rate": 3.353383458646617e-07, "loss": 0.1561, "step": 64270 }, { "epoch": 96.66, "grad_norm": 5.7417073249816895, "learning_rate": 3.3383458646616546e-07, "loss": 0.1215, "step": 64280 }, { "epoch": 96.68, "grad_norm": 6.722829818725586, "learning_rate": 3.3233082706766924e-07, "loss": 0.2179, "step": 64290 }, { "epoch": 96.69, "grad_norm": 8.941481590270996, "learning_rate": 3.308270676691729e-07, "loss": 0.1763, "step": 64300 }, { "epoch": 96.71, "grad_norm": 4.601995468139648, "learning_rate": 3.293233082706767e-07, "loss": 0.1611, "step": 64310 }, { "epoch": 96.72, "grad_norm": 7.5421671867370605, "learning_rate": 3.278195488721805e-07, "loss": 0.1459, "step": 64320 }, { "epoch": 96.74, "grad_norm": 2.4349803924560547, "learning_rate": 3.263157894736842e-07, "loss": 0.1892, "step": 64330 }, { "epoch": 96.75, "grad_norm": 1.8686342239379883, "learning_rate": 3.24812030075188e-07, "loss": 0.1407, "step": 64340 }, { "epoch": 96.77, "grad_norm": 5.259500503540039, "learning_rate": 3.233082706766918e-07, "loss": 0.1057, "step": 64350 }, { "epoch": 96.78, "grad_norm": 4.295063018798828, "learning_rate": 3.2180451127819547e-07, "loss": 0.1865, "step": 64360 }, { "epoch": 96.8, "grad_norm": 4.434532642364502, "learning_rate": 3.2030075187969925e-07, "loss": 0.1993, "step": 64370 }, { "epoch": 96.81, "grad_norm": 5.843010902404785, "learning_rate": 3.1879699248120303e-07, "loss": 0.166, "step": 64380 }, { "epoch": 96.83, "grad_norm": 4.545680046081543, "learning_rate": 3.172932330827068e-07, "loss": 0.135, "step": 64390 }, { "epoch": 96.84, "grad_norm": 6.0127739906311035, "learning_rate": 3.1578947368421055e-07, "loss": 0.201, "step": 64400 }, { "epoch": 96.86, "grad_norm": 6.95708703994751, "learning_rate": 3.1428571428571433e-07, "loss": 0.1584, "step": 64410 }, { "epoch": 96.87, "grad_norm": 0.3541220724582672, "learning_rate": 3.127819548872181e-07, "loss": 0.143, "step": 64420 }, { "epoch": 96.89, "grad_norm": 4.712253570556641, "learning_rate": 3.1127819548872185e-07, "loss": 0.1417, "step": 64430 }, { "epoch": 96.9, "grad_norm": 5.915809154510498, "learning_rate": 3.097744360902256e-07, "loss": 0.135, "step": 64440 }, { "epoch": 96.92, "grad_norm": 7.92987060546875, "learning_rate": 3.0827067669172936e-07, "loss": 0.1586, "step": 64450 }, { "epoch": 96.93, "grad_norm": 3.605088472366333, "learning_rate": 3.067669172932331e-07, "loss": 0.1384, "step": 64460 }, { "epoch": 96.95, "grad_norm": 5.079803943634033, "learning_rate": 3.052631578947369e-07, "loss": 0.177, "step": 64470 }, { "epoch": 96.96, "grad_norm": 5.349300384521484, "learning_rate": 3.037593984962406e-07, "loss": 0.2294, "step": 64480 }, { "epoch": 96.98, "grad_norm": 9.588523864746094, "learning_rate": 3.022556390977444e-07, "loss": 0.1532, "step": 64490 }, { "epoch": 96.99, "grad_norm": 3.1783509254455566, "learning_rate": 3.007518796992481e-07, "loss": 0.1256, "step": 64500 }, { "epoch": 97.0, "eval_accuracy": 0.9308, "eval_loss": 0.332674503326416, "eval_runtime": 84.9569, "eval_samples_per_second": 117.707, "eval_steps_per_second": 0.471, "step": 64505 }, { "epoch": 97.01, "grad_norm": 7.344843864440918, "learning_rate": 2.992481203007519e-07, "loss": 0.1059, "step": 64510 }, { "epoch": 97.02, "grad_norm": 10.291905403137207, "learning_rate": 2.977443609022557e-07, "loss": 0.1662, "step": 64520 }, { "epoch": 97.04, "grad_norm": 8.140653610229492, "learning_rate": 2.962406015037594e-07, "loss": 0.2008, "step": 64530 }, { "epoch": 97.05, "grad_norm": 3.3860104084014893, "learning_rate": 2.9473684210526315e-07, "loss": 0.1893, "step": 64540 }, { "epoch": 97.07, "grad_norm": 7.287869930267334, "learning_rate": 2.9323308270676693e-07, "loss": 0.1382, "step": 64550 }, { "epoch": 97.08, "grad_norm": 8.065459251403809, "learning_rate": 2.917293233082707e-07, "loss": 0.1349, "step": 64560 }, { "epoch": 97.1, "grad_norm": 5.940702438354492, "learning_rate": 2.9022556390977445e-07, "loss": 0.1896, "step": 64570 }, { "epoch": 97.11, "grad_norm": 8.597441673278809, "learning_rate": 2.8872180451127823e-07, "loss": 0.1467, "step": 64580 }, { "epoch": 97.13, "grad_norm": 4.228404998779297, "learning_rate": 2.87218045112782e-07, "loss": 0.1584, "step": 64590 }, { "epoch": 97.14, "grad_norm": 8.518570899963379, "learning_rate": 2.8571428571428575e-07, "loss": 0.1133, "step": 64600 }, { "epoch": 97.16, "grad_norm": 5.979735374450684, "learning_rate": 2.842105263157895e-07, "loss": 0.1639, "step": 64610 }, { "epoch": 97.17, "grad_norm": 4.625314235687256, "learning_rate": 2.8270676691729326e-07, "loss": 0.1755, "step": 64620 }, { "epoch": 97.19, "grad_norm": 2.0218687057495117, "learning_rate": 2.81203007518797e-07, "loss": 0.1521, "step": 64630 }, { "epoch": 97.2, "grad_norm": 6.187188148498535, "learning_rate": 2.796992481203008e-07, "loss": 0.162, "step": 64640 }, { "epoch": 97.22, "grad_norm": 6.775570869445801, "learning_rate": 2.7819548872180456e-07, "loss": 0.1703, "step": 64650 }, { "epoch": 97.23, "grad_norm": 1.6887646913528442, "learning_rate": 2.766917293233083e-07, "loss": 0.2072, "step": 64660 }, { "epoch": 97.25, "grad_norm": 5.309747695922852, "learning_rate": 2.751879699248121e-07, "loss": 0.1646, "step": 64670 }, { "epoch": 97.26, "grad_norm": 5.663767337799072, "learning_rate": 2.736842105263158e-07, "loss": 0.166, "step": 64680 }, { "epoch": 97.28, "grad_norm": 5.132472991943359, "learning_rate": 2.7218045112781954e-07, "loss": 0.1857, "step": 64690 }, { "epoch": 97.29, "grad_norm": 5.090039253234863, "learning_rate": 2.706766917293233e-07, "loss": 0.195, "step": 64700 }, { "epoch": 97.31, "grad_norm": 9.945984840393066, "learning_rate": 2.691729323308271e-07, "loss": 0.1869, "step": 64710 }, { "epoch": 97.32, "grad_norm": 9.467114448547363, "learning_rate": 2.6766917293233083e-07, "loss": 0.1387, "step": 64720 }, { "epoch": 97.34, "grad_norm": 2.083440065383911, "learning_rate": 2.661654135338346e-07, "loss": 0.1623, "step": 64730 }, { "epoch": 97.35, "grad_norm": 3.3765485286712646, "learning_rate": 2.646616541353384e-07, "loss": 0.1059, "step": 64740 }, { "epoch": 97.37, "grad_norm": 2.4343514442443848, "learning_rate": 2.6315789473684213e-07, "loss": 0.193, "step": 64750 }, { "epoch": 97.38, "grad_norm": 7.881185531616211, "learning_rate": 2.6165413533834586e-07, "loss": 0.162, "step": 64760 }, { "epoch": 97.4, "grad_norm": 3.8608689308166504, "learning_rate": 2.6015037593984965e-07, "loss": 0.1648, "step": 64770 }, { "epoch": 97.41, "grad_norm": 3.745732069015503, "learning_rate": 2.586466165413534e-07, "loss": 0.1706, "step": 64780 }, { "epoch": 97.43, "grad_norm": 1.355181336402893, "learning_rate": 2.5714285714285716e-07, "loss": 0.1558, "step": 64790 }, { "epoch": 97.44, "grad_norm": 3.218841075897217, "learning_rate": 2.5563909774436095e-07, "loss": 0.1518, "step": 64800 }, { "epoch": 97.46, "grad_norm": 1.6268922090530396, "learning_rate": 2.541353383458647e-07, "loss": 0.1573, "step": 64810 }, { "epoch": 97.47, "grad_norm": 2.534693479537964, "learning_rate": 2.5263157894736846e-07, "loss": 0.1684, "step": 64820 }, { "epoch": 97.49, "grad_norm": 5.223088264465332, "learning_rate": 2.511278195488722e-07, "loss": 0.1612, "step": 64830 }, { "epoch": 97.5, "grad_norm": 4.601364612579346, "learning_rate": 2.49624060150376e-07, "loss": 0.1334, "step": 64840 }, { "epoch": 97.52, "grad_norm": 3.1391961574554443, "learning_rate": 2.481203007518797e-07, "loss": 0.1509, "step": 64850 }, { "epoch": 97.53, "grad_norm": 5.453991889953613, "learning_rate": 2.466165413533835e-07, "loss": 0.1266, "step": 64860 }, { "epoch": 97.55, "grad_norm": 4.62083625793457, "learning_rate": 2.4511278195488727e-07, "loss": 0.158, "step": 64870 }, { "epoch": 97.56, "grad_norm": 1.933967113494873, "learning_rate": 2.43609022556391e-07, "loss": 0.0955, "step": 64880 }, { "epoch": 97.58, "grad_norm": 5.738483428955078, "learning_rate": 2.4210526315789473e-07, "loss": 0.1555, "step": 64890 }, { "epoch": 97.59, "grad_norm": 4.406610012054443, "learning_rate": 2.406015037593985e-07, "loss": 0.1667, "step": 64900 }, { "epoch": 97.61, "grad_norm": 4.557981491088867, "learning_rate": 2.3909774436090225e-07, "loss": 0.1796, "step": 64910 }, { "epoch": 97.62, "grad_norm": 4.654937744140625, "learning_rate": 2.3759398496240603e-07, "loss": 0.1713, "step": 64920 }, { "epoch": 97.64, "grad_norm": 5.671332359313965, "learning_rate": 2.3609022556390982e-07, "loss": 0.1292, "step": 64930 }, { "epoch": 97.65, "grad_norm": 5.643190860748291, "learning_rate": 2.3458646616541355e-07, "loss": 0.1565, "step": 64940 }, { "epoch": 97.67, "grad_norm": 2.340914487838745, "learning_rate": 2.330827067669173e-07, "loss": 0.128, "step": 64950 }, { "epoch": 97.68, "grad_norm": 7.151415824890137, "learning_rate": 2.315789473684211e-07, "loss": 0.1618, "step": 64960 }, { "epoch": 97.7, "grad_norm": 7.448176860809326, "learning_rate": 2.3007518796992482e-07, "loss": 0.1396, "step": 64970 }, { "epoch": 97.71, "grad_norm": 5.021811008453369, "learning_rate": 2.285714285714286e-07, "loss": 0.1772, "step": 64980 }, { "epoch": 97.73, "grad_norm": 0.7833675146102905, "learning_rate": 2.2706766917293236e-07, "loss": 0.1125, "step": 64990 }, { "epoch": 97.74, "grad_norm": 7.163773536682129, "learning_rate": 2.255639097744361e-07, "loss": 0.2541, "step": 65000 }, { "epoch": 97.76, "grad_norm": 7.621840000152588, "learning_rate": 2.2406015037593987e-07, "loss": 0.1367, "step": 65010 }, { "epoch": 97.77, "grad_norm": 3.7870657444000244, "learning_rate": 2.2255639097744363e-07, "loss": 0.1336, "step": 65020 }, { "epoch": 97.79, "grad_norm": 4.785586357116699, "learning_rate": 2.2105263157894736e-07, "loss": 0.1779, "step": 65030 }, { "epoch": 97.8, "grad_norm": 2.2367138862609863, "learning_rate": 2.1954887218045115e-07, "loss": 0.1886, "step": 65040 }, { "epoch": 97.82, "grad_norm": 4.493763446807861, "learning_rate": 2.180451127819549e-07, "loss": 0.1943, "step": 65050 }, { "epoch": 97.83, "grad_norm": 6.870443344116211, "learning_rate": 2.1654135338345866e-07, "loss": 0.0937, "step": 65060 }, { "epoch": 97.85, "grad_norm": 6.161388874053955, "learning_rate": 2.1503759398496242e-07, "loss": 0.192, "step": 65070 }, { "epoch": 97.86, "grad_norm": 5.1376729011535645, "learning_rate": 2.135338345864662e-07, "loss": 0.1587, "step": 65080 }, { "epoch": 97.88, "grad_norm": 3.9065380096435547, "learning_rate": 2.1203007518796993e-07, "loss": 0.1104, "step": 65090 }, { "epoch": 97.89, "grad_norm": 5.1056227684021, "learning_rate": 2.105263157894737e-07, "loss": 0.157, "step": 65100 }, { "epoch": 97.91, "grad_norm": 3.8293356895446777, "learning_rate": 2.0902255639097747e-07, "loss": 0.17, "step": 65110 }, { "epoch": 97.92, "grad_norm": 4.883228302001953, "learning_rate": 2.075187969924812e-07, "loss": 0.1989, "step": 65120 }, { "epoch": 97.94, "grad_norm": 6.1433210372924805, "learning_rate": 2.0601503759398496e-07, "loss": 0.2161, "step": 65130 }, { "epoch": 97.95, "grad_norm": 3.8383498191833496, "learning_rate": 2.0451127819548875e-07, "loss": 0.2358, "step": 65140 }, { "epoch": 97.97, "grad_norm": 3.27778959274292, "learning_rate": 2.030075187969925e-07, "loss": 0.1498, "step": 65150 }, { "epoch": 97.98, "grad_norm": 5.0947265625, "learning_rate": 2.0150375939849626e-07, "loss": 0.1577, "step": 65160 }, { "epoch": 98.0, "grad_norm": 0.12277551740407944, "learning_rate": 2.0000000000000002e-07, "loss": 0.0984, "step": 65170 }, { "epoch": 98.0, "eval_accuracy": 0.9317, "eval_loss": 0.32914501428604126, "eval_runtime": 84.4819, "eval_samples_per_second": 118.369, "eval_steps_per_second": 0.473, "step": 65170 }, { "epoch": 98.02, "grad_norm": 8.998836517333984, "learning_rate": 1.984962406015038e-07, "loss": 0.1479, "step": 65180 }, { "epoch": 98.03, "grad_norm": 4.802471160888672, "learning_rate": 1.9699248120300753e-07, "loss": 0.1792, "step": 65190 }, { "epoch": 98.05, "grad_norm": 6.974554061889648, "learning_rate": 1.954887218045113e-07, "loss": 0.1599, "step": 65200 }, { "epoch": 98.06, "grad_norm": 5.7014641761779785, "learning_rate": 1.9398496240601507e-07, "loss": 0.1527, "step": 65210 }, { "epoch": 98.08, "grad_norm": 7.29013729095459, "learning_rate": 1.924812030075188e-07, "loss": 0.1443, "step": 65220 }, { "epoch": 98.09, "grad_norm": 3.837073802947998, "learning_rate": 1.9097744360902256e-07, "loss": 0.1044, "step": 65230 }, { "epoch": 98.11, "grad_norm": 9.046390533447266, "learning_rate": 1.8947368421052634e-07, "loss": 0.1598, "step": 65240 }, { "epoch": 98.12, "grad_norm": 7.997859954833984, "learning_rate": 1.8796992481203008e-07, "loss": 0.2044, "step": 65250 }, { "epoch": 98.14, "grad_norm": 5.434107780456543, "learning_rate": 1.8646616541353386e-07, "loss": 0.1655, "step": 65260 }, { "epoch": 98.15, "grad_norm": 4.262781620025635, "learning_rate": 1.8496240601503762e-07, "loss": 0.1393, "step": 65270 }, { "epoch": 98.17, "grad_norm": 6.696924209594727, "learning_rate": 1.8345864661654135e-07, "loss": 0.1341, "step": 65280 }, { "epoch": 98.18, "grad_norm": 6.295987129211426, "learning_rate": 1.8195488721804513e-07, "loss": 0.1655, "step": 65290 }, { "epoch": 98.2, "grad_norm": 2.8167364597320557, "learning_rate": 1.804511278195489e-07, "loss": 0.1448, "step": 65300 }, { "epoch": 98.21, "grad_norm": 2.0722157955169678, "learning_rate": 1.7894736842105265e-07, "loss": 0.2011, "step": 65310 }, { "epoch": 98.23, "grad_norm": 4.755069255828857, "learning_rate": 1.774436090225564e-07, "loss": 0.1573, "step": 65320 }, { "epoch": 98.24, "grad_norm": 11.782632827758789, "learning_rate": 1.759398496240602e-07, "loss": 0.1736, "step": 65330 }, { "epoch": 98.26, "grad_norm": 4.000039577484131, "learning_rate": 1.7443609022556392e-07, "loss": 0.1365, "step": 65340 }, { "epoch": 98.27, "grad_norm": 4.4005656242370605, "learning_rate": 1.7293233082706767e-07, "loss": 0.1795, "step": 65350 }, { "epoch": 98.29, "grad_norm": 7.827691555023193, "learning_rate": 1.7142857142857146e-07, "loss": 0.1507, "step": 65360 }, { "epoch": 98.3, "grad_norm": 3.5540237426757812, "learning_rate": 1.699248120300752e-07, "loss": 0.1653, "step": 65370 }, { "epoch": 98.32, "grad_norm": 4.963322639465332, "learning_rate": 1.6842105263157895e-07, "loss": 0.1742, "step": 65380 }, { "epoch": 98.33, "grad_norm": 4.910261154174805, "learning_rate": 1.6691729323308273e-07, "loss": 0.1477, "step": 65390 }, { "epoch": 98.35, "grad_norm": 7.770264148712158, "learning_rate": 1.6541353383458646e-07, "loss": 0.1713, "step": 65400 }, { "epoch": 98.36, "grad_norm": 2.305351495742798, "learning_rate": 1.6390977443609025e-07, "loss": 0.1471, "step": 65410 }, { "epoch": 98.38, "grad_norm": 5.237599849700928, "learning_rate": 1.62406015037594e-07, "loss": 0.2242, "step": 65420 }, { "epoch": 98.39, "grad_norm": 3.8604493141174316, "learning_rate": 1.6090225563909773e-07, "loss": 0.1789, "step": 65430 }, { "epoch": 98.41, "grad_norm": 4.205555438995361, "learning_rate": 1.5939849624060152e-07, "loss": 0.1286, "step": 65440 }, { "epoch": 98.42, "grad_norm": 6.994670867919922, "learning_rate": 1.5789473684210527e-07, "loss": 0.204, "step": 65450 }, { "epoch": 98.44, "grad_norm": 6.597128391265869, "learning_rate": 1.5639097744360906e-07, "loss": 0.1172, "step": 65460 }, { "epoch": 98.45, "grad_norm": 9.189870834350586, "learning_rate": 1.548872180451128e-07, "loss": 0.1601, "step": 65470 }, { "epoch": 98.47, "grad_norm": 5.771359920501709, "learning_rate": 1.5338345864661655e-07, "loss": 0.1457, "step": 65480 }, { "epoch": 98.48, "grad_norm": 6.4306840896606445, "learning_rate": 1.518796992481203e-07, "loss": 0.1385, "step": 65490 }, { "epoch": 98.5, "grad_norm": 3.8584532737731934, "learning_rate": 1.5037593984962406e-07, "loss": 0.1687, "step": 65500 }, { "epoch": 98.51, "grad_norm": 4.318583965301514, "learning_rate": 1.4887218045112784e-07, "loss": 0.1136, "step": 65510 }, { "epoch": 98.53, "grad_norm": 4.790411472320557, "learning_rate": 1.4736842105263158e-07, "loss": 0.176, "step": 65520 }, { "epoch": 98.54, "grad_norm": 7.689479351043701, "learning_rate": 1.4586466165413536e-07, "loss": 0.17, "step": 65530 }, { "epoch": 98.56, "grad_norm": 6.151116371154785, "learning_rate": 1.4436090225563912e-07, "loss": 0.1942, "step": 65540 }, { "epoch": 98.57, "grad_norm": 5.650688648223877, "learning_rate": 1.4285714285714287e-07, "loss": 0.2166, "step": 65550 }, { "epoch": 98.59, "grad_norm": 4.956643581390381, "learning_rate": 1.4135338345864663e-07, "loss": 0.1672, "step": 65560 }, { "epoch": 98.6, "grad_norm": 1.9977216720581055, "learning_rate": 1.398496240601504e-07, "loss": 0.153, "step": 65570 }, { "epoch": 98.62, "grad_norm": 6.504947185516357, "learning_rate": 1.3834586466165415e-07, "loss": 0.2146, "step": 65580 }, { "epoch": 98.63, "grad_norm": 3.9041435718536377, "learning_rate": 1.368421052631579e-07, "loss": 0.2097, "step": 65590 }, { "epoch": 98.65, "grad_norm": 6.294933795928955, "learning_rate": 1.3533834586466166e-07, "loss": 0.1532, "step": 65600 }, { "epoch": 98.66, "grad_norm": 3.49302339553833, "learning_rate": 1.3383458646616542e-07, "loss": 0.1407, "step": 65610 }, { "epoch": 98.68, "grad_norm": 6.846986293792725, "learning_rate": 1.323308270676692e-07, "loss": 0.1436, "step": 65620 }, { "epoch": 98.69, "grad_norm": 8.050239562988281, "learning_rate": 1.3082706766917293e-07, "loss": 0.1993, "step": 65630 }, { "epoch": 98.71, "grad_norm": 2.393629789352417, "learning_rate": 1.293233082706767e-07, "loss": 0.1998, "step": 65640 }, { "epoch": 98.72, "grad_norm": 3.4583864212036133, "learning_rate": 1.2781954887218047e-07, "loss": 0.2058, "step": 65650 }, { "epoch": 98.74, "grad_norm": 1.8351730108261108, "learning_rate": 1.2631578947368423e-07, "loss": 0.0967, "step": 65660 }, { "epoch": 98.75, "grad_norm": 4.4519195556640625, "learning_rate": 1.24812030075188e-07, "loss": 0.1455, "step": 65670 }, { "epoch": 98.77, "grad_norm": 3.725358009338379, "learning_rate": 1.2330827067669174e-07, "loss": 0.1497, "step": 65680 }, { "epoch": 98.78, "grad_norm": 5.022524356842041, "learning_rate": 1.218045112781955e-07, "loss": 0.2046, "step": 65690 }, { "epoch": 98.8, "grad_norm": 0.5539684295654297, "learning_rate": 1.2030075187969926e-07, "loss": 0.1661, "step": 65700 }, { "epoch": 98.81, "grad_norm": 5.730692386627197, "learning_rate": 1.1879699248120302e-07, "loss": 0.1907, "step": 65710 }, { "epoch": 98.83, "grad_norm": 4.800581455230713, "learning_rate": 1.1729323308270677e-07, "loss": 0.1923, "step": 65720 }, { "epoch": 98.84, "grad_norm": 7.648699760437012, "learning_rate": 1.1578947368421054e-07, "loss": 0.1876, "step": 65730 }, { "epoch": 98.86, "grad_norm": 6.9826860427856445, "learning_rate": 1.142857142857143e-07, "loss": 0.1962, "step": 65740 }, { "epoch": 98.87, "grad_norm": 4.280595779418945, "learning_rate": 1.1278195488721805e-07, "loss": 0.1995, "step": 65750 }, { "epoch": 98.89, "grad_norm": 9.39132308959961, "learning_rate": 1.1127819548872182e-07, "loss": 0.1467, "step": 65760 }, { "epoch": 98.9, "grad_norm": 8.573371887207031, "learning_rate": 1.0977443609022557e-07, "loss": 0.156, "step": 65770 }, { "epoch": 98.92, "grad_norm": 3.3703553676605225, "learning_rate": 1.0827067669172933e-07, "loss": 0.1639, "step": 65780 }, { "epoch": 98.93, "grad_norm": 5.628261089324951, "learning_rate": 1.067669172932331e-07, "loss": 0.1439, "step": 65790 }, { "epoch": 98.95, "grad_norm": 4.770748615264893, "learning_rate": 1.0526315789473685e-07, "loss": 0.1691, "step": 65800 }, { "epoch": 98.96, "grad_norm": 3.2969655990600586, "learning_rate": 1.037593984962406e-07, "loss": 0.129, "step": 65810 }, { "epoch": 98.98, "grad_norm": 5.129558563232422, "learning_rate": 1.0225563909774437e-07, "loss": 0.1381, "step": 65820 }, { "epoch": 98.99, "grad_norm": 7.410007953643799, "learning_rate": 1.0075187969924813e-07, "loss": 0.1525, "step": 65830 }, { "epoch": 99.0, "eval_accuracy": 0.9311, "eval_loss": 0.33068838715553284, "eval_runtime": 84.5618, "eval_samples_per_second": 118.257, "eval_steps_per_second": 0.473, "step": 65835 }, { "epoch": 99.01, "grad_norm": 4.0409674644470215, "learning_rate": 9.92481203007519e-08, "loss": 0.1796, "step": 65840 }, { "epoch": 99.02, "grad_norm": 6.996447563171387, "learning_rate": 9.774436090225564e-08, "loss": 0.1921, "step": 65850 }, { "epoch": 99.04, "grad_norm": 9.530542373657227, "learning_rate": 9.62406015037594e-08, "loss": 0.1522, "step": 65860 }, { "epoch": 99.05, "grad_norm": 1.6565377712249756, "learning_rate": 9.473684210526317e-08, "loss": 0.1283, "step": 65870 }, { "epoch": 99.07, "grad_norm": 4.351380348205566, "learning_rate": 9.323308270676693e-08, "loss": 0.1222, "step": 65880 }, { "epoch": 99.08, "grad_norm": 5.406339645385742, "learning_rate": 9.172932330827067e-08, "loss": 0.1445, "step": 65890 }, { "epoch": 99.1, "grad_norm": 3.567111015319824, "learning_rate": 9.022556390977444e-08, "loss": 0.0886, "step": 65900 }, { "epoch": 99.11, "grad_norm": 4.423712730407715, "learning_rate": 8.87218045112782e-08, "loss": 0.158, "step": 65910 }, { "epoch": 99.13, "grad_norm": 1.147392988204956, "learning_rate": 8.721804511278196e-08, "loss": 0.1439, "step": 65920 }, { "epoch": 99.14, "grad_norm": 4.6901936531066895, "learning_rate": 8.571428571428573e-08, "loss": 0.1749, "step": 65930 }, { "epoch": 99.16, "grad_norm": 9.2613525390625, "learning_rate": 8.421052631578947e-08, "loss": 0.1984, "step": 65940 }, { "epoch": 99.17, "grad_norm": 3.2010562419891357, "learning_rate": 8.270676691729323e-08, "loss": 0.115, "step": 65950 }, { "epoch": 99.19, "grad_norm": 2.4863340854644775, "learning_rate": 8.1203007518797e-08, "loss": 0.1331, "step": 65960 }, { "epoch": 99.2, "grad_norm": 2.9829752445220947, "learning_rate": 7.969924812030076e-08, "loss": 0.1423, "step": 65970 }, { "epoch": 99.22, "grad_norm": 4.100430011749268, "learning_rate": 7.819548872180453e-08, "loss": 0.1391, "step": 65980 }, { "epoch": 99.23, "grad_norm": 4.504250526428223, "learning_rate": 7.669172932330827e-08, "loss": 0.1529, "step": 65990 }, { "epoch": 99.25, "grad_norm": 3.5403831005096436, "learning_rate": 7.518796992481203e-08, "loss": 0.1882, "step": 66000 }, { "epoch": 99.26, "grad_norm": 4.3315253257751465, "learning_rate": 7.368421052631579e-08, "loss": 0.1514, "step": 66010 }, { "epoch": 99.28, "grad_norm": 5.822681903839111, "learning_rate": 7.218045112781956e-08, "loss": 0.1829, "step": 66020 }, { "epoch": 99.29, "grad_norm": 5.7567596435546875, "learning_rate": 7.067669172932332e-08, "loss": 0.1862, "step": 66030 }, { "epoch": 99.31, "grad_norm": 7.93897819519043, "learning_rate": 6.917293233082707e-08, "loss": 0.1586, "step": 66040 }, { "epoch": 99.32, "grad_norm": 2.92777156829834, "learning_rate": 6.766917293233083e-08, "loss": 0.1325, "step": 66050 }, { "epoch": 99.34, "grad_norm": 5.535176753997803, "learning_rate": 6.61654135338346e-08, "loss": 0.192, "step": 66060 }, { "epoch": 99.35, "grad_norm": 6.798808574676514, "learning_rate": 6.466165413533834e-08, "loss": 0.171, "step": 66070 }, { "epoch": 99.37, "grad_norm": 7.453383922576904, "learning_rate": 6.315789473684211e-08, "loss": 0.2143, "step": 66080 }, { "epoch": 99.38, "grad_norm": 5.353199005126953, "learning_rate": 6.165413533834587e-08, "loss": 0.165, "step": 66090 }, { "epoch": 99.4, "grad_norm": 3.9845528602600098, "learning_rate": 6.015037593984963e-08, "loss": 0.1643, "step": 66100 }, { "epoch": 99.41, "grad_norm": 10.434816360473633, "learning_rate": 5.864661654135339e-08, "loss": 0.2099, "step": 66110 }, { "epoch": 99.43, "grad_norm": 3.1787760257720947, "learning_rate": 5.714285714285715e-08, "loss": 0.1539, "step": 66120 }, { "epoch": 99.44, "grad_norm": 6.48075532913208, "learning_rate": 5.563909774436091e-08, "loss": 0.1741, "step": 66130 }, { "epoch": 99.46, "grad_norm": 3.269533634185791, "learning_rate": 5.4135338345864665e-08, "loss": 0.1438, "step": 66140 }, { "epoch": 99.47, "grad_norm": 8.6683988571167, "learning_rate": 5.263157894736842e-08, "loss": 0.1949, "step": 66150 }, { "epoch": 99.49, "grad_norm": 4.051426887512207, "learning_rate": 5.1127819548872186e-08, "loss": 0.1634, "step": 66160 }, { "epoch": 99.5, "grad_norm": 6.472201347351074, "learning_rate": 4.962406015037595e-08, "loss": 0.1474, "step": 66170 }, { "epoch": 99.52, "grad_norm": 4.984071731567383, "learning_rate": 4.81203007518797e-08, "loss": 0.1707, "step": 66180 }, { "epoch": 99.53, "grad_norm": 5.94837760925293, "learning_rate": 4.6616541353383465e-08, "loss": 0.2083, "step": 66190 }, { "epoch": 99.55, "grad_norm": 5.263061046600342, "learning_rate": 4.511278195488722e-08, "loss": 0.1573, "step": 66200 }, { "epoch": 99.56, "grad_norm": 5.428894996643066, "learning_rate": 4.360902255639098e-08, "loss": 0.1753, "step": 66210 }, { "epoch": 99.58, "grad_norm": 6.092723846435547, "learning_rate": 4.2105263157894737e-08, "loss": 0.1658, "step": 66220 }, { "epoch": 99.59, "grad_norm": 5.707245826721191, "learning_rate": 4.06015037593985e-08, "loss": 0.1583, "step": 66230 }, { "epoch": 99.61, "grad_norm": 5.840432167053223, "learning_rate": 3.9097744360902264e-08, "loss": 0.2175, "step": 66240 }, { "epoch": 99.62, "grad_norm": 5.600442409515381, "learning_rate": 3.7593984962406015e-08, "loss": 0.2012, "step": 66250 }, { "epoch": 99.64, "grad_norm": 8.73216724395752, "learning_rate": 3.609022556390978e-08, "loss": 0.1126, "step": 66260 }, { "epoch": 99.65, "grad_norm": 4.184023857116699, "learning_rate": 3.4586466165413536e-08, "loss": 0.1326, "step": 66270 }, { "epoch": 99.67, "grad_norm": 2.4098730087280273, "learning_rate": 3.30827067669173e-08, "loss": 0.1219, "step": 66280 }, { "epoch": 99.68, "grad_norm": 6.737592697143555, "learning_rate": 3.157894736842106e-08, "loss": 0.1788, "step": 66290 }, { "epoch": 99.7, "grad_norm": 3.4307661056518555, "learning_rate": 3.0075187969924815e-08, "loss": 0.1677, "step": 66300 }, { "epoch": 99.71, "grad_norm": 6.838919162750244, "learning_rate": 2.8571428571428575e-08, "loss": 0.156, "step": 66310 }, { "epoch": 99.73, "grad_norm": 3.750256061553955, "learning_rate": 2.7067669172932333e-08, "loss": 0.1118, "step": 66320 }, { "epoch": 99.74, "grad_norm": 3.541330099105835, "learning_rate": 2.5563909774436093e-08, "loss": 0.1467, "step": 66330 }, { "epoch": 99.76, "grad_norm": 6.930870532989502, "learning_rate": 2.406015037593985e-08, "loss": 0.1469, "step": 66340 }, { "epoch": 99.77, "grad_norm": 4.462535858154297, "learning_rate": 2.255639097744361e-08, "loss": 0.1693, "step": 66350 }, { "epoch": 99.79, "grad_norm": 16.731098175048828, "learning_rate": 2.1052631578947368e-08, "loss": 0.1813, "step": 66360 }, { "epoch": 99.8, "grad_norm": 4.853457450866699, "learning_rate": 1.9548872180451132e-08, "loss": 0.1693, "step": 66370 }, { "epoch": 99.82, "grad_norm": 8.614503860473633, "learning_rate": 1.804511278195489e-08, "loss": 0.1212, "step": 66380 }, { "epoch": 99.83, "grad_norm": 1.8390692472457886, "learning_rate": 1.654135338345865e-08, "loss": 0.1156, "step": 66390 }, { "epoch": 99.85, "grad_norm": 3.2085022926330566, "learning_rate": 1.5037593984962407e-08, "loss": 0.1469, "step": 66400 }, { "epoch": 99.86, "grad_norm": 4.798015594482422, "learning_rate": 1.3533834586466166e-08, "loss": 0.1651, "step": 66410 }, { "epoch": 99.88, "grad_norm": 5.5648322105407715, "learning_rate": 1.2030075187969925e-08, "loss": 0.1682, "step": 66420 }, { "epoch": 99.89, "grad_norm": 4.767200469970703, "learning_rate": 1.0526315789473684e-08, "loss": 0.2008, "step": 66430 }, { "epoch": 99.91, "grad_norm": 2.938972234725952, "learning_rate": 9.022556390977445e-09, "loss": 0.1684, "step": 66440 }, { "epoch": 99.92, "grad_norm": 7.364297389984131, "learning_rate": 7.518796992481204e-09, "loss": 0.1327, "step": 66450 }, { "epoch": 99.94, "grad_norm": 4.032517433166504, "learning_rate": 6.015037593984963e-09, "loss": 0.1577, "step": 66460 }, { "epoch": 99.95, "grad_norm": 5.099705219268799, "learning_rate": 4.511278195488722e-09, "loss": 0.1541, "step": 66470 }, { "epoch": 99.97, "grad_norm": 5.294932842254639, "learning_rate": 3.0075187969924813e-09, "loss": 0.147, "step": 66480 }, { "epoch": 99.98, "grad_norm": 5.263985633850098, "learning_rate": 1.5037593984962407e-09, "loss": 0.0833, "step": 66490 }, { "epoch": 100.0, "grad_norm": 0.18724946677684784, "learning_rate": 0.0, "loss": 0.1471, "step": 66500 }, { "epoch": 100.0, "eval_accuracy": 0.9309, "eval_loss": 0.33010387420654297, "eval_runtime": 84.7295, "eval_samples_per_second": 118.023, "eval_steps_per_second": 0.472, "step": 66500 }, { "epoch": 100.0, "step": 66500, "total_flos": 1.1646058381332455e+21, "train_loss": 0.2947179788530321, "train_runtime": 117326.6726, "train_samples_per_second": 36.224, "train_steps_per_second": 0.567 } ], "logging_steps": 10, "max_steps": 66500, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "total_flos": 1.1646058381332455e+21, "train_batch_size": 64, "trial_name": null, "trial_params": null }