{ "best_metric": 0.8707767328456983, "best_model_checkpoint": "hybrid-cnn-vit/checkpoint-2020", "epoch": 10.0, "eval_steps": 500, "global_step": 2020, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 3.4049220085144043, "learning_rate": 2.4752475247524753e-06, "loss": 0.75, "step": 10 }, { "epoch": 0.1, "grad_norm": 2.486193895339966, "learning_rate": 4.950495049504951e-06, "loss": 0.7094, "step": 20 }, { "epoch": 0.15, "grad_norm": 2.687983989715576, "learning_rate": 7.4257425742574256e-06, "loss": 0.633, "step": 30 }, { "epoch": 0.2, "grad_norm": 6.435194969177246, "learning_rate": 9.900990099009901e-06, "loss": 0.5843, "step": 40 }, { "epoch": 0.25, "grad_norm": 5.748059272766113, "learning_rate": 1.2376237623762377e-05, "loss": 0.5832, "step": 50 }, { "epoch": 0.3, "grad_norm": 3.351829767227173, "learning_rate": 1.4851485148514851e-05, "loss": 0.5679, "step": 60 }, { "epoch": 0.35, "grad_norm": 5.649282455444336, "learning_rate": 1.7326732673267325e-05, "loss": 0.5399, "step": 70 }, { "epoch": 0.4, "grad_norm": 3.6417155265808105, "learning_rate": 1.9801980198019803e-05, "loss": 0.516, "step": 80 }, { "epoch": 0.45, "grad_norm": 3.760568857192993, "learning_rate": 2.227722772277228e-05, "loss": 0.5005, "step": 90 }, { "epoch": 0.5, "grad_norm": 4.888506889343262, "learning_rate": 2.4752475247524754e-05, "loss": 0.5184, "step": 100 }, { "epoch": 0.54, "grad_norm": 7.107848644256592, "learning_rate": 2.722772277227723e-05, "loss": 0.5198, "step": 110 }, { "epoch": 0.59, "grad_norm": 2.687451124191284, "learning_rate": 2.9702970297029702e-05, "loss": 0.5292, "step": 120 }, { "epoch": 0.64, "grad_norm": 2.2040014266967773, "learning_rate": 3.217821782178218e-05, "loss": 0.5167, "step": 130 }, { "epoch": 0.69, "grad_norm": 7.79260778427124, "learning_rate": 3.465346534653465e-05, "loss": 0.5081, "step": 140 }, { "epoch": 0.74, "grad_norm": 4.946630954742432, "learning_rate": 3.712871287128713e-05, "loss": 0.4637, "step": 150 }, { "epoch": 0.79, "grad_norm": 4.178718090057373, "learning_rate": 3.9603960396039605e-05, "loss": 0.4939, "step": 160 }, { "epoch": 0.84, "grad_norm": 2.5444717407226562, "learning_rate": 4.207920792079208e-05, "loss": 0.4808, "step": 170 }, { "epoch": 0.89, "grad_norm": 5.634459972381592, "learning_rate": 4.455445544554456e-05, "loss": 0.4815, "step": 180 }, { "epoch": 0.94, "grad_norm": 7.709349632263184, "learning_rate": 4.702970297029703e-05, "loss": 0.5112, "step": 190 }, { "epoch": 0.99, "grad_norm": 1.794649600982666, "learning_rate": 4.950495049504951e-05, "loss": 0.5277, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.8209683037269244, "eval_loss": 0.39027276635169983, "eval_runtime": 34.4714, "eval_samples_per_second": 83.286, "eval_steps_per_second": 2.611, "step": 202 }, { "epoch": 1.04, "grad_norm": 6.730863571166992, "learning_rate": 4.977997799779978e-05, "loss": 0.4614, "step": 210 }, { "epoch": 1.09, "grad_norm": 3.0025227069854736, "learning_rate": 4.950495049504951e-05, "loss": 0.4952, "step": 220 }, { "epoch": 1.14, "grad_norm": 2.116140842437744, "learning_rate": 4.9229922992299234e-05, "loss": 0.4742, "step": 230 }, { "epoch": 1.19, "grad_norm": 2.0951218605041504, "learning_rate": 4.895489548954896e-05, "loss": 0.47, "step": 240 }, { "epoch": 1.24, "grad_norm": 9.745083808898926, "learning_rate": 4.867986798679868e-05, "loss": 0.4991, "step": 250 }, { "epoch": 1.29, "grad_norm": 3.8443119525909424, "learning_rate": 4.8404840484048406e-05, "loss": 0.5179, "step": 260 }, { "epoch": 1.34, "grad_norm": 2.324491262435913, "learning_rate": 4.812981298129813e-05, "loss": 0.4431, "step": 270 }, { "epoch": 1.39, "grad_norm": 4.08018684387207, "learning_rate": 4.785478547854786e-05, "loss": 0.4497, "step": 280 }, { "epoch": 1.44, "grad_norm": 4.045833587646484, "learning_rate": 4.7579757975797585e-05, "loss": 0.4923, "step": 290 }, { "epoch": 1.49, "grad_norm": 5.029973983764648, "learning_rate": 4.730473047304731e-05, "loss": 0.4931, "step": 300 }, { "epoch": 1.53, "grad_norm": 3.3357350826263428, "learning_rate": 4.702970297029703e-05, "loss": 0.4768, "step": 310 }, { "epoch": 1.58, "grad_norm": 6.275207042694092, "learning_rate": 4.675467546754676e-05, "loss": 0.4646, "step": 320 }, { "epoch": 1.63, "grad_norm": 3.9513421058654785, "learning_rate": 4.647964796479648e-05, "loss": 0.4692, "step": 330 }, { "epoch": 1.68, "grad_norm": 2.237048625946045, "learning_rate": 4.62046204620462e-05, "loss": 0.4692, "step": 340 }, { "epoch": 1.73, "grad_norm": 1.937572717666626, "learning_rate": 4.592959295929593e-05, "loss": 0.4475, "step": 350 }, { "epoch": 1.78, "grad_norm": 1.6712441444396973, "learning_rate": 4.5654565456545655e-05, "loss": 0.465, "step": 360 }, { "epoch": 1.83, "grad_norm": 2.3434207439422607, "learning_rate": 4.537953795379538e-05, "loss": 0.473, "step": 370 }, { "epoch": 1.88, "grad_norm": 2.657801389694214, "learning_rate": 4.510451045104511e-05, "loss": 0.456, "step": 380 }, { "epoch": 1.93, "grad_norm": 1.898109793663025, "learning_rate": 4.4829482948294834e-05, "loss": 0.4769, "step": 390 }, { "epoch": 1.98, "grad_norm": 3.6151821613311768, "learning_rate": 4.455445544554456e-05, "loss": 0.4623, "step": 400 }, { "epoch": 2.0, "eval_accuracy": 0.8415186346220829, "eval_loss": 0.3477635383605957, "eval_runtime": 34.2991, "eval_samples_per_second": 83.705, "eval_steps_per_second": 2.624, "step": 404 }, { "epoch": 2.03, "grad_norm": 1.7920018434524536, "learning_rate": 4.427942794279428e-05, "loss": 0.451, "step": 410 }, { "epoch": 2.08, "grad_norm": 3.742645502090454, "learning_rate": 4.4004400440044006e-05, "loss": 0.4107, "step": 420 }, { "epoch": 2.13, "grad_norm": 3.6825478076934814, "learning_rate": 4.372937293729373e-05, "loss": 0.4103, "step": 430 }, { "epoch": 2.18, "grad_norm": 2.2625832557678223, "learning_rate": 4.345434543454346e-05, "loss": 0.4334, "step": 440 }, { "epoch": 2.23, "grad_norm": 2.2035701274871826, "learning_rate": 4.3179317931793185e-05, "loss": 0.4155, "step": 450 }, { "epoch": 2.28, "grad_norm": 4.748138904571533, "learning_rate": 4.2904290429042904e-05, "loss": 0.4452, "step": 460 }, { "epoch": 2.33, "grad_norm": 2.5996172428131104, "learning_rate": 4.262926292629263e-05, "loss": 0.4156, "step": 470 }, { "epoch": 2.38, "grad_norm": 1.9976857900619507, "learning_rate": 4.2354235423542356e-05, "loss": 0.4161, "step": 480 }, { "epoch": 2.43, "grad_norm": 4.618232250213623, "learning_rate": 4.207920792079208e-05, "loss": 0.439, "step": 490 }, { "epoch": 2.48, "grad_norm": 4.22369384765625, "learning_rate": 4.18041804180418e-05, "loss": 0.4711, "step": 500 }, { "epoch": 2.52, "grad_norm": 4.014861106872559, "learning_rate": 4.152915291529153e-05, "loss": 0.4502, "step": 510 }, { "epoch": 2.57, "grad_norm": 1.90145742893219, "learning_rate": 4.1254125412541255e-05, "loss": 0.4216, "step": 520 }, { "epoch": 2.62, "grad_norm": 1.8490346670150757, "learning_rate": 4.097909790979098e-05, "loss": 0.3942, "step": 530 }, { "epoch": 2.67, "grad_norm": 2.4955575466156006, "learning_rate": 4.070407040704071e-05, "loss": 0.4037, "step": 540 }, { "epoch": 2.72, "grad_norm": 2.651855945587158, "learning_rate": 4.042904290429043e-05, "loss": 0.4208, "step": 550 }, { "epoch": 2.77, "grad_norm": 3.947380781173706, "learning_rate": 4.015401540154016e-05, "loss": 0.4216, "step": 560 }, { "epoch": 2.82, "grad_norm": 3.179884433746338, "learning_rate": 3.987898789878988e-05, "loss": 0.4367, "step": 570 }, { "epoch": 2.87, "grad_norm": 2.080159902572632, "learning_rate": 3.9603960396039605e-05, "loss": 0.4239, "step": 580 }, { "epoch": 2.92, "grad_norm": 2.788001775741577, "learning_rate": 3.932893289328933e-05, "loss": 0.4063, "step": 590 }, { "epoch": 2.97, "grad_norm": 2.00004243850708, "learning_rate": 3.905390539053906e-05, "loss": 0.4497, "step": 600 }, { "epoch": 3.0, "eval_accuracy": 0.8519679554162313, "eval_loss": 0.3333553075790405, "eval_runtime": 34.2147, "eval_samples_per_second": 83.911, "eval_steps_per_second": 2.63, "step": 606 }, { "epoch": 3.02, "grad_norm": 2.456493854522705, "learning_rate": 3.877887788778878e-05, "loss": 0.4136, "step": 610 }, { "epoch": 3.07, "grad_norm": 2.922100305557251, "learning_rate": 3.8503850385038503e-05, "loss": 0.4026, "step": 620 }, { "epoch": 3.12, "grad_norm": 2.9372498989105225, "learning_rate": 3.822882288228823e-05, "loss": 0.4093, "step": 630 }, { "epoch": 3.17, "grad_norm": 3.000676393508911, "learning_rate": 3.7953795379537956e-05, "loss": 0.3901, "step": 640 }, { "epoch": 3.22, "grad_norm": 2.4853076934814453, "learning_rate": 3.767876787678768e-05, "loss": 0.3956, "step": 650 }, { "epoch": 3.27, "grad_norm": 2.6341211795806885, "learning_rate": 3.74037403740374e-05, "loss": 0.4142, "step": 660 }, { "epoch": 3.32, "grad_norm": 2.0509798526763916, "learning_rate": 3.712871287128713e-05, "loss": 0.3922, "step": 670 }, { "epoch": 3.37, "grad_norm": 2.622851848602295, "learning_rate": 3.6853685368536854e-05, "loss": 0.4163, "step": 680 }, { "epoch": 3.42, "grad_norm": 2.008538007736206, "learning_rate": 3.657865786578658e-05, "loss": 0.3921, "step": 690 }, { "epoch": 3.47, "grad_norm": 2.5022664070129395, "learning_rate": 3.6303630363036307e-05, "loss": 0.3626, "step": 700 }, { "epoch": 3.51, "grad_norm": 3.2395951747894287, "learning_rate": 3.602860286028603e-05, "loss": 0.4153, "step": 710 }, { "epoch": 3.56, "grad_norm": 2.134999990463257, "learning_rate": 3.575357535753576e-05, "loss": 0.3853, "step": 720 }, { "epoch": 3.61, "grad_norm": 3.0224032402038574, "learning_rate": 3.5478547854785485e-05, "loss": 0.3913, "step": 730 }, { "epoch": 3.66, "grad_norm": 2.18223237991333, "learning_rate": 3.5203520352035205e-05, "loss": 0.4183, "step": 740 }, { "epoch": 3.71, "grad_norm": 2.258599281311035, "learning_rate": 3.492849284928493e-05, "loss": 0.397, "step": 750 }, { "epoch": 3.76, "grad_norm": 2.017249822616577, "learning_rate": 3.465346534653465e-05, "loss": 0.3807, "step": 760 }, { "epoch": 3.81, "grad_norm": 2.045938491821289, "learning_rate": 3.4378437843784377e-05, "loss": 0.4119, "step": 770 }, { "epoch": 3.86, "grad_norm": 5.812851428985596, "learning_rate": 3.41034103410341e-05, "loss": 0.4088, "step": 780 }, { "epoch": 3.91, "grad_norm": 1.954240083694458, "learning_rate": 3.382838283828383e-05, "loss": 0.3862, "step": 790 }, { "epoch": 3.96, "grad_norm": 3.0154707431793213, "learning_rate": 3.3553355335533555e-05, "loss": 0.4074, "step": 800 }, { "epoch": 4.0, "eval_accuracy": 0.8460466736328806, "eval_loss": 0.33968210220336914, "eval_runtime": 34.3325, "eval_samples_per_second": 83.623, "eval_steps_per_second": 2.621, "step": 808 }, { "epoch": 4.01, "grad_norm": 2.900836706161499, "learning_rate": 3.327832783278328e-05, "loss": 0.3948, "step": 810 }, { "epoch": 4.06, "grad_norm": 3.435845375061035, "learning_rate": 3.300330033003301e-05, "loss": 0.3919, "step": 820 }, { "epoch": 4.11, "grad_norm": 2.006223201751709, "learning_rate": 3.272827282728273e-05, "loss": 0.3892, "step": 830 }, { "epoch": 4.16, "grad_norm": 3.857332229614258, "learning_rate": 3.2453245324532453e-05, "loss": 0.3901, "step": 840 }, { "epoch": 4.21, "grad_norm": 3.8714852333068848, "learning_rate": 3.217821782178218e-05, "loss": 0.3751, "step": 850 }, { "epoch": 4.26, "grad_norm": 1.7595456838607788, "learning_rate": 3.1903190319031906e-05, "loss": 0.3789, "step": 860 }, { "epoch": 4.31, "grad_norm": 2.590575695037842, "learning_rate": 3.162816281628163e-05, "loss": 0.3714, "step": 870 }, { "epoch": 4.36, "grad_norm": 2.044952154159546, "learning_rate": 3.135313531353136e-05, "loss": 0.394, "step": 880 }, { "epoch": 4.41, "grad_norm": 2.6872971057891846, "learning_rate": 3.1078107810781085e-05, "loss": 0.3501, "step": 890 }, { "epoch": 4.46, "grad_norm": 4.981396675109863, "learning_rate": 3.0803080308030804e-05, "loss": 0.3813, "step": 900 }, { "epoch": 4.5, "grad_norm": 2.0973191261291504, "learning_rate": 3.052805280528053e-05, "loss": 0.3627, "step": 910 }, { "epoch": 4.55, "grad_norm": 2.8159642219543457, "learning_rate": 3.0253025302530253e-05, "loss": 0.372, "step": 920 }, { "epoch": 4.6, "grad_norm": 4.265486240386963, "learning_rate": 2.9977997799779976e-05, "loss": 0.3565, "step": 930 }, { "epoch": 4.65, "grad_norm": 1.922487735748291, "learning_rate": 2.9702970297029702e-05, "loss": 0.3528, "step": 940 }, { "epoch": 4.7, "grad_norm": 2.1887781620025635, "learning_rate": 2.942794279427943e-05, "loss": 0.346, "step": 950 }, { "epoch": 4.75, "grad_norm": 2.510910749435425, "learning_rate": 2.9152915291529155e-05, "loss": 0.3465, "step": 960 }, { "epoch": 4.8, "grad_norm": 2.1515932083129883, "learning_rate": 2.8877887788778878e-05, "loss": 0.3558, "step": 970 }, { "epoch": 4.85, "grad_norm": 3.1407530307769775, "learning_rate": 2.8602860286028604e-05, "loss": 0.3909, "step": 980 }, { "epoch": 4.9, "grad_norm": 2.861525535583496, "learning_rate": 2.832783278327833e-05, "loss": 0.3804, "step": 990 }, { "epoch": 4.95, "grad_norm": 2.334779739379883, "learning_rate": 2.8052805280528056e-05, "loss": 0.3362, "step": 1000 }, { "epoch": 5.0, "grad_norm": 2.414121150970459, "learning_rate": 2.777777777777778e-05, "loss": 0.3552, "step": 1010 }, { "epoch": 5.0, "eval_accuracy": 0.8624172762103797, "eval_loss": 0.3226765990257263, "eval_runtime": 34.3538, "eval_samples_per_second": 83.572, "eval_steps_per_second": 2.62, "step": 1010 }, { "epoch": 5.05, "grad_norm": 3.0383219718933105, "learning_rate": 2.7502750275027505e-05, "loss": 0.3375, "step": 1020 }, { "epoch": 5.1, "grad_norm": 2.1559934616088867, "learning_rate": 2.722772277227723e-05, "loss": 0.347, "step": 1030 }, { "epoch": 5.15, "grad_norm": 2.3701679706573486, "learning_rate": 2.6952695269526958e-05, "loss": 0.3319, "step": 1040 }, { "epoch": 5.2, "grad_norm": 3.0003325939178467, "learning_rate": 2.667766776677668e-05, "loss": 0.342, "step": 1050 }, { "epoch": 5.25, "grad_norm": 3.154733657836914, "learning_rate": 2.64026402640264e-05, "loss": 0.3753, "step": 1060 }, { "epoch": 5.3, "grad_norm": 2.575256824493408, "learning_rate": 2.6127612761276126e-05, "loss": 0.3037, "step": 1070 }, { "epoch": 5.35, "grad_norm": 2.572767734527588, "learning_rate": 2.5852585258525853e-05, "loss": 0.3334, "step": 1080 }, { "epoch": 5.4, "grad_norm": 2.2883052825927734, "learning_rate": 2.557755775577558e-05, "loss": 0.3367, "step": 1090 }, { "epoch": 5.45, "grad_norm": 2.747040271759033, "learning_rate": 2.53025302530253e-05, "loss": 0.3135, "step": 1100 }, { "epoch": 5.5, "grad_norm": 3.9614763259887695, "learning_rate": 2.5027502750275028e-05, "loss": 0.3513, "step": 1110 }, { "epoch": 5.54, "grad_norm": 3.847574234008789, "learning_rate": 2.4752475247524754e-05, "loss": 0.3443, "step": 1120 }, { "epoch": 5.59, "grad_norm": 2.754849433898926, "learning_rate": 2.447744774477448e-05, "loss": 0.3472, "step": 1130 }, { "epoch": 5.64, "grad_norm": 2.95867657661438, "learning_rate": 2.4202420242024203e-05, "loss": 0.3424, "step": 1140 }, { "epoch": 5.69, "grad_norm": 3.1151840686798096, "learning_rate": 2.392739273927393e-05, "loss": 0.3431, "step": 1150 }, { "epoch": 5.74, "grad_norm": 2.3718581199645996, "learning_rate": 2.3652365236523656e-05, "loss": 0.3366, "step": 1160 }, { "epoch": 5.79, "grad_norm": 2.323246479034424, "learning_rate": 2.337733773377338e-05, "loss": 0.3162, "step": 1170 }, { "epoch": 5.84, "grad_norm": 2.7909843921661377, "learning_rate": 2.31023102310231e-05, "loss": 0.3521, "step": 1180 }, { "epoch": 5.89, "grad_norm": 3.047494888305664, "learning_rate": 2.2827282728272828e-05, "loss": 0.348, "step": 1190 }, { "epoch": 5.94, "grad_norm": 2.6819052696228027, "learning_rate": 2.2552255225522554e-05, "loss": 0.3573, "step": 1200 }, { "epoch": 5.99, "grad_norm": 2.541433334350586, "learning_rate": 2.227722772277228e-05, "loss": 0.3637, "step": 1210 }, { "epoch": 6.0, "eval_accuracy": 0.8617206548241031, "eval_loss": 0.3230111300945282, "eval_runtime": 34.3385, "eval_samples_per_second": 83.609, "eval_steps_per_second": 2.621, "step": 1212 }, { "epoch": 6.04, "grad_norm": 4.097253322601318, "learning_rate": 2.2002200220022003e-05, "loss": 0.3407, "step": 1220 }, { "epoch": 6.09, "grad_norm": 2.9143059253692627, "learning_rate": 2.172717271727173e-05, "loss": 0.3196, "step": 1230 }, { "epoch": 6.14, "grad_norm": 3.2299258708953857, "learning_rate": 2.1452145214521452e-05, "loss": 0.3093, "step": 1240 }, { "epoch": 6.19, "grad_norm": 3.449723958969116, "learning_rate": 2.1177117711771178e-05, "loss": 0.3107, "step": 1250 }, { "epoch": 6.24, "grad_norm": 4.260531425476074, "learning_rate": 2.09020902090209e-05, "loss": 0.3447, "step": 1260 }, { "epoch": 6.29, "grad_norm": 2.3418753147125244, "learning_rate": 2.0627062706270627e-05, "loss": 0.2984, "step": 1270 }, { "epoch": 6.34, "grad_norm": 2.2192769050598145, "learning_rate": 2.0352035203520354e-05, "loss": 0.2918, "step": 1280 }, { "epoch": 6.39, "grad_norm": 2.858307123184204, "learning_rate": 2.007700770077008e-05, "loss": 0.3481, "step": 1290 }, { "epoch": 6.44, "grad_norm": 2.215548038482666, "learning_rate": 1.9801980198019803e-05, "loss": 0.3107, "step": 1300 }, { "epoch": 6.49, "grad_norm": 3.317269802093506, "learning_rate": 1.952695269526953e-05, "loss": 0.3133, "step": 1310 }, { "epoch": 6.53, "grad_norm": 3.2768638134002686, "learning_rate": 1.9251925192519252e-05, "loss": 0.2935, "step": 1320 }, { "epoch": 6.58, "grad_norm": 3.264039993286133, "learning_rate": 1.8976897689768978e-05, "loss": 0.3061, "step": 1330 }, { "epoch": 6.63, "grad_norm": 4.440883159637451, "learning_rate": 1.87018701870187e-05, "loss": 0.335, "step": 1340 }, { "epoch": 6.68, "grad_norm": 2.5896713733673096, "learning_rate": 1.8426842684268427e-05, "loss": 0.3186, "step": 1350 }, { "epoch": 6.73, "grad_norm": 3.132344961166382, "learning_rate": 1.8151815181518153e-05, "loss": 0.3179, "step": 1360 }, { "epoch": 6.78, "grad_norm": 2.9426112174987793, "learning_rate": 1.787678767876788e-05, "loss": 0.3051, "step": 1370 }, { "epoch": 6.83, "grad_norm": 3.3997907638549805, "learning_rate": 1.7601760176017602e-05, "loss": 0.3032, "step": 1380 }, { "epoch": 6.88, "grad_norm": 3.3179874420166016, "learning_rate": 1.7326732673267325e-05, "loss": 0.3073, "step": 1390 }, { "epoch": 6.93, "grad_norm": 3.1379783153533936, "learning_rate": 1.705170517051705e-05, "loss": 0.3032, "step": 1400 }, { "epoch": 6.98, "grad_norm": 2.739069700241089, "learning_rate": 1.6776677667766778e-05, "loss": 0.3316, "step": 1410 }, { "epoch": 7.0, "eval_accuracy": 0.8672936259143156, "eval_loss": 0.31887274980545044, "eval_runtime": 34.4865, "eval_samples_per_second": 83.25, "eval_steps_per_second": 2.61, "step": 1414 }, { "epoch": 7.03, "grad_norm": 2.8164913654327393, "learning_rate": 1.6501650165016504e-05, "loss": 0.2943, "step": 1420 }, { "epoch": 7.08, "grad_norm": 2.99489688873291, "learning_rate": 1.6226622662266227e-05, "loss": 0.286, "step": 1430 }, { "epoch": 7.13, "grad_norm": 2.5896992683410645, "learning_rate": 1.5951595159515953e-05, "loss": 0.2976, "step": 1440 }, { "epoch": 7.18, "grad_norm": 2.5396080017089844, "learning_rate": 1.567656765676568e-05, "loss": 0.2813, "step": 1450 }, { "epoch": 7.23, "grad_norm": 3.6244957447052, "learning_rate": 1.5401540154015402e-05, "loss": 0.2735, "step": 1460 }, { "epoch": 7.28, "grad_norm": 3.4044344425201416, "learning_rate": 1.5126512651265127e-05, "loss": 0.288, "step": 1470 }, { "epoch": 7.33, "grad_norm": 3.5277624130249023, "learning_rate": 1.4851485148514851e-05, "loss": 0.3093, "step": 1480 }, { "epoch": 7.38, "grad_norm": 3.3465511798858643, "learning_rate": 1.4576457645764577e-05, "loss": 0.2734, "step": 1490 }, { "epoch": 7.43, "grad_norm": 3.2350246906280518, "learning_rate": 1.4301430143014302e-05, "loss": 0.2807, "step": 1500 }, { "epoch": 7.48, "grad_norm": 2.3579447269439697, "learning_rate": 1.4026402640264028e-05, "loss": 0.2724, "step": 1510 }, { "epoch": 7.52, "grad_norm": 3.0606653690338135, "learning_rate": 1.3751375137513753e-05, "loss": 0.2754, "step": 1520 }, { "epoch": 7.57, "grad_norm": 3.078380584716797, "learning_rate": 1.3476347634763479e-05, "loss": 0.2796, "step": 1530 }, { "epoch": 7.62, "grad_norm": 3.4750142097473145, "learning_rate": 1.32013201320132e-05, "loss": 0.2913, "step": 1540 }, { "epoch": 7.67, "grad_norm": 2.9180691242218018, "learning_rate": 1.2926292629262926e-05, "loss": 0.2556, "step": 1550 }, { "epoch": 7.72, "grad_norm": 3.396652936935425, "learning_rate": 1.265126512651265e-05, "loss": 0.2898, "step": 1560 }, { "epoch": 7.77, "grad_norm": 3.599966287612915, "learning_rate": 1.2376237623762377e-05, "loss": 0.2612, "step": 1570 }, { "epoch": 7.82, "grad_norm": 3.002462387084961, "learning_rate": 1.2101210121012102e-05, "loss": 0.2932, "step": 1580 }, { "epoch": 7.87, "grad_norm": 3.0669169425964355, "learning_rate": 1.1826182618261828e-05, "loss": 0.3011, "step": 1590 }, { "epoch": 7.92, "grad_norm": 2.8159451484680176, "learning_rate": 1.155115511551155e-05, "loss": 0.259, "step": 1600 }, { "epoch": 7.97, "grad_norm": 2.9513814449310303, "learning_rate": 1.1276127612761277e-05, "loss": 0.31, "step": 1610 }, { "epoch": 8.0, "eval_accuracy": 0.8491814698711251, "eval_loss": 0.380447119474411, "eval_runtime": 34.2815, "eval_samples_per_second": 83.748, "eval_steps_per_second": 2.625, "step": 1616 }, { "epoch": 8.02, "grad_norm": 3.6924493312835693, "learning_rate": 1.1001100110011001e-05, "loss": 0.2748, "step": 1620 }, { "epoch": 8.07, "grad_norm": 2.8759872913360596, "learning_rate": 1.0726072607260726e-05, "loss": 0.2577, "step": 1630 }, { "epoch": 8.12, "grad_norm": 4.450769424438477, "learning_rate": 1.045104510451045e-05, "loss": 0.2568, "step": 1640 }, { "epoch": 8.17, "grad_norm": 3.2912204265594482, "learning_rate": 1.0176017601760177e-05, "loss": 0.2741, "step": 1650 }, { "epoch": 8.22, "grad_norm": 3.2611083984375, "learning_rate": 9.900990099009901e-06, "loss": 0.2713, "step": 1660 }, { "epoch": 8.27, "grad_norm": 3.5394034385681152, "learning_rate": 9.625962596259626e-06, "loss": 0.2614, "step": 1670 }, { "epoch": 8.32, "grad_norm": 2.817716121673584, "learning_rate": 9.35093509350935e-06, "loss": 0.2278, "step": 1680 }, { "epoch": 8.37, "grad_norm": 2.6742143630981445, "learning_rate": 9.075907590759077e-06, "loss": 0.2568, "step": 1690 }, { "epoch": 8.42, "grad_norm": 3.2295913696289062, "learning_rate": 8.800880088008801e-06, "loss": 0.2782, "step": 1700 }, { "epoch": 8.47, "grad_norm": 2.844656467437744, "learning_rate": 8.525852585258526e-06, "loss": 0.245, "step": 1710 }, { "epoch": 8.51, "grad_norm": 3.8193488121032715, "learning_rate": 8.250825082508252e-06, "loss": 0.2474, "step": 1720 }, { "epoch": 8.56, "grad_norm": 3.309065103530884, "learning_rate": 7.975797579757976e-06, "loss": 0.2521, "step": 1730 }, { "epoch": 8.61, "grad_norm": 3.4286129474639893, "learning_rate": 7.700770077007701e-06, "loss": 0.2595, "step": 1740 }, { "epoch": 8.66, "grad_norm": 2.9357540607452393, "learning_rate": 7.4257425742574256e-06, "loss": 0.2674, "step": 1750 }, { "epoch": 8.71, "grad_norm": 2.765472650527954, "learning_rate": 7.150715071507151e-06, "loss": 0.2803, "step": 1760 }, { "epoch": 8.76, "grad_norm": 2.7337937355041504, "learning_rate": 6.875687568756876e-06, "loss": 0.2561, "step": 1770 }, { "epoch": 8.81, "grad_norm": 3.0260534286499023, "learning_rate": 6.6006600660066e-06, "loss": 0.2595, "step": 1780 }, { "epoch": 8.86, "grad_norm": 3.255251884460449, "learning_rate": 6.325632563256325e-06, "loss": 0.2571, "step": 1790 }, { "epoch": 8.91, "grad_norm": 3.837414503097534, "learning_rate": 6.050605060506051e-06, "loss": 0.2672, "step": 1800 }, { "epoch": 8.96, "grad_norm": 2.7164084911346436, "learning_rate": 5.775577557755775e-06, "loss": 0.2324, "step": 1810 }, { "epoch": 9.0, "eval_accuracy": 0.8662486938349008, "eval_loss": 0.3381957411766052, "eval_runtime": 34.3859, "eval_samples_per_second": 83.493, "eval_steps_per_second": 2.617, "step": 1818 }, { "epoch": 9.01, "grad_norm": 3.442255735397339, "learning_rate": 5.500550055005501e-06, "loss": 0.2454, "step": 1820 }, { "epoch": 9.06, "grad_norm": 3.476314067840576, "learning_rate": 5.225522552255225e-06, "loss": 0.2488, "step": 1830 }, { "epoch": 9.11, "grad_norm": 2.7462522983551025, "learning_rate": 4.950495049504951e-06, "loss": 0.2386, "step": 1840 }, { "epoch": 9.16, "grad_norm": 4.024281978607178, "learning_rate": 4.675467546754675e-06, "loss": 0.2434, "step": 1850 }, { "epoch": 9.21, "grad_norm": 3.229743003845215, "learning_rate": 4.400440044004401e-06, "loss": 0.2382, "step": 1860 }, { "epoch": 9.26, "grad_norm": 3.476935386657715, "learning_rate": 4.125412541254126e-06, "loss": 0.2706, "step": 1870 }, { "epoch": 9.31, "grad_norm": 4.351961135864258, "learning_rate": 3.8503850385038505e-06, "loss": 0.2529, "step": 1880 }, { "epoch": 9.36, "grad_norm": 3.750527858734131, "learning_rate": 3.5753575357535755e-06, "loss": 0.2189, "step": 1890 }, { "epoch": 9.41, "grad_norm": 4.025125026702881, "learning_rate": 3.3003300330033e-06, "loss": 0.2535, "step": 1900 }, { "epoch": 9.46, "grad_norm": 3.364833354949951, "learning_rate": 3.0253025302530254e-06, "loss": 0.2475, "step": 1910 }, { "epoch": 9.5, "grad_norm": 2.76141619682312, "learning_rate": 2.7502750275027504e-06, "loss": 0.2248, "step": 1920 }, { "epoch": 9.55, "grad_norm": 3.184504270553589, "learning_rate": 2.4752475247524753e-06, "loss": 0.2374, "step": 1930 }, { "epoch": 9.6, "grad_norm": 3.67197585105896, "learning_rate": 2.2002200220022003e-06, "loss": 0.2336, "step": 1940 }, { "epoch": 9.65, "grad_norm": 3.4729063510894775, "learning_rate": 1.9251925192519253e-06, "loss": 0.2345, "step": 1950 }, { "epoch": 9.7, "grad_norm": 2.817584991455078, "learning_rate": 1.65016501650165e-06, "loss": 0.2057, "step": 1960 }, { "epoch": 9.75, "grad_norm": 3.606154441833496, "learning_rate": 1.3751375137513752e-06, "loss": 0.2526, "step": 1970 }, { "epoch": 9.8, "grad_norm": 3.766678810119629, "learning_rate": 1.1001100110011001e-06, "loss": 0.2465, "step": 1980 }, { "epoch": 9.85, "grad_norm": 2.8478949069976807, "learning_rate": 8.25082508250825e-07, "loss": 0.2305, "step": 1990 }, { "epoch": 9.9, "grad_norm": 3.010897159576416, "learning_rate": 5.500550055005501e-07, "loss": 0.2269, "step": 2000 }, { "epoch": 9.95, "grad_norm": 2.9162042140960693, "learning_rate": 2.7502750275027504e-07, "loss": 0.2408, "step": 2010 }, { "epoch": 10.0, "grad_norm": 5.191891670227051, "learning_rate": 0.0, "loss": 0.234, "step": 2020 }, { "epoch": 10.0, "eval_accuracy": 0.8707767328456983, "eval_loss": 0.3384123742580414, "eval_runtime": 34.2477, "eval_samples_per_second": 83.83, "eval_steps_per_second": 2.628, "step": 2020 }, { "epoch": 10.0, "step": 2020, "total_flos": 6.733106817319895e+19, "train_loss": 0.3648681934517209, "train_runtime": 8202.686, "train_samples_per_second": 31.498, "train_steps_per_second": 0.246 } ], "logging_steps": 10, "max_steps": 2020, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 6.733106817319895e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }