|
{ |
|
"best_metric": 0.38806891441345215, |
|
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/ckb-Arab/checkpoint-95500", |
|
"epoch": 173.32123411978222, |
|
"eval_steps": 500, |
|
"global_step": 95500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9074410163339383, |
|
"grad_norm": 3.4398534297943115, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.6403, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9074410163339383, |
|
"eval_accuracy": 0.752505506780714, |
|
"eval_loss": 1.153350591659546, |
|
"eval_runtime": 53.4289, |
|
"eval_samples_per_second": 82.446, |
|
"eval_steps_per_second": 2.583, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.8148820326678767, |
|
"grad_norm": 2.854556083679199, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.1266, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8148820326678767, |
|
"eval_accuracy": 0.7904873277235048, |
|
"eval_loss": 0.9590327143669128, |
|
"eval_runtime": 53.4057, |
|
"eval_samples_per_second": 82.482, |
|
"eval_steps_per_second": 2.584, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.722323049001815, |
|
"grad_norm": 2.9346060752868652, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 0.9785, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.722323049001815, |
|
"eval_accuracy": 0.8084301658954617, |
|
"eval_loss": 0.8642853498458862, |
|
"eval_runtime": 59.5918, |
|
"eval_samples_per_second": 73.92, |
|
"eval_steps_per_second": 2.316, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.629764065335753, |
|
"grad_norm": 2.695502996444702, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.8933, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.629764065335753, |
|
"eval_accuracy": 0.8202711962432342, |
|
"eval_loss": 0.8070259094238281, |
|
"eval_runtime": 54.3445, |
|
"eval_samples_per_second": 81.057, |
|
"eval_steps_per_second": 2.539, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.537205081669692, |
|
"grad_norm": 2.3147311210632324, |
|
"learning_rate": 9.75e-05, |
|
"loss": 0.8354, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.537205081669692, |
|
"eval_accuracy": 0.8293452619955832, |
|
"eval_loss": 0.7662876844406128, |
|
"eval_runtime": 54.1554, |
|
"eval_samples_per_second": 81.34, |
|
"eval_steps_per_second": 2.548, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.44464609800363, |
|
"grad_norm": 2.6086342334747314, |
|
"learning_rate": 9.7e-05, |
|
"loss": 0.7909, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.44464609800363, |
|
"eval_accuracy": 0.8355002523340903, |
|
"eval_loss": 0.7356261610984802, |
|
"eval_runtime": 53.6095, |
|
"eval_samples_per_second": 82.168, |
|
"eval_steps_per_second": 2.574, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.352087114337568, |
|
"grad_norm": 2.1992199420928955, |
|
"learning_rate": 9.65e-05, |
|
"loss": 0.7546, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.352087114337568, |
|
"eval_accuracy": 0.8423681886232234, |
|
"eval_loss": 0.6986920237541199, |
|
"eval_runtime": 55.6018, |
|
"eval_samples_per_second": 79.224, |
|
"eval_steps_per_second": 2.482, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.259528130671506, |
|
"grad_norm": 2.3473734855651855, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.7292, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.259528130671506, |
|
"eval_accuracy": 0.8470036500248151, |
|
"eval_loss": 0.6758582592010498, |
|
"eval_runtime": 54.4017, |
|
"eval_samples_per_second": 80.972, |
|
"eval_steps_per_second": 2.537, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.166969147005444, |
|
"grad_norm": 2.585818290710449, |
|
"learning_rate": 9.55e-05, |
|
"loss": 0.7025, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.166969147005444, |
|
"eval_accuracy": 0.8527411894826806, |
|
"eval_loss": 0.6568289399147034, |
|
"eval_runtime": 57.7681, |
|
"eval_samples_per_second": 76.253, |
|
"eval_steps_per_second": 2.389, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.074410163339383, |
|
"grad_norm": 2.4189865589141846, |
|
"learning_rate": 9.5e-05, |
|
"loss": 0.68, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.074410163339383, |
|
"eval_accuracy": 0.8559500862970382, |
|
"eval_loss": 0.6354258060455322, |
|
"eval_runtime": 59.1091, |
|
"eval_samples_per_second": 74.523, |
|
"eval_steps_per_second": 2.335, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.98185117967332, |
|
"grad_norm": 2.443392515182495, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 0.6645, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.98185117967332, |
|
"eval_accuracy": 0.8591041578043516, |
|
"eval_loss": 0.6224693655967712, |
|
"eval_runtime": 53.9006, |
|
"eval_samples_per_second": 81.725, |
|
"eval_steps_per_second": 2.56, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 10.88929219600726, |
|
"grad_norm": 2.1055374145507812, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.6385, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 10.88929219600726, |
|
"eval_accuracy": 0.8609373044675259, |
|
"eval_loss": 0.6164940595626831, |
|
"eval_runtime": 53.8382, |
|
"eval_samples_per_second": 81.819, |
|
"eval_steps_per_second": 2.563, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 11.796733212341199, |
|
"grad_norm": 1.9976396560668945, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 0.6305, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 11.796733212341199, |
|
"eval_accuracy": 0.8621495473876636, |
|
"eval_loss": 0.6023448705673218, |
|
"eval_runtime": 54.3137, |
|
"eval_samples_per_second": 81.103, |
|
"eval_steps_per_second": 2.541, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 12.704174228675136, |
|
"grad_norm": 2.6231155395507812, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.6143, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 12.704174228675136, |
|
"eval_accuracy": 0.8642217993678152, |
|
"eval_loss": 0.5955979824066162, |
|
"eval_runtime": 55.8185, |
|
"eval_samples_per_second": 78.916, |
|
"eval_steps_per_second": 2.472, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 13.611615245009075, |
|
"grad_norm": 2.0853583812713623, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.6012, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 13.611615245009075, |
|
"eval_accuracy": 0.8669231710723286, |
|
"eval_loss": 0.5834583044052124, |
|
"eval_runtime": 52.8539, |
|
"eval_samples_per_second": 83.343, |
|
"eval_steps_per_second": 2.611, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 14.519056261343012, |
|
"grad_norm": 1.9873307943344116, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.59, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 14.519056261343012, |
|
"eval_accuracy": 0.8700438033381165, |
|
"eval_loss": 0.5704456567764282, |
|
"eval_runtime": 52.8195, |
|
"eval_samples_per_second": 83.397, |
|
"eval_steps_per_second": 2.613, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 15.426497277676951, |
|
"grad_norm": 2.106224536895752, |
|
"learning_rate": 9.15e-05, |
|
"loss": 0.5781, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 15.426497277676951, |
|
"eval_accuracy": 0.8698398656877739, |
|
"eval_loss": 0.5724136233329773, |
|
"eval_runtime": 59.543, |
|
"eval_samples_per_second": 73.98, |
|
"eval_steps_per_second": 2.318, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 16.33393829401089, |
|
"grad_norm": 2.0043249130249023, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.5675, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 16.33393829401089, |
|
"eval_accuracy": 0.8715064286118855, |
|
"eval_loss": 0.5624237656593323, |
|
"eval_runtime": 53.5321, |
|
"eval_samples_per_second": 82.287, |
|
"eval_steps_per_second": 2.578, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 17.24137931034483, |
|
"grad_norm": 2.078568935394287, |
|
"learning_rate": 9.05e-05, |
|
"loss": 0.5566, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 17.24137931034483, |
|
"eval_accuracy": 0.872356935014549, |
|
"eval_loss": 0.5570796728134155, |
|
"eval_runtime": 52.8375, |
|
"eval_samples_per_second": 83.369, |
|
"eval_steps_per_second": 2.612, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 18.148820326678766, |
|
"grad_norm": 2.026803731918335, |
|
"learning_rate": 9e-05, |
|
"loss": 0.5533, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 18.148820326678766, |
|
"eval_accuracy": 0.875522907428112, |
|
"eval_loss": 0.5457433462142944, |
|
"eval_runtime": 52.7025, |
|
"eval_samples_per_second": 83.582, |
|
"eval_steps_per_second": 2.618, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 19.056261343012704, |
|
"grad_norm": 1.8563569784164429, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 0.5396, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 19.056261343012704, |
|
"eval_accuracy": 0.8741645771312478, |
|
"eval_loss": 0.5479493737220764, |
|
"eval_runtime": 52.8826, |
|
"eval_samples_per_second": 83.298, |
|
"eval_steps_per_second": 2.61, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 19.96370235934664, |
|
"grad_norm": 2.1099376678466797, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.5359, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 19.96370235934664, |
|
"eval_accuracy": 0.8773543802678719, |
|
"eval_loss": 0.5376391410827637, |
|
"eval_runtime": 53.3193, |
|
"eval_samples_per_second": 82.615, |
|
"eval_steps_per_second": 2.588, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 20.87114337568058, |
|
"grad_norm": 2.5443553924560547, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 0.5246, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 20.87114337568058, |
|
"eval_accuracy": 0.8773027699144483, |
|
"eval_loss": 0.5346177816390991, |
|
"eval_runtime": 51.9467, |
|
"eval_samples_per_second": 84.799, |
|
"eval_steps_per_second": 2.657, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 21.77858439201452, |
|
"grad_norm": 2.047163963317871, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.5173, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 21.77858439201452, |
|
"eval_accuracy": 0.880544279055781, |
|
"eval_loss": 0.5207871794700623, |
|
"eval_runtime": 62.4765, |
|
"eval_samples_per_second": 70.506, |
|
"eval_steps_per_second": 2.209, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 22.686025408348456, |
|
"grad_norm": 1.9300446510314941, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.5111, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 22.686025408348456, |
|
"eval_accuracy": 0.8803666290669171, |
|
"eval_loss": 0.5259021520614624, |
|
"eval_runtime": 53.0987, |
|
"eval_samples_per_second": 82.959, |
|
"eval_steps_per_second": 2.599, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 23.593466424682397, |
|
"grad_norm": 2.314628839492798, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.505, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 23.593466424682397, |
|
"eval_accuracy": 0.8821161209658513, |
|
"eval_loss": 0.5170288681983948, |
|
"eval_runtime": 60.4353, |
|
"eval_samples_per_second": 72.888, |
|
"eval_steps_per_second": 2.283, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 24.500907441016334, |
|
"grad_norm": 2.187793493270874, |
|
"learning_rate": 8.65e-05, |
|
"loss": 0.4995, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 24.500907441016334, |
|
"eval_accuracy": 0.8803155533947083, |
|
"eval_loss": 0.5203161835670471, |
|
"eval_runtime": 54.2267, |
|
"eval_samples_per_second": 81.233, |
|
"eval_steps_per_second": 2.545, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 25.40834845735027, |
|
"grad_norm": 1.8629887104034424, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.4933, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 25.40834845735027, |
|
"eval_accuracy": 0.8827071990702727, |
|
"eval_loss": 0.5080223679542542, |
|
"eval_runtime": 53.122, |
|
"eval_samples_per_second": 82.922, |
|
"eval_steps_per_second": 2.598, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 26.31578947368421, |
|
"grad_norm": 1.95268976688385, |
|
"learning_rate": 8.55e-05, |
|
"loss": 0.4841, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 26.31578947368421, |
|
"eval_accuracy": 0.8852798720942991, |
|
"eval_loss": 0.4982340335845947, |
|
"eval_runtime": 55.7058, |
|
"eval_samples_per_second": 79.076, |
|
"eval_steps_per_second": 2.477, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 27.22323049001815, |
|
"grad_norm": 1.9664937257766724, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.4769, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 27.22323049001815, |
|
"eval_accuracy": 0.8844890070822018, |
|
"eval_loss": 0.5071918964385986, |
|
"eval_runtime": 57.0471, |
|
"eval_samples_per_second": 77.217, |
|
"eval_steps_per_second": 2.419, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 28.130671506352087, |
|
"grad_norm": 2.0566840171813965, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 0.4782, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 28.130671506352087, |
|
"eval_accuracy": 0.885741458637098, |
|
"eval_loss": 0.4975322186946869, |
|
"eval_runtime": 53.7809, |
|
"eval_samples_per_second": 81.906, |
|
"eval_steps_per_second": 2.566, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 29.038112522686024, |
|
"grad_norm": 1.969655990600586, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.4728, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 29.038112522686024, |
|
"eval_accuracy": 0.8861595126788497, |
|
"eval_loss": 0.498710572719574, |
|
"eval_runtime": 58.2764, |
|
"eval_samples_per_second": 75.588, |
|
"eval_steps_per_second": 2.368, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 29.945553539019965, |
|
"grad_norm": 1.9814519882202148, |
|
"learning_rate": 8.35e-05, |
|
"loss": 0.4648, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 29.945553539019965, |
|
"eval_accuracy": 0.8870825900148245, |
|
"eval_loss": 0.49478381872177124, |
|
"eval_runtime": 53.8125, |
|
"eval_samples_per_second": 81.858, |
|
"eval_steps_per_second": 2.564, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 30.852994555353902, |
|
"grad_norm": 1.8299716711044312, |
|
"learning_rate": 8.3e-05, |
|
"loss": 0.4597, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 30.852994555353902, |
|
"eval_accuracy": 0.8868082072258864, |
|
"eval_loss": 0.49765515327453613, |
|
"eval_runtime": 54.5146, |
|
"eval_samples_per_second": 80.804, |
|
"eval_steps_per_second": 2.531, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 31.76043557168784, |
|
"grad_norm": 2.192680597305298, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.4569, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 31.76043557168784, |
|
"eval_accuracy": 0.8874967002300411, |
|
"eval_loss": 0.49416524171829224, |
|
"eval_runtime": 53.3923, |
|
"eval_samples_per_second": 82.503, |
|
"eval_steps_per_second": 2.585, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 32.66787658802178, |
|
"grad_norm": 2.5133163928985596, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.4488, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 32.66787658802178, |
|
"eval_accuracy": 0.8887065044419662, |
|
"eval_loss": 0.49053409695625305, |
|
"eval_runtime": 56.0002, |
|
"eval_samples_per_second": 78.66, |
|
"eval_steps_per_second": 2.464, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 33.57531760435572, |
|
"grad_norm": 1.596177339553833, |
|
"learning_rate": 8.15e-05, |
|
"loss": 0.4456, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 33.57531760435572, |
|
"eval_accuracy": 0.8882017291247682, |
|
"eval_loss": 0.4841141700744629, |
|
"eval_runtime": 53.2223, |
|
"eval_samples_per_second": 82.766, |
|
"eval_steps_per_second": 2.593, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 34.48275862068966, |
|
"grad_norm": 2.740516185760498, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.4439, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 34.48275862068966, |
|
"eval_accuracy": 0.8906933029564148, |
|
"eval_loss": 0.473172128200531, |
|
"eval_runtime": 53.5999, |
|
"eval_samples_per_second": 82.183, |
|
"eval_steps_per_second": 2.575, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 35.39019963702359, |
|
"grad_norm": 1.7900762557983398, |
|
"learning_rate": 8.05e-05, |
|
"loss": 0.435, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 35.39019963702359, |
|
"eval_accuracy": 0.8905509902844176, |
|
"eval_loss": 0.4774630069732666, |
|
"eval_runtime": 53.7773, |
|
"eval_samples_per_second": 81.912, |
|
"eval_steps_per_second": 2.566, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 36.29764065335753, |
|
"grad_norm": 1.9263832569122314, |
|
"learning_rate": 8e-05, |
|
"loss": 0.4355, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 36.29764065335753, |
|
"eval_accuracy": 0.889703235016953, |
|
"eval_loss": 0.4819239377975464, |
|
"eval_runtime": 54.1016, |
|
"eval_samples_per_second": 81.421, |
|
"eval_steps_per_second": 2.551, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 37.20508166969147, |
|
"grad_norm": 1.9390649795532227, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 0.4327, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 37.20508166969147, |
|
"eval_accuracy": 0.8891170881908429, |
|
"eval_loss": 0.48580387234687805, |
|
"eval_runtime": 59.5973, |
|
"eval_samples_per_second": 73.913, |
|
"eval_steps_per_second": 2.316, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 38.11252268602541, |
|
"grad_norm": 1.9884870052337646, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.4254, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 38.11252268602541, |
|
"eval_accuracy": 0.8915290748428344, |
|
"eval_loss": 0.4742184281349182, |
|
"eval_runtime": 54.3464, |
|
"eval_samples_per_second": 81.054, |
|
"eval_steps_per_second": 2.539, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 39.01996370235935, |
|
"grad_norm": 1.5046188831329346, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 0.4229, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 39.01996370235935, |
|
"eval_accuracy": 0.8925229541919623, |
|
"eval_loss": 0.47058817744255066, |
|
"eval_runtime": 56.2804, |
|
"eval_samples_per_second": 78.269, |
|
"eval_steps_per_second": 2.452, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 39.92740471869328, |
|
"grad_norm": 1.9617971181869507, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.4174, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 39.92740471869328, |
|
"eval_accuracy": 0.8919159314703821, |
|
"eval_loss": 0.4736374616622925, |
|
"eval_runtime": 53.1824, |
|
"eval_samples_per_second": 82.828, |
|
"eval_steps_per_second": 2.595, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 40.83484573502722, |
|
"grad_norm": 1.7383509874343872, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.4151, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 40.83484573502722, |
|
"eval_accuracy": 0.8929338481208785, |
|
"eval_loss": 0.46910572052001953, |
|
"eval_runtime": 53.0493, |
|
"eval_samples_per_second": 83.036, |
|
"eval_steps_per_second": 2.601, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 41.74228675136116, |
|
"grad_norm": 1.895717740058899, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.4137, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 41.74228675136116, |
|
"eval_accuracy": 0.8939702847059194, |
|
"eval_loss": 0.46362048387527466, |
|
"eval_runtime": 53.7274, |
|
"eval_samples_per_second": 81.988, |
|
"eval_steps_per_second": 2.569, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 42.6497277676951, |
|
"grad_norm": 1.9185525178909302, |
|
"learning_rate": 7.65e-05, |
|
"loss": 0.4124, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 42.6497277676951, |
|
"eval_accuracy": 0.8955085956295108, |
|
"eval_loss": 0.4603004455566406, |
|
"eval_runtime": 52.7296, |
|
"eval_samples_per_second": 83.539, |
|
"eval_steps_per_second": 2.617, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 43.55716878402904, |
|
"grad_norm": 1.9594053030014038, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.4022, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 43.55716878402904, |
|
"eval_accuracy": 0.894325563921544, |
|
"eval_loss": 0.4673307240009308, |
|
"eval_runtime": 52.947, |
|
"eval_samples_per_second": 83.196, |
|
"eval_steps_per_second": 2.606, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 44.46460980036298, |
|
"grad_norm": 1.761846899986267, |
|
"learning_rate": 7.55e-05, |
|
"loss": 0.4035, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 44.46460980036298, |
|
"eval_accuracy": 0.8957788122798188, |
|
"eval_loss": 0.45509716868400574, |
|
"eval_runtime": 55.3592, |
|
"eval_samples_per_second": 79.571, |
|
"eval_steps_per_second": 2.493, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 45.37205081669691, |
|
"grad_norm": 1.936480164527893, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.3996, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 45.37205081669691, |
|
"eval_accuracy": 0.8945571305505521, |
|
"eval_loss": 0.4623182713985443, |
|
"eval_runtime": 53.3019, |
|
"eval_samples_per_second": 82.642, |
|
"eval_steps_per_second": 2.589, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 46.27949183303085, |
|
"grad_norm": 1.907658576965332, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 0.3979, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 46.27949183303085, |
|
"eval_accuracy": 0.8960829529232748, |
|
"eval_loss": 0.4556325376033783, |
|
"eval_runtime": 54.1617, |
|
"eval_samples_per_second": 81.331, |
|
"eval_steps_per_second": 2.548, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 47.186932849364794, |
|
"grad_norm": 1.9181513786315918, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.391, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 47.186932849364794, |
|
"eval_accuracy": 0.8958657660824587, |
|
"eval_loss": 0.4613765776157379, |
|
"eval_runtime": 55.5872, |
|
"eval_samples_per_second": 79.245, |
|
"eval_steps_per_second": 2.483, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 48.09437386569873, |
|
"grad_norm": 1.6843451261520386, |
|
"learning_rate": 7.35e-05, |
|
"loss": 0.391, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 48.09437386569873, |
|
"eval_accuracy": 0.89564026034286, |
|
"eval_loss": 0.45949628949165344, |
|
"eval_runtime": 52.8113, |
|
"eval_samples_per_second": 83.41, |
|
"eval_steps_per_second": 2.613, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 49.00181488203267, |
|
"grad_norm": 1.7681550979614258, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.3874, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 49.00181488203267, |
|
"eval_accuracy": 0.8962522308149911, |
|
"eval_loss": 0.4545115828514099, |
|
"eval_runtime": 53.1381, |
|
"eval_samples_per_second": 82.897, |
|
"eval_steps_per_second": 2.597, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 49.90925589836661, |
|
"grad_norm": 1.8517777919769287, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.3835, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 49.90925589836661, |
|
"eval_accuracy": 0.896625333542615, |
|
"eval_loss": 0.45060068368911743, |
|
"eval_runtime": 52.9396, |
|
"eval_samples_per_second": 83.208, |
|
"eval_steps_per_second": 2.607, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 50.81669691470054, |
|
"grad_norm": 1.9447550773620605, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.3779, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 50.81669691470054, |
|
"eval_accuracy": 0.8974899929361903, |
|
"eval_loss": 0.4529257118701935, |
|
"eval_runtime": 56.9809, |
|
"eval_samples_per_second": 77.307, |
|
"eval_steps_per_second": 2.422, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 51.724137931034484, |
|
"grad_norm": 1.7611163854599, |
|
"learning_rate": 7.15e-05, |
|
"loss": 0.3783, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 51.724137931034484, |
|
"eval_accuracy": 0.8984791687632082, |
|
"eval_loss": 0.44696977734565735, |
|
"eval_runtime": 53.9391, |
|
"eval_samples_per_second": 81.666, |
|
"eval_steps_per_second": 2.558, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 52.63157894736842, |
|
"grad_norm": 1.749190092086792, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.3727, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 52.63157894736842, |
|
"eval_accuracy": 0.8978207320708537, |
|
"eval_loss": 0.4506888687610626, |
|
"eval_runtime": 53.5362, |
|
"eval_samples_per_second": 82.281, |
|
"eval_steps_per_second": 2.578, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 53.53901996370236, |
|
"grad_norm": 1.8609730005264282, |
|
"learning_rate": 7.05e-05, |
|
"loss": 0.3705, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 53.53901996370236, |
|
"eval_accuracy": 0.8976168077767325, |
|
"eval_loss": 0.4500649869441986, |
|
"eval_runtime": 53.4168, |
|
"eval_samples_per_second": 82.465, |
|
"eval_steps_per_second": 2.583, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 54.4464609800363, |
|
"grad_norm": 1.8506393432617188, |
|
"learning_rate": 7e-05, |
|
"loss": 0.3719, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 54.4464609800363, |
|
"eval_accuracy": 0.8993505575402249, |
|
"eval_loss": 0.44908007979393005, |
|
"eval_runtime": 57.767, |
|
"eval_samples_per_second": 76.255, |
|
"eval_steps_per_second": 2.389, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 55.35390199637023, |
|
"grad_norm": 1.8105406761169434, |
|
"learning_rate": 6.95e-05, |
|
"loss": 0.3684, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 55.35390199637023, |
|
"eval_accuracy": 0.8994710323502174, |
|
"eval_loss": 0.44289711117744446, |
|
"eval_runtime": 53.0941, |
|
"eval_samples_per_second": 82.966, |
|
"eval_steps_per_second": 2.599, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 56.261343012704174, |
|
"grad_norm": 1.9500548839569092, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.3621, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 56.261343012704174, |
|
"eval_accuracy": 0.8995646091699282, |
|
"eval_loss": 0.44326120615005493, |
|
"eval_runtime": 57.7269, |
|
"eval_samples_per_second": 76.308, |
|
"eval_steps_per_second": 2.391, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 57.168784029038115, |
|
"grad_norm": 1.771316647529602, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 0.3639, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 57.168784029038115, |
|
"eval_accuracy": 0.8996551918347424, |
|
"eval_loss": 0.4400934875011444, |
|
"eval_runtime": 59.1245, |
|
"eval_samples_per_second": 74.504, |
|
"eval_steps_per_second": 2.334, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 58.07622504537205, |
|
"grad_norm": 1.9457340240478516, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.3603, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 58.07622504537205, |
|
"eval_accuracy": 0.8985898806146979, |
|
"eval_loss": 0.4473365545272827, |
|
"eval_runtime": 53.3011, |
|
"eval_samples_per_second": 82.644, |
|
"eval_steps_per_second": 2.589, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 58.98366606170599, |
|
"grad_norm": 1.8146084547042847, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.3568, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 58.98366606170599, |
|
"eval_accuracy": 0.9005719064701833, |
|
"eval_loss": 0.4383050501346588, |
|
"eval_runtime": 57.7373, |
|
"eval_samples_per_second": 76.294, |
|
"eval_steps_per_second": 2.39, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 59.89110707803993, |
|
"grad_norm": 1.809646725654602, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.3516, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 59.89110707803993, |
|
"eval_accuracy": 0.9000993445757237, |
|
"eval_loss": 0.44284284114837646, |
|
"eval_runtime": 53.187, |
|
"eval_samples_per_second": 82.821, |
|
"eval_steps_per_second": 2.595, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 60.798548094373864, |
|
"grad_norm": 1.7659413814544678, |
|
"learning_rate": 6.65e-05, |
|
"loss": 0.3531, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 60.798548094373864, |
|
"eval_accuracy": 0.8994799933629496, |
|
"eval_loss": 0.44589298963546753, |
|
"eval_runtime": 53.0032, |
|
"eval_samples_per_second": 83.108, |
|
"eval_steps_per_second": 2.604, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 61.705989110707804, |
|
"grad_norm": 1.636080265045166, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.3499, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 61.705989110707804, |
|
"eval_accuracy": 0.8999748562089449, |
|
"eval_loss": 0.4367033839225769, |
|
"eval_runtime": 53.0566, |
|
"eval_samples_per_second": 83.025, |
|
"eval_steps_per_second": 2.601, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 62.613430127041745, |
|
"grad_norm": 1.8669129610061646, |
|
"learning_rate": 6.55e-05, |
|
"loss": 0.3489, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 62.613430127041745, |
|
"eval_accuracy": 0.9013975155279503, |
|
"eval_loss": 0.4371834695339203, |
|
"eval_runtime": 53.1608, |
|
"eval_samples_per_second": 82.862, |
|
"eval_steps_per_second": 2.596, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 63.52087114337568, |
|
"grad_norm": 1.9811877012252808, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.3429, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 63.52087114337568, |
|
"eval_accuracy": 0.9018210634557753, |
|
"eval_loss": 0.4384971857070923, |
|
"eval_runtime": 57.6259, |
|
"eval_samples_per_second": 76.441, |
|
"eval_steps_per_second": 2.395, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 64.42831215970962, |
|
"grad_norm": 1.7895385026931763, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 0.3415, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 64.42831215970962, |
|
"eval_accuracy": 0.9028058283836883, |
|
"eval_loss": 0.43259307742118835, |
|
"eval_runtime": 53.0105, |
|
"eval_samples_per_second": 83.097, |
|
"eval_steps_per_second": 2.603, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 65.33575317604355, |
|
"grad_norm": 1.9592262506484985, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.3402, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 65.33575317604355, |
|
"eval_accuracy": 0.901298823973929, |
|
"eval_loss": 0.4333614110946655, |
|
"eval_runtime": 52.9956, |
|
"eval_samples_per_second": 83.12, |
|
"eval_steps_per_second": 2.604, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 66.2431941923775, |
|
"grad_norm": 1.69992196559906, |
|
"learning_rate": 6.35e-05, |
|
"loss": 0.3349, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 66.2431941923775, |
|
"eval_accuracy": 0.90217415310253, |
|
"eval_loss": 0.430584579706192, |
|
"eval_runtime": 52.9983, |
|
"eval_samples_per_second": 83.116, |
|
"eval_steps_per_second": 2.604, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 67.15063520871144, |
|
"grad_norm": 1.70908522605896, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.3387, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 67.15063520871144, |
|
"eval_accuracy": 0.9022541035115501, |
|
"eval_loss": 0.4351217746734619, |
|
"eval_runtime": 53.4219, |
|
"eval_samples_per_second": 82.457, |
|
"eval_steps_per_second": 2.583, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 68.05807622504537, |
|
"grad_norm": 1.7078979015350342, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.3328, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 68.05807622504537, |
|
"eval_accuracy": 0.9027885079291, |
|
"eval_loss": 0.43064776062965393, |
|
"eval_runtime": 52.9742, |
|
"eval_samples_per_second": 83.154, |
|
"eval_steps_per_second": 2.605, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 68.96551724137932, |
|
"grad_norm": 1.6770516633987427, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.33, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 68.96551724137932, |
|
"eval_accuracy": 0.9039243367993435, |
|
"eval_loss": 0.42335009574890137, |
|
"eval_runtime": 52.9527, |
|
"eval_samples_per_second": 83.187, |
|
"eval_steps_per_second": 2.606, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 69.87295825771325, |
|
"grad_norm": 1.7739548683166504, |
|
"learning_rate": 6.15e-05, |
|
"loss": 0.3291, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 69.87295825771325, |
|
"eval_accuracy": 0.9030174309735959, |
|
"eval_loss": 0.42873483896255493, |
|
"eval_runtime": 52.9154, |
|
"eval_samples_per_second": 83.246, |
|
"eval_steps_per_second": 2.608, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 70.78039927404718, |
|
"grad_norm": 1.9532561302185059, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.3288, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 70.78039927404718, |
|
"eval_accuracy": 0.9036126447268178, |
|
"eval_loss": 0.42941179871559143, |
|
"eval_runtime": 53.0126, |
|
"eval_samples_per_second": 83.093, |
|
"eval_steps_per_second": 2.603, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 71.68784029038113, |
|
"grad_norm": 1.9629998207092285, |
|
"learning_rate": 6.05e-05, |
|
"loss": 0.3255, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 71.68784029038113, |
|
"eval_accuracy": 0.904484644880421, |
|
"eval_loss": 0.4233216643333435, |
|
"eval_runtime": 52.8794, |
|
"eval_samples_per_second": 83.303, |
|
"eval_steps_per_second": 2.61, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 72.59528130671507, |
|
"grad_norm": 1.9358775615692139, |
|
"learning_rate": 6e-05, |
|
"loss": 0.3248, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 72.59528130671507, |
|
"eval_accuracy": 0.9041070159565976, |
|
"eval_loss": 0.4286067485809326, |
|
"eval_runtime": 52.8963, |
|
"eval_samples_per_second": 83.276, |
|
"eval_steps_per_second": 2.609, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 73.502722323049, |
|
"grad_norm": 1.6557801961898804, |
|
"learning_rate": 5.95e-05, |
|
"loss": 0.3203, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 73.502722323049, |
|
"eval_accuracy": 0.9040081056486239, |
|
"eval_loss": 0.4276265799999237, |
|
"eval_runtime": 52.8201, |
|
"eval_samples_per_second": 83.396, |
|
"eval_steps_per_second": 2.613, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 74.41016333938293, |
|
"grad_norm": 1.58525812625885, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.3218, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 74.41016333938293, |
|
"eval_accuracy": 0.9048278199949151, |
|
"eval_loss": 0.4280424118041992, |
|
"eval_runtime": 53.6976, |
|
"eval_samples_per_second": 82.034, |
|
"eval_steps_per_second": 2.57, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 75.31760435571688, |
|
"grad_norm": 1.5357856750488281, |
|
"learning_rate": 5.85e-05, |
|
"loss": 0.3198, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 75.31760435571688, |
|
"eval_accuracy": 0.904816966551977, |
|
"eval_loss": 0.41927599906921387, |
|
"eval_runtime": 52.6237, |
|
"eval_samples_per_second": 83.708, |
|
"eval_steps_per_second": 2.622, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 76.22504537205081, |
|
"grad_norm": 1.9634026288986206, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.3149, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 76.22504537205081, |
|
"eval_accuracy": 0.9031350344081517, |
|
"eval_loss": 0.4304438531398773, |
|
"eval_runtime": 52.8667, |
|
"eval_samples_per_second": 83.323, |
|
"eval_steps_per_second": 2.61, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 77.13248638838475, |
|
"grad_norm": 1.563607096672058, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.3144, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 77.13248638838475, |
|
"eval_accuracy": 0.9048774759257809, |
|
"eval_loss": 0.427058607339859, |
|
"eval_runtime": 52.712, |
|
"eval_samples_per_second": 83.567, |
|
"eval_steps_per_second": 2.618, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 78.0399274047187, |
|
"grad_norm": 1.871159553527832, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.3103, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 78.0399274047187, |
|
"eval_accuracy": 0.9048358855975869, |
|
"eval_loss": 0.4271075129508972, |
|
"eval_runtime": 52.7904, |
|
"eval_samples_per_second": 83.443, |
|
"eval_steps_per_second": 2.614, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 78.94736842105263, |
|
"grad_norm": 1.61452054977417, |
|
"learning_rate": 5.65e-05, |
|
"loss": 0.3102, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 78.94736842105263, |
|
"eval_accuracy": 0.9049740443605474, |
|
"eval_loss": 0.42336586117744446, |
|
"eval_runtime": 52.875, |
|
"eval_samples_per_second": 83.31, |
|
"eval_steps_per_second": 2.61, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 79.85480943738656, |
|
"grad_norm": 1.8801889419555664, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.307, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 79.85480943738656, |
|
"eval_accuracy": 0.9055931173260359, |
|
"eval_loss": 0.42033708095550537, |
|
"eval_runtime": 52.627, |
|
"eval_samples_per_second": 83.702, |
|
"eval_steps_per_second": 2.622, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 80.76225045372051, |
|
"grad_norm": 1.5314077138900757, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 0.3037, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 80.76225045372051, |
|
"eval_accuracy": 0.9061196499462085, |
|
"eval_loss": 0.4253558814525604, |
|
"eval_runtime": 52.7114, |
|
"eval_samples_per_second": 83.568, |
|
"eval_steps_per_second": 2.618, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 81.66969147005445, |
|
"grad_norm": 1.7618950605392456, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.3016, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 81.66969147005445, |
|
"eval_accuracy": 0.9062959555947709, |
|
"eval_loss": 0.4181654453277588, |
|
"eval_runtime": 52.7251, |
|
"eval_samples_per_second": 83.547, |
|
"eval_steps_per_second": 2.617, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 82.57713248638838, |
|
"grad_norm": 1.913796067237854, |
|
"learning_rate": 5.45e-05, |
|
"loss": 0.303, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 82.57713248638838, |
|
"eval_accuracy": 0.9072205041995722, |
|
"eval_loss": 0.41656142473220825, |
|
"eval_runtime": 52.7847, |
|
"eval_samples_per_second": 83.452, |
|
"eval_steps_per_second": 2.614, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 83.48457350272233, |
|
"grad_norm": 1.803902506828308, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.3028, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 83.48457350272233, |
|
"eval_accuracy": 0.9070220964556146, |
|
"eval_loss": 0.4230930507183075, |
|
"eval_runtime": 54.7506, |
|
"eval_samples_per_second": 80.456, |
|
"eval_steps_per_second": 2.521, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 84.39201451905626, |
|
"grad_norm": 1.7706644535064697, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 0.2986, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 84.39201451905626, |
|
"eval_accuracy": 0.9074832026084252, |
|
"eval_loss": 0.4140300452709198, |
|
"eval_runtime": 52.7024, |
|
"eval_samples_per_second": 83.583, |
|
"eval_steps_per_second": 2.618, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 85.2994555353902, |
|
"grad_norm": 1.9356876611709595, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.2966, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 85.2994555353902, |
|
"eval_accuracy": 0.9071740298423052, |
|
"eval_loss": 0.4191630482673645, |
|
"eval_runtime": 52.7256, |
|
"eval_samples_per_second": 83.546, |
|
"eval_steps_per_second": 2.617, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 86.20689655172414, |
|
"grad_norm": 1.8884636163711548, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.2959, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 86.20689655172414, |
|
"eval_accuracy": 0.9080046323395202, |
|
"eval_loss": 0.4184423089027405, |
|
"eval_runtime": 52.5803, |
|
"eval_samples_per_second": 83.777, |
|
"eval_steps_per_second": 2.625, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 87.11433756805808, |
|
"grad_norm": 1.7885215282440186, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.2943, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 87.11433756805808, |
|
"eval_accuracy": 0.9073709179447276, |
|
"eval_loss": 0.4168856143951416, |
|
"eval_runtime": 52.6737, |
|
"eval_samples_per_second": 83.628, |
|
"eval_steps_per_second": 2.62, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 88.02177858439201, |
|
"grad_norm": 1.6675046682357788, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 0.2932, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 88.02177858439201, |
|
"eval_accuracy": 0.9065622782059254, |
|
"eval_loss": 0.42474210262298584, |
|
"eval_runtime": 57.2893, |
|
"eval_samples_per_second": 76.89, |
|
"eval_steps_per_second": 2.409, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 88.92921960072596, |
|
"grad_norm": 1.7693278789520264, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.2913, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 88.92921960072596, |
|
"eval_accuracy": 0.908334698713182, |
|
"eval_loss": 0.41496196389198303, |
|
"eval_runtime": 52.681, |
|
"eval_samples_per_second": 83.616, |
|
"eval_steps_per_second": 2.62, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 89.83666061705989, |
|
"grad_norm": 1.80568528175354, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.29, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 89.83666061705989, |
|
"eval_accuracy": 0.9067204115993917, |
|
"eval_loss": 0.42078179121017456, |
|
"eval_runtime": 52.5861, |
|
"eval_samples_per_second": 83.767, |
|
"eval_steps_per_second": 2.624, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 90.74410163339383, |
|
"grad_norm": 1.7286852598190308, |
|
"learning_rate": 5e-05, |
|
"loss": 0.288, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 90.74410163339383, |
|
"eval_accuracy": 0.908493707496287, |
|
"eval_loss": 0.41103261709213257, |
|
"eval_runtime": 52.6627, |
|
"eval_samples_per_second": 83.646, |
|
"eval_steps_per_second": 2.62, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 91.65154264972777, |
|
"grad_norm": 1.6064249277114868, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.2846, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 91.65154264972777, |
|
"eval_accuracy": 0.9089611693118373, |
|
"eval_loss": 0.4114561975002289, |
|
"eval_runtime": 52.5213, |
|
"eval_samples_per_second": 83.871, |
|
"eval_steps_per_second": 2.628, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 92.5589836660617, |
|
"grad_norm": 1.6957948207855225, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.2825, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 92.5589836660617, |
|
"eval_accuracy": 0.9070289798162255, |
|
"eval_loss": 0.41827893257141113, |
|
"eval_runtime": 52.6164, |
|
"eval_samples_per_second": 83.719, |
|
"eval_steps_per_second": 2.623, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 93.46642468239564, |
|
"grad_norm": 1.6073497533798218, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.2834, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 93.46642468239564, |
|
"eval_accuracy": 0.9093731751882107, |
|
"eval_loss": 0.40947192907333374, |
|
"eval_runtime": 55.8553, |
|
"eval_samples_per_second": 78.865, |
|
"eval_steps_per_second": 2.471, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 94.37386569872959, |
|
"grad_norm": 1.8224419355392456, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.2803, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 94.37386569872959, |
|
"eval_accuracy": 0.9078407463018748, |
|
"eval_loss": 0.418082594871521, |
|
"eval_runtime": 57.2453, |
|
"eval_samples_per_second": 76.95, |
|
"eval_steps_per_second": 2.411, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 95.28130671506352, |
|
"grad_norm": 1.9654055833816528, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.2787, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 95.28130671506352, |
|
"eval_accuracy": 0.9089184717302816, |
|
"eval_loss": 0.4162246882915497, |
|
"eval_runtime": 57.2773, |
|
"eval_samples_per_second": 76.907, |
|
"eval_steps_per_second": 2.409, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 96.18874773139746, |
|
"grad_norm": 1.7956877946853638, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.278, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 96.18874773139746, |
|
"eval_accuracy": 0.9096441756342786, |
|
"eval_loss": 0.40937647223472595, |
|
"eval_runtime": 57.2739, |
|
"eval_samples_per_second": 76.911, |
|
"eval_steps_per_second": 2.409, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 97.0961887477314, |
|
"grad_norm": 1.6677452325820923, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.2759, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 97.0961887477314, |
|
"eval_accuracy": 0.9090880598745718, |
|
"eval_loss": 0.4136127233505249, |
|
"eval_runtime": 52.9843, |
|
"eval_samples_per_second": 83.138, |
|
"eval_steps_per_second": 2.605, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 98.00362976406534, |
|
"grad_norm": 1.7029211521148682, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.2746, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 98.00362976406534, |
|
"eval_accuracy": 0.9085475626951355, |
|
"eval_loss": 0.4151366651058197, |
|
"eval_runtime": 55.2968, |
|
"eval_samples_per_second": 79.661, |
|
"eval_steps_per_second": 2.496, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 98.91107078039927, |
|
"grad_norm": 1.4931912422180176, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.2734, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 98.91107078039927, |
|
"eval_accuracy": 0.9087434484443874, |
|
"eval_loss": 0.4170074760913849, |
|
"eval_runtime": 52.6515, |
|
"eval_samples_per_second": 83.663, |
|
"eval_steps_per_second": 2.621, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 99.81851179673322, |
|
"grad_norm": 1.8849012851715088, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.2719, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 99.81851179673322, |
|
"eval_accuracy": 0.9087569026104417, |
|
"eval_loss": 0.41325053572654724, |
|
"eval_runtime": 52.6731, |
|
"eval_samples_per_second": 83.629, |
|
"eval_steps_per_second": 2.62, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 100.72595281306715, |
|
"grad_norm": 2.079172372817993, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.271, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 100.72595281306715, |
|
"eval_accuracy": 0.9103295110887096, |
|
"eval_loss": 0.41259288787841797, |
|
"eval_runtime": 52.6285, |
|
"eval_samples_per_second": 83.7, |
|
"eval_steps_per_second": 2.622, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 101.63339382940109, |
|
"grad_norm": 1.8264790773391724, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.267, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 101.63339382940109, |
|
"eval_accuracy": 0.9093908939634668, |
|
"eval_loss": 0.41436412930488586, |
|
"eval_runtime": 53.0842, |
|
"eval_samples_per_second": 82.981, |
|
"eval_steps_per_second": 2.6, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 102.54083484573503, |
|
"grad_norm": 1.6904346942901611, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.2682, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 102.54083484573503, |
|
"eval_accuracy": 0.9092624512952864, |
|
"eval_loss": 0.413330078125, |
|
"eval_runtime": 57.4299, |
|
"eval_samples_per_second": 76.702, |
|
"eval_steps_per_second": 2.403, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 103.44827586206897, |
|
"grad_norm": 1.7619383335113525, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.2681, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 103.44827586206897, |
|
"eval_accuracy": 0.9103495162664795, |
|
"eval_loss": 0.40912753343582153, |
|
"eval_runtime": 57.2635, |
|
"eval_samples_per_second": 76.925, |
|
"eval_steps_per_second": 2.41, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 104.3557168784029, |
|
"grad_norm": 1.7024521827697754, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.2644, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 104.3557168784029, |
|
"eval_accuracy": 0.9091305589286443, |
|
"eval_loss": 0.4176701605319977, |
|
"eval_runtime": 52.5316, |
|
"eval_samples_per_second": 83.854, |
|
"eval_steps_per_second": 2.627, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 105.26315789473684, |
|
"grad_norm": 1.936393141746521, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.2621, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 105.26315789473684, |
|
"eval_accuracy": 0.909638402972615, |
|
"eval_loss": 0.4138263165950775, |
|
"eval_runtime": 57.3248, |
|
"eval_samples_per_second": 76.843, |
|
"eval_steps_per_second": 2.407, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 106.17059891107078, |
|
"grad_norm": 1.9783495664596558, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.2618, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 106.17059891107078, |
|
"eval_accuracy": 0.9093778717725997, |
|
"eval_loss": 0.41479504108428955, |
|
"eval_runtime": 52.6769, |
|
"eval_samples_per_second": 83.623, |
|
"eval_steps_per_second": 2.62, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 107.07803992740472, |
|
"grad_norm": 1.8036541938781738, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.2593, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 107.07803992740472, |
|
"eval_accuracy": 0.9108752466798229, |
|
"eval_loss": 0.407368004322052, |
|
"eval_runtime": 52.7623, |
|
"eval_samples_per_second": 83.488, |
|
"eval_steps_per_second": 2.616, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 107.98548094373865, |
|
"grad_norm": 1.7356771230697632, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.2586, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 107.98548094373865, |
|
"eval_accuracy": 0.9106982338220416, |
|
"eval_loss": 0.4060940444469452, |
|
"eval_runtime": 57.2822, |
|
"eval_samples_per_second": 76.9, |
|
"eval_steps_per_second": 2.409, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 108.8929219600726, |
|
"grad_norm": 1.9993195533752441, |
|
"learning_rate": 4e-05, |
|
"loss": 0.2591, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 108.8929219600726, |
|
"eval_accuracy": 0.910394873244746, |
|
"eval_loss": 0.4131792187690735, |
|
"eval_runtime": 53.0, |
|
"eval_samples_per_second": 83.113, |
|
"eval_steps_per_second": 2.604, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 109.80036297640653, |
|
"grad_norm": 1.8162901401519775, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.2558, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 109.80036297640653, |
|
"eval_accuracy": 0.910258865637902, |
|
"eval_loss": 0.4147132933139801, |
|
"eval_runtime": 55.8856, |
|
"eval_samples_per_second": 78.822, |
|
"eval_steps_per_second": 2.469, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 110.70780399274047, |
|
"grad_norm": 1.8106731176376343, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.2541, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 110.70780399274047, |
|
"eval_accuracy": 0.9114573371669734, |
|
"eval_loss": 0.40593624114990234, |
|
"eval_runtime": 52.8519, |
|
"eval_samples_per_second": 83.346, |
|
"eval_steps_per_second": 2.611, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 111.61524500907441, |
|
"grad_norm": 1.748769998550415, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.2556, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 111.61524500907441, |
|
"eval_accuracy": 0.9115416885324719, |
|
"eval_loss": 0.4094337522983551, |
|
"eval_runtime": 52.8285, |
|
"eval_samples_per_second": 83.383, |
|
"eval_steps_per_second": 2.612, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 112.52268602540835, |
|
"grad_norm": 1.6545246839523315, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.25, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 112.52268602540835, |
|
"eval_accuracy": 0.9117101451094991, |
|
"eval_loss": 0.4004589915275574, |
|
"eval_runtime": 52.6819, |
|
"eval_samples_per_second": 83.615, |
|
"eval_steps_per_second": 2.619, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 113.43012704174228, |
|
"grad_norm": 1.9466238021850586, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.2492, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 113.43012704174228, |
|
"eval_accuracy": 0.9121271025446342, |
|
"eval_loss": 0.4025409519672394, |
|
"eval_runtime": 52.7085, |
|
"eval_samples_per_second": 83.573, |
|
"eval_steps_per_second": 2.618, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 114.33756805807623, |
|
"grad_norm": 1.8718467950820923, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.2505, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 114.33756805807623, |
|
"eval_accuracy": 0.9116951165625715, |
|
"eval_loss": 0.40717950463294983, |
|
"eval_runtime": 52.6815, |
|
"eval_samples_per_second": 83.616, |
|
"eval_steps_per_second": 2.62, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 115.24500907441016, |
|
"grad_norm": 1.6669822931289673, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.2477, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 115.24500907441016, |
|
"eval_accuracy": 0.9123295778283549, |
|
"eval_loss": 0.40226927399635315, |
|
"eval_runtime": 53.2555, |
|
"eval_samples_per_second": 82.714, |
|
"eval_steps_per_second": 2.591, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 116.1524500907441, |
|
"grad_norm": 1.631198525428772, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.2462, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 116.1524500907441, |
|
"eval_accuracy": 0.9121495562330304, |
|
"eval_loss": 0.4079442322254181, |
|
"eval_runtime": 52.7267, |
|
"eval_samples_per_second": 83.544, |
|
"eval_steps_per_second": 2.617, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 117.05989110707804, |
|
"grad_norm": 1.7638319730758667, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.2472, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 117.05989110707804, |
|
"eval_accuracy": 0.9112412273671573, |
|
"eval_loss": 0.40657439827919006, |
|
"eval_runtime": 52.7752, |
|
"eval_samples_per_second": 83.467, |
|
"eval_steps_per_second": 2.615, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 117.96733212341198, |
|
"grad_norm": 1.821175217628479, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.2436, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 117.96733212341198, |
|
"eval_accuracy": 0.9124436813445042, |
|
"eval_loss": 0.40668636560440063, |
|
"eval_runtime": 52.7669, |
|
"eval_samples_per_second": 83.48, |
|
"eval_steps_per_second": 2.615, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 118.87477313974591, |
|
"grad_norm": 1.8072514533996582, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.2432, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 118.87477313974591, |
|
"eval_accuracy": 0.9114819864290975, |
|
"eval_loss": 0.4095401167869568, |
|
"eval_runtime": 52.6486, |
|
"eval_samples_per_second": 83.668, |
|
"eval_steps_per_second": 2.621, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 119.78221415607986, |
|
"grad_norm": 1.8061636686325073, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.2406, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 119.78221415607986, |
|
"eval_accuracy": 0.9118786110814227, |
|
"eval_loss": 0.4098331332206726, |
|
"eval_runtime": 52.7138, |
|
"eval_samples_per_second": 83.564, |
|
"eval_steps_per_second": 2.618, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 120.6896551724138, |
|
"grad_norm": 1.9020588397979736, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.2421, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 120.6896551724138, |
|
"eval_accuracy": 0.9126196512325608, |
|
"eval_loss": 0.4041764736175537, |
|
"eval_runtime": 52.7206, |
|
"eval_samples_per_second": 83.554, |
|
"eval_steps_per_second": 2.618, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 121.59709618874773, |
|
"grad_norm": 1.8141471147537231, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.2407, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 121.59709618874773, |
|
"eval_accuracy": 0.9133736295696817, |
|
"eval_loss": 0.40654563903808594, |
|
"eval_runtime": 52.5772, |
|
"eval_samples_per_second": 83.782, |
|
"eval_steps_per_second": 2.625, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 122.50453720508168, |
|
"grad_norm": 1.9479206800460815, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.2392, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 122.50453720508168, |
|
"eval_accuracy": 0.9131637376284198, |
|
"eval_loss": 0.4058144688606262, |
|
"eval_runtime": 57.3018, |
|
"eval_samples_per_second": 76.874, |
|
"eval_steps_per_second": 2.408, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 123.41197822141561, |
|
"grad_norm": 2.147027015686035, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.2377, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 123.41197822141561, |
|
"eval_accuracy": 0.9131377603207724, |
|
"eval_loss": 0.4089277386665344, |
|
"eval_runtime": 52.5836, |
|
"eval_samples_per_second": 83.771, |
|
"eval_steps_per_second": 2.624, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 124.31941923774954, |
|
"grad_norm": 1.95304536819458, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.2372, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 124.31941923774954, |
|
"eval_accuracy": 0.9145953679805113, |
|
"eval_loss": 0.4008789658546448, |
|
"eval_runtime": 52.5737, |
|
"eval_samples_per_second": 83.787, |
|
"eval_steps_per_second": 2.625, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 125.22686025408349, |
|
"grad_norm": 1.8719152212142944, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.234, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 125.22686025408349, |
|
"eval_accuracy": 0.9142573977063571, |
|
"eval_loss": 0.4005224108695984, |
|
"eval_runtime": 54.591, |
|
"eval_samples_per_second": 80.691, |
|
"eval_steps_per_second": 2.528, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 126.13430127041742, |
|
"grad_norm": 1.9268224239349365, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.2342, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 126.13430127041742, |
|
"eval_accuracy": 0.9132593660123727, |
|
"eval_loss": 0.41198381781578064, |
|
"eval_runtime": 52.5548, |
|
"eval_samples_per_second": 83.817, |
|
"eval_steps_per_second": 2.626, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 127.04174228675136, |
|
"grad_norm": 1.9191150665283203, |
|
"learning_rate": 3e-05, |
|
"loss": 0.2348, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 127.04174228675136, |
|
"eval_accuracy": 0.913298348179974, |
|
"eval_loss": 0.40492549538612366, |
|
"eval_runtime": 52.6333, |
|
"eval_samples_per_second": 83.692, |
|
"eval_steps_per_second": 2.622, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 127.9491833030853, |
|
"grad_norm": 1.5569913387298584, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.2321, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 127.9491833030853, |
|
"eval_accuracy": 0.9125856672390897, |
|
"eval_loss": 0.4051525592803955, |
|
"eval_runtime": 52.6574, |
|
"eval_samples_per_second": 83.654, |
|
"eval_steps_per_second": 2.621, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 128.85662431941924, |
|
"grad_norm": 1.7746883630752563, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.2294, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 128.85662431941924, |
|
"eval_accuracy": 0.9139568516478013, |
|
"eval_loss": 0.4073280692100525, |
|
"eval_runtime": 52.6696, |
|
"eval_samples_per_second": 83.635, |
|
"eval_steps_per_second": 2.62, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 129.76406533575317, |
|
"grad_norm": 1.4891724586486816, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.2285, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 129.76406533575317, |
|
"eval_accuracy": 0.9131215447858751, |
|
"eval_loss": 0.40580666065216064, |
|
"eval_runtime": 52.7507, |
|
"eval_samples_per_second": 83.506, |
|
"eval_steps_per_second": 2.616, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 130.6715063520871, |
|
"grad_norm": 1.5842249393463135, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.2285, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 130.6715063520871, |
|
"eval_accuracy": 0.9142407479255579, |
|
"eval_loss": 0.4018247723579407, |
|
"eval_runtime": 54.7129, |
|
"eval_samples_per_second": 80.511, |
|
"eval_steps_per_second": 2.522, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 131.57894736842104, |
|
"grad_norm": 1.8539658784866333, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.2285, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 131.57894736842104, |
|
"eval_accuracy": 0.914012928454821, |
|
"eval_loss": 0.4047853946685791, |
|
"eval_runtime": 53.2071, |
|
"eval_samples_per_second": 82.79, |
|
"eval_steps_per_second": 2.594, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 132.486388384755, |
|
"grad_norm": 1.4111963510513306, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.227, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 132.486388384755, |
|
"eval_accuracy": 0.913627272698819, |
|
"eval_loss": 0.4063122570514679, |
|
"eval_runtime": 54.764, |
|
"eval_samples_per_second": 80.436, |
|
"eval_steps_per_second": 2.52, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 133.39382940108894, |
|
"grad_norm": 1.7000839710235596, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.2227, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 133.39382940108894, |
|
"eval_accuracy": 0.9136585735388811, |
|
"eval_loss": 0.40767940878868103, |
|
"eval_runtime": 52.6716, |
|
"eval_samples_per_second": 83.631, |
|
"eval_steps_per_second": 2.62, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 134.30127041742287, |
|
"grad_norm": 1.7322769165039062, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.2227, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 134.30127041742287, |
|
"eval_accuracy": 0.9146321198686937, |
|
"eval_loss": 0.40449175238609314, |
|
"eval_runtime": 57.3022, |
|
"eval_samples_per_second": 76.873, |
|
"eval_steps_per_second": 2.408, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 135.2087114337568, |
|
"grad_norm": 1.8843836784362793, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.2227, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 135.2087114337568, |
|
"eval_accuracy": 0.9153162670489917, |
|
"eval_loss": 0.3996308743953705, |
|
"eval_runtime": 52.9537, |
|
"eval_samples_per_second": 83.186, |
|
"eval_steps_per_second": 2.606, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 136.11615245009074, |
|
"grad_norm": 1.688589334487915, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.2228, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 136.11615245009074, |
|
"eval_accuracy": 0.9148934837092731, |
|
"eval_loss": 0.4007312059402466, |
|
"eval_runtime": 52.7857, |
|
"eval_samples_per_second": 83.451, |
|
"eval_steps_per_second": 2.614, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 137.02359346642467, |
|
"grad_norm": 1.886564016342163, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.2222, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 137.02359346642467, |
|
"eval_accuracy": 0.9137650871178321, |
|
"eval_loss": 0.40756621956825256, |
|
"eval_runtime": 52.9485, |
|
"eval_samples_per_second": 83.194, |
|
"eval_steps_per_second": 2.606, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 137.93103448275863, |
|
"grad_norm": 1.569810152053833, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.2186, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 137.93103448275863, |
|
"eval_accuracy": 0.9148974119075408, |
|
"eval_loss": 0.4073057770729065, |
|
"eval_runtime": 52.8219, |
|
"eval_samples_per_second": 83.393, |
|
"eval_steps_per_second": 2.613, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 138.83847549909257, |
|
"grad_norm": 1.5704463720321655, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.2189, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 138.83847549909257, |
|
"eval_accuracy": 0.9138604511070603, |
|
"eval_loss": 0.40438133478164673, |
|
"eval_runtime": 52.8176, |
|
"eval_samples_per_second": 83.4, |
|
"eval_steps_per_second": 2.613, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 139.7459165154265, |
|
"grad_norm": 1.7239934206008911, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.2171, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 139.7459165154265, |
|
"eval_accuracy": 0.9152970248903189, |
|
"eval_loss": 0.4021734297275543, |
|
"eval_runtime": 53.338, |
|
"eval_samples_per_second": 82.587, |
|
"eval_steps_per_second": 2.587, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 140.65335753176043, |
|
"grad_norm": 1.79320228099823, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.2167, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 140.65335753176043, |
|
"eval_accuracy": 0.9152779126251601, |
|
"eval_loss": 0.3990631401538849, |
|
"eval_runtime": 53.0839, |
|
"eval_samples_per_second": 82.982, |
|
"eval_steps_per_second": 2.6, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 141.56079854809437, |
|
"grad_norm": 1.6768089532852173, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.2164, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 141.56079854809437, |
|
"eval_accuracy": 0.9149024426267115, |
|
"eval_loss": 0.4050694704055786, |
|
"eval_runtime": 53.0695, |
|
"eval_samples_per_second": 83.004, |
|
"eval_steps_per_second": 2.6, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 142.4682395644283, |
|
"grad_norm": 1.9307670593261719, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.2154, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 142.4682395644283, |
|
"eval_accuracy": 0.9162203399408525, |
|
"eval_loss": 0.3998095691204071, |
|
"eval_runtime": 53.1001, |
|
"eval_samples_per_second": 82.956, |
|
"eval_steps_per_second": 2.599, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 143.37568058076226, |
|
"grad_norm": 1.617890477180481, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.215, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 143.37568058076226, |
|
"eval_accuracy": 0.9160746394397012, |
|
"eval_loss": 0.40022924542427063, |
|
"eval_runtime": 53.1216, |
|
"eval_samples_per_second": 82.923, |
|
"eval_steps_per_second": 2.598, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 144.2831215970962, |
|
"grad_norm": 2.0667710304260254, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.2126, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 144.2831215970962, |
|
"eval_accuracy": 0.9151828126910668, |
|
"eval_loss": 0.4023064076900482, |
|
"eval_runtime": 53.0659, |
|
"eval_samples_per_second": 83.01, |
|
"eval_steps_per_second": 2.601, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 145.19056261343013, |
|
"grad_norm": 1.772654414176941, |
|
"learning_rate": 2e-05, |
|
"loss": 0.2134, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 145.19056261343013, |
|
"eval_accuracy": 0.9151423035495024, |
|
"eval_loss": 0.4016391932964325, |
|
"eval_runtime": 52.86, |
|
"eval_samples_per_second": 83.333, |
|
"eval_steps_per_second": 2.611, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 146.09800362976407, |
|
"grad_norm": 1.6949107646942139, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.2121, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 146.09800362976407, |
|
"eval_accuracy": 0.9157573098498856, |
|
"eval_loss": 0.40052202343940735, |
|
"eval_runtime": 52.9508, |
|
"eval_samples_per_second": 83.19, |
|
"eval_steps_per_second": 2.606, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 147.005444646098, |
|
"grad_norm": 1.7470875978469849, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.2118, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 147.005444646098, |
|
"eval_accuracy": 0.9147493650748915, |
|
"eval_loss": 0.4061746895313263, |
|
"eval_runtime": 53.021, |
|
"eval_samples_per_second": 83.08, |
|
"eval_steps_per_second": 2.603, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 147.91288566243193, |
|
"grad_norm": 1.6520947217941284, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.2092, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 147.91288566243193, |
|
"eval_accuracy": 0.9153819887159277, |
|
"eval_loss": 0.4039769172668457, |
|
"eval_runtime": 52.9737, |
|
"eval_samples_per_second": 83.155, |
|
"eval_steps_per_second": 2.605, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 148.82032667876587, |
|
"grad_norm": 1.625849962234497, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.2071, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 148.82032667876587, |
|
"eval_accuracy": 0.9155334497970264, |
|
"eval_loss": 0.4043760895729065, |
|
"eval_runtime": 52.9528, |
|
"eval_samples_per_second": 83.187, |
|
"eval_steps_per_second": 2.606, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 149.72776769509983, |
|
"grad_norm": 1.706663727760315, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.2049, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 149.72776769509983, |
|
"eval_accuracy": 0.9146216897066012, |
|
"eval_loss": 0.4076862335205078, |
|
"eval_runtime": 53.5645, |
|
"eval_samples_per_second": 82.237, |
|
"eval_steps_per_second": 2.576, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 150.63520871143376, |
|
"grad_norm": 1.9129126071929932, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.2072, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 150.63520871143376, |
|
"eval_accuracy": 0.9153711232101976, |
|
"eval_loss": 0.39911210536956787, |
|
"eval_runtime": 53.0193, |
|
"eval_samples_per_second": 83.083, |
|
"eval_steps_per_second": 2.603, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 151.5426497277677, |
|
"grad_norm": 1.7741316556930542, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.2048, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 151.5426497277677, |
|
"eval_accuracy": 0.9165385170632505, |
|
"eval_loss": 0.3991451859474182, |
|
"eval_runtime": 53.0232, |
|
"eval_samples_per_second": 83.077, |
|
"eval_steps_per_second": 2.603, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 152.45009074410163, |
|
"grad_norm": 1.6072008609771729, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.2064, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 152.45009074410163, |
|
"eval_accuracy": 0.9149193775458487, |
|
"eval_loss": 0.40828338265419006, |
|
"eval_runtime": 52.9412, |
|
"eval_samples_per_second": 83.206, |
|
"eval_steps_per_second": 2.607, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 153.35753176043556, |
|
"grad_norm": 1.7185778617858887, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.2061, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 153.35753176043556, |
|
"eval_accuracy": 0.9158725538979593, |
|
"eval_loss": 0.40011066198349, |
|
"eval_runtime": 52.978, |
|
"eval_samples_per_second": 83.148, |
|
"eval_steps_per_second": 2.605, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 154.2649727767695, |
|
"grad_norm": 1.7340868711471558, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.202, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 154.2649727767695, |
|
"eval_accuracy": 0.9167605678134148, |
|
"eval_loss": 0.3951858580112457, |
|
"eval_runtime": 53.0994, |
|
"eval_samples_per_second": 82.958, |
|
"eval_steps_per_second": 2.599, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 155.17241379310346, |
|
"grad_norm": 1.725895881652832, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.2007, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 155.17241379310346, |
|
"eval_accuracy": 0.9168939812952535, |
|
"eval_loss": 0.3987417221069336, |
|
"eval_runtime": 54.957, |
|
"eval_samples_per_second": 80.154, |
|
"eval_steps_per_second": 2.511, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 156.0798548094374, |
|
"grad_norm": 1.5828238725662231, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.2015, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 156.0798548094374, |
|
"eval_accuracy": 0.917391414109539, |
|
"eval_loss": 0.4005061388015747, |
|
"eval_runtime": 52.8456, |
|
"eval_samples_per_second": 83.356, |
|
"eval_steps_per_second": 2.611, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 156.98729582577133, |
|
"grad_norm": 1.631608247756958, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.2017, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 156.98729582577133, |
|
"eval_accuracy": 0.9161869769340681, |
|
"eval_loss": 0.40137505531311035, |
|
"eval_runtime": 53.0267, |
|
"eval_samples_per_second": 83.071, |
|
"eval_steps_per_second": 2.602, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 157.89473684210526, |
|
"grad_norm": 1.9344474077224731, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.1977, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 157.89473684210526, |
|
"eval_accuracy": 0.9169892177992084, |
|
"eval_loss": 0.39951831102371216, |
|
"eval_runtime": 52.9394, |
|
"eval_samples_per_second": 83.208, |
|
"eval_steps_per_second": 2.607, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 158.8021778584392, |
|
"grad_norm": 1.6796910762786865, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.1994, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 158.8021778584392, |
|
"eval_accuracy": 0.9157372947418714, |
|
"eval_loss": 0.4002035856246948, |
|
"eval_runtime": 52.8869, |
|
"eval_samples_per_second": 83.291, |
|
"eval_steps_per_second": 2.609, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 159.70961887477313, |
|
"grad_norm": 1.5331308841705322, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.1987, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 159.70961887477313, |
|
"eval_accuracy": 0.9159691578071264, |
|
"eval_loss": 0.40263810753822327, |
|
"eval_runtime": 57.4412, |
|
"eval_samples_per_second": 76.687, |
|
"eval_steps_per_second": 2.402, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 160.6170598911071, |
|
"grad_norm": 1.8451423645019531, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.1985, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 160.6170598911071, |
|
"eval_accuracy": 0.91600935818353, |
|
"eval_loss": 0.4028289318084717, |
|
"eval_runtime": 53.2779, |
|
"eval_samples_per_second": 82.68, |
|
"eval_steps_per_second": 2.59, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 161.52450090744102, |
|
"grad_norm": 1.6251318454742432, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.1976, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 161.52450090744102, |
|
"eval_accuracy": 0.9168077236536895, |
|
"eval_loss": 0.39661651849746704, |
|
"eval_runtime": 54.3943, |
|
"eval_samples_per_second": 80.983, |
|
"eval_steps_per_second": 2.537, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 162.43194192377496, |
|
"grad_norm": 1.906327486038208, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.1975, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 162.43194192377496, |
|
"eval_accuracy": 0.9169821754553538, |
|
"eval_loss": 0.3963495194911957, |
|
"eval_runtime": 53.0048, |
|
"eval_samples_per_second": 83.106, |
|
"eval_steps_per_second": 2.604, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 163.3393829401089, |
|
"grad_norm": 1.9544309377670288, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1963, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 163.3393829401089, |
|
"eval_accuracy": 0.9164625099202927, |
|
"eval_loss": 0.404565691947937, |
|
"eval_runtime": 53.9246, |
|
"eval_samples_per_second": 81.688, |
|
"eval_steps_per_second": 2.559, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 164.24682395644282, |
|
"grad_norm": 1.852169156074524, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.1963, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 164.24682395644282, |
|
"eval_accuracy": 0.9155449059728268, |
|
"eval_loss": 0.406656414270401, |
|
"eval_runtime": 52.9408, |
|
"eval_samples_per_second": 83.206, |
|
"eval_steps_per_second": 2.607, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 165.15426497277676, |
|
"grad_norm": 1.7243677377700806, |
|
"learning_rate": 9e-06, |
|
"loss": 0.1985, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 165.15426497277676, |
|
"eval_accuracy": 0.9178093382768805, |
|
"eval_loss": 0.39525070786476135, |
|
"eval_runtime": 52.9935, |
|
"eval_samples_per_second": 83.123, |
|
"eval_steps_per_second": 2.604, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 166.06170598911072, |
|
"grad_norm": 1.7043794393539429, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.1936, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 166.06170598911072, |
|
"eval_accuracy": 0.9167059652035036, |
|
"eval_loss": 0.3972921669483185, |
|
"eval_runtime": 52.9476, |
|
"eval_samples_per_second": 83.195, |
|
"eval_steps_per_second": 2.606, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 166.96914700544465, |
|
"grad_norm": 1.6746214628219604, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.1937, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 166.96914700544465, |
|
"eval_accuracy": 0.9164150593646615, |
|
"eval_loss": 0.40151235461235046, |
|
"eval_runtime": 52.9555, |
|
"eval_samples_per_second": 83.183, |
|
"eval_steps_per_second": 2.606, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 167.8765880217786, |
|
"grad_norm": 1.724612832069397, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.1936, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 167.8765880217786, |
|
"eval_accuracy": 0.9173433401523613, |
|
"eval_loss": 0.40141040086746216, |
|
"eval_runtime": 53.1703, |
|
"eval_samples_per_second": 82.847, |
|
"eval_steps_per_second": 2.595, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 168.78402903811252, |
|
"grad_norm": 1.8417091369628906, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.1923, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 168.78402903811252, |
|
"eval_accuracy": 0.9186817674871586, |
|
"eval_loss": 0.39400893449783325, |
|
"eval_runtime": 56.2134, |
|
"eval_samples_per_second": 78.362, |
|
"eval_steps_per_second": 2.455, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 169.69147005444646, |
|
"grad_norm": 1.7512474060058594, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.1933, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 169.69147005444646, |
|
"eval_accuracy": 0.9175796477495107, |
|
"eval_loss": 0.3983522951602936, |
|
"eval_runtime": 52.9057, |
|
"eval_samples_per_second": 83.261, |
|
"eval_steps_per_second": 2.608, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 170.5989110707804, |
|
"grad_norm": 1.4073301553726196, |
|
"learning_rate": 6e-06, |
|
"loss": 0.1935, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 170.5989110707804, |
|
"eval_accuracy": 0.9187447843680622, |
|
"eval_loss": 0.3936294913291931, |
|
"eval_runtime": 52.7794, |
|
"eval_samples_per_second": 83.461, |
|
"eval_steps_per_second": 2.615, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 171.50635208711435, |
|
"grad_norm": 1.71249520778656, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.1932, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 171.50635208711435, |
|
"eval_accuracy": 0.9183512090422428, |
|
"eval_loss": 0.3950323760509491, |
|
"eval_runtime": 52.9329, |
|
"eval_samples_per_second": 83.219, |
|
"eval_steps_per_second": 2.607, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 172.41379310344828, |
|
"grad_norm": 1.4874622821807861, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1915, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 172.41379310344828, |
|
"eval_accuracy": 0.9178856529139476, |
|
"eval_loss": 0.39599481225013733, |
|
"eval_runtime": 51.846, |
|
"eval_samples_per_second": 84.963, |
|
"eval_steps_per_second": 2.662, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 173.32123411978222, |
|
"grad_norm": 1.898534893989563, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.1891, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 173.32123411978222, |
|
"eval_accuracy": 0.918700790482899, |
|
"eval_loss": 0.38806891441345215, |
|
"eval_runtime": 53.2348, |
|
"eval_samples_per_second": 82.747, |
|
"eval_steps_per_second": 2.592, |
|
"step": 95500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 182, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.06414374804652e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|