xlm-r_ckb-arab / trainer_state.json
DGurgurov's picture
Uploading checkpoint-95500 for xlm-r - ckb-arab
6fb39eb verified
{
"best_metric": 0.38806891441345215,
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/ckb-Arab/checkpoint-95500",
"epoch": 173.32123411978222,
"eval_steps": 500,
"global_step": 95500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.9074410163339383,
"grad_norm": 3.4398534297943115,
"learning_rate": 9.95e-05,
"loss": 1.6403,
"step": 500
},
{
"epoch": 0.9074410163339383,
"eval_accuracy": 0.752505506780714,
"eval_loss": 1.153350591659546,
"eval_runtime": 53.4289,
"eval_samples_per_second": 82.446,
"eval_steps_per_second": 2.583,
"step": 500
},
{
"epoch": 1.8148820326678767,
"grad_norm": 2.854556083679199,
"learning_rate": 9.900000000000001e-05,
"loss": 1.1266,
"step": 1000
},
{
"epoch": 1.8148820326678767,
"eval_accuracy": 0.7904873277235048,
"eval_loss": 0.9590327143669128,
"eval_runtime": 53.4057,
"eval_samples_per_second": 82.482,
"eval_steps_per_second": 2.584,
"step": 1000
},
{
"epoch": 2.722323049001815,
"grad_norm": 2.9346060752868652,
"learning_rate": 9.850000000000001e-05,
"loss": 0.9785,
"step": 1500
},
{
"epoch": 2.722323049001815,
"eval_accuracy": 0.8084301658954617,
"eval_loss": 0.8642853498458862,
"eval_runtime": 59.5918,
"eval_samples_per_second": 73.92,
"eval_steps_per_second": 2.316,
"step": 1500
},
{
"epoch": 3.629764065335753,
"grad_norm": 2.695502996444702,
"learning_rate": 9.8e-05,
"loss": 0.8933,
"step": 2000
},
{
"epoch": 3.629764065335753,
"eval_accuracy": 0.8202711962432342,
"eval_loss": 0.8070259094238281,
"eval_runtime": 54.3445,
"eval_samples_per_second": 81.057,
"eval_steps_per_second": 2.539,
"step": 2000
},
{
"epoch": 4.537205081669692,
"grad_norm": 2.3147311210632324,
"learning_rate": 9.75e-05,
"loss": 0.8354,
"step": 2500
},
{
"epoch": 4.537205081669692,
"eval_accuracy": 0.8293452619955832,
"eval_loss": 0.7662876844406128,
"eval_runtime": 54.1554,
"eval_samples_per_second": 81.34,
"eval_steps_per_second": 2.548,
"step": 2500
},
{
"epoch": 5.44464609800363,
"grad_norm": 2.6086342334747314,
"learning_rate": 9.7e-05,
"loss": 0.7909,
"step": 3000
},
{
"epoch": 5.44464609800363,
"eval_accuracy": 0.8355002523340903,
"eval_loss": 0.7356261610984802,
"eval_runtime": 53.6095,
"eval_samples_per_second": 82.168,
"eval_steps_per_second": 2.574,
"step": 3000
},
{
"epoch": 6.352087114337568,
"grad_norm": 2.1992199420928955,
"learning_rate": 9.65e-05,
"loss": 0.7546,
"step": 3500
},
{
"epoch": 6.352087114337568,
"eval_accuracy": 0.8423681886232234,
"eval_loss": 0.6986920237541199,
"eval_runtime": 55.6018,
"eval_samples_per_second": 79.224,
"eval_steps_per_second": 2.482,
"step": 3500
},
{
"epoch": 7.259528130671506,
"grad_norm": 2.3473734855651855,
"learning_rate": 9.6e-05,
"loss": 0.7292,
"step": 4000
},
{
"epoch": 7.259528130671506,
"eval_accuracy": 0.8470036500248151,
"eval_loss": 0.6758582592010498,
"eval_runtime": 54.4017,
"eval_samples_per_second": 80.972,
"eval_steps_per_second": 2.537,
"step": 4000
},
{
"epoch": 8.166969147005444,
"grad_norm": 2.585818290710449,
"learning_rate": 9.55e-05,
"loss": 0.7025,
"step": 4500
},
{
"epoch": 8.166969147005444,
"eval_accuracy": 0.8527411894826806,
"eval_loss": 0.6568289399147034,
"eval_runtime": 57.7681,
"eval_samples_per_second": 76.253,
"eval_steps_per_second": 2.389,
"step": 4500
},
{
"epoch": 9.074410163339383,
"grad_norm": 2.4189865589141846,
"learning_rate": 9.5e-05,
"loss": 0.68,
"step": 5000
},
{
"epoch": 9.074410163339383,
"eval_accuracy": 0.8559500862970382,
"eval_loss": 0.6354258060455322,
"eval_runtime": 59.1091,
"eval_samples_per_second": 74.523,
"eval_steps_per_second": 2.335,
"step": 5000
},
{
"epoch": 9.98185117967332,
"grad_norm": 2.443392515182495,
"learning_rate": 9.449999999999999e-05,
"loss": 0.6645,
"step": 5500
},
{
"epoch": 9.98185117967332,
"eval_accuracy": 0.8591041578043516,
"eval_loss": 0.6224693655967712,
"eval_runtime": 53.9006,
"eval_samples_per_second": 81.725,
"eval_steps_per_second": 2.56,
"step": 5500
},
{
"epoch": 10.88929219600726,
"grad_norm": 2.1055374145507812,
"learning_rate": 9.4e-05,
"loss": 0.6385,
"step": 6000
},
{
"epoch": 10.88929219600726,
"eval_accuracy": 0.8609373044675259,
"eval_loss": 0.6164940595626831,
"eval_runtime": 53.8382,
"eval_samples_per_second": 81.819,
"eval_steps_per_second": 2.563,
"step": 6000
},
{
"epoch": 11.796733212341199,
"grad_norm": 1.9976396560668945,
"learning_rate": 9.350000000000001e-05,
"loss": 0.6305,
"step": 6500
},
{
"epoch": 11.796733212341199,
"eval_accuracy": 0.8621495473876636,
"eval_loss": 0.6023448705673218,
"eval_runtime": 54.3137,
"eval_samples_per_second": 81.103,
"eval_steps_per_second": 2.541,
"step": 6500
},
{
"epoch": 12.704174228675136,
"grad_norm": 2.6231155395507812,
"learning_rate": 9.300000000000001e-05,
"loss": 0.6143,
"step": 7000
},
{
"epoch": 12.704174228675136,
"eval_accuracy": 0.8642217993678152,
"eval_loss": 0.5955979824066162,
"eval_runtime": 55.8185,
"eval_samples_per_second": 78.916,
"eval_steps_per_second": 2.472,
"step": 7000
},
{
"epoch": 13.611615245009075,
"grad_norm": 2.0853583812713623,
"learning_rate": 9.250000000000001e-05,
"loss": 0.6012,
"step": 7500
},
{
"epoch": 13.611615245009075,
"eval_accuracy": 0.8669231710723286,
"eval_loss": 0.5834583044052124,
"eval_runtime": 52.8539,
"eval_samples_per_second": 83.343,
"eval_steps_per_second": 2.611,
"step": 7500
},
{
"epoch": 14.519056261343012,
"grad_norm": 1.9873307943344116,
"learning_rate": 9.200000000000001e-05,
"loss": 0.59,
"step": 8000
},
{
"epoch": 14.519056261343012,
"eval_accuracy": 0.8700438033381165,
"eval_loss": 0.5704456567764282,
"eval_runtime": 52.8195,
"eval_samples_per_second": 83.397,
"eval_steps_per_second": 2.613,
"step": 8000
},
{
"epoch": 15.426497277676951,
"grad_norm": 2.106224536895752,
"learning_rate": 9.15e-05,
"loss": 0.5781,
"step": 8500
},
{
"epoch": 15.426497277676951,
"eval_accuracy": 0.8698398656877739,
"eval_loss": 0.5724136233329773,
"eval_runtime": 59.543,
"eval_samples_per_second": 73.98,
"eval_steps_per_second": 2.318,
"step": 8500
},
{
"epoch": 16.33393829401089,
"grad_norm": 2.0043249130249023,
"learning_rate": 9.1e-05,
"loss": 0.5675,
"step": 9000
},
{
"epoch": 16.33393829401089,
"eval_accuracy": 0.8715064286118855,
"eval_loss": 0.5624237656593323,
"eval_runtime": 53.5321,
"eval_samples_per_second": 82.287,
"eval_steps_per_second": 2.578,
"step": 9000
},
{
"epoch": 17.24137931034483,
"grad_norm": 2.078568935394287,
"learning_rate": 9.05e-05,
"loss": 0.5566,
"step": 9500
},
{
"epoch": 17.24137931034483,
"eval_accuracy": 0.872356935014549,
"eval_loss": 0.5570796728134155,
"eval_runtime": 52.8375,
"eval_samples_per_second": 83.369,
"eval_steps_per_second": 2.612,
"step": 9500
},
{
"epoch": 18.148820326678766,
"grad_norm": 2.026803731918335,
"learning_rate": 9e-05,
"loss": 0.5533,
"step": 10000
},
{
"epoch": 18.148820326678766,
"eval_accuracy": 0.875522907428112,
"eval_loss": 0.5457433462142944,
"eval_runtime": 52.7025,
"eval_samples_per_second": 83.582,
"eval_steps_per_second": 2.618,
"step": 10000
},
{
"epoch": 19.056261343012704,
"grad_norm": 1.8563569784164429,
"learning_rate": 8.950000000000001e-05,
"loss": 0.5396,
"step": 10500
},
{
"epoch": 19.056261343012704,
"eval_accuracy": 0.8741645771312478,
"eval_loss": 0.5479493737220764,
"eval_runtime": 52.8826,
"eval_samples_per_second": 83.298,
"eval_steps_per_second": 2.61,
"step": 10500
},
{
"epoch": 19.96370235934664,
"grad_norm": 2.1099376678466797,
"learning_rate": 8.900000000000001e-05,
"loss": 0.5359,
"step": 11000
},
{
"epoch": 19.96370235934664,
"eval_accuracy": 0.8773543802678719,
"eval_loss": 0.5376391410827637,
"eval_runtime": 53.3193,
"eval_samples_per_second": 82.615,
"eval_steps_per_second": 2.588,
"step": 11000
},
{
"epoch": 20.87114337568058,
"grad_norm": 2.5443553924560547,
"learning_rate": 8.850000000000001e-05,
"loss": 0.5246,
"step": 11500
},
{
"epoch": 20.87114337568058,
"eval_accuracy": 0.8773027699144483,
"eval_loss": 0.5346177816390991,
"eval_runtime": 51.9467,
"eval_samples_per_second": 84.799,
"eval_steps_per_second": 2.657,
"step": 11500
},
{
"epoch": 21.77858439201452,
"grad_norm": 2.047163963317871,
"learning_rate": 8.800000000000001e-05,
"loss": 0.5173,
"step": 12000
},
{
"epoch": 21.77858439201452,
"eval_accuracy": 0.880544279055781,
"eval_loss": 0.5207871794700623,
"eval_runtime": 62.4765,
"eval_samples_per_second": 70.506,
"eval_steps_per_second": 2.209,
"step": 12000
},
{
"epoch": 22.686025408348456,
"grad_norm": 1.9300446510314941,
"learning_rate": 8.75e-05,
"loss": 0.5111,
"step": 12500
},
{
"epoch": 22.686025408348456,
"eval_accuracy": 0.8803666290669171,
"eval_loss": 0.5259021520614624,
"eval_runtime": 53.0987,
"eval_samples_per_second": 82.959,
"eval_steps_per_second": 2.599,
"step": 12500
},
{
"epoch": 23.593466424682397,
"grad_norm": 2.314628839492798,
"learning_rate": 8.7e-05,
"loss": 0.505,
"step": 13000
},
{
"epoch": 23.593466424682397,
"eval_accuracy": 0.8821161209658513,
"eval_loss": 0.5170288681983948,
"eval_runtime": 60.4353,
"eval_samples_per_second": 72.888,
"eval_steps_per_second": 2.283,
"step": 13000
},
{
"epoch": 24.500907441016334,
"grad_norm": 2.187793493270874,
"learning_rate": 8.65e-05,
"loss": 0.4995,
"step": 13500
},
{
"epoch": 24.500907441016334,
"eval_accuracy": 0.8803155533947083,
"eval_loss": 0.5203161835670471,
"eval_runtime": 54.2267,
"eval_samples_per_second": 81.233,
"eval_steps_per_second": 2.545,
"step": 13500
},
{
"epoch": 25.40834845735027,
"grad_norm": 1.8629887104034424,
"learning_rate": 8.6e-05,
"loss": 0.4933,
"step": 14000
},
{
"epoch": 25.40834845735027,
"eval_accuracy": 0.8827071990702727,
"eval_loss": 0.5080223679542542,
"eval_runtime": 53.122,
"eval_samples_per_second": 82.922,
"eval_steps_per_second": 2.598,
"step": 14000
},
{
"epoch": 26.31578947368421,
"grad_norm": 1.95268976688385,
"learning_rate": 8.55e-05,
"loss": 0.4841,
"step": 14500
},
{
"epoch": 26.31578947368421,
"eval_accuracy": 0.8852798720942991,
"eval_loss": 0.4982340335845947,
"eval_runtime": 55.7058,
"eval_samples_per_second": 79.076,
"eval_steps_per_second": 2.477,
"step": 14500
},
{
"epoch": 27.22323049001815,
"grad_norm": 1.9664937257766724,
"learning_rate": 8.5e-05,
"loss": 0.4769,
"step": 15000
},
{
"epoch": 27.22323049001815,
"eval_accuracy": 0.8844890070822018,
"eval_loss": 0.5071918964385986,
"eval_runtime": 57.0471,
"eval_samples_per_second": 77.217,
"eval_steps_per_second": 2.419,
"step": 15000
},
{
"epoch": 28.130671506352087,
"grad_norm": 2.0566840171813965,
"learning_rate": 8.450000000000001e-05,
"loss": 0.4782,
"step": 15500
},
{
"epoch": 28.130671506352087,
"eval_accuracy": 0.885741458637098,
"eval_loss": 0.4975322186946869,
"eval_runtime": 53.7809,
"eval_samples_per_second": 81.906,
"eval_steps_per_second": 2.566,
"step": 15500
},
{
"epoch": 29.038112522686024,
"grad_norm": 1.969655990600586,
"learning_rate": 8.4e-05,
"loss": 0.4728,
"step": 16000
},
{
"epoch": 29.038112522686024,
"eval_accuracy": 0.8861595126788497,
"eval_loss": 0.498710572719574,
"eval_runtime": 58.2764,
"eval_samples_per_second": 75.588,
"eval_steps_per_second": 2.368,
"step": 16000
},
{
"epoch": 29.945553539019965,
"grad_norm": 1.9814519882202148,
"learning_rate": 8.35e-05,
"loss": 0.4648,
"step": 16500
},
{
"epoch": 29.945553539019965,
"eval_accuracy": 0.8870825900148245,
"eval_loss": 0.49478381872177124,
"eval_runtime": 53.8125,
"eval_samples_per_second": 81.858,
"eval_steps_per_second": 2.564,
"step": 16500
},
{
"epoch": 30.852994555353902,
"grad_norm": 1.8299716711044312,
"learning_rate": 8.3e-05,
"loss": 0.4597,
"step": 17000
},
{
"epoch": 30.852994555353902,
"eval_accuracy": 0.8868082072258864,
"eval_loss": 0.49765515327453613,
"eval_runtime": 54.5146,
"eval_samples_per_second": 80.804,
"eval_steps_per_second": 2.531,
"step": 17000
},
{
"epoch": 31.76043557168784,
"grad_norm": 2.192680597305298,
"learning_rate": 8.25e-05,
"loss": 0.4569,
"step": 17500
},
{
"epoch": 31.76043557168784,
"eval_accuracy": 0.8874967002300411,
"eval_loss": 0.49416524171829224,
"eval_runtime": 53.3923,
"eval_samples_per_second": 82.503,
"eval_steps_per_second": 2.585,
"step": 17500
},
{
"epoch": 32.66787658802178,
"grad_norm": 2.5133163928985596,
"learning_rate": 8.2e-05,
"loss": 0.4488,
"step": 18000
},
{
"epoch": 32.66787658802178,
"eval_accuracy": 0.8887065044419662,
"eval_loss": 0.49053409695625305,
"eval_runtime": 56.0002,
"eval_samples_per_second": 78.66,
"eval_steps_per_second": 2.464,
"step": 18000
},
{
"epoch": 33.57531760435572,
"grad_norm": 1.596177339553833,
"learning_rate": 8.15e-05,
"loss": 0.4456,
"step": 18500
},
{
"epoch": 33.57531760435572,
"eval_accuracy": 0.8882017291247682,
"eval_loss": 0.4841141700744629,
"eval_runtime": 53.2223,
"eval_samples_per_second": 82.766,
"eval_steps_per_second": 2.593,
"step": 18500
},
{
"epoch": 34.48275862068966,
"grad_norm": 2.740516185760498,
"learning_rate": 8.1e-05,
"loss": 0.4439,
"step": 19000
},
{
"epoch": 34.48275862068966,
"eval_accuracy": 0.8906933029564148,
"eval_loss": 0.473172128200531,
"eval_runtime": 53.5999,
"eval_samples_per_second": 82.183,
"eval_steps_per_second": 2.575,
"step": 19000
},
{
"epoch": 35.39019963702359,
"grad_norm": 1.7900762557983398,
"learning_rate": 8.05e-05,
"loss": 0.435,
"step": 19500
},
{
"epoch": 35.39019963702359,
"eval_accuracy": 0.8905509902844176,
"eval_loss": 0.4774630069732666,
"eval_runtime": 53.7773,
"eval_samples_per_second": 81.912,
"eval_steps_per_second": 2.566,
"step": 19500
},
{
"epoch": 36.29764065335753,
"grad_norm": 1.9263832569122314,
"learning_rate": 8e-05,
"loss": 0.4355,
"step": 20000
},
{
"epoch": 36.29764065335753,
"eval_accuracy": 0.889703235016953,
"eval_loss": 0.4819239377975464,
"eval_runtime": 54.1016,
"eval_samples_per_second": 81.421,
"eval_steps_per_second": 2.551,
"step": 20000
},
{
"epoch": 37.20508166969147,
"grad_norm": 1.9390649795532227,
"learning_rate": 7.950000000000001e-05,
"loss": 0.4327,
"step": 20500
},
{
"epoch": 37.20508166969147,
"eval_accuracy": 0.8891170881908429,
"eval_loss": 0.48580387234687805,
"eval_runtime": 59.5973,
"eval_samples_per_second": 73.913,
"eval_steps_per_second": 2.316,
"step": 20500
},
{
"epoch": 38.11252268602541,
"grad_norm": 1.9884870052337646,
"learning_rate": 7.900000000000001e-05,
"loss": 0.4254,
"step": 21000
},
{
"epoch": 38.11252268602541,
"eval_accuracy": 0.8915290748428344,
"eval_loss": 0.4742184281349182,
"eval_runtime": 54.3464,
"eval_samples_per_second": 81.054,
"eval_steps_per_second": 2.539,
"step": 21000
},
{
"epoch": 39.01996370235935,
"grad_norm": 1.5046188831329346,
"learning_rate": 7.850000000000001e-05,
"loss": 0.4229,
"step": 21500
},
{
"epoch": 39.01996370235935,
"eval_accuracy": 0.8925229541919623,
"eval_loss": 0.47058817744255066,
"eval_runtime": 56.2804,
"eval_samples_per_second": 78.269,
"eval_steps_per_second": 2.452,
"step": 21500
},
{
"epoch": 39.92740471869328,
"grad_norm": 1.9617971181869507,
"learning_rate": 7.800000000000001e-05,
"loss": 0.4174,
"step": 22000
},
{
"epoch": 39.92740471869328,
"eval_accuracy": 0.8919159314703821,
"eval_loss": 0.4736374616622925,
"eval_runtime": 53.1824,
"eval_samples_per_second": 82.828,
"eval_steps_per_second": 2.595,
"step": 22000
},
{
"epoch": 40.83484573502722,
"grad_norm": 1.7383509874343872,
"learning_rate": 7.75e-05,
"loss": 0.4151,
"step": 22500
},
{
"epoch": 40.83484573502722,
"eval_accuracy": 0.8929338481208785,
"eval_loss": 0.46910572052001953,
"eval_runtime": 53.0493,
"eval_samples_per_second": 83.036,
"eval_steps_per_second": 2.601,
"step": 22500
},
{
"epoch": 41.74228675136116,
"grad_norm": 1.895717740058899,
"learning_rate": 7.7e-05,
"loss": 0.4137,
"step": 23000
},
{
"epoch": 41.74228675136116,
"eval_accuracy": 0.8939702847059194,
"eval_loss": 0.46362048387527466,
"eval_runtime": 53.7274,
"eval_samples_per_second": 81.988,
"eval_steps_per_second": 2.569,
"step": 23000
},
{
"epoch": 42.6497277676951,
"grad_norm": 1.9185525178909302,
"learning_rate": 7.65e-05,
"loss": 0.4124,
"step": 23500
},
{
"epoch": 42.6497277676951,
"eval_accuracy": 0.8955085956295108,
"eval_loss": 0.4603004455566406,
"eval_runtime": 52.7296,
"eval_samples_per_second": 83.539,
"eval_steps_per_second": 2.617,
"step": 23500
},
{
"epoch": 43.55716878402904,
"grad_norm": 1.9594053030014038,
"learning_rate": 7.6e-05,
"loss": 0.4022,
"step": 24000
},
{
"epoch": 43.55716878402904,
"eval_accuracy": 0.894325563921544,
"eval_loss": 0.4673307240009308,
"eval_runtime": 52.947,
"eval_samples_per_second": 83.196,
"eval_steps_per_second": 2.606,
"step": 24000
},
{
"epoch": 44.46460980036298,
"grad_norm": 1.761846899986267,
"learning_rate": 7.55e-05,
"loss": 0.4035,
"step": 24500
},
{
"epoch": 44.46460980036298,
"eval_accuracy": 0.8957788122798188,
"eval_loss": 0.45509716868400574,
"eval_runtime": 55.3592,
"eval_samples_per_second": 79.571,
"eval_steps_per_second": 2.493,
"step": 24500
},
{
"epoch": 45.37205081669691,
"grad_norm": 1.936480164527893,
"learning_rate": 7.500000000000001e-05,
"loss": 0.3996,
"step": 25000
},
{
"epoch": 45.37205081669691,
"eval_accuracy": 0.8945571305505521,
"eval_loss": 0.4623182713985443,
"eval_runtime": 53.3019,
"eval_samples_per_second": 82.642,
"eval_steps_per_second": 2.589,
"step": 25000
},
{
"epoch": 46.27949183303085,
"grad_norm": 1.907658576965332,
"learning_rate": 7.450000000000001e-05,
"loss": 0.3979,
"step": 25500
},
{
"epoch": 46.27949183303085,
"eval_accuracy": 0.8960829529232748,
"eval_loss": 0.4556325376033783,
"eval_runtime": 54.1617,
"eval_samples_per_second": 81.331,
"eval_steps_per_second": 2.548,
"step": 25500
},
{
"epoch": 47.186932849364794,
"grad_norm": 1.9181513786315918,
"learning_rate": 7.4e-05,
"loss": 0.391,
"step": 26000
},
{
"epoch": 47.186932849364794,
"eval_accuracy": 0.8958657660824587,
"eval_loss": 0.4613765776157379,
"eval_runtime": 55.5872,
"eval_samples_per_second": 79.245,
"eval_steps_per_second": 2.483,
"step": 26000
},
{
"epoch": 48.09437386569873,
"grad_norm": 1.6843451261520386,
"learning_rate": 7.35e-05,
"loss": 0.391,
"step": 26500
},
{
"epoch": 48.09437386569873,
"eval_accuracy": 0.89564026034286,
"eval_loss": 0.45949628949165344,
"eval_runtime": 52.8113,
"eval_samples_per_second": 83.41,
"eval_steps_per_second": 2.613,
"step": 26500
},
{
"epoch": 49.00181488203267,
"grad_norm": 1.7681550979614258,
"learning_rate": 7.3e-05,
"loss": 0.3874,
"step": 27000
},
{
"epoch": 49.00181488203267,
"eval_accuracy": 0.8962522308149911,
"eval_loss": 0.4545115828514099,
"eval_runtime": 53.1381,
"eval_samples_per_second": 82.897,
"eval_steps_per_second": 2.597,
"step": 27000
},
{
"epoch": 49.90925589836661,
"grad_norm": 1.8517777919769287,
"learning_rate": 7.25e-05,
"loss": 0.3835,
"step": 27500
},
{
"epoch": 49.90925589836661,
"eval_accuracy": 0.896625333542615,
"eval_loss": 0.45060068368911743,
"eval_runtime": 52.9396,
"eval_samples_per_second": 83.208,
"eval_steps_per_second": 2.607,
"step": 27500
},
{
"epoch": 50.81669691470054,
"grad_norm": 1.9447550773620605,
"learning_rate": 7.2e-05,
"loss": 0.3779,
"step": 28000
},
{
"epoch": 50.81669691470054,
"eval_accuracy": 0.8974899929361903,
"eval_loss": 0.4529257118701935,
"eval_runtime": 56.9809,
"eval_samples_per_second": 77.307,
"eval_steps_per_second": 2.422,
"step": 28000
},
{
"epoch": 51.724137931034484,
"grad_norm": 1.7611163854599,
"learning_rate": 7.15e-05,
"loss": 0.3783,
"step": 28500
},
{
"epoch": 51.724137931034484,
"eval_accuracy": 0.8984791687632082,
"eval_loss": 0.44696977734565735,
"eval_runtime": 53.9391,
"eval_samples_per_second": 81.666,
"eval_steps_per_second": 2.558,
"step": 28500
},
{
"epoch": 52.63157894736842,
"grad_norm": 1.749190092086792,
"learning_rate": 7.1e-05,
"loss": 0.3727,
"step": 29000
},
{
"epoch": 52.63157894736842,
"eval_accuracy": 0.8978207320708537,
"eval_loss": 0.4506888687610626,
"eval_runtime": 53.5362,
"eval_samples_per_second": 82.281,
"eval_steps_per_second": 2.578,
"step": 29000
},
{
"epoch": 53.53901996370236,
"grad_norm": 1.8609730005264282,
"learning_rate": 7.05e-05,
"loss": 0.3705,
"step": 29500
},
{
"epoch": 53.53901996370236,
"eval_accuracy": 0.8976168077767325,
"eval_loss": 0.4500649869441986,
"eval_runtime": 53.4168,
"eval_samples_per_second": 82.465,
"eval_steps_per_second": 2.583,
"step": 29500
},
{
"epoch": 54.4464609800363,
"grad_norm": 1.8506393432617188,
"learning_rate": 7e-05,
"loss": 0.3719,
"step": 30000
},
{
"epoch": 54.4464609800363,
"eval_accuracy": 0.8993505575402249,
"eval_loss": 0.44908007979393005,
"eval_runtime": 57.767,
"eval_samples_per_second": 76.255,
"eval_steps_per_second": 2.389,
"step": 30000
},
{
"epoch": 55.35390199637023,
"grad_norm": 1.8105406761169434,
"learning_rate": 6.95e-05,
"loss": 0.3684,
"step": 30500
},
{
"epoch": 55.35390199637023,
"eval_accuracy": 0.8994710323502174,
"eval_loss": 0.44289711117744446,
"eval_runtime": 53.0941,
"eval_samples_per_second": 82.966,
"eval_steps_per_second": 2.599,
"step": 30500
},
{
"epoch": 56.261343012704174,
"grad_norm": 1.9500548839569092,
"learning_rate": 6.9e-05,
"loss": 0.3621,
"step": 31000
},
{
"epoch": 56.261343012704174,
"eval_accuracy": 0.8995646091699282,
"eval_loss": 0.44326120615005493,
"eval_runtime": 57.7269,
"eval_samples_per_second": 76.308,
"eval_steps_per_second": 2.391,
"step": 31000
},
{
"epoch": 57.168784029038115,
"grad_norm": 1.771316647529602,
"learning_rate": 6.850000000000001e-05,
"loss": 0.3639,
"step": 31500
},
{
"epoch": 57.168784029038115,
"eval_accuracy": 0.8996551918347424,
"eval_loss": 0.4400934875011444,
"eval_runtime": 59.1245,
"eval_samples_per_second": 74.504,
"eval_steps_per_second": 2.334,
"step": 31500
},
{
"epoch": 58.07622504537205,
"grad_norm": 1.9457340240478516,
"learning_rate": 6.800000000000001e-05,
"loss": 0.3603,
"step": 32000
},
{
"epoch": 58.07622504537205,
"eval_accuracy": 0.8985898806146979,
"eval_loss": 0.4473365545272827,
"eval_runtime": 53.3011,
"eval_samples_per_second": 82.644,
"eval_steps_per_second": 2.589,
"step": 32000
},
{
"epoch": 58.98366606170599,
"grad_norm": 1.8146084547042847,
"learning_rate": 6.750000000000001e-05,
"loss": 0.3568,
"step": 32500
},
{
"epoch": 58.98366606170599,
"eval_accuracy": 0.9005719064701833,
"eval_loss": 0.4383050501346588,
"eval_runtime": 57.7373,
"eval_samples_per_second": 76.294,
"eval_steps_per_second": 2.39,
"step": 32500
},
{
"epoch": 59.89110707803993,
"grad_norm": 1.809646725654602,
"learning_rate": 6.7e-05,
"loss": 0.3516,
"step": 33000
},
{
"epoch": 59.89110707803993,
"eval_accuracy": 0.9000993445757237,
"eval_loss": 0.44284284114837646,
"eval_runtime": 53.187,
"eval_samples_per_second": 82.821,
"eval_steps_per_second": 2.595,
"step": 33000
},
{
"epoch": 60.798548094373864,
"grad_norm": 1.7659413814544678,
"learning_rate": 6.65e-05,
"loss": 0.3531,
"step": 33500
},
{
"epoch": 60.798548094373864,
"eval_accuracy": 0.8994799933629496,
"eval_loss": 0.44589298963546753,
"eval_runtime": 53.0032,
"eval_samples_per_second": 83.108,
"eval_steps_per_second": 2.604,
"step": 33500
},
{
"epoch": 61.705989110707804,
"grad_norm": 1.636080265045166,
"learning_rate": 6.6e-05,
"loss": 0.3499,
"step": 34000
},
{
"epoch": 61.705989110707804,
"eval_accuracy": 0.8999748562089449,
"eval_loss": 0.4367033839225769,
"eval_runtime": 53.0566,
"eval_samples_per_second": 83.025,
"eval_steps_per_second": 2.601,
"step": 34000
},
{
"epoch": 62.613430127041745,
"grad_norm": 1.8669129610061646,
"learning_rate": 6.55e-05,
"loss": 0.3489,
"step": 34500
},
{
"epoch": 62.613430127041745,
"eval_accuracy": 0.9013975155279503,
"eval_loss": 0.4371834695339203,
"eval_runtime": 53.1608,
"eval_samples_per_second": 82.862,
"eval_steps_per_second": 2.596,
"step": 34500
},
{
"epoch": 63.52087114337568,
"grad_norm": 1.9811877012252808,
"learning_rate": 6.500000000000001e-05,
"loss": 0.3429,
"step": 35000
},
{
"epoch": 63.52087114337568,
"eval_accuracy": 0.9018210634557753,
"eval_loss": 0.4384971857070923,
"eval_runtime": 57.6259,
"eval_samples_per_second": 76.441,
"eval_steps_per_second": 2.395,
"step": 35000
},
{
"epoch": 64.42831215970962,
"grad_norm": 1.7895385026931763,
"learning_rate": 6.450000000000001e-05,
"loss": 0.3415,
"step": 35500
},
{
"epoch": 64.42831215970962,
"eval_accuracy": 0.9028058283836883,
"eval_loss": 0.43259307742118835,
"eval_runtime": 53.0105,
"eval_samples_per_second": 83.097,
"eval_steps_per_second": 2.603,
"step": 35500
},
{
"epoch": 65.33575317604355,
"grad_norm": 1.9592262506484985,
"learning_rate": 6.400000000000001e-05,
"loss": 0.3402,
"step": 36000
},
{
"epoch": 65.33575317604355,
"eval_accuracy": 0.901298823973929,
"eval_loss": 0.4333614110946655,
"eval_runtime": 52.9956,
"eval_samples_per_second": 83.12,
"eval_steps_per_second": 2.604,
"step": 36000
},
{
"epoch": 66.2431941923775,
"grad_norm": 1.69992196559906,
"learning_rate": 6.35e-05,
"loss": 0.3349,
"step": 36500
},
{
"epoch": 66.2431941923775,
"eval_accuracy": 0.90217415310253,
"eval_loss": 0.430584579706192,
"eval_runtime": 52.9983,
"eval_samples_per_second": 83.116,
"eval_steps_per_second": 2.604,
"step": 36500
},
{
"epoch": 67.15063520871144,
"grad_norm": 1.70908522605896,
"learning_rate": 6.3e-05,
"loss": 0.3387,
"step": 37000
},
{
"epoch": 67.15063520871144,
"eval_accuracy": 0.9022541035115501,
"eval_loss": 0.4351217746734619,
"eval_runtime": 53.4219,
"eval_samples_per_second": 82.457,
"eval_steps_per_second": 2.583,
"step": 37000
},
{
"epoch": 68.05807622504537,
"grad_norm": 1.7078979015350342,
"learning_rate": 6.25e-05,
"loss": 0.3328,
"step": 37500
},
{
"epoch": 68.05807622504537,
"eval_accuracy": 0.9027885079291,
"eval_loss": 0.43064776062965393,
"eval_runtime": 52.9742,
"eval_samples_per_second": 83.154,
"eval_steps_per_second": 2.605,
"step": 37500
},
{
"epoch": 68.96551724137932,
"grad_norm": 1.6770516633987427,
"learning_rate": 6.2e-05,
"loss": 0.33,
"step": 38000
},
{
"epoch": 68.96551724137932,
"eval_accuracy": 0.9039243367993435,
"eval_loss": 0.42335009574890137,
"eval_runtime": 52.9527,
"eval_samples_per_second": 83.187,
"eval_steps_per_second": 2.606,
"step": 38000
},
{
"epoch": 69.87295825771325,
"grad_norm": 1.7739548683166504,
"learning_rate": 6.15e-05,
"loss": 0.3291,
"step": 38500
},
{
"epoch": 69.87295825771325,
"eval_accuracy": 0.9030174309735959,
"eval_loss": 0.42873483896255493,
"eval_runtime": 52.9154,
"eval_samples_per_second": 83.246,
"eval_steps_per_second": 2.608,
"step": 38500
},
{
"epoch": 70.78039927404718,
"grad_norm": 1.9532561302185059,
"learning_rate": 6.1e-05,
"loss": 0.3288,
"step": 39000
},
{
"epoch": 70.78039927404718,
"eval_accuracy": 0.9036126447268178,
"eval_loss": 0.42941179871559143,
"eval_runtime": 53.0126,
"eval_samples_per_second": 83.093,
"eval_steps_per_second": 2.603,
"step": 39000
},
{
"epoch": 71.68784029038113,
"grad_norm": 1.9629998207092285,
"learning_rate": 6.05e-05,
"loss": 0.3255,
"step": 39500
},
{
"epoch": 71.68784029038113,
"eval_accuracy": 0.904484644880421,
"eval_loss": 0.4233216643333435,
"eval_runtime": 52.8794,
"eval_samples_per_second": 83.303,
"eval_steps_per_second": 2.61,
"step": 39500
},
{
"epoch": 72.59528130671507,
"grad_norm": 1.9358775615692139,
"learning_rate": 6e-05,
"loss": 0.3248,
"step": 40000
},
{
"epoch": 72.59528130671507,
"eval_accuracy": 0.9041070159565976,
"eval_loss": 0.4286067485809326,
"eval_runtime": 52.8963,
"eval_samples_per_second": 83.276,
"eval_steps_per_second": 2.609,
"step": 40000
},
{
"epoch": 73.502722323049,
"grad_norm": 1.6557801961898804,
"learning_rate": 5.95e-05,
"loss": 0.3203,
"step": 40500
},
{
"epoch": 73.502722323049,
"eval_accuracy": 0.9040081056486239,
"eval_loss": 0.4276265799999237,
"eval_runtime": 52.8201,
"eval_samples_per_second": 83.396,
"eval_steps_per_second": 2.613,
"step": 40500
},
{
"epoch": 74.41016333938293,
"grad_norm": 1.58525812625885,
"learning_rate": 5.9e-05,
"loss": 0.3218,
"step": 41000
},
{
"epoch": 74.41016333938293,
"eval_accuracy": 0.9048278199949151,
"eval_loss": 0.4280424118041992,
"eval_runtime": 53.6976,
"eval_samples_per_second": 82.034,
"eval_steps_per_second": 2.57,
"step": 41000
},
{
"epoch": 75.31760435571688,
"grad_norm": 1.5357856750488281,
"learning_rate": 5.85e-05,
"loss": 0.3198,
"step": 41500
},
{
"epoch": 75.31760435571688,
"eval_accuracy": 0.904816966551977,
"eval_loss": 0.41927599906921387,
"eval_runtime": 52.6237,
"eval_samples_per_second": 83.708,
"eval_steps_per_second": 2.622,
"step": 41500
},
{
"epoch": 76.22504537205081,
"grad_norm": 1.9634026288986206,
"learning_rate": 5.8e-05,
"loss": 0.3149,
"step": 42000
},
{
"epoch": 76.22504537205081,
"eval_accuracy": 0.9031350344081517,
"eval_loss": 0.4304438531398773,
"eval_runtime": 52.8667,
"eval_samples_per_second": 83.323,
"eval_steps_per_second": 2.61,
"step": 42000
},
{
"epoch": 77.13248638838475,
"grad_norm": 1.563607096672058,
"learning_rate": 5.7499999999999995e-05,
"loss": 0.3144,
"step": 42500
},
{
"epoch": 77.13248638838475,
"eval_accuracy": 0.9048774759257809,
"eval_loss": 0.427058607339859,
"eval_runtime": 52.712,
"eval_samples_per_second": 83.567,
"eval_steps_per_second": 2.618,
"step": 42500
},
{
"epoch": 78.0399274047187,
"grad_norm": 1.871159553527832,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.3103,
"step": 43000
},
{
"epoch": 78.0399274047187,
"eval_accuracy": 0.9048358855975869,
"eval_loss": 0.4271075129508972,
"eval_runtime": 52.7904,
"eval_samples_per_second": 83.443,
"eval_steps_per_second": 2.614,
"step": 43000
},
{
"epoch": 78.94736842105263,
"grad_norm": 1.61452054977417,
"learning_rate": 5.65e-05,
"loss": 0.3102,
"step": 43500
},
{
"epoch": 78.94736842105263,
"eval_accuracy": 0.9049740443605474,
"eval_loss": 0.42336586117744446,
"eval_runtime": 52.875,
"eval_samples_per_second": 83.31,
"eval_steps_per_second": 2.61,
"step": 43500
},
{
"epoch": 79.85480943738656,
"grad_norm": 1.8801889419555664,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.307,
"step": 44000
},
{
"epoch": 79.85480943738656,
"eval_accuracy": 0.9055931173260359,
"eval_loss": 0.42033708095550537,
"eval_runtime": 52.627,
"eval_samples_per_second": 83.702,
"eval_steps_per_second": 2.622,
"step": 44000
},
{
"epoch": 80.76225045372051,
"grad_norm": 1.5314077138900757,
"learning_rate": 5.550000000000001e-05,
"loss": 0.3037,
"step": 44500
},
{
"epoch": 80.76225045372051,
"eval_accuracy": 0.9061196499462085,
"eval_loss": 0.4253558814525604,
"eval_runtime": 52.7114,
"eval_samples_per_second": 83.568,
"eval_steps_per_second": 2.618,
"step": 44500
},
{
"epoch": 81.66969147005445,
"grad_norm": 1.7618950605392456,
"learning_rate": 5.500000000000001e-05,
"loss": 0.3016,
"step": 45000
},
{
"epoch": 81.66969147005445,
"eval_accuracy": 0.9062959555947709,
"eval_loss": 0.4181654453277588,
"eval_runtime": 52.7251,
"eval_samples_per_second": 83.547,
"eval_steps_per_second": 2.617,
"step": 45000
},
{
"epoch": 82.57713248638838,
"grad_norm": 1.913796067237854,
"learning_rate": 5.45e-05,
"loss": 0.303,
"step": 45500
},
{
"epoch": 82.57713248638838,
"eval_accuracy": 0.9072205041995722,
"eval_loss": 0.41656142473220825,
"eval_runtime": 52.7847,
"eval_samples_per_second": 83.452,
"eval_steps_per_second": 2.614,
"step": 45500
},
{
"epoch": 83.48457350272233,
"grad_norm": 1.803902506828308,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.3028,
"step": 46000
},
{
"epoch": 83.48457350272233,
"eval_accuracy": 0.9070220964556146,
"eval_loss": 0.4230930507183075,
"eval_runtime": 54.7506,
"eval_samples_per_second": 80.456,
"eval_steps_per_second": 2.521,
"step": 46000
},
{
"epoch": 84.39201451905626,
"grad_norm": 1.7706644535064697,
"learning_rate": 5.3500000000000006e-05,
"loss": 0.2986,
"step": 46500
},
{
"epoch": 84.39201451905626,
"eval_accuracy": 0.9074832026084252,
"eval_loss": 0.4140300452709198,
"eval_runtime": 52.7024,
"eval_samples_per_second": 83.583,
"eval_steps_per_second": 2.618,
"step": 46500
},
{
"epoch": 85.2994555353902,
"grad_norm": 1.9356876611709595,
"learning_rate": 5.300000000000001e-05,
"loss": 0.2966,
"step": 47000
},
{
"epoch": 85.2994555353902,
"eval_accuracy": 0.9071740298423052,
"eval_loss": 0.4191630482673645,
"eval_runtime": 52.7256,
"eval_samples_per_second": 83.546,
"eval_steps_per_second": 2.617,
"step": 47000
},
{
"epoch": 86.20689655172414,
"grad_norm": 1.8884636163711548,
"learning_rate": 5.25e-05,
"loss": 0.2959,
"step": 47500
},
{
"epoch": 86.20689655172414,
"eval_accuracy": 0.9080046323395202,
"eval_loss": 0.4184423089027405,
"eval_runtime": 52.5803,
"eval_samples_per_second": 83.777,
"eval_steps_per_second": 2.625,
"step": 47500
},
{
"epoch": 87.11433756805808,
"grad_norm": 1.7885215282440186,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.2943,
"step": 48000
},
{
"epoch": 87.11433756805808,
"eval_accuracy": 0.9073709179447276,
"eval_loss": 0.4168856143951416,
"eval_runtime": 52.6737,
"eval_samples_per_second": 83.628,
"eval_steps_per_second": 2.62,
"step": 48000
},
{
"epoch": 88.02177858439201,
"grad_norm": 1.6675046682357788,
"learning_rate": 5.1500000000000005e-05,
"loss": 0.2932,
"step": 48500
},
{
"epoch": 88.02177858439201,
"eval_accuracy": 0.9065622782059254,
"eval_loss": 0.42474210262298584,
"eval_runtime": 57.2893,
"eval_samples_per_second": 76.89,
"eval_steps_per_second": 2.409,
"step": 48500
},
{
"epoch": 88.92921960072596,
"grad_norm": 1.7693278789520264,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.2913,
"step": 49000
},
{
"epoch": 88.92921960072596,
"eval_accuracy": 0.908334698713182,
"eval_loss": 0.41496196389198303,
"eval_runtime": 52.681,
"eval_samples_per_second": 83.616,
"eval_steps_per_second": 2.62,
"step": 49000
},
{
"epoch": 89.83666061705989,
"grad_norm": 1.80568528175354,
"learning_rate": 5.05e-05,
"loss": 0.29,
"step": 49500
},
{
"epoch": 89.83666061705989,
"eval_accuracy": 0.9067204115993917,
"eval_loss": 0.42078179121017456,
"eval_runtime": 52.5861,
"eval_samples_per_second": 83.767,
"eval_steps_per_second": 2.624,
"step": 49500
},
{
"epoch": 90.74410163339383,
"grad_norm": 1.7286852598190308,
"learning_rate": 5e-05,
"loss": 0.288,
"step": 50000
},
{
"epoch": 90.74410163339383,
"eval_accuracy": 0.908493707496287,
"eval_loss": 0.41103261709213257,
"eval_runtime": 52.6627,
"eval_samples_per_second": 83.646,
"eval_steps_per_second": 2.62,
"step": 50000
},
{
"epoch": 91.65154264972777,
"grad_norm": 1.6064249277114868,
"learning_rate": 4.9500000000000004e-05,
"loss": 0.2846,
"step": 50500
},
{
"epoch": 91.65154264972777,
"eval_accuracy": 0.9089611693118373,
"eval_loss": 0.4114561975002289,
"eval_runtime": 52.5213,
"eval_samples_per_second": 83.871,
"eval_steps_per_second": 2.628,
"step": 50500
},
{
"epoch": 92.5589836660617,
"grad_norm": 1.6957948207855225,
"learning_rate": 4.9e-05,
"loss": 0.2825,
"step": 51000
},
{
"epoch": 92.5589836660617,
"eval_accuracy": 0.9070289798162255,
"eval_loss": 0.41827893257141113,
"eval_runtime": 52.6164,
"eval_samples_per_second": 83.719,
"eval_steps_per_second": 2.623,
"step": 51000
},
{
"epoch": 93.46642468239564,
"grad_norm": 1.6073497533798218,
"learning_rate": 4.85e-05,
"loss": 0.2834,
"step": 51500
},
{
"epoch": 93.46642468239564,
"eval_accuracy": 0.9093731751882107,
"eval_loss": 0.40947192907333374,
"eval_runtime": 55.8553,
"eval_samples_per_second": 78.865,
"eval_steps_per_second": 2.471,
"step": 51500
},
{
"epoch": 94.37386569872959,
"grad_norm": 1.8224419355392456,
"learning_rate": 4.8e-05,
"loss": 0.2803,
"step": 52000
},
{
"epoch": 94.37386569872959,
"eval_accuracy": 0.9078407463018748,
"eval_loss": 0.418082594871521,
"eval_runtime": 57.2453,
"eval_samples_per_second": 76.95,
"eval_steps_per_second": 2.411,
"step": 52000
},
{
"epoch": 95.28130671506352,
"grad_norm": 1.9654055833816528,
"learning_rate": 4.75e-05,
"loss": 0.2787,
"step": 52500
},
{
"epoch": 95.28130671506352,
"eval_accuracy": 0.9089184717302816,
"eval_loss": 0.4162246882915497,
"eval_runtime": 57.2773,
"eval_samples_per_second": 76.907,
"eval_steps_per_second": 2.409,
"step": 52500
},
{
"epoch": 96.18874773139746,
"grad_norm": 1.7956877946853638,
"learning_rate": 4.7e-05,
"loss": 0.278,
"step": 53000
},
{
"epoch": 96.18874773139746,
"eval_accuracy": 0.9096441756342786,
"eval_loss": 0.40937647223472595,
"eval_runtime": 57.2739,
"eval_samples_per_second": 76.911,
"eval_steps_per_second": 2.409,
"step": 53000
},
{
"epoch": 97.0961887477314,
"grad_norm": 1.6677452325820923,
"learning_rate": 4.6500000000000005e-05,
"loss": 0.2759,
"step": 53500
},
{
"epoch": 97.0961887477314,
"eval_accuracy": 0.9090880598745718,
"eval_loss": 0.4136127233505249,
"eval_runtime": 52.9843,
"eval_samples_per_second": 83.138,
"eval_steps_per_second": 2.605,
"step": 53500
},
{
"epoch": 98.00362976406534,
"grad_norm": 1.7029211521148682,
"learning_rate": 4.600000000000001e-05,
"loss": 0.2746,
"step": 54000
},
{
"epoch": 98.00362976406534,
"eval_accuracy": 0.9085475626951355,
"eval_loss": 0.4151366651058197,
"eval_runtime": 55.2968,
"eval_samples_per_second": 79.661,
"eval_steps_per_second": 2.496,
"step": 54000
},
{
"epoch": 98.91107078039927,
"grad_norm": 1.4931912422180176,
"learning_rate": 4.55e-05,
"loss": 0.2734,
"step": 54500
},
{
"epoch": 98.91107078039927,
"eval_accuracy": 0.9087434484443874,
"eval_loss": 0.4170074760913849,
"eval_runtime": 52.6515,
"eval_samples_per_second": 83.663,
"eval_steps_per_second": 2.621,
"step": 54500
},
{
"epoch": 99.81851179673322,
"grad_norm": 1.8849012851715088,
"learning_rate": 4.5e-05,
"loss": 0.2719,
"step": 55000
},
{
"epoch": 99.81851179673322,
"eval_accuracy": 0.9087569026104417,
"eval_loss": 0.41325053572654724,
"eval_runtime": 52.6731,
"eval_samples_per_second": 83.629,
"eval_steps_per_second": 2.62,
"step": 55000
},
{
"epoch": 100.72595281306715,
"grad_norm": 2.079172372817993,
"learning_rate": 4.4500000000000004e-05,
"loss": 0.271,
"step": 55500
},
{
"epoch": 100.72595281306715,
"eval_accuracy": 0.9103295110887096,
"eval_loss": 0.41259288787841797,
"eval_runtime": 52.6285,
"eval_samples_per_second": 83.7,
"eval_steps_per_second": 2.622,
"step": 55500
},
{
"epoch": 101.63339382940109,
"grad_norm": 1.8264790773391724,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.267,
"step": 56000
},
{
"epoch": 101.63339382940109,
"eval_accuracy": 0.9093908939634668,
"eval_loss": 0.41436412930488586,
"eval_runtime": 53.0842,
"eval_samples_per_second": 82.981,
"eval_steps_per_second": 2.6,
"step": 56000
},
{
"epoch": 102.54083484573503,
"grad_norm": 1.6904346942901611,
"learning_rate": 4.35e-05,
"loss": 0.2682,
"step": 56500
},
{
"epoch": 102.54083484573503,
"eval_accuracy": 0.9092624512952864,
"eval_loss": 0.413330078125,
"eval_runtime": 57.4299,
"eval_samples_per_second": 76.702,
"eval_steps_per_second": 2.403,
"step": 56500
},
{
"epoch": 103.44827586206897,
"grad_norm": 1.7619383335113525,
"learning_rate": 4.3e-05,
"loss": 0.2681,
"step": 57000
},
{
"epoch": 103.44827586206897,
"eval_accuracy": 0.9103495162664795,
"eval_loss": 0.40912753343582153,
"eval_runtime": 57.2635,
"eval_samples_per_second": 76.925,
"eval_steps_per_second": 2.41,
"step": 57000
},
{
"epoch": 104.3557168784029,
"grad_norm": 1.7024521827697754,
"learning_rate": 4.25e-05,
"loss": 0.2644,
"step": 57500
},
{
"epoch": 104.3557168784029,
"eval_accuracy": 0.9091305589286443,
"eval_loss": 0.4176701605319977,
"eval_runtime": 52.5316,
"eval_samples_per_second": 83.854,
"eval_steps_per_second": 2.627,
"step": 57500
},
{
"epoch": 105.26315789473684,
"grad_norm": 1.936393141746521,
"learning_rate": 4.2e-05,
"loss": 0.2621,
"step": 58000
},
{
"epoch": 105.26315789473684,
"eval_accuracy": 0.909638402972615,
"eval_loss": 0.4138263165950775,
"eval_runtime": 57.3248,
"eval_samples_per_second": 76.843,
"eval_steps_per_second": 2.407,
"step": 58000
},
{
"epoch": 106.17059891107078,
"grad_norm": 1.9783495664596558,
"learning_rate": 4.15e-05,
"loss": 0.2618,
"step": 58500
},
{
"epoch": 106.17059891107078,
"eval_accuracy": 0.9093778717725997,
"eval_loss": 0.41479504108428955,
"eval_runtime": 52.6769,
"eval_samples_per_second": 83.623,
"eval_steps_per_second": 2.62,
"step": 58500
},
{
"epoch": 107.07803992740472,
"grad_norm": 1.8036541938781738,
"learning_rate": 4.1e-05,
"loss": 0.2593,
"step": 59000
},
{
"epoch": 107.07803992740472,
"eval_accuracy": 0.9108752466798229,
"eval_loss": 0.407368004322052,
"eval_runtime": 52.7623,
"eval_samples_per_second": 83.488,
"eval_steps_per_second": 2.616,
"step": 59000
},
{
"epoch": 107.98548094373865,
"grad_norm": 1.7356771230697632,
"learning_rate": 4.05e-05,
"loss": 0.2586,
"step": 59500
},
{
"epoch": 107.98548094373865,
"eval_accuracy": 0.9106982338220416,
"eval_loss": 0.4060940444469452,
"eval_runtime": 57.2822,
"eval_samples_per_second": 76.9,
"eval_steps_per_second": 2.409,
"step": 59500
},
{
"epoch": 108.8929219600726,
"grad_norm": 1.9993195533752441,
"learning_rate": 4e-05,
"loss": 0.2591,
"step": 60000
},
{
"epoch": 108.8929219600726,
"eval_accuracy": 0.910394873244746,
"eval_loss": 0.4131792187690735,
"eval_runtime": 53.0,
"eval_samples_per_second": 83.113,
"eval_steps_per_second": 2.604,
"step": 60000
},
{
"epoch": 109.80036297640653,
"grad_norm": 1.8162901401519775,
"learning_rate": 3.9500000000000005e-05,
"loss": 0.2558,
"step": 60500
},
{
"epoch": 109.80036297640653,
"eval_accuracy": 0.910258865637902,
"eval_loss": 0.4147132933139801,
"eval_runtime": 55.8856,
"eval_samples_per_second": 78.822,
"eval_steps_per_second": 2.469,
"step": 60500
},
{
"epoch": 110.70780399274047,
"grad_norm": 1.8106731176376343,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.2541,
"step": 61000
},
{
"epoch": 110.70780399274047,
"eval_accuracy": 0.9114573371669734,
"eval_loss": 0.40593624114990234,
"eval_runtime": 52.8519,
"eval_samples_per_second": 83.346,
"eval_steps_per_second": 2.611,
"step": 61000
},
{
"epoch": 111.61524500907441,
"grad_norm": 1.748769998550415,
"learning_rate": 3.85e-05,
"loss": 0.2556,
"step": 61500
},
{
"epoch": 111.61524500907441,
"eval_accuracy": 0.9115416885324719,
"eval_loss": 0.4094337522983551,
"eval_runtime": 52.8285,
"eval_samples_per_second": 83.383,
"eval_steps_per_second": 2.612,
"step": 61500
},
{
"epoch": 112.52268602540835,
"grad_norm": 1.6545246839523315,
"learning_rate": 3.8e-05,
"loss": 0.25,
"step": 62000
},
{
"epoch": 112.52268602540835,
"eval_accuracy": 0.9117101451094991,
"eval_loss": 0.4004589915275574,
"eval_runtime": 52.6819,
"eval_samples_per_second": 83.615,
"eval_steps_per_second": 2.619,
"step": 62000
},
{
"epoch": 113.43012704174228,
"grad_norm": 1.9466238021850586,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.2492,
"step": 62500
},
{
"epoch": 113.43012704174228,
"eval_accuracy": 0.9121271025446342,
"eval_loss": 0.4025409519672394,
"eval_runtime": 52.7085,
"eval_samples_per_second": 83.573,
"eval_steps_per_second": 2.618,
"step": 62500
},
{
"epoch": 114.33756805807623,
"grad_norm": 1.8718467950820923,
"learning_rate": 3.7e-05,
"loss": 0.2505,
"step": 63000
},
{
"epoch": 114.33756805807623,
"eval_accuracy": 0.9116951165625715,
"eval_loss": 0.40717950463294983,
"eval_runtime": 52.6815,
"eval_samples_per_second": 83.616,
"eval_steps_per_second": 2.62,
"step": 63000
},
{
"epoch": 115.24500907441016,
"grad_norm": 1.6669822931289673,
"learning_rate": 3.65e-05,
"loss": 0.2477,
"step": 63500
},
{
"epoch": 115.24500907441016,
"eval_accuracy": 0.9123295778283549,
"eval_loss": 0.40226927399635315,
"eval_runtime": 53.2555,
"eval_samples_per_second": 82.714,
"eval_steps_per_second": 2.591,
"step": 63500
},
{
"epoch": 116.1524500907441,
"grad_norm": 1.631198525428772,
"learning_rate": 3.6e-05,
"loss": 0.2462,
"step": 64000
},
{
"epoch": 116.1524500907441,
"eval_accuracy": 0.9121495562330304,
"eval_loss": 0.4079442322254181,
"eval_runtime": 52.7267,
"eval_samples_per_second": 83.544,
"eval_steps_per_second": 2.617,
"step": 64000
},
{
"epoch": 117.05989110707804,
"grad_norm": 1.7638319730758667,
"learning_rate": 3.55e-05,
"loss": 0.2472,
"step": 64500
},
{
"epoch": 117.05989110707804,
"eval_accuracy": 0.9112412273671573,
"eval_loss": 0.40657439827919006,
"eval_runtime": 52.7752,
"eval_samples_per_second": 83.467,
"eval_steps_per_second": 2.615,
"step": 64500
},
{
"epoch": 117.96733212341198,
"grad_norm": 1.821175217628479,
"learning_rate": 3.5e-05,
"loss": 0.2436,
"step": 65000
},
{
"epoch": 117.96733212341198,
"eval_accuracy": 0.9124436813445042,
"eval_loss": 0.40668636560440063,
"eval_runtime": 52.7669,
"eval_samples_per_second": 83.48,
"eval_steps_per_second": 2.615,
"step": 65000
},
{
"epoch": 118.87477313974591,
"grad_norm": 1.8072514533996582,
"learning_rate": 3.45e-05,
"loss": 0.2432,
"step": 65500
},
{
"epoch": 118.87477313974591,
"eval_accuracy": 0.9114819864290975,
"eval_loss": 0.4095401167869568,
"eval_runtime": 52.6486,
"eval_samples_per_second": 83.668,
"eval_steps_per_second": 2.621,
"step": 65500
},
{
"epoch": 119.78221415607986,
"grad_norm": 1.8061636686325073,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.2406,
"step": 66000
},
{
"epoch": 119.78221415607986,
"eval_accuracy": 0.9118786110814227,
"eval_loss": 0.4098331332206726,
"eval_runtime": 52.7138,
"eval_samples_per_second": 83.564,
"eval_steps_per_second": 2.618,
"step": 66000
},
{
"epoch": 120.6896551724138,
"grad_norm": 1.9020588397979736,
"learning_rate": 3.35e-05,
"loss": 0.2421,
"step": 66500
},
{
"epoch": 120.6896551724138,
"eval_accuracy": 0.9126196512325608,
"eval_loss": 0.4041764736175537,
"eval_runtime": 52.7206,
"eval_samples_per_second": 83.554,
"eval_steps_per_second": 2.618,
"step": 66500
},
{
"epoch": 121.59709618874773,
"grad_norm": 1.8141471147537231,
"learning_rate": 3.3e-05,
"loss": 0.2407,
"step": 67000
},
{
"epoch": 121.59709618874773,
"eval_accuracy": 0.9133736295696817,
"eval_loss": 0.40654563903808594,
"eval_runtime": 52.5772,
"eval_samples_per_second": 83.782,
"eval_steps_per_second": 2.625,
"step": 67000
},
{
"epoch": 122.50453720508168,
"grad_norm": 1.9479206800460815,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.2392,
"step": 67500
},
{
"epoch": 122.50453720508168,
"eval_accuracy": 0.9131637376284198,
"eval_loss": 0.4058144688606262,
"eval_runtime": 57.3018,
"eval_samples_per_second": 76.874,
"eval_steps_per_second": 2.408,
"step": 67500
},
{
"epoch": 123.41197822141561,
"grad_norm": 2.147027015686035,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.2377,
"step": 68000
},
{
"epoch": 123.41197822141561,
"eval_accuracy": 0.9131377603207724,
"eval_loss": 0.4089277386665344,
"eval_runtime": 52.5836,
"eval_samples_per_second": 83.771,
"eval_steps_per_second": 2.624,
"step": 68000
},
{
"epoch": 124.31941923774954,
"grad_norm": 1.95304536819458,
"learning_rate": 3.15e-05,
"loss": 0.2372,
"step": 68500
},
{
"epoch": 124.31941923774954,
"eval_accuracy": 0.9145953679805113,
"eval_loss": 0.4008789658546448,
"eval_runtime": 52.5737,
"eval_samples_per_second": 83.787,
"eval_steps_per_second": 2.625,
"step": 68500
},
{
"epoch": 125.22686025408349,
"grad_norm": 1.8719152212142944,
"learning_rate": 3.1e-05,
"loss": 0.234,
"step": 69000
},
{
"epoch": 125.22686025408349,
"eval_accuracy": 0.9142573977063571,
"eval_loss": 0.4005224108695984,
"eval_runtime": 54.591,
"eval_samples_per_second": 80.691,
"eval_steps_per_second": 2.528,
"step": 69000
},
{
"epoch": 126.13430127041742,
"grad_norm": 1.9268224239349365,
"learning_rate": 3.05e-05,
"loss": 0.2342,
"step": 69500
},
{
"epoch": 126.13430127041742,
"eval_accuracy": 0.9132593660123727,
"eval_loss": 0.41198381781578064,
"eval_runtime": 52.5548,
"eval_samples_per_second": 83.817,
"eval_steps_per_second": 2.626,
"step": 69500
},
{
"epoch": 127.04174228675136,
"grad_norm": 1.9191150665283203,
"learning_rate": 3e-05,
"loss": 0.2348,
"step": 70000
},
{
"epoch": 127.04174228675136,
"eval_accuracy": 0.913298348179974,
"eval_loss": 0.40492549538612366,
"eval_runtime": 52.6333,
"eval_samples_per_second": 83.692,
"eval_steps_per_second": 2.622,
"step": 70000
},
{
"epoch": 127.9491833030853,
"grad_norm": 1.5569913387298584,
"learning_rate": 2.95e-05,
"loss": 0.2321,
"step": 70500
},
{
"epoch": 127.9491833030853,
"eval_accuracy": 0.9125856672390897,
"eval_loss": 0.4051525592803955,
"eval_runtime": 52.6574,
"eval_samples_per_second": 83.654,
"eval_steps_per_second": 2.621,
"step": 70500
},
{
"epoch": 128.85662431941924,
"grad_norm": 1.7746883630752563,
"learning_rate": 2.9e-05,
"loss": 0.2294,
"step": 71000
},
{
"epoch": 128.85662431941924,
"eval_accuracy": 0.9139568516478013,
"eval_loss": 0.4073280692100525,
"eval_runtime": 52.6696,
"eval_samples_per_second": 83.635,
"eval_steps_per_second": 2.62,
"step": 71000
},
{
"epoch": 129.76406533575317,
"grad_norm": 1.4891724586486816,
"learning_rate": 2.8499999999999998e-05,
"loss": 0.2285,
"step": 71500
},
{
"epoch": 129.76406533575317,
"eval_accuracy": 0.9131215447858751,
"eval_loss": 0.40580666065216064,
"eval_runtime": 52.7507,
"eval_samples_per_second": 83.506,
"eval_steps_per_second": 2.616,
"step": 71500
},
{
"epoch": 130.6715063520871,
"grad_norm": 1.5842249393463135,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.2285,
"step": 72000
},
{
"epoch": 130.6715063520871,
"eval_accuracy": 0.9142407479255579,
"eval_loss": 0.4018247723579407,
"eval_runtime": 54.7129,
"eval_samples_per_second": 80.511,
"eval_steps_per_second": 2.522,
"step": 72000
},
{
"epoch": 131.57894736842104,
"grad_norm": 1.8539658784866333,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.2285,
"step": 72500
},
{
"epoch": 131.57894736842104,
"eval_accuracy": 0.914012928454821,
"eval_loss": 0.4047853946685791,
"eval_runtime": 53.2071,
"eval_samples_per_second": 82.79,
"eval_steps_per_second": 2.594,
"step": 72500
},
{
"epoch": 132.486388384755,
"grad_norm": 1.4111963510513306,
"learning_rate": 2.7000000000000002e-05,
"loss": 0.227,
"step": 73000
},
{
"epoch": 132.486388384755,
"eval_accuracy": 0.913627272698819,
"eval_loss": 0.4063122570514679,
"eval_runtime": 54.764,
"eval_samples_per_second": 80.436,
"eval_steps_per_second": 2.52,
"step": 73000
},
{
"epoch": 133.39382940108894,
"grad_norm": 1.7000839710235596,
"learning_rate": 2.6500000000000004e-05,
"loss": 0.2227,
"step": 73500
},
{
"epoch": 133.39382940108894,
"eval_accuracy": 0.9136585735388811,
"eval_loss": 0.40767940878868103,
"eval_runtime": 52.6716,
"eval_samples_per_second": 83.631,
"eval_steps_per_second": 2.62,
"step": 73500
},
{
"epoch": 134.30127041742287,
"grad_norm": 1.7322769165039062,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.2227,
"step": 74000
},
{
"epoch": 134.30127041742287,
"eval_accuracy": 0.9146321198686937,
"eval_loss": 0.40449175238609314,
"eval_runtime": 57.3022,
"eval_samples_per_second": 76.873,
"eval_steps_per_second": 2.408,
"step": 74000
},
{
"epoch": 135.2087114337568,
"grad_norm": 1.8843836784362793,
"learning_rate": 2.5500000000000003e-05,
"loss": 0.2227,
"step": 74500
},
{
"epoch": 135.2087114337568,
"eval_accuracy": 0.9153162670489917,
"eval_loss": 0.3996308743953705,
"eval_runtime": 52.9537,
"eval_samples_per_second": 83.186,
"eval_steps_per_second": 2.606,
"step": 74500
},
{
"epoch": 136.11615245009074,
"grad_norm": 1.688589334487915,
"learning_rate": 2.5e-05,
"loss": 0.2228,
"step": 75000
},
{
"epoch": 136.11615245009074,
"eval_accuracy": 0.9148934837092731,
"eval_loss": 0.4007312059402466,
"eval_runtime": 52.7857,
"eval_samples_per_second": 83.451,
"eval_steps_per_second": 2.614,
"step": 75000
},
{
"epoch": 137.02359346642467,
"grad_norm": 1.886564016342163,
"learning_rate": 2.45e-05,
"loss": 0.2222,
"step": 75500
},
{
"epoch": 137.02359346642467,
"eval_accuracy": 0.9137650871178321,
"eval_loss": 0.40756621956825256,
"eval_runtime": 52.9485,
"eval_samples_per_second": 83.194,
"eval_steps_per_second": 2.606,
"step": 75500
},
{
"epoch": 137.93103448275863,
"grad_norm": 1.569810152053833,
"learning_rate": 2.4e-05,
"loss": 0.2186,
"step": 76000
},
{
"epoch": 137.93103448275863,
"eval_accuracy": 0.9148974119075408,
"eval_loss": 0.4073057770729065,
"eval_runtime": 52.8219,
"eval_samples_per_second": 83.393,
"eval_steps_per_second": 2.613,
"step": 76000
},
{
"epoch": 138.83847549909257,
"grad_norm": 1.5704463720321655,
"learning_rate": 2.35e-05,
"loss": 0.2189,
"step": 76500
},
{
"epoch": 138.83847549909257,
"eval_accuracy": 0.9138604511070603,
"eval_loss": 0.40438133478164673,
"eval_runtime": 52.8176,
"eval_samples_per_second": 83.4,
"eval_steps_per_second": 2.613,
"step": 76500
},
{
"epoch": 139.7459165154265,
"grad_norm": 1.7239934206008911,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.2171,
"step": 77000
},
{
"epoch": 139.7459165154265,
"eval_accuracy": 0.9152970248903189,
"eval_loss": 0.4021734297275543,
"eval_runtime": 53.338,
"eval_samples_per_second": 82.587,
"eval_steps_per_second": 2.587,
"step": 77000
},
{
"epoch": 140.65335753176043,
"grad_norm": 1.79320228099823,
"learning_rate": 2.25e-05,
"loss": 0.2167,
"step": 77500
},
{
"epoch": 140.65335753176043,
"eval_accuracy": 0.9152779126251601,
"eval_loss": 0.3990631401538849,
"eval_runtime": 53.0839,
"eval_samples_per_second": 82.982,
"eval_steps_per_second": 2.6,
"step": 77500
},
{
"epoch": 141.56079854809437,
"grad_norm": 1.6768089532852173,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.2164,
"step": 78000
},
{
"epoch": 141.56079854809437,
"eval_accuracy": 0.9149024426267115,
"eval_loss": 0.4050694704055786,
"eval_runtime": 53.0695,
"eval_samples_per_second": 83.004,
"eval_steps_per_second": 2.6,
"step": 78000
},
{
"epoch": 142.4682395644283,
"grad_norm": 1.9307670593261719,
"learning_rate": 2.15e-05,
"loss": 0.2154,
"step": 78500
},
{
"epoch": 142.4682395644283,
"eval_accuracy": 0.9162203399408525,
"eval_loss": 0.3998095691204071,
"eval_runtime": 53.1001,
"eval_samples_per_second": 82.956,
"eval_steps_per_second": 2.599,
"step": 78500
},
{
"epoch": 143.37568058076226,
"grad_norm": 1.617890477180481,
"learning_rate": 2.1e-05,
"loss": 0.215,
"step": 79000
},
{
"epoch": 143.37568058076226,
"eval_accuracy": 0.9160746394397012,
"eval_loss": 0.40022924542427063,
"eval_runtime": 53.1216,
"eval_samples_per_second": 82.923,
"eval_steps_per_second": 2.598,
"step": 79000
},
{
"epoch": 144.2831215970962,
"grad_norm": 2.0667710304260254,
"learning_rate": 2.05e-05,
"loss": 0.2126,
"step": 79500
},
{
"epoch": 144.2831215970962,
"eval_accuracy": 0.9151828126910668,
"eval_loss": 0.4023064076900482,
"eval_runtime": 53.0659,
"eval_samples_per_second": 83.01,
"eval_steps_per_second": 2.601,
"step": 79500
},
{
"epoch": 145.19056261343013,
"grad_norm": 1.772654414176941,
"learning_rate": 2e-05,
"loss": 0.2134,
"step": 80000
},
{
"epoch": 145.19056261343013,
"eval_accuracy": 0.9151423035495024,
"eval_loss": 0.4016391932964325,
"eval_runtime": 52.86,
"eval_samples_per_second": 83.333,
"eval_steps_per_second": 2.611,
"step": 80000
},
{
"epoch": 146.09800362976407,
"grad_norm": 1.6949107646942139,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.2121,
"step": 80500
},
{
"epoch": 146.09800362976407,
"eval_accuracy": 0.9157573098498856,
"eval_loss": 0.40052202343940735,
"eval_runtime": 52.9508,
"eval_samples_per_second": 83.19,
"eval_steps_per_second": 2.606,
"step": 80500
},
{
"epoch": 147.005444646098,
"grad_norm": 1.7470875978469849,
"learning_rate": 1.9e-05,
"loss": 0.2118,
"step": 81000
},
{
"epoch": 147.005444646098,
"eval_accuracy": 0.9147493650748915,
"eval_loss": 0.4061746895313263,
"eval_runtime": 53.021,
"eval_samples_per_second": 83.08,
"eval_steps_per_second": 2.603,
"step": 81000
},
{
"epoch": 147.91288566243193,
"grad_norm": 1.6520947217941284,
"learning_rate": 1.85e-05,
"loss": 0.2092,
"step": 81500
},
{
"epoch": 147.91288566243193,
"eval_accuracy": 0.9153819887159277,
"eval_loss": 0.4039769172668457,
"eval_runtime": 52.9737,
"eval_samples_per_second": 83.155,
"eval_steps_per_second": 2.605,
"step": 81500
},
{
"epoch": 148.82032667876587,
"grad_norm": 1.625849962234497,
"learning_rate": 1.8e-05,
"loss": 0.2071,
"step": 82000
},
{
"epoch": 148.82032667876587,
"eval_accuracy": 0.9155334497970264,
"eval_loss": 0.4043760895729065,
"eval_runtime": 52.9528,
"eval_samples_per_second": 83.187,
"eval_steps_per_second": 2.606,
"step": 82000
},
{
"epoch": 149.72776769509983,
"grad_norm": 1.706663727760315,
"learning_rate": 1.75e-05,
"loss": 0.2049,
"step": 82500
},
{
"epoch": 149.72776769509983,
"eval_accuracy": 0.9146216897066012,
"eval_loss": 0.4076862335205078,
"eval_runtime": 53.5645,
"eval_samples_per_second": 82.237,
"eval_steps_per_second": 2.576,
"step": 82500
},
{
"epoch": 150.63520871143376,
"grad_norm": 1.9129126071929932,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.2072,
"step": 83000
},
{
"epoch": 150.63520871143376,
"eval_accuracy": 0.9153711232101976,
"eval_loss": 0.39911210536956787,
"eval_runtime": 53.0193,
"eval_samples_per_second": 83.083,
"eval_steps_per_second": 2.603,
"step": 83000
},
{
"epoch": 151.5426497277677,
"grad_norm": 1.7741316556930542,
"learning_rate": 1.65e-05,
"loss": 0.2048,
"step": 83500
},
{
"epoch": 151.5426497277677,
"eval_accuracy": 0.9165385170632505,
"eval_loss": 0.3991451859474182,
"eval_runtime": 53.0232,
"eval_samples_per_second": 83.077,
"eval_steps_per_second": 2.603,
"step": 83500
},
{
"epoch": 152.45009074410163,
"grad_norm": 1.6072008609771729,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.2064,
"step": 84000
},
{
"epoch": 152.45009074410163,
"eval_accuracy": 0.9149193775458487,
"eval_loss": 0.40828338265419006,
"eval_runtime": 52.9412,
"eval_samples_per_second": 83.206,
"eval_steps_per_second": 2.607,
"step": 84000
},
{
"epoch": 153.35753176043556,
"grad_norm": 1.7185778617858887,
"learning_rate": 1.55e-05,
"loss": 0.2061,
"step": 84500
},
{
"epoch": 153.35753176043556,
"eval_accuracy": 0.9158725538979593,
"eval_loss": 0.40011066198349,
"eval_runtime": 52.978,
"eval_samples_per_second": 83.148,
"eval_steps_per_second": 2.605,
"step": 84500
},
{
"epoch": 154.2649727767695,
"grad_norm": 1.7340868711471558,
"learning_rate": 1.5e-05,
"loss": 0.202,
"step": 85000
},
{
"epoch": 154.2649727767695,
"eval_accuracy": 0.9167605678134148,
"eval_loss": 0.3951858580112457,
"eval_runtime": 53.0994,
"eval_samples_per_second": 82.958,
"eval_steps_per_second": 2.599,
"step": 85000
},
{
"epoch": 155.17241379310346,
"grad_norm": 1.725895881652832,
"learning_rate": 1.45e-05,
"loss": 0.2007,
"step": 85500
},
{
"epoch": 155.17241379310346,
"eval_accuracy": 0.9168939812952535,
"eval_loss": 0.3987417221069336,
"eval_runtime": 54.957,
"eval_samples_per_second": 80.154,
"eval_steps_per_second": 2.511,
"step": 85500
},
{
"epoch": 156.0798548094374,
"grad_norm": 1.5828238725662231,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.2015,
"step": 86000
},
{
"epoch": 156.0798548094374,
"eval_accuracy": 0.917391414109539,
"eval_loss": 0.4005061388015747,
"eval_runtime": 52.8456,
"eval_samples_per_second": 83.356,
"eval_steps_per_second": 2.611,
"step": 86000
},
{
"epoch": 156.98729582577133,
"grad_norm": 1.631608247756958,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.2017,
"step": 86500
},
{
"epoch": 156.98729582577133,
"eval_accuracy": 0.9161869769340681,
"eval_loss": 0.40137505531311035,
"eval_runtime": 53.0267,
"eval_samples_per_second": 83.071,
"eval_steps_per_second": 2.602,
"step": 86500
},
{
"epoch": 157.89473684210526,
"grad_norm": 1.9344474077224731,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.1977,
"step": 87000
},
{
"epoch": 157.89473684210526,
"eval_accuracy": 0.9169892177992084,
"eval_loss": 0.39951831102371216,
"eval_runtime": 52.9394,
"eval_samples_per_second": 83.208,
"eval_steps_per_second": 2.607,
"step": 87000
},
{
"epoch": 158.8021778584392,
"grad_norm": 1.6796910762786865,
"learning_rate": 1.25e-05,
"loss": 0.1994,
"step": 87500
},
{
"epoch": 158.8021778584392,
"eval_accuracy": 0.9157372947418714,
"eval_loss": 0.4002035856246948,
"eval_runtime": 52.8869,
"eval_samples_per_second": 83.291,
"eval_steps_per_second": 2.609,
"step": 87500
},
{
"epoch": 159.70961887477313,
"grad_norm": 1.5331308841705322,
"learning_rate": 1.2e-05,
"loss": 0.1987,
"step": 88000
},
{
"epoch": 159.70961887477313,
"eval_accuracy": 0.9159691578071264,
"eval_loss": 0.40263810753822327,
"eval_runtime": 57.4412,
"eval_samples_per_second": 76.687,
"eval_steps_per_second": 2.402,
"step": 88000
},
{
"epoch": 160.6170598911071,
"grad_norm": 1.8451423645019531,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.1985,
"step": 88500
},
{
"epoch": 160.6170598911071,
"eval_accuracy": 0.91600935818353,
"eval_loss": 0.4028289318084717,
"eval_runtime": 53.2779,
"eval_samples_per_second": 82.68,
"eval_steps_per_second": 2.59,
"step": 88500
},
{
"epoch": 161.52450090744102,
"grad_norm": 1.6251318454742432,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.1976,
"step": 89000
},
{
"epoch": 161.52450090744102,
"eval_accuracy": 0.9168077236536895,
"eval_loss": 0.39661651849746704,
"eval_runtime": 54.3943,
"eval_samples_per_second": 80.983,
"eval_steps_per_second": 2.537,
"step": 89000
},
{
"epoch": 162.43194192377496,
"grad_norm": 1.906327486038208,
"learning_rate": 1.05e-05,
"loss": 0.1975,
"step": 89500
},
{
"epoch": 162.43194192377496,
"eval_accuracy": 0.9169821754553538,
"eval_loss": 0.3963495194911957,
"eval_runtime": 53.0048,
"eval_samples_per_second": 83.106,
"eval_steps_per_second": 2.604,
"step": 89500
},
{
"epoch": 163.3393829401089,
"grad_norm": 1.9544309377670288,
"learning_rate": 1e-05,
"loss": 0.1963,
"step": 90000
},
{
"epoch": 163.3393829401089,
"eval_accuracy": 0.9164625099202927,
"eval_loss": 0.404565691947937,
"eval_runtime": 53.9246,
"eval_samples_per_second": 81.688,
"eval_steps_per_second": 2.559,
"step": 90000
},
{
"epoch": 164.24682395644282,
"grad_norm": 1.852169156074524,
"learning_rate": 9.5e-06,
"loss": 0.1963,
"step": 90500
},
{
"epoch": 164.24682395644282,
"eval_accuracy": 0.9155449059728268,
"eval_loss": 0.406656414270401,
"eval_runtime": 52.9408,
"eval_samples_per_second": 83.206,
"eval_steps_per_second": 2.607,
"step": 90500
},
{
"epoch": 165.15426497277676,
"grad_norm": 1.7243677377700806,
"learning_rate": 9e-06,
"loss": 0.1985,
"step": 91000
},
{
"epoch": 165.15426497277676,
"eval_accuracy": 0.9178093382768805,
"eval_loss": 0.39525070786476135,
"eval_runtime": 52.9935,
"eval_samples_per_second": 83.123,
"eval_steps_per_second": 2.604,
"step": 91000
},
{
"epoch": 166.06170598911072,
"grad_norm": 1.7043794393539429,
"learning_rate": 8.500000000000002e-06,
"loss": 0.1936,
"step": 91500
},
{
"epoch": 166.06170598911072,
"eval_accuracy": 0.9167059652035036,
"eval_loss": 0.3972921669483185,
"eval_runtime": 52.9476,
"eval_samples_per_second": 83.195,
"eval_steps_per_second": 2.606,
"step": 91500
},
{
"epoch": 166.96914700544465,
"grad_norm": 1.6746214628219604,
"learning_rate": 8.000000000000001e-06,
"loss": 0.1937,
"step": 92000
},
{
"epoch": 166.96914700544465,
"eval_accuracy": 0.9164150593646615,
"eval_loss": 0.40151235461235046,
"eval_runtime": 52.9555,
"eval_samples_per_second": 83.183,
"eval_steps_per_second": 2.606,
"step": 92000
},
{
"epoch": 167.8765880217786,
"grad_norm": 1.724612832069397,
"learning_rate": 7.5e-06,
"loss": 0.1936,
"step": 92500
},
{
"epoch": 167.8765880217786,
"eval_accuracy": 0.9173433401523613,
"eval_loss": 0.40141040086746216,
"eval_runtime": 53.1703,
"eval_samples_per_second": 82.847,
"eval_steps_per_second": 2.595,
"step": 92500
},
{
"epoch": 168.78402903811252,
"grad_norm": 1.8417091369628906,
"learning_rate": 7.000000000000001e-06,
"loss": 0.1923,
"step": 93000
},
{
"epoch": 168.78402903811252,
"eval_accuracy": 0.9186817674871586,
"eval_loss": 0.39400893449783325,
"eval_runtime": 56.2134,
"eval_samples_per_second": 78.362,
"eval_steps_per_second": 2.455,
"step": 93000
},
{
"epoch": 169.69147005444646,
"grad_norm": 1.7512474060058594,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.1933,
"step": 93500
},
{
"epoch": 169.69147005444646,
"eval_accuracy": 0.9175796477495107,
"eval_loss": 0.3983522951602936,
"eval_runtime": 52.9057,
"eval_samples_per_second": 83.261,
"eval_steps_per_second": 2.608,
"step": 93500
},
{
"epoch": 170.5989110707804,
"grad_norm": 1.4073301553726196,
"learning_rate": 6e-06,
"loss": 0.1935,
"step": 94000
},
{
"epoch": 170.5989110707804,
"eval_accuracy": 0.9187447843680622,
"eval_loss": 0.3936294913291931,
"eval_runtime": 52.7794,
"eval_samples_per_second": 83.461,
"eval_steps_per_second": 2.615,
"step": 94000
},
{
"epoch": 171.50635208711435,
"grad_norm": 1.71249520778656,
"learning_rate": 5.500000000000001e-06,
"loss": 0.1932,
"step": 94500
},
{
"epoch": 171.50635208711435,
"eval_accuracy": 0.9183512090422428,
"eval_loss": 0.3950323760509491,
"eval_runtime": 52.9329,
"eval_samples_per_second": 83.219,
"eval_steps_per_second": 2.607,
"step": 94500
},
{
"epoch": 172.41379310344828,
"grad_norm": 1.4874622821807861,
"learning_rate": 5e-06,
"loss": 0.1915,
"step": 95000
},
{
"epoch": 172.41379310344828,
"eval_accuracy": 0.9178856529139476,
"eval_loss": 0.39599481225013733,
"eval_runtime": 51.846,
"eval_samples_per_second": 84.963,
"eval_steps_per_second": 2.662,
"step": 95000
},
{
"epoch": 173.32123411978222,
"grad_norm": 1.898534893989563,
"learning_rate": 4.5e-06,
"loss": 0.1891,
"step": 95500
},
{
"epoch": 173.32123411978222,
"eval_accuracy": 0.918700790482899,
"eval_loss": 0.38806891441345215,
"eval_runtime": 53.2348,
"eval_samples_per_second": 82.747,
"eval_steps_per_second": 2.592,
"step": 95500
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 182,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.06414374804652e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}