dehanalkautsar's picture
Upload trainer_state.json with huggingface_hub
5ad3718 verified
{
"best_metric": 1.6794742345809937,
"best_model_checkpoint": "models/dehanalkautsar/mbert-uncased-modified_embedding_table-en/checkpoint-70000",
"epoch": 0.624464048155099,
"eval_steps": 2000,
"global_step": 70000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017841829947288543,
"grad_norm": 31.71474266052246,
"learning_rate": 4.9977697687696264e-05,
"loss": 5.6683,
"step": 2000
},
{
"epoch": 0.017841829947288543,
"eval_loss": 3.4221630096435547,
"eval_runtime": 73.4656,
"eval_samples_per_second": 136.118,
"eval_steps_per_second": 2.137,
"step": 2000
},
{
"epoch": 0.035683659894577086,
"grad_norm": 20.9453182220459,
"learning_rate": 4.995539537539252e-05,
"loss": 3.1985,
"step": 4000
},
{
"epoch": 0.035683659894577086,
"eval_loss": 2.7761876583099365,
"eval_runtime": 73.4155,
"eval_samples_per_second": 136.211,
"eval_steps_per_second": 2.139,
"step": 4000
},
{
"epoch": 0.05352548984186563,
"grad_norm": 18.989194869995117,
"learning_rate": 4.993309306308878e-05,
"loss": 2.7816,
"step": 6000
},
{
"epoch": 0.05352548984186563,
"eval_loss": 2.5367112159729004,
"eval_runtime": 73.3987,
"eval_samples_per_second": 136.242,
"eval_steps_per_second": 2.139,
"step": 6000
},
{
"epoch": 0.07136731978915417,
"grad_norm": 20.059371948242188,
"learning_rate": 4.991079075078504e-05,
"loss": 2.5767,
"step": 8000
},
{
"epoch": 0.07136731978915417,
"eval_loss": 2.3765993118286133,
"eval_runtime": 73.3358,
"eval_samples_per_second": 136.359,
"eval_steps_per_second": 2.141,
"step": 8000
},
{
"epoch": 0.08920914973644271,
"grad_norm": 18.088693618774414,
"learning_rate": 4.9888488438481305e-05,
"loss": 2.4472,
"step": 10000
},
{
"epoch": 0.08920914973644271,
"eval_loss": 2.261122226715088,
"eval_runtime": 73.3532,
"eval_samples_per_second": 136.327,
"eval_steps_per_second": 2.14,
"step": 10000
},
{
"epoch": 0.10705097968373126,
"grad_norm": 18.355438232421875,
"learning_rate": 4.986618612617757e-05,
"loss": 2.3517,
"step": 12000
},
{
"epoch": 0.10705097968373126,
"eval_loss": 2.197890520095825,
"eval_runtime": 73.2175,
"eval_samples_per_second": 136.579,
"eval_steps_per_second": 2.144,
"step": 12000
},
{
"epoch": 0.1248928096310198,
"grad_norm": 19.445158004760742,
"learning_rate": 4.984388381387383e-05,
"loss": 2.2819,
"step": 14000
},
{
"epoch": 0.1248928096310198,
"eval_loss": 2.135493040084839,
"eval_runtime": 73.523,
"eval_samples_per_second": 136.012,
"eval_steps_per_second": 2.135,
"step": 14000
},
{
"epoch": 0.14273463957830834,
"grad_norm": 16.74393081665039,
"learning_rate": 4.9821581501570084e-05,
"loss": 2.2253,
"step": 16000
},
{
"epoch": 0.14273463957830834,
"eval_loss": 2.0676965713500977,
"eval_runtime": 73.4191,
"eval_samples_per_second": 136.204,
"eval_steps_per_second": 2.138,
"step": 16000
},
{
"epoch": 0.1605764695255969,
"grad_norm": 18.755170822143555,
"learning_rate": 4.9799279189266346e-05,
"loss": 2.1737,
"step": 18000
},
{
"epoch": 0.1605764695255969,
"eval_loss": 2.0221915245056152,
"eval_runtime": 73.3608,
"eval_samples_per_second": 136.313,
"eval_steps_per_second": 2.14,
"step": 18000
},
{
"epoch": 0.17841829947288543,
"grad_norm": 18.238853454589844,
"learning_rate": 4.977697687696261e-05,
"loss": 2.1339,
"step": 20000
},
{
"epoch": 0.17841829947288543,
"eval_loss": 1.9968066215515137,
"eval_runtime": 73.376,
"eval_samples_per_second": 136.284,
"eval_steps_per_second": 2.14,
"step": 20000
},
{
"epoch": 0.19626012942017398,
"grad_norm": 16.624298095703125,
"learning_rate": 4.975467456465886e-05,
"loss": 2.097,
"step": 22000
},
{
"epoch": 0.19626012942017398,
"eval_loss": 1.973600149154663,
"eval_runtime": 73.4156,
"eval_samples_per_second": 136.211,
"eval_steps_per_second": 2.139,
"step": 22000
},
{
"epoch": 0.2141019593674625,
"grad_norm": 21.57083511352539,
"learning_rate": 4.9732372252355125e-05,
"loss": 2.0679,
"step": 24000
},
{
"epoch": 0.2141019593674625,
"eval_loss": 1.9505703449249268,
"eval_runtime": 73.2384,
"eval_samples_per_second": 136.54,
"eval_steps_per_second": 2.144,
"step": 24000
},
{
"epoch": 0.23194378931475107,
"grad_norm": 18.031625747680664,
"learning_rate": 4.971006994005139e-05,
"loss": 2.0474,
"step": 26000
},
{
"epoch": 0.23194378931475107,
"eval_loss": 1.9203472137451172,
"eval_runtime": 73.2703,
"eval_samples_per_second": 136.481,
"eval_steps_per_second": 2.143,
"step": 26000
},
{
"epoch": 0.2497856192620396,
"grad_norm": 18.387907028198242,
"learning_rate": 4.968776762774764e-05,
"loss": 2.0235,
"step": 28000
},
{
"epoch": 0.2497856192620396,
"eval_loss": 1.9067487716674805,
"eval_runtime": 73.2711,
"eval_samples_per_second": 136.479,
"eval_steps_per_second": 2.143,
"step": 28000
},
{
"epoch": 0.2676274492093281,
"grad_norm": 16.74208641052246,
"learning_rate": 4.966546531544391e-05,
"loss": 2.0007,
"step": 30000
},
{
"epoch": 0.2676274492093281,
"eval_loss": 1.8875941038131714,
"eval_runtime": 73.3071,
"eval_samples_per_second": 136.412,
"eval_steps_per_second": 2.142,
"step": 30000
},
{
"epoch": 0.2854692791566167,
"grad_norm": 17.28813934326172,
"learning_rate": 4.964316300314017e-05,
"loss": 1.9809,
"step": 32000
},
{
"epoch": 0.2854692791566167,
"eval_loss": 1.8658957481384277,
"eval_runtime": 73.2999,
"eval_samples_per_second": 136.426,
"eval_steps_per_second": 2.142,
"step": 32000
},
{
"epoch": 0.30331110910390524,
"grad_norm": 17.0612735748291,
"learning_rate": 4.962086069083643e-05,
"loss": 1.9672,
"step": 34000
},
{
"epoch": 0.30331110910390524,
"eval_loss": 1.8565300703048706,
"eval_runtime": 73.3279,
"eval_samples_per_second": 136.374,
"eval_steps_per_second": 2.141,
"step": 34000
},
{
"epoch": 0.3211529390511938,
"grad_norm": 17.805253982543945,
"learning_rate": 4.959855837853269e-05,
"loss": 1.9517,
"step": 36000
},
{
"epoch": 0.3211529390511938,
"eval_loss": 1.8271287679672241,
"eval_runtime": 73.2647,
"eval_samples_per_second": 136.491,
"eval_steps_per_second": 2.143,
"step": 36000
},
{
"epoch": 0.33899476899848235,
"grad_norm": 16.978797912597656,
"learning_rate": 4.957625606622895e-05,
"loss": 1.9358,
"step": 38000
},
{
"epoch": 0.33899476899848235,
"eval_loss": 1.8138540983200073,
"eval_runtime": 73.3003,
"eval_samples_per_second": 136.425,
"eval_steps_per_second": 2.142,
"step": 38000
},
{
"epoch": 0.35683659894577086,
"grad_norm": 18.134506225585938,
"learning_rate": 4.955395375392521e-05,
"loss": 1.9204,
"step": 40000
},
{
"epoch": 0.35683659894577086,
"eval_loss": 1.8061386346817017,
"eval_runtime": 73.3032,
"eval_samples_per_second": 136.42,
"eval_steps_per_second": 2.142,
"step": 40000
},
{
"epoch": 0.3746784288930594,
"grad_norm": 16.018447875976562,
"learning_rate": 4.953165144162147e-05,
"loss": 1.9103,
"step": 42000
},
{
"epoch": 0.3746784288930594,
"eval_loss": 1.790651559829712,
"eval_runtime": 73.3055,
"eval_samples_per_second": 136.415,
"eval_steps_per_second": 2.142,
"step": 42000
},
{
"epoch": 0.39252025884034797,
"grad_norm": 18.30422592163086,
"learning_rate": 4.950934912931773e-05,
"loss": 1.8984,
"step": 44000
},
{
"epoch": 0.39252025884034797,
"eval_loss": 1.787701964378357,
"eval_runtime": 73.3134,
"eval_samples_per_second": 136.401,
"eval_steps_per_second": 2.141,
"step": 44000
},
{
"epoch": 0.4103620887876365,
"grad_norm": 16.60624122619629,
"learning_rate": 4.9487046817013986e-05,
"loss": 1.89,
"step": 46000
},
{
"epoch": 0.4103620887876365,
"eval_loss": 1.7718769311904907,
"eval_runtime": 73.3399,
"eval_samples_per_second": 136.351,
"eval_steps_per_second": 2.141,
"step": 46000
},
{
"epoch": 0.428203918734925,
"grad_norm": 15.059417724609375,
"learning_rate": 4.964316300314017e-05,
"loss": 1.8775,
"step": 48000
},
{
"epoch": 0.428203918734925,
"eval_loss": 1.763095736503601,
"eval_runtime": 73.3521,
"eval_samples_per_second": 136.329,
"eval_steps_per_second": 2.14,
"step": 48000
},
{
"epoch": 0.4460457486822136,
"grad_norm": 17.129064559936523,
"learning_rate": 4.9628294794937674e-05,
"loss": 1.8687,
"step": 50000
},
{
"epoch": 0.4460457486822136,
"eval_loss": 1.743654727935791,
"eval_runtime": 73.2498,
"eval_samples_per_second": 136.519,
"eval_steps_per_second": 2.143,
"step": 50000
},
{
"epoch": 0.46388757862950214,
"grad_norm": 16.349536895751953,
"learning_rate": 4.947285443645707e-05,
"loss": 1.8632,
"step": 52000
},
{
"epoch": 0.46388757862950214,
"eval_loss": 1.743268609046936,
"eval_runtime": 73.3921,
"eval_samples_per_second": 136.254,
"eval_steps_per_second": 2.139,
"step": 52000
},
{
"epoch": 0.4817294085767907,
"grad_norm": 16.42721939086914,
"learning_rate": 4.9452579607090025e-05,
"loss": 1.8494,
"step": 54000
},
{
"epoch": 0.4817294085767907,
"eval_loss": 1.7285025119781494,
"eval_runtime": 73.2209,
"eval_samples_per_second": 136.573,
"eval_steps_per_second": 2.144,
"step": 54000
},
{
"epoch": 0.4995712385240792,
"grad_norm": 17.674468994140625,
"learning_rate": 4.9432304777722994e-05,
"loss": 1.8404,
"step": 56000
},
{
"epoch": 0.4995712385240792,
"eval_loss": 1.7261757850646973,
"eval_runtime": 73.1858,
"eval_samples_per_second": 136.639,
"eval_steps_per_second": 2.145,
"step": 56000
},
{
"epoch": 0.5174130684713678,
"grad_norm": 16.304468154907227,
"learning_rate": 4.941202994835596e-05,
"loss": 1.8308,
"step": 58000
},
{
"epoch": 0.5174130684713678,
"eval_loss": 1.7157503366470337,
"eval_runtime": 73.1684,
"eval_samples_per_second": 136.671,
"eval_steps_per_second": 2.146,
"step": 58000
},
{
"epoch": 0.5352548984186563,
"grad_norm": 17.134702682495117,
"learning_rate": 4.939175511898892e-05,
"loss": 1.8245,
"step": 60000
},
{
"epoch": 0.5352548984186563,
"eval_loss": 1.7094610929489136,
"eval_runtime": 73.3218,
"eval_samples_per_second": 136.385,
"eval_steps_per_second": 2.141,
"step": 60000
},
{
"epoch": 0.5530967283659448,
"grad_norm": 17.70859146118164,
"learning_rate": 4.937148028962189e-05,
"loss": 1.8201,
"step": 62000
},
{
"epoch": 0.5530967283659448,
"eval_loss": 1.70658540725708,
"eval_runtime": 73.3241,
"eval_samples_per_second": 136.381,
"eval_steps_per_second": 2.141,
"step": 62000
},
{
"epoch": 0.5709385583132334,
"grad_norm": 16.962129592895508,
"learning_rate": 4.935120546025485e-05,
"loss": 1.8107,
"step": 64000
},
{
"epoch": 0.5709385583132334,
"eval_loss": 1.6914931535720825,
"eval_runtime": 73.2479,
"eval_samples_per_second": 136.523,
"eval_steps_per_second": 2.143,
"step": 64000
},
{
"epoch": 0.5887803882605219,
"grad_norm": 16.842283248901367,
"learning_rate": 4.933093063088781e-05,
"loss": 1.8027,
"step": 66000
},
{
"epoch": 0.5887803882605219,
"eval_loss": 1.683428168296814,
"eval_runtime": 73.2102,
"eval_samples_per_second": 136.593,
"eval_steps_per_second": 2.145,
"step": 66000
},
{
"epoch": 0.6066222182078105,
"grad_norm": 17.075162887573242,
"learning_rate": 4.9310655801520775e-05,
"loss": 1.7991,
"step": 68000
},
{
"epoch": 0.6066222182078105,
"eval_loss": 1.688643217086792,
"eval_runtime": 73.3407,
"eval_samples_per_second": 136.35,
"eval_steps_per_second": 2.141,
"step": 68000
},
{
"epoch": 0.624464048155099,
"grad_norm": 17.02593231201172,
"learning_rate": 4.929038097215374e-05,
"loss": 1.7906,
"step": 70000
},
{
"epoch": 0.624464048155099,
"eval_loss": 1.6794742345809937,
"eval_runtime": 73.266,
"eval_samples_per_second": 136.489,
"eval_steps_per_second": 2.143,
"step": 70000
}
],
"logging_steps": 2000,
"max_steps": 4932224,
"num_input_tokens_seen": 0,
"num_train_epochs": 44,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.716630245376e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}