|
{ |
|
"best_metric": 1.6794742345809937, |
|
"best_model_checkpoint": "models/dehanalkautsar/mbert-uncased-modified_embedding_table-en/checkpoint-70000", |
|
"epoch": 0.624464048155099, |
|
"eval_steps": 2000, |
|
"global_step": 70000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.017841829947288543, |
|
"grad_norm": 31.71474266052246, |
|
"learning_rate": 4.9977697687696264e-05, |
|
"loss": 5.6683, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.017841829947288543, |
|
"eval_loss": 3.4221630096435547, |
|
"eval_runtime": 73.4656, |
|
"eval_samples_per_second": 136.118, |
|
"eval_steps_per_second": 2.137, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.035683659894577086, |
|
"grad_norm": 20.9453182220459, |
|
"learning_rate": 4.995539537539252e-05, |
|
"loss": 3.1985, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.035683659894577086, |
|
"eval_loss": 2.7761876583099365, |
|
"eval_runtime": 73.4155, |
|
"eval_samples_per_second": 136.211, |
|
"eval_steps_per_second": 2.139, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.05352548984186563, |
|
"grad_norm": 18.989194869995117, |
|
"learning_rate": 4.993309306308878e-05, |
|
"loss": 2.7816, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.05352548984186563, |
|
"eval_loss": 2.5367112159729004, |
|
"eval_runtime": 73.3987, |
|
"eval_samples_per_second": 136.242, |
|
"eval_steps_per_second": 2.139, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.07136731978915417, |
|
"grad_norm": 20.059371948242188, |
|
"learning_rate": 4.991079075078504e-05, |
|
"loss": 2.5767, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.07136731978915417, |
|
"eval_loss": 2.3765993118286133, |
|
"eval_runtime": 73.3358, |
|
"eval_samples_per_second": 136.359, |
|
"eval_steps_per_second": 2.141, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.08920914973644271, |
|
"grad_norm": 18.088693618774414, |
|
"learning_rate": 4.9888488438481305e-05, |
|
"loss": 2.4472, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.08920914973644271, |
|
"eval_loss": 2.261122226715088, |
|
"eval_runtime": 73.3532, |
|
"eval_samples_per_second": 136.327, |
|
"eval_steps_per_second": 2.14, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.10705097968373126, |
|
"grad_norm": 18.355438232421875, |
|
"learning_rate": 4.986618612617757e-05, |
|
"loss": 2.3517, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.10705097968373126, |
|
"eval_loss": 2.197890520095825, |
|
"eval_runtime": 73.2175, |
|
"eval_samples_per_second": 136.579, |
|
"eval_steps_per_second": 2.144, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.1248928096310198, |
|
"grad_norm": 19.445158004760742, |
|
"learning_rate": 4.984388381387383e-05, |
|
"loss": 2.2819, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.1248928096310198, |
|
"eval_loss": 2.135493040084839, |
|
"eval_runtime": 73.523, |
|
"eval_samples_per_second": 136.012, |
|
"eval_steps_per_second": 2.135, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.14273463957830834, |
|
"grad_norm": 16.74393081665039, |
|
"learning_rate": 4.9821581501570084e-05, |
|
"loss": 2.2253, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.14273463957830834, |
|
"eval_loss": 2.0676965713500977, |
|
"eval_runtime": 73.4191, |
|
"eval_samples_per_second": 136.204, |
|
"eval_steps_per_second": 2.138, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.1605764695255969, |
|
"grad_norm": 18.755170822143555, |
|
"learning_rate": 4.9799279189266346e-05, |
|
"loss": 2.1737, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.1605764695255969, |
|
"eval_loss": 2.0221915245056152, |
|
"eval_runtime": 73.3608, |
|
"eval_samples_per_second": 136.313, |
|
"eval_steps_per_second": 2.14, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.17841829947288543, |
|
"grad_norm": 18.238853454589844, |
|
"learning_rate": 4.977697687696261e-05, |
|
"loss": 2.1339, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.17841829947288543, |
|
"eval_loss": 1.9968066215515137, |
|
"eval_runtime": 73.376, |
|
"eval_samples_per_second": 136.284, |
|
"eval_steps_per_second": 2.14, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.19626012942017398, |
|
"grad_norm": 16.624298095703125, |
|
"learning_rate": 4.975467456465886e-05, |
|
"loss": 2.097, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.19626012942017398, |
|
"eval_loss": 1.973600149154663, |
|
"eval_runtime": 73.4156, |
|
"eval_samples_per_second": 136.211, |
|
"eval_steps_per_second": 2.139, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.2141019593674625, |
|
"grad_norm": 21.57083511352539, |
|
"learning_rate": 4.9732372252355125e-05, |
|
"loss": 2.0679, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.2141019593674625, |
|
"eval_loss": 1.9505703449249268, |
|
"eval_runtime": 73.2384, |
|
"eval_samples_per_second": 136.54, |
|
"eval_steps_per_second": 2.144, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.23194378931475107, |
|
"grad_norm": 18.031625747680664, |
|
"learning_rate": 4.971006994005139e-05, |
|
"loss": 2.0474, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.23194378931475107, |
|
"eval_loss": 1.9203472137451172, |
|
"eval_runtime": 73.2703, |
|
"eval_samples_per_second": 136.481, |
|
"eval_steps_per_second": 2.143, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.2497856192620396, |
|
"grad_norm": 18.387907028198242, |
|
"learning_rate": 4.968776762774764e-05, |
|
"loss": 2.0235, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.2497856192620396, |
|
"eval_loss": 1.9067487716674805, |
|
"eval_runtime": 73.2711, |
|
"eval_samples_per_second": 136.479, |
|
"eval_steps_per_second": 2.143, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.2676274492093281, |
|
"grad_norm": 16.74208641052246, |
|
"learning_rate": 4.966546531544391e-05, |
|
"loss": 2.0007, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.2676274492093281, |
|
"eval_loss": 1.8875941038131714, |
|
"eval_runtime": 73.3071, |
|
"eval_samples_per_second": 136.412, |
|
"eval_steps_per_second": 2.142, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.2854692791566167, |
|
"grad_norm": 17.28813934326172, |
|
"learning_rate": 4.964316300314017e-05, |
|
"loss": 1.9809, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.2854692791566167, |
|
"eval_loss": 1.8658957481384277, |
|
"eval_runtime": 73.2999, |
|
"eval_samples_per_second": 136.426, |
|
"eval_steps_per_second": 2.142, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.30331110910390524, |
|
"grad_norm": 17.0612735748291, |
|
"learning_rate": 4.962086069083643e-05, |
|
"loss": 1.9672, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.30331110910390524, |
|
"eval_loss": 1.8565300703048706, |
|
"eval_runtime": 73.3279, |
|
"eval_samples_per_second": 136.374, |
|
"eval_steps_per_second": 2.141, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.3211529390511938, |
|
"grad_norm": 17.805253982543945, |
|
"learning_rate": 4.959855837853269e-05, |
|
"loss": 1.9517, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.3211529390511938, |
|
"eval_loss": 1.8271287679672241, |
|
"eval_runtime": 73.2647, |
|
"eval_samples_per_second": 136.491, |
|
"eval_steps_per_second": 2.143, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.33899476899848235, |
|
"grad_norm": 16.978797912597656, |
|
"learning_rate": 4.957625606622895e-05, |
|
"loss": 1.9358, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.33899476899848235, |
|
"eval_loss": 1.8138540983200073, |
|
"eval_runtime": 73.3003, |
|
"eval_samples_per_second": 136.425, |
|
"eval_steps_per_second": 2.142, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.35683659894577086, |
|
"grad_norm": 18.134506225585938, |
|
"learning_rate": 4.955395375392521e-05, |
|
"loss": 1.9204, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.35683659894577086, |
|
"eval_loss": 1.8061386346817017, |
|
"eval_runtime": 73.3032, |
|
"eval_samples_per_second": 136.42, |
|
"eval_steps_per_second": 2.142, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.3746784288930594, |
|
"grad_norm": 16.018447875976562, |
|
"learning_rate": 4.953165144162147e-05, |
|
"loss": 1.9103, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.3746784288930594, |
|
"eval_loss": 1.790651559829712, |
|
"eval_runtime": 73.3055, |
|
"eval_samples_per_second": 136.415, |
|
"eval_steps_per_second": 2.142, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.39252025884034797, |
|
"grad_norm": 18.30422592163086, |
|
"learning_rate": 4.950934912931773e-05, |
|
"loss": 1.8984, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.39252025884034797, |
|
"eval_loss": 1.787701964378357, |
|
"eval_runtime": 73.3134, |
|
"eval_samples_per_second": 136.401, |
|
"eval_steps_per_second": 2.141, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.4103620887876365, |
|
"grad_norm": 16.60624122619629, |
|
"learning_rate": 4.9487046817013986e-05, |
|
"loss": 1.89, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.4103620887876365, |
|
"eval_loss": 1.7718769311904907, |
|
"eval_runtime": 73.3399, |
|
"eval_samples_per_second": 136.351, |
|
"eval_steps_per_second": 2.141, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.428203918734925, |
|
"grad_norm": 15.059417724609375, |
|
"learning_rate": 4.964316300314017e-05, |
|
"loss": 1.8775, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.428203918734925, |
|
"eval_loss": 1.763095736503601, |
|
"eval_runtime": 73.3521, |
|
"eval_samples_per_second": 136.329, |
|
"eval_steps_per_second": 2.14, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.4460457486822136, |
|
"grad_norm": 17.129064559936523, |
|
"learning_rate": 4.9628294794937674e-05, |
|
"loss": 1.8687, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.4460457486822136, |
|
"eval_loss": 1.743654727935791, |
|
"eval_runtime": 73.2498, |
|
"eval_samples_per_second": 136.519, |
|
"eval_steps_per_second": 2.143, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.46388757862950214, |
|
"grad_norm": 16.349536895751953, |
|
"learning_rate": 4.947285443645707e-05, |
|
"loss": 1.8632, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.46388757862950214, |
|
"eval_loss": 1.743268609046936, |
|
"eval_runtime": 73.3921, |
|
"eval_samples_per_second": 136.254, |
|
"eval_steps_per_second": 2.139, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.4817294085767907, |
|
"grad_norm": 16.42721939086914, |
|
"learning_rate": 4.9452579607090025e-05, |
|
"loss": 1.8494, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.4817294085767907, |
|
"eval_loss": 1.7285025119781494, |
|
"eval_runtime": 73.2209, |
|
"eval_samples_per_second": 136.573, |
|
"eval_steps_per_second": 2.144, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.4995712385240792, |
|
"grad_norm": 17.674468994140625, |
|
"learning_rate": 4.9432304777722994e-05, |
|
"loss": 1.8404, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.4995712385240792, |
|
"eval_loss": 1.7261757850646973, |
|
"eval_runtime": 73.1858, |
|
"eval_samples_per_second": 136.639, |
|
"eval_steps_per_second": 2.145, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.5174130684713678, |
|
"grad_norm": 16.304468154907227, |
|
"learning_rate": 4.941202994835596e-05, |
|
"loss": 1.8308, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.5174130684713678, |
|
"eval_loss": 1.7157503366470337, |
|
"eval_runtime": 73.1684, |
|
"eval_samples_per_second": 136.671, |
|
"eval_steps_per_second": 2.146, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.5352548984186563, |
|
"grad_norm": 17.134702682495117, |
|
"learning_rate": 4.939175511898892e-05, |
|
"loss": 1.8245, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.5352548984186563, |
|
"eval_loss": 1.7094610929489136, |
|
"eval_runtime": 73.3218, |
|
"eval_samples_per_second": 136.385, |
|
"eval_steps_per_second": 2.141, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.5530967283659448, |
|
"grad_norm": 17.70859146118164, |
|
"learning_rate": 4.937148028962189e-05, |
|
"loss": 1.8201, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.5530967283659448, |
|
"eval_loss": 1.70658540725708, |
|
"eval_runtime": 73.3241, |
|
"eval_samples_per_second": 136.381, |
|
"eval_steps_per_second": 2.141, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.5709385583132334, |
|
"grad_norm": 16.962129592895508, |
|
"learning_rate": 4.935120546025485e-05, |
|
"loss": 1.8107, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.5709385583132334, |
|
"eval_loss": 1.6914931535720825, |
|
"eval_runtime": 73.2479, |
|
"eval_samples_per_second": 136.523, |
|
"eval_steps_per_second": 2.143, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.5887803882605219, |
|
"grad_norm": 16.842283248901367, |
|
"learning_rate": 4.933093063088781e-05, |
|
"loss": 1.8027, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.5887803882605219, |
|
"eval_loss": 1.683428168296814, |
|
"eval_runtime": 73.2102, |
|
"eval_samples_per_second": 136.593, |
|
"eval_steps_per_second": 2.145, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.6066222182078105, |
|
"grad_norm": 17.075162887573242, |
|
"learning_rate": 4.9310655801520775e-05, |
|
"loss": 1.7991, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.6066222182078105, |
|
"eval_loss": 1.688643217086792, |
|
"eval_runtime": 73.3407, |
|
"eval_samples_per_second": 136.35, |
|
"eval_steps_per_second": 2.141, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.624464048155099, |
|
"grad_norm": 17.02593231201172, |
|
"learning_rate": 4.929038097215374e-05, |
|
"loss": 1.7906, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 0.624464048155099, |
|
"eval_loss": 1.6794742345809937, |
|
"eval_runtime": 73.266, |
|
"eval_samples_per_second": 136.489, |
|
"eval_steps_per_second": 2.143, |
|
"step": 70000 |
|
} |
|
], |
|
"logging_steps": 2000, |
|
"max_steps": 4932224, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 44, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.716630245376e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|