|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 3e-05, |
|
"loss": 2.3698, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 3e-05, |
|
"loss": 1.8255, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.6054430379746836, |
|
"eval_loss": 1.83919358253479, |
|
"eval_runtime": 4.6476, |
|
"eval_samples_per_second": 107.584, |
|
"eval_steps_per_second": 13.556, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7843, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7546, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"learning_rate": 3e-05, |
|
"loss": 1.7368, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.6078227848101266, |
|
"eval_loss": 1.81111478805542, |
|
"eval_runtime": 4.7063, |
|
"eval_samples_per_second": 106.242, |
|
"eval_steps_per_second": 13.386, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6749, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6689, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.607493670886076, |
|
"eval_loss": 1.8103100061416626, |
|
"eval_runtime": 5.2341, |
|
"eval_samples_per_second": 95.528, |
|
"eval_steps_per_second": 12.037, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"learning_rate": 3e-05, |
|
"loss": 1.6205, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"learning_rate": 3e-05, |
|
"loss": 1.556, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5555, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.6067341772151899, |
|
"eval_loss": 1.8414338827133179, |
|
"eval_runtime": 4.8146, |
|
"eval_samples_per_second": 103.85, |
|
"eval_steps_per_second": 13.085, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4289, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"learning_rate": 3e-05, |
|
"loss": 1.4559, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.6037721518987341, |
|
"eval_loss": 1.8992472887039185, |
|
"eval_runtime": 5.1125, |
|
"eval_samples_per_second": 97.799, |
|
"eval_steps_per_second": 12.323, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3828, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3271, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"learning_rate": 3e-05, |
|
"loss": 1.3514, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.6018227848101266, |
|
"eval_loss": 1.9584064483642578, |
|
"eval_runtime": 4.4025, |
|
"eval_samples_per_second": 113.572, |
|
"eval_steps_per_second": 14.31, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2239, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"learning_rate": 3e-05, |
|
"loss": 1.2491, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5999746835443038, |
|
"eval_loss": 2.030003070831299, |
|
"eval_runtime": 4.7046, |
|
"eval_samples_per_second": 106.279, |
|
"eval_steps_per_second": 13.391, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1873, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1455, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 3e-05, |
|
"loss": 1.1749, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5981518987341772, |
|
"eval_loss": 2.1050591468811035, |
|
"eval_runtime": 4.5572, |
|
"eval_samples_per_second": 109.717, |
|
"eval_steps_per_second": 13.824, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0398, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0769, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5953924050632912, |
|
"eval_loss": 2.194838762283325, |
|
"eval_runtime": 5.1306, |
|
"eval_samples_per_second": 97.455, |
|
"eval_steps_per_second": 12.279, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0208, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9809, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0134, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.594253164556962, |
|
"eval_loss": 2.2515170574188232, |
|
"eval_runtime": 4.7037, |
|
"eval_samples_per_second": 106.3, |
|
"eval_steps_per_second": 13.394, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 10.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8808, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 10.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.9209, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.592126582278481, |
|
"eval_loss": 2.3421294689178467, |
|
"eval_runtime": 4.5581, |
|
"eval_samples_per_second": 109.695, |
|
"eval_steps_per_second": 13.822, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 11.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.881, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 11.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8321, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 3e-05, |
|
"loss": 0.8636, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.5905063291139241, |
|
"eval_loss": 2.4442591667175293, |
|
"eval_runtime": 5.1497, |
|
"eval_samples_per_second": 97.093, |
|
"eval_steps_per_second": 12.234, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 12.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7437, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7866, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.588, |
|
"eval_loss": 2.557358741760254, |
|
"eval_runtime": 4.558, |
|
"eval_samples_per_second": 109.698, |
|
"eval_steps_per_second": 13.822, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 13.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7408, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 13.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7067, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"learning_rate": 3e-05, |
|
"loss": 0.7448, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.5866835443037974, |
|
"eval_loss": 2.579989194869995, |
|
"eval_runtime": 4.4082, |
|
"eval_samples_per_second": 113.424, |
|
"eval_steps_per_second": 14.291, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 14.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.637, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 14.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6709, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.5845569620253165, |
|
"eval_loss": 2.6911704540252686, |
|
"eval_runtime": 5.1406, |
|
"eval_samples_per_second": 97.265, |
|
"eval_steps_per_second": 12.255, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 15.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6375, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 15.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6088, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6439, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.5853164556962025, |
|
"eval_loss": 2.7545602321624756, |
|
"eval_runtime": 4.7065, |
|
"eval_samples_per_second": 106.237, |
|
"eval_steps_per_second": 13.386, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 16.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5552, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5869, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.5831139240506329, |
|
"eval_loss": 2.799652338027954, |
|
"eval_runtime": 5.213, |
|
"eval_samples_per_second": 95.914, |
|
"eval_steps_per_second": 12.085, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 17.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5547, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 17.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5336, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5596, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.5832911392405064, |
|
"eval_loss": 2.843494176864624, |
|
"eval_runtime": 4.6373, |
|
"eval_samples_per_second": 107.822, |
|
"eval_steps_per_second": 13.586, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 18.4, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4871, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 18.8, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5205, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.5832911392405064, |
|
"eval_loss": 2.9509618282318115, |
|
"eval_runtime": 4.4063, |
|
"eval_samples_per_second": 113.473, |
|
"eval_steps_per_second": 14.298, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 19.2, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4924, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 19.6, |
|
"learning_rate": 3e-05, |
|
"loss": 0.4789, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 3e-05, |
|
"loss": 0.5045, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.5824050632911393, |
|
"eval_loss": 2.9796953201293945, |
|
"eval_runtime": 5.1055, |
|
"eval_samples_per_second": 97.933, |
|
"eval_steps_per_second": 12.34, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 5000, |
|
"total_flos": 3.1967425075347456e+17, |
|
"train_loss": 1.043557526397705, |
|
"train_runtime": 3605.2187, |
|
"train_samples_per_second": 44.38, |
|
"train_steps_per_second": 1.387 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 5000, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 3.1967425075347456e+17, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|