|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.987551867219917, |
|
"eval_steps": 500, |
|
"global_step": 360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08298755186721991, |
|
"grad_norm": 0.4138890994781238, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6819, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16597510373443983, |
|
"grad_norm": 0.29279799441362253, |
|
"learning_rate": 5e-06, |
|
"loss": 0.6152, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24896265560165975, |
|
"grad_norm": 0.22155490022279595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5911, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.33195020746887965, |
|
"grad_norm": 0.21703792973144043, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5758, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4149377593360996, |
|
"grad_norm": 0.19782125949582666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5658, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4979253112033195, |
|
"grad_norm": 0.20429620764864578, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5621, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5809128630705395, |
|
"grad_norm": 0.1947179711144, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5481, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6639004149377593, |
|
"grad_norm": 0.20952965041956714, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5491, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7468879668049793, |
|
"grad_norm": 0.2092027679734135, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5473, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8298755186721992, |
|
"grad_norm": 0.1973704614234666, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5412, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9128630705394191, |
|
"grad_norm": 0.22215520376065145, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5361, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.995850622406639, |
|
"grad_norm": 0.20002555613598916, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5354, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.995850622406639, |
|
"eval_loss": 0.5283368229866028, |
|
"eval_runtime": 121.5237, |
|
"eval_samples_per_second": 26.703, |
|
"eval_steps_per_second": 0.42, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0788381742738589, |
|
"grad_norm": 0.23151869944663353, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5334, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.161825726141079, |
|
"grad_norm": 0.20416069004838694, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5122, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2448132780082988, |
|
"grad_norm": 0.21256654137396935, |
|
"learning_rate": 5e-06, |
|
"loss": 0.509, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3278008298755186, |
|
"grad_norm": 0.21018667523519946, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5041, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4107883817427385, |
|
"grad_norm": 0.219240042940767, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4998, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4937759336099585, |
|
"grad_norm": 0.22681455392212077, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5037, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.5767634854771784, |
|
"grad_norm": 0.227133839723048, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4977, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6597510373443982, |
|
"grad_norm": 0.21040711904959797, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4941, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.7427385892116183, |
|
"grad_norm": 0.23482785666403702, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4945, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.8257261410788381, |
|
"grad_norm": 0.2035179907011211, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4904, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.908713692946058, |
|
"grad_norm": 0.21720290177963564, |
|
"learning_rate": 5e-06, |
|
"loss": 0.491, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.991701244813278, |
|
"grad_norm": 0.2214820393037949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4901, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.5045989155769348, |
|
"eval_runtime": 122.3281, |
|
"eval_samples_per_second": 26.527, |
|
"eval_steps_per_second": 0.417, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 2.074688796680498, |
|
"grad_norm": 0.25601226331965665, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4945, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.1576763485477177, |
|
"grad_norm": 0.2537099080076595, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4617, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.240663900414938, |
|
"grad_norm": 0.2445352596834903, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4648, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.323651452282158, |
|
"grad_norm": 0.24195048816699535, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4688, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.4066390041493775, |
|
"grad_norm": 0.3297443855710949, |
|
"learning_rate": 5e-06, |
|
"loss": 0.46, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.4896265560165975, |
|
"grad_norm": 0.2227067008121754, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4679, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.572614107883817, |
|
"grad_norm": 0.24268677689146825, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4642, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.6556016597510372, |
|
"grad_norm": 0.24131530500929413, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4597, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.7385892116182573, |
|
"grad_norm": 0.22997089130920098, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4617, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.821576763485477, |
|
"grad_norm": 0.23994756278793414, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4597, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.904564315352697, |
|
"grad_norm": 0.23257285232469585, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4545, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.987551867219917, |
|
"grad_norm": 0.22525776234601527, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4618, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.987551867219917, |
|
"eval_loss": 0.49428611993789673, |
|
"eval_runtime": 121.9138, |
|
"eval_samples_per_second": 26.617, |
|
"eval_steps_per_second": 0.418, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.987551867219917, |
|
"step": 360, |
|
"total_flos": 602804028702720.0, |
|
"train_loss": 0.5124640332327949, |
|
"train_runtime": 20041.158, |
|
"train_samples_per_second": 9.227, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 602804028702720.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|