|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.447470817120623, |
|
"eval_steps": 50, |
|
"global_step": 350, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1556420233463035, |
|
"grad_norm": 5.363790512084961, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 2.5426, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.311284046692607, |
|
"grad_norm": 5.297606468200684, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 2.4551, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.4669260700389105, |
|
"grad_norm": 1.6734049320220947, |
|
"learning_rate": 9.999999999999999e-06, |
|
"loss": 1.5765, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.622568093385214, |
|
"grad_norm": 0.5996205806732178, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.9482, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7782101167315175, |
|
"grad_norm": 0.5326632857322693, |
|
"learning_rate": 1.4994303528285384e-05, |
|
"loss": 0.7767, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.7782101167315175, |
|
"eval_loss": 0.7420370578765869, |
|
"eval_runtime": 75.0351, |
|
"eval_samples_per_second": 3.052, |
|
"eval_steps_per_second": 1.533, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.933852140077821, |
|
"grad_norm": 0.47296807169914246, |
|
"learning_rate": 1.4948783661087592e-05, |
|
"loss": 0.6917, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0894941634241244, |
|
"grad_norm": 0.5496929883956909, |
|
"learning_rate": 1.485802041113141e-05, |
|
"loss": 0.5968, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.245136186770428, |
|
"grad_norm": 0.5656551122665405, |
|
"learning_rate": 1.4722565067948796e-05, |
|
"loss": 0.5096, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.4007782101167314, |
|
"grad_norm": 0.7151289582252502, |
|
"learning_rate": 1.454324037767081e-05, |
|
"loss": 0.5053, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.556420233463035, |
|
"grad_norm": 0.7926196455955505, |
|
"learning_rate": 1.4321135545726069e-05, |
|
"loss": 0.4193, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.556420233463035, |
|
"eval_loss": 0.4160374701023102, |
|
"eval_runtime": 75.0144, |
|
"eval_samples_per_second": 3.053, |
|
"eval_steps_per_second": 1.533, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.7120622568093387, |
|
"grad_norm": 0.7219254970550537, |
|
"learning_rate": 1.4057599621084365e-05, |
|
"loss": 0.3695, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.867704280155642, |
|
"grad_norm": 0.7828359007835388, |
|
"learning_rate": 1.3754233302229055e-05, |
|
"loss": 0.3554, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.0233463035019454, |
|
"grad_norm": 0.6474707126617432, |
|
"learning_rate": 1.3412879214628194e-05, |
|
"loss": 0.3115, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.178988326848249, |
|
"grad_norm": 0.8858464360237122, |
|
"learning_rate": 1.3035610718758362e-05, |
|
"loss": 0.2754, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.3346303501945527, |
|
"grad_norm": 0.6484001278877258, |
|
"learning_rate": 1.2624719316660416e-05, |
|
"loss": 0.2815, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.3346303501945527, |
|
"eval_loss": 0.3077065050601959, |
|
"eval_runtime": 74.9901, |
|
"eval_samples_per_second": 3.054, |
|
"eval_steps_per_second": 1.534, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.490272373540856, |
|
"grad_norm": 0.9611061811447144, |
|
"learning_rate": 1.218270073351891e-05, |
|
"loss": 0.2641, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.6459143968871595, |
|
"grad_norm": 1.0954933166503906, |
|
"learning_rate": 1.1712239758804626e-05, |
|
"loss": 0.2675, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.801556420233463, |
|
"grad_norm": 0.7623236775398254, |
|
"learning_rate": 1.1216193939054056e-05, |
|
"loss": 0.26, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.9571984435797667, |
|
"grad_norm": 0.8693030476570129, |
|
"learning_rate": 1.0697576221334781e-05, |
|
"loss": 0.2494, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.11284046692607, |
|
"grad_norm": 0.9107272028923035, |
|
"learning_rate": 1.0159536652819018e-05, |
|
"loss": 0.2131, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.11284046692607, |
|
"eval_loss": 0.26981863379478455, |
|
"eval_runtime": 75.23, |
|
"eval_samples_per_second": 3.044, |
|
"eval_steps_per_second": 1.529, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.2684824902723735, |
|
"grad_norm": 0.7745103240013123, |
|
"learning_rate": 9.605343247620892e-06, |
|
"loss": 0.2036, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.424124513618677, |
|
"grad_norm": 0.9593366980552673, |
|
"learning_rate": 9.038362137110888e-06, |
|
"loss": 0.2013, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.5797665369649807, |
|
"grad_norm": 1.2859652042388916, |
|
"learning_rate": 8.462037124273056e-06, |
|
"loss": 0.195, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.735408560311284, |
|
"grad_norm": 1.0107340812683105, |
|
"learning_rate": 7.879868766290346e-06, |
|
"loss": 0.1884, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.8910505836575875, |
|
"grad_norm": 0.8121142387390137, |
|
"learning_rate": 7.2953931124089755e-06, |
|
"loss": 0.2117, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.8910505836575875, |
|
"eval_loss": 0.25344181060791016, |
|
"eval_runtime": 75.1186, |
|
"eval_samples_per_second": 3.049, |
|
"eval_steps_per_second": 1.531, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.046692607003891, |
|
"grad_norm": 0.7917193174362183, |
|
"learning_rate": 6.71216022622653e-06, |
|
"loss": 0.2004, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 4.202334630350195, |
|
"grad_norm": 1.1348683834075928, |
|
"learning_rate": 6.133712622857887e-06, |
|
"loss": 0.1652, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 4.357976653696498, |
|
"grad_norm": 1.1503227949142456, |
|
"learning_rate": 5.563563751950078e-06, |
|
"loss": 0.1647, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.5136186770428015, |
|
"grad_norm": 0.8282884359359741, |
|
"learning_rate": 5.005176657238754e-06, |
|
"loss": 0.1613, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.669260700389105, |
|
"grad_norm": 1.0249297618865967, |
|
"learning_rate": 4.461942942266561e-06, |
|
"loss": 0.1512, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.669260700389105, |
|
"eval_loss": 0.2489164173603058, |
|
"eval_runtime": 75.279, |
|
"eval_samples_per_second": 3.042, |
|
"eval_steps_per_second": 1.528, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.824902723735408, |
|
"grad_norm": 0.966607391834259, |
|
"learning_rate": 3.937162170024189e-06, |
|
"loss": 0.1582, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.980544747081712, |
|
"grad_norm": 0.8376232385635376, |
|
"learning_rate": 3.4340218216391697e-06, |
|
"loss": 0.1485, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 5.136186770428016, |
|
"grad_norm": 0.9548529386520386, |
|
"learning_rate": 2.955577935841975e-06, |
|
"loss": 0.1427, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 5.291828793774319, |
|
"grad_norm": 0.9749653935432434, |
|
"learning_rate": 2.5047365468038768e-06, |
|
"loss": 0.1335, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 5.447470817120623, |
|
"grad_norm": 1.2298961877822876, |
|
"learning_rate": 2.0842360330918544e-06, |
|
"loss": 0.1235, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.447470817120623, |
|
"eval_loss": 0.2625938951969147, |
|
"eval_runtime": 74.8495, |
|
"eval_samples_per_second": 3.059, |
|
"eval_steps_per_second": 1.536, |
|
"step": 350 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 448, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3919626884425974e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|