|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 1129, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.0792332887649536, |
|
"learning_rate": 6.637168141592921e-06, |
|
"loss": 2.8189, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.3077887296676636, |
|
"learning_rate": 1.3274336283185841e-05, |
|
"loss": 2.6588, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.2354748249053955, |
|
"learning_rate": 1.991150442477876e-05, |
|
"loss": 2.4479, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.6841175556182861, |
|
"learning_rate": 2.6548672566371683e-05, |
|
"loss": 2.2587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.5395749807357788, |
|
"learning_rate": 2.9645669291338583e-05, |
|
"loss": 1.9755, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.49534282088279724, |
|
"learning_rate": 2.890748031496063e-05, |
|
"loss": 1.8501, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.40641939640045166, |
|
"learning_rate": 2.8169291338582678e-05, |
|
"loss": 1.8908, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.39455121755599976, |
|
"learning_rate": 2.7431102362204727e-05, |
|
"loss": 1.7573, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.48203709721565247, |
|
"learning_rate": 2.6722440944881888e-05, |
|
"loss": 1.6421, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6426458358764648, |
|
"learning_rate": 2.5984251968503937e-05, |
|
"loss": 1.738, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.446591854095459, |
|
"learning_rate": 2.5246062992125983e-05, |
|
"loss": 1.6237, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.43455299735069275, |
|
"learning_rate": 2.450787401574803e-05, |
|
"loss": 1.6079, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.4745838940143585, |
|
"learning_rate": 2.376968503937008e-05, |
|
"loss": 1.4791, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4581379294395447, |
|
"learning_rate": 2.3031496062992126e-05, |
|
"loss": 1.6389, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.4056738018989563, |
|
"learning_rate": 2.2293307086614175e-05, |
|
"loss": 1.5783, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.48176687955856323, |
|
"learning_rate": 2.155511811023622e-05, |
|
"loss": 1.5243, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.548370361328125, |
|
"learning_rate": 2.0816929133858267e-05, |
|
"loss": 1.4877, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5029275417327881, |
|
"learning_rate": 2.0078740157480316e-05, |
|
"loss": 1.5826, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5703559517860413, |
|
"learning_rate": 1.934055118110236e-05, |
|
"loss": 1.5118, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4942224323749542, |
|
"learning_rate": 1.860236220472441e-05, |
|
"loss": 1.5043, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5140496492385864, |
|
"learning_rate": 1.786417322834646e-05, |
|
"loss": 1.53, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.5072318911552429, |
|
"learning_rate": 1.7125984251968505e-05, |
|
"loss": 1.4875, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5855621695518494, |
|
"learning_rate": 1.6387795275590554e-05, |
|
"loss": 1.4771, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.47852256894111633, |
|
"learning_rate": 1.5649606299212596e-05, |
|
"loss": 1.3911, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.4826895594596863, |
|
"learning_rate": 1.4911417322834647e-05, |
|
"loss": 1.3855, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.5134251117706299, |
|
"learning_rate": 1.4173228346456692e-05, |
|
"loss": 1.4103, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4681044816970825, |
|
"learning_rate": 1.343503937007874e-05, |
|
"loss": 1.4102, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5529048442840576, |
|
"learning_rate": 1.2696850393700789e-05, |
|
"loss": 1.4231, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5898122787475586, |
|
"learning_rate": 1.1958661417322836e-05, |
|
"loss": 1.3718, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.766099214553833, |
|
"learning_rate": 1.1220472440944882e-05, |
|
"loss": 1.3482, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5822522044181824, |
|
"learning_rate": 1.0482283464566929e-05, |
|
"loss": 1.2873, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.49051281809806824, |
|
"learning_rate": 9.744094488188976e-06, |
|
"loss": 1.3268, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.5041054487228394, |
|
"learning_rate": 9.005905511811024e-06, |
|
"loss": 1.3923, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5318450331687927, |
|
"learning_rate": 8.267716535433071e-06, |
|
"loss": 1.3487, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.5623608827590942, |
|
"learning_rate": 7.529527559055118e-06, |
|
"loss": 1.4091, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5455105900764465, |
|
"learning_rate": 6.791338582677165e-06, |
|
"loss": 1.3664, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5180054903030396, |
|
"learning_rate": 6.053149606299213e-06, |
|
"loss": 1.3277, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.641249418258667, |
|
"learning_rate": 5.3149606299212595e-06, |
|
"loss": 1.4015, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.4672348201274872, |
|
"learning_rate": 4.576771653543308e-06, |
|
"loss": 1.3587, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.468883752822876, |
|
"learning_rate": 3.838582677165354e-06, |
|
"loss": 1.4637, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.5818473100662231, |
|
"learning_rate": 3.1003937007874014e-06, |
|
"loss": 1.3372, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6030290126800537, |
|
"learning_rate": 2.3622047244094487e-06, |
|
"loss": 1.4148, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.5191698670387268, |
|
"learning_rate": 1.624015748031496e-06, |
|
"loss": 1.3975, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.48896682262420654, |
|
"learning_rate": 8.858267716535433e-07, |
|
"loss": 1.4067, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.46865954995155334, |
|
"learning_rate": 1.4763779527559055e-07, |
|
"loss": 1.3374, |
|
"step": 1125 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1129, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 2425451377065984.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|