|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 504, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 11.642287552521111, |
|
"learning_rate": 2.631578947368421e-07, |
|
"loss": 0.8829, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 7.106858407842631, |
|
"learning_rate": 5.263157894736842e-07, |
|
"loss": 0.8087, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 2.093269967612724, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.7296, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.2477023318233176, |
|
"learning_rate": 1.0526315789473683e-06, |
|
"loss": 0.6712, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 1.0512343429003204, |
|
"learning_rate": 1.3157894736842106e-06, |
|
"loss": 0.6321, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 4.474475922392763, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.6026, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 1.6529279657405915, |
|
"learning_rate": 1.8421052631578946e-06, |
|
"loss": 0.5884, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 2.7949986781945366, |
|
"learning_rate": 1.9996767546702485e-06, |
|
"loss": 0.5783, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5357142857142857, |
|
"grad_norm": 2.118745503355743, |
|
"learning_rate": 1.996043443883064e-06, |
|
"loss": 0.5706, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5952380952380952, |
|
"grad_norm": 2.0357375426196147, |
|
"learning_rate": 1.988392397752233e-06, |
|
"loss": 0.5598, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6547619047619048, |
|
"grad_norm": 2.2701188124758, |
|
"learning_rate": 1.9767648201496052e-06, |
|
"loss": 0.5569, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 2.865482452040676, |
|
"learning_rate": 1.961223330122206e-06, |
|
"loss": 0.5477, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7738095238095238, |
|
"grad_norm": 2.7451064629160493, |
|
"learning_rate": 1.941851624664209e-06, |
|
"loss": 0.5459, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 2.0988347378452, |
|
"learning_rate": 1.9187540279759314e-06, |
|
"loss": 0.5383, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8928571428571429, |
|
"grad_norm": 1.7836154166022118, |
|
"learning_rate": 1.8920549296372686e-06, |
|
"loss": 0.5337, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 1.0480422517342471, |
|
"learning_rate": 1.861898114721218e-06, |
|
"loss": 0.528, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.0656253919005394, |
|
"eval_runtime": 116.504, |
|
"eval_samples_per_second": 155.385, |
|
"eval_steps_per_second": 0.609, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0119047619047619, |
|
"grad_norm": 1.8121694809796869, |
|
"learning_rate": 1.8284459894551025e-06, |
|
"loss": 0.5223, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0714285714285714, |
|
"grad_norm": 1.420187622758248, |
|
"learning_rate": 1.7918787065996015e-06, |
|
"loss": 0.5032, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.130952380952381, |
|
"grad_norm": 1.5424317018915787, |
|
"learning_rate": 1.7523931952557666e-06, |
|
"loss": 0.5004, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.1904761904761905, |
|
"grad_norm": 1.850588186757041, |
|
"learning_rate": 1.7102021003248955e-06, |
|
"loss": 0.498, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.4232612836192142, |
|
"learning_rate": 1.6655326373326793e-06, |
|
"loss": 0.4987, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3095238095238095, |
|
"grad_norm": 1.230835146194867, |
|
"learning_rate": 1.6186253687848507e-06, |
|
"loss": 0.493, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.369047619047619, |
|
"grad_norm": 1.2193965194507899, |
|
"learning_rate": 1.569732908644127e-06, |
|
"loss": 0.4923, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 1.5201451501554284, |
|
"learning_rate": 1.5191185619053519e-06, |
|
"loss": 0.4902, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4880952380952381, |
|
"grad_norm": 1.2106618553239001, |
|
"learning_rate": 1.4670549065952552e-06, |
|
"loss": 0.4863, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.5476190476190477, |
|
"grad_norm": 1.073369144919772, |
|
"learning_rate": 1.4138223258333096e-06, |
|
"loss": 0.4845, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6071428571428572, |
|
"grad_norm": 1.3569659774066425, |
|
"learning_rate": 1.3597074978591206e-06, |
|
"loss": 0.4823, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.1172719170259655, |
|
"learning_rate": 1.3050018521581279e-06, |
|
"loss": 0.4826, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7261904761904763, |
|
"grad_norm": 1.057235946491605, |
|
"learning_rate": 1.2499999999999999e-06, |
|
"loss": 0.4817, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.7857142857142856, |
|
"grad_norm": 1.1395484188056737, |
|
"learning_rate": 1.1949981478418721e-06, |
|
"loss": 0.4763, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.8452380952380953, |
|
"grad_norm": 0.9673060331448388, |
|
"learning_rate": 1.1402925021408796e-06, |
|
"loss": 0.4751, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 1.2716355726282513, |
|
"learning_rate": 1.0861776741666901e-06, |
|
"loss": 0.4743, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9642857142857144, |
|
"grad_norm": 0.8670777947729602, |
|
"learning_rate": 1.032945093404745e-06, |
|
"loss": 0.4738, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.061109066009521484, |
|
"eval_runtime": 116.1265, |
|
"eval_samples_per_second": 155.89, |
|
"eval_steps_per_second": 0.611, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.0238095238095237, |
|
"grad_norm": 1.1781799129689645, |
|
"learning_rate": 9.80881438094648e-07, |
|
"loss": 0.4626, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 1.196256029896588, |
|
"learning_rate": 9.302670913558731e-07, |
|
"loss": 0.4514, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.142857142857143, |
|
"grad_norm": 1.240120670934915, |
|
"learning_rate": 8.813746312151494e-07, |
|
"loss": 0.4466, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.2023809523809526, |
|
"grad_norm": 1.4515250964073299, |
|
"learning_rate": 8.344673626673205e-07, |
|
"loss": 0.4461, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.261904761904762, |
|
"grad_norm": 0.8373786848430113, |
|
"learning_rate": 7.897978996751046e-07, |
|
"loss": 0.4491, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.3214285714285716, |
|
"grad_norm": 1.0346083625080364, |
|
"learning_rate": 7.476068047442332e-07, |
|
"loss": 0.4443, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.9459213157979981, |
|
"learning_rate": 7.081212934003984e-07, |
|
"loss": 0.4435, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.4404761904761907, |
|
"grad_norm": 0.9958744135548997, |
|
"learning_rate": 6.715540105448972e-07, |
|
"loss": 0.4428, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.9392635640773219, |
|
"learning_rate": 6.381018852787821e-07, |
|
"loss": 0.4427, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.5595238095238093, |
|
"grad_norm": 0.7294108780936411, |
|
"learning_rate": 6.079450703627314e-07, |
|
"loss": 0.4443, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.619047619047619, |
|
"grad_norm": 0.8142282440596496, |
|
"learning_rate": 5.812459720240681e-07, |
|
"loss": 0.4448, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.678571428571429, |
|
"grad_norm": 0.8207822906148434, |
|
"learning_rate": 5.581483753357905e-07, |
|
"loss": 0.4425, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.738095238095238, |
|
"grad_norm": 0.9245896216419172, |
|
"learning_rate": 5.387766698777935e-07, |
|
"loss": 0.4438, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.7976190476190474, |
|
"grad_norm": 0.7453831990866285, |
|
"learning_rate": 5.232351798503945e-07, |
|
"loss": 0.4423, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.8498775475082037, |
|
"learning_rate": 5.116076022477671e-07, |
|
"loss": 0.4416, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.9166666666666665, |
|
"grad_norm": 0.8873720999371016, |
|
"learning_rate": 5.039565561169362e-07, |
|
"loss": 0.4385, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 0.8671697549026851, |
|
"learning_rate": 5.003232453297512e-07, |
|
"loss": 0.4405, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.05991922318935394, |
|
"eval_runtime": 116.0997, |
|
"eval_samples_per_second": 155.926, |
|
"eval_steps_per_second": 0.612, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 504, |
|
"total_flos": 3376037568184320.0, |
|
"train_loss": 0.5144212938963421, |
|
"train_runtime": 17119.0234, |
|
"train_samples_per_second": 60.276, |
|
"train_steps_per_second": 0.029 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 504, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3376037568184320.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|