|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.10230179028132992, |
|
"eval_steps": 9, |
|
"global_step": 90, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0011366865586814436, |
|
"eval_loss": 11.095451354980469, |
|
"eval_runtime": 5.6106, |
|
"eval_samples_per_second": 132.071, |
|
"eval_steps_per_second": 33.151, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0034100596760443308, |
|
"grad_norm": 2.3856959342956543, |
|
"learning_rate": 3e-05, |
|
"loss": 44.3673, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0068201193520886615, |
|
"grad_norm": 2.4655117988586426, |
|
"learning_rate": 6e-05, |
|
"loss": 44.3703, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.010230179028132993, |
|
"grad_norm": 2.2212629318237305, |
|
"learning_rate": 9e-05, |
|
"loss": 44.3261, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.010230179028132993, |
|
"eval_loss": 11.074569702148438, |
|
"eval_runtime": 4.7833, |
|
"eval_samples_per_second": 154.915, |
|
"eval_steps_per_second": 38.885, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.013640238704177323, |
|
"grad_norm": 2.743508815765381, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 44.2612, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.017050298380221655, |
|
"grad_norm": 1.8524738550186157, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 44.2154, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.020460358056265986, |
|
"grad_norm": 1.639198660850525, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 44.167, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.020460358056265986, |
|
"eval_loss": 11.030960083007812, |
|
"eval_runtime": 4.7631, |
|
"eval_samples_per_second": 155.569, |
|
"eval_steps_per_second": 39.05, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.023870417732310314, |
|
"grad_norm": 1.5570883750915527, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 44.1426, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.027280477408354646, |
|
"grad_norm": 1.5581414699554443, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 44.0753, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.030690537084398978, |
|
"grad_norm": 1.6539247035980225, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 44.0163, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.030690537084398978, |
|
"eval_loss": 10.998085021972656, |
|
"eval_runtime": 4.7738, |
|
"eval_samples_per_second": 155.223, |
|
"eval_steps_per_second": 38.963, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.03410059676044331, |
|
"grad_norm": 1.4229531288146973, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 44.0237, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03751065643648764, |
|
"grad_norm": 1.3927640914916992, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 43.9566, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.04092071611253197, |
|
"grad_norm": 1.352387547492981, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 43.9265, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04092071611253197, |
|
"eval_loss": 10.977602005004883, |
|
"eval_runtime": 4.7507, |
|
"eval_samples_per_second": 155.978, |
|
"eval_steps_per_second": 39.152, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0443307757885763, |
|
"grad_norm": 1.2622820138931274, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 43.855, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.04774083546462063, |
|
"grad_norm": 1.1330219507217407, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 43.8811, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.05115089514066496, |
|
"grad_norm": 1.1096593141555786, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 43.8743, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05115089514066496, |
|
"eval_loss": 10.961272239685059, |
|
"eval_runtime": 4.7683, |
|
"eval_samples_per_second": 155.4, |
|
"eval_steps_per_second": 39.007, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05456095481670929, |
|
"grad_norm": 1.0063976049423218, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 43.9744, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.057971014492753624, |
|
"grad_norm": 1.0805094242095947, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 43.7906, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.061381074168797956, |
|
"grad_norm": 1.041139841079712, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 43.8027, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.061381074168797956, |
|
"eval_loss": 10.949912071228027, |
|
"eval_runtime": 4.7641, |
|
"eval_samples_per_second": 155.537, |
|
"eval_steps_per_second": 39.042, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.06479113384484228, |
|
"grad_norm": 1.2141445875167847, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 43.8766, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.06820119352088662, |
|
"grad_norm": 1.179465413093567, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 43.7685, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07161125319693094, |
|
"grad_norm": 0.8835251331329346, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 43.8831, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07161125319693094, |
|
"eval_loss": 10.942551612854004, |
|
"eval_runtime": 4.747, |
|
"eval_samples_per_second": 156.099, |
|
"eval_steps_per_second": 39.183, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.07502131287297528, |
|
"grad_norm": 1.0751994848251343, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 43.777, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.0784313725490196, |
|
"grad_norm": 1.384466528892517, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 43.6297, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.08184143222506395, |
|
"grad_norm": 1.247775912284851, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 43.7553, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08184143222506395, |
|
"eval_loss": 10.936705589294434, |
|
"eval_runtime": 4.7575, |
|
"eval_samples_per_second": 155.753, |
|
"eval_steps_per_second": 39.096, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.08525149190110827, |
|
"grad_norm": 1.064352035522461, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 43.7996, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0886615515771526, |
|
"grad_norm": 0.9701411128044128, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 43.8023, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.09207161125319693, |
|
"grad_norm": 1.058826208114624, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 43.8552, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.09207161125319693, |
|
"eval_loss": 10.935081481933594, |
|
"eval_runtime": 4.232, |
|
"eval_samples_per_second": 175.096, |
|
"eval_steps_per_second": 43.951, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.09548167092924126, |
|
"grad_norm": 1.081790566444397, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 43.735, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.0988917306052856, |
|
"grad_norm": 0.8974946737289429, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 43.7617, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.10230179028132992, |
|
"grad_norm": 1.1310738325119019, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 43.7058, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10230179028132992, |
|
"eval_loss": 10.934582710266113, |
|
"eval_runtime": 4.682, |
|
"eval_samples_per_second": 158.267, |
|
"eval_steps_per_second": 39.727, |
|
"step": 90 |
|
} |
|
], |
|
"logging_steps": 3, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 9, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 798370037760.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|