|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 3804, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.026288117770767613, |
|
"grad_norm": 0.21158406138420105, |
|
"learning_rate": 6.377383300460224e-07, |
|
"loss": 3.4637, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.052576235541535225, |
|
"grad_norm": 0.17568713426589966, |
|
"learning_rate": 1.2886259040105195e-06, |
|
"loss": 3.5087, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.07886435331230283, |
|
"grad_norm": 0.6034949421882629, |
|
"learning_rate": 1.946088099934254e-06, |
|
"loss": 3.4185, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.10515247108307045, |
|
"grad_norm": 0.34301501512527466, |
|
"learning_rate": 2.603550295857988e-06, |
|
"loss": 3.2854, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13144058885383805, |
|
"grad_norm": 3.483755588531494, |
|
"learning_rate": 3.2610124917817228e-06, |
|
"loss": 3.1951, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.15772870662460567, |
|
"grad_norm": 0.43615999817848206, |
|
"learning_rate": 3.918474687705457e-06, |
|
"loss": 3.0889, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.18401682439537329, |
|
"grad_norm": 0.3760946989059448, |
|
"learning_rate": 4.575936883629192e-06, |
|
"loss": 2.8217, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.2103049421661409, |
|
"grad_norm": 0.2940825819969177, |
|
"learning_rate": 5.2333990795529265e-06, |
|
"loss": 2.5901, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.23659305993690852, |
|
"grad_norm": 0.38412269949913025, |
|
"learning_rate": 5.89086127547666e-06, |
|
"loss": 2.5224, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.2628811777076761, |
|
"grad_norm": 0.25396350026130676, |
|
"learning_rate": 6.548323471400395e-06, |
|
"loss": 2.4714, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2891692954784437, |
|
"grad_norm": 0.21601170301437378, |
|
"learning_rate": 7.20578566732413e-06, |
|
"loss": 2.457, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.31545741324921134, |
|
"grad_norm": 0.23218804597854614, |
|
"learning_rate": 7.863247863247863e-06, |
|
"loss": 2.4262, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.34174553101997895, |
|
"grad_norm": 0.24238590896129608, |
|
"learning_rate": 8.5207100591716e-06, |
|
"loss": 2.4206, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.36803364879074657, |
|
"grad_norm": 0.4151826500892639, |
|
"learning_rate": 9.178172255095332e-06, |
|
"loss": 2.3889, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3943217665615142, |
|
"grad_norm": 0.22028537094593048, |
|
"learning_rate": 9.835634451019067e-06, |
|
"loss": 2.377, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.4206098843322818, |
|
"grad_norm": 0.23977142572402954, |
|
"learning_rate": 1.0493096646942801e-05, |
|
"loss": 2.3533, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.4468980021030494, |
|
"grad_norm": 0.2686922252178192, |
|
"learning_rate": 1.1150558842866538e-05, |
|
"loss": 2.3217, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.47318611987381703, |
|
"grad_norm": 0.2237405776977539, |
|
"learning_rate": 1.1808021038790272e-05, |
|
"loss": 2.3034, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.49947423764458465, |
|
"grad_norm": 0.35920849442481995, |
|
"learning_rate": 1.2465483234714005e-05, |
|
"loss": 2.296, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.5257623554153522, |
|
"grad_norm": 0.32759296894073486, |
|
"learning_rate": 1.312294543063774e-05, |
|
"loss": 2.291, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5520504731861199, |
|
"grad_norm": 0.3541663587093353, |
|
"learning_rate": 1.3780407626561474e-05, |
|
"loss": 2.2739, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5783385909568874, |
|
"grad_norm": 0.3697475492954254, |
|
"learning_rate": 1.4437869822485209e-05, |
|
"loss": 2.2553, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.6046267087276551, |
|
"grad_norm": 0.27857789397239685, |
|
"learning_rate": 1.5095332018408943e-05, |
|
"loss": 2.2625, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.6309148264984227, |
|
"grad_norm": 0.22623978555202484, |
|
"learning_rate": 1.5752794214332678e-05, |
|
"loss": 2.2512, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.6572029442691903, |
|
"grad_norm": 0.3197992146015167, |
|
"learning_rate": 1.641025641025641e-05, |
|
"loss": 2.222, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6834910620399579, |
|
"grad_norm": 0.2565959692001343, |
|
"learning_rate": 1.7067718606180147e-05, |
|
"loss": 2.2295, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.7097791798107256, |
|
"grad_norm": 0.24300645291805267, |
|
"learning_rate": 1.772518080210388e-05, |
|
"loss": 2.217, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.7360672975814931, |
|
"grad_norm": 0.21543020009994507, |
|
"learning_rate": 1.8382642998027616e-05, |
|
"loss": 2.234, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.7623554153522608, |
|
"grad_norm": 0.27316907048225403, |
|
"learning_rate": 1.904010519395135e-05, |
|
"loss": 2.2144, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.7886435331230284, |
|
"grad_norm": 0.3314705789089203, |
|
"learning_rate": 1.9697567389875082e-05, |
|
"loss": 2.2068, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.814931650893796, |
|
"grad_norm": 0.2859819531440735, |
|
"learning_rate": 1.975319567905163e-05, |
|
"loss": 2.1974, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.8412197686645636, |
|
"grad_norm": 0.44442227482795715, |
|
"learning_rate": 1.8051220835680475e-05, |
|
"loss": 2.1802, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.8675078864353313, |
|
"grad_norm": 0.35564538836479187, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 2.1839, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.8937960042060988, |
|
"grad_norm": 0.28175088763237, |
|
"learning_rate": 1.1110865250488047e-05, |
|
"loss": 2.1736, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.9200841219768665, |
|
"grad_norm": 0.28838351368904114, |
|
"learning_rate": 7.0355686110209266e-06, |
|
"loss": 2.1717, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.9463722397476341, |
|
"grad_norm": 0.3840157687664032, |
|
"learning_rate": 3.4570596330386775e-06, |
|
"loss": 2.1674, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.9726603575184016, |
|
"grad_norm": 0.2805449366569519, |
|
"learning_rate": 9.750348092257368e-07, |
|
"loss": 2.1741, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.9989484752891693, |
|
"grad_norm": 0.3055085837841034, |
|
"learning_rate": 5.438766451707489e-09, |
|
"loss": 2.1764, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 3804, |
|
"total_flos": 5.5431121920956826e+17, |
|
"train_loss": 2.4680882914209215, |
|
"train_runtime": 1212.9814, |
|
"train_samples_per_second": 50.165, |
|
"train_steps_per_second": 3.136 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 3804, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.5431121920956826e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|