opt-babylm2-rewritten-clean-spacy_random-removal-num-adj-earlystop-bpe_seed-42_1e-3
/
trainer_state.json
{ | |
"best_metric": 2.6849253177642822, | |
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_random-removal-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44820", | |
"epoch": 19.99972118440863, | |
"eval_steps": 500, | |
"global_step": 44820, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.44610494618859087, | |
"grad_norm": 0.5416804552078247, | |
"learning_rate": 3.125e-05, | |
"loss": 5.5826, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.8922098923771817, | |
"grad_norm": 0.6205823421478271, | |
"learning_rate": 6.25e-05, | |
"loss": 4.0965, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 0.9997211844086321, | |
"eval_accuracy": 0.36173254996280507, | |
"eval_loss": 3.8076581954956055, | |
"eval_runtime": 72.1326, | |
"eval_samples_per_second": 841.547, | |
"eval_steps_per_second": 13.156, | |
"step": 2241 | |
}, | |
{ | |
"epoch": 1.3385936541571404, | |
"grad_norm": 0.5506306290626526, | |
"learning_rate": 9.375e-05, | |
"loss": 3.6973, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 1.7846986003457315, | |
"grad_norm": 0.5420992374420166, | |
"learning_rate": 0.000125, | |
"loss": 3.438, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 1.999721184408632, | |
"eval_accuracy": 0.4103082413796779, | |
"eval_loss": 3.2954118251800537, | |
"eval_runtime": 71.9919, | |
"eval_samples_per_second": 843.192, | |
"eval_steps_per_second": 13.182, | |
"step": 4482 | |
}, | |
{ | |
"epoch": 2.23108236212569, | |
"grad_norm": 0.4806063771247864, | |
"learning_rate": 0.00015625, | |
"loss": 3.2452, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 2.6771873083142808, | |
"grad_norm": 0.45570075511932373, | |
"learning_rate": 0.0001875, | |
"loss": 3.124, | |
"step": 6000 | |
}, | |
{ | |
"epoch": 2.999721184408632, | |
"eval_accuracy": 0.4310424299861783, | |
"eval_loss": 3.0855491161346436, | |
"eval_runtime": 71.529, | |
"eval_samples_per_second": 848.648, | |
"eval_steps_per_second": 13.267, | |
"step": 6723 | |
}, | |
{ | |
"epoch": 3.12357107009424, | |
"grad_norm": 0.4047807455062866, | |
"learning_rate": 0.00021875, | |
"loss": 3.0342, | |
"step": 7000 | |
}, | |
{ | |
"epoch": 3.5696760162828305, | |
"grad_norm": 0.3797657787799835, | |
"learning_rate": 0.00025, | |
"loss": 2.964, | |
"step": 8000 | |
}, | |
{ | |
"epoch": 3.999721184408632, | |
"eval_accuracy": 0.4414220571842397, | |
"eval_loss": 2.9820733070373535, | |
"eval_runtime": 70.9825, | |
"eval_samples_per_second": 855.182, | |
"eval_steps_per_second": 13.369, | |
"step": 8964 | |
}, | |
{ | |
"epoch": 4.01605977806279, | |
"grad_norm": 0.38084328174591064, | |
"learning_rate": 0.00028125000000000003, | |
"loss": 2.9201, | |
"step": 9000 | |
}, | |
{ | |
"epoch": 4.46216472425138, | |
"grad_norm": 0.34799855947494507, | |
"learning_rate": 0.0003125, | |
"loss": 2.8626, | |
"step": 10000 | |
}, | |
{ | |
"epoch": 4.908269670439971, | |
"grad_norm": 0.3457787334918976, | |
"learning_rate": 0.00034375, | |
"loss": 2.8431, | |
"step": 11000 | |
}, | |
{ | |
"epoch": 4.9997211844086324, | |
"eval_accuracy": 0.4475276442389222, | |
"eval_loss": 2.921679973602295, | |
"eval_runtime": 70.5951, | |
"eval_samples_per_second": 859.875, | |
"eval_steps_per_second": 13.443, | |
"step": 11205 | |
}, | |
{ | |
"epoch": 5.35465343221993, | |
"grad_norm": 0.3201024830341339, | |
"learning_rate": 0.000375, | |
"loss": 2.7975, | |
"step": 12000 | |
}, | |
{ | |
"epoch": 5.800758378408521, | |
"grad_norm": 0.29664143919944763, | |
"learning_rate": 0.00040625000000000004, | |
"loss": 2.7861, | |
"step": 13000 | |
}, | |
{ | |
"epoch": 5.9997211844086324, | |
"eval_accuracy": 0.45178146378397166, | |
"eval_loss": 2.882841110229492, | |
"eval_runtime": 70.4869, | |
"eval_samples_per_second": 861.195, | |
"eval_steps_per_second": 13.463, | |
"step": 13446 | |
}, | |
{ | |
"epoch": 6.24714214018848, | |
"grad_norm": 0.30828335881233215, | |
"learning_rate": 0.0004375, | |
"loss": 2.7565, | |
"step": 14000 | |
}, | |
{ | |
"epoch": 6.69324708637707, | |
"grad_norm": 0.2784518599510193, | |
"learning_rate": 0.00046871875, | |
"loss": 2.7474, | |
"step": 15000 | |
}, | |
{ | |
"epoch": 6.9997211844086324, | |
"eval_accuracy": 0.4548471778214276, | |
"eval_loss": 2.855525016784668, | |
"eval_runtime": 70.476, | |
"eval_samples_per_second": 861.328, | |
"eval_steps_per_second": 13.466, | |
"step": 15687 | |
}, | |
{ | |
"epoch": 7.1396308481570285, | |
"grad_norm": 0.29578331112861633, | |
"learning_rate": 0.00049996875, | |
"loss": 2.7271, | |
"step": 16000 | |
}, | |
{ | |
"epoch": 7.58573579434562, | |
"grad_norm": 0.25437843799591064, | |
"learning_rate": 0.00053121875, | |
"loss": 2.7083, | |
"step": 17000 | |
}, | |
{ | |
"epoch": 7.9997211844086324, | |
"eval_accuracy": 0.45683654876378177, | |
"eval_loss": 2.836312770843506, | |
"eval_runtime": 70.4512, | |
"eval_samples_per_second": 861.632, | |
"eval_steps_per_second": 13.47, | |
"step": 17928 | |
}, | |
{ | |
"epoch": 8.03211955612558, | |
"grad_norm": 0.2703135907649994, | |
"learning_rate": 0.0005624375, | |
"loss": 2.7104, | |
"step": 18000 | |
}, | |
{ | |
"epoch": 8.47822450231417, | |
"grad_norm": 0.24806231260299683, | |
"learning_rate": 0.0005936875, | |
"loss": 2.6771, | |
"step": 19000 | |
}, | |
{ | |
"epoch": 8.92432944850276, | |
"grad_norm": 0.2299337089061737, | |
"learning_rate": 0.00062490625, | |
"loss": 2.6908, | |
"step": 20000 | |
}, | |
{ | |
"epoch": 8.999721184408632, | |
"eval_accuracy": 0.4584115589467588, | |
"eval_loss": 2.8230695724487305, | |
"eval_runtime": 70.6865, | |
"eval_samples_per_second": 858.764, | |
"eval_steps_per_second": 13.425, | |
"step": 20169 | |
}, | |
{ | |
"epoch": 9.37071321028272, | |
"grad_norm": 0.2225543111562729, | |
"learning_rate": 0.0006561562500000001, | |
"loss": 2.6592, | |
"step": 21000 | |
}, | |
{ | |
"epoch": 9.81681815647131, | |
"grad_norm": 0.21800974011421204, | |
"learning_rate": 0.0006873749999999999, | |
"loss": 2.6704, | |
"step": 22000 | |
}, | |
{ | |
"epoch": 9.999721184408632, | |
"eval_accuracy": 0.4594804727485446, | |
"eval_loss": 2.8132576942443848, | |
"eval_runtime": 70.4993, | |
"eval_samples_per_second": 861.044, | |
"eval_steps_per_second": 13.461, | |
"step": 22410 | |
}, | |
{ | |
"epoch": 10.26320191825127, | |
"grad_norm": 0.2179899364709854, | |
"learning_rate": 0.000718625, | |
"loss": 2.6507, | |
"step": 23000 | |
}, | |
{ | |
"epoch": 10.70930686443986, | |
"grad_norm": 0.21323835849761963, | |
"learning_rate": 0.000749875, | |
"loss": 2.6536, | |
"step": 24000 | |
}, | |
{ | |
"epoch": 10.999721184408632, | |
"eval_accuracy": 0.46032088732895265, | |
"eval_loss": 2.8056821823120117, | |
"eval_runtime": 70.7016, | |
"eval_samples_per_second": 858.58, | |
"eval_steps_per_second": 13.423, | |
"step": 24651 | |
}, | |
{ | |
"epoch": 11.155690626219819, | |
"grad_norm": 0.20048922300338745, | |
"learning_rate": 0.00078109375, | |
"loss": 2.6449, | |
"step": 25000 | |
}, | |
{ | |
"epoch": 11.60179557240841, | |
"grad_norm": 0.1998824179172516, | |
"learning_rate": 0.00081234375, | |
"loss": 2.6385, | |
"step": 26000 | |
}, | |
{ | |
"epoch": 11.999721184408632, | |
"eval_accuracy": 0.46126305092651365, | |
"eval_loss": 2.7991325855255127, | |
"eval_runtime": 70.2522, | |
"eval_samples_per_second": 864.072, | |
"eval_steps_per_second": 13.508, | |
"step": 26892 | |
}, | |
{ | |
"epoch": 12.048179334188367, | |
"grad_norm": 0.2191251516342163, | |
"learning_rate": 0.0008435625, | |
"loss": 2.6464, | |
"step": 27000 | |
}, | |
{ | |
"epoch": 12.49428428037696, | |
"grad_norm": 0.1840791553258896, | |
"learning_rate": 0.0008748125, | |
"loss": 2.6216, | |
"step": 28000 | |
}, | |
{ | |
"epoch": 12.94038922656555, | |
"grad_norm": 0.17640572786331177, | |
"learning_rate": 0.0009060312499999999, | |
"loss": 2.6417, | |
"step": 29000 | |
}, | |
{ | |
"epoch": 12.999721184408632, | |
"eval_accuracy": 0.46212052058027303, | |
"eval_loss": 2.7933452129364014, | |
"eval_runtime": 70.3959, | |
"eval_samples_per_second": 862.309, | |
"eval_steps_per_second": 13.481, | |
"step": 29133 | |
}, | |
{ | |
"epoch": 13.386772988345509, | |
"grad_norm": 0.18327829241752625, | |
"learning_rate": 0.00093728125, | |
"loss": 2.6113, | |
"step": 30000 | |
}, | |
{ | |
"epoch": 13.8328779345341, | |
"grad_norm": 0.18839265406131744, | |
"learning_rate": 0.00096853125, | |
"loss": 2.6348, | |
"step": 31000 | |
}, | |
{ | |
"epoch": 13.999721184408632, | |
"eval_accuracy": 0.46265898283930146, | |
"eval_loss": 2.788285970687866, | |
"eval_runtime": 70.5003, | |
"eval_samples_per_second": 861.032, | |
"eval_steps_per_second": 13.461, | |
"step": 31374 | |
}, | |
{ | |
"epoch": 14.279261696314057, | |
"grad_norm": 0.18754950165748596, | |
"learning_rate": 0.00099975, | |
"loss": 2.6136, | |
"step": 32000 | |
}, | |
{ | |
"epoch": 14.72536664250265, | |
"grad_norm": 0.17947959899902344, | |
"learning_rate": 0.0009226209048361935, | |
"loss": 2.6156, | |
"step": 33000 | |
}, | |
{ | |
"epoch": 14.999721184408632, | |
"eval_accuracy": 0.46483098519212634, | |
"eval_loss": 2.7718214988708496, | |
"eval_runtime": 70.2832, | |
"eval_samples_per_second": 863.692, | |
"eval_steps_per_second": 13.503, | |
"step": 33615 | |
}, | |
{ | |
"epoch": 15.171750404282607, | |
"grad_norm": 0.17530585825443268, | |
"learning_rate": 0.0008446957878315133, | |
"loss": 2.593, | |
"step": 34000 | |
}, | |
{ | |
"epoch": 15.6178553504712, | |
"grad_norm": 0.17448733747005463, | |
"learning_rate": 0.0007666926677067083, | |
"loss": 2.5708, | |
"step": 35000 | |
}, | |
{ | |
"epoch": 15.999721184408632, | |
"eval_accuracy": 0.46829445713346207, | |
"eval_loss": 2.744858741760254, | |
"eval_runtime": 71.3532, | |
"eval_samples_per_second": 850.739, | |
"eval_steps_per_second": 13.3, | |
"step": 35856 | |
}, | |
{ | |
"epoch": 16.06423911225116, | |
"grad_norm": 0.18279102444648743, | |
"learning_rate": 0.0006886895475819032, | |
"loss": 2.5623, | |
"step": 36000 | |
}, | |
{ | |
"epoch": 16.51034405843975, | |
"grad_norm": 0.1779509037733078, | |
"learning_rate": 0.0006106864274570983, | |
"loss": 2.5179, | |
"step": 37000 | |
}, | |
{ | |
"epoch": 16.95644900462834, | |
"grad_norm": 0.17570973932743073, | |
"learning_rate": 0.0005327613104524182, | |
"loss": 2.5219, | |
"step": 38000 | |
}, | |
{ | |
"epoch": 16.99972118440863, | |
"eval_accuracy": 0.4711115159537614, | |
"eval_loss": 2.722611665725708, | |
"eval_runtime": 70.6828, | |
"eval_samples_per_second": 858.809, | |
"eval_steps_per_second": 13.426, | |
"step": 38097 | |
}, | |
{ | |
"epoch": 17.4028327664083, | |
"grad_norm": 0.1890154480934143, | |
"learning_rate": 0.00045475819032761314, | |
"loss": 2.4645, | |
"step": 39000 | |
}, | |
{ | |
"epoch": 17.84893771259689, | |
"grad_norm": 0.18561357259750366, | |
"learning_rate": 0.00037683307332293293, | |
"loss": 2.4669, | |
"step": 40000 | |
}, | |
{ | |
"epoch": 17.99972118440863, | |
"eval_accuracy": 0.4742804648670334, | |
"eval_loss": 2.701925277709961, | |
"eval_runtime": 70.3229, | |
"eval_samples_per_second": 863.203, | |
"eval_steps_per_second": 13.495, | |
"step": 40338 | |
}, | |
{ | |
"epoch": 18.29532147437685, | |
"grad_norm": 0.1927938163280487, | |
"learning_rate": 0.0002988299531981279, | |
"loss": 2.4194, | |
"step": 41000 | |
}, | |
{ | |
"epoch": 18.74142642056544, | |
"grad_norm": 0.195767343044281, | |
"learning_rate": 0.00022090483619344775, | |
"loss": 2.4023, | |
"step": 42000 | |
}, | |
{ | |
"epoch": 18.99972118440863, | |
"eval_accuracy": 0.47688595033420517, | |
"eval_loss": 2.689098834991455, | |
"eval_runtime": 70.641, | |
"eval_samples_per_second": 859.317, | |
"eval_steps_per_second": 13.434, | |
"step": 42579 | |
}, | |
{ | |
"epoch": 19.187810182345398, | |
"grad_norm": 0.2046130895614624, | |
"learning_rate": 0.00014290171606864274, | |
"loss": 2.3731, | |
"step": 43000 | |
}, | |
{ | |
"epoch": 19.63391512853399, | |
"grad_norm": 0.2043648362159729, | |
"learning_rate": 6.497659906396255e-05, | |
"loss": 2.3317, | |
"step": 44000 | |
}, | |
{ | |
"epoch": 19.99972118440863, | |
"eval_accuracy": 0.4783791090856058, | |
"eval_loss": 2.6849253177642822, | |
"eval_runtime": 70.7459, | |
"eval_samples_per_second": 858.042, | |
"eval_steps_per_second": 13.414, | |
"step": 44820 | |
}, | |
{ | |
"epoch": 19.99972118440863, | |
"step": 44820, | |
"total_flos": 1.499260234235904e+18, | |
"train_loss": 2.807752750367791, | |
"train_runtime": 29691.1283, | |
"train_samples_per_second": 386.531, | |
"train_steps_per_second": 1.51 | |
} | |
], | |
"logging_steps": 1000, | |
"max_steps": 44820, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 20, | |
"save_steps": 500, | |
"stateful_callbacks": { | |
"EarlyStoppingCallback": { | |
"args": { | |
"early_stopping_patience": 3, | |
"early_stopping_threshold": 0.0 | |
}, | |
"attributes": { | |
"early_stopping_patience_counter": 0 | |
} | |
}, | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": true | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 1.499260234235904e+18, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |