{ "best_metric": 2.6849253177642822, "best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy_random-removal-num-adj-earlystop-bpe_seed-42_1e-3/checkpoint-44820", "epoch": 19.99972118440863, "eval_steps": 500, "global_step": 44820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.44610494618859087, "grad_norm": 0.5416804552078247, "learning_rate": 3.125e-05, "loss": 5.5826, "step": 1000 }, { "epoch": 0.8922098923771817, "grad_norm": 0.6205823421478271, "learning_rate": 6.25e-05, "loss": 4.0965, "step": 2000 }, { "epoch": 0.9997211844086321, "eval_accuracy": 0.36173254996280507, "eval_loss": 3.8076581954956055, "eval_runtime": 72.1326, "eval_samples_per_second": 841.547, "eval_steps_per_second": 13.156, "step": 2241 }, { "epoch": 1.3385936541571404, "grad_norm": 0.5506306290626526, "learning_rate": 9.375e-05, "loss": 3.6973, "step": 3000 }, { "epoch": 1.7846986003457315, "grad_norm": 0.5420992374420166, "learning_rate": 0.000125, "loss": 3.438, "step": 4000 }, { "epoch": 1.999721184408632, "eval_accuracy": 0.4103082413796779, "eval_loss": 3.2954118251800537, "eval_runtime": 71.9919, "eval_samples_per_second": 843.192, "eval_steps_per_second": 13.182, "step": 4482 }, { "epoch": 2.23108236212569, "grad_norm": 0.4806063771247864, "learning_rate": 0.00015625, "loss": 3.2452, "step": 5000 }, { "epoch": 2.6771873083142808, "grad_norm": 0.45570075511932373, "learning_rate": 0.0001875, "loss": 3.124, "step": 6000 }, { "epoch": 2.999721184408632, "eval_accuracy": 0.4310424299861783, "eval_loss": 3.0855491161346436, "eval_runtime": 71.529, "eval_samples_per_second": 848.648, "eval_steps_per_second": 13.267, "step": 6723 }, { "epoch": 3.12357107009424, "grad_norm": 0.4047807455062866, "learning_rate": 0.00021875, "loss": 3.0342, "step": 7000 }, { "epoch": 3.5696760162828305, "grad_norm": 0.3797657787799835, "learning_rate": 0.00025, "loss": 2.964, "step": 8000 }, { "epoch": 3.999721184408632, "eval_accuracy": 0.4414220571842397, "eval_loss": 2.9820733070373535, "eval_runtime": 70.9825, "eval_samples_per_second": 855.182, "eval_steps_per_second": 13.369, "step": 8964 }, { "epoch": 4.01605977806279, "grad_norm": 0.38084328174591064, "learning_rate": 0.00028125000000000003, "loss": 2.9201, "step": 9000 }, { "epoch": 4.46216472425138, "grad_norm": 0.34799855947494507, "learning_rate": 0.0003125, "loss": 2.8626, "step": 10000 }, { "epoch": 4.908269670439971, "grad_norm": 0.3457787334918976, "learning_rate": 0.00034375, "loss": 2.8431, "step": 11000 }, { "epoch": 4.9997211844086324, "eval_accuracy": 0.4475276442389222, "eval_loss": 2.921679973602295, "eval_runtime": 70.5951, "eval_samples_per_second": 859.875, "eval_steps_per_second": 13.443, "step": 11205 }, { "epoch": 5.35465343221993, "grad_norm": 0.3201024830341339, "learning_rate": 0.000375, "loss": 2.7975, "step": 12000 }, { "epoch": 5.800758378408521, "grad_norm": 0.29664143919944763, "learning_rate": 0.00040625000000000004, "loss": 2.7861, "step": 13000 }, { "epoch": 5.9997211844086324, "eval_accuracy": 0.45178146378397166, "eval_loss": 2.882841110229492, "eval_runtime": 70.4869, "eval_samples_per_second": 861.195, "eval_steps_per_second": 13.463, "step": 13446 }, { "epoch": 6.24714214018848, "grad_norm": 0.30828335881233215, "learning_rate": 0.0004375, "loss": 2.7565, "step": 14000 }, { "epoch": 6.69324708637707, "grad_norm": 0.2784518599510193, "learning_rate": 0.00046871875, "loss": 2.7474, "step": 15000 }, { "epoch": 6.9997211844086324, "eval_accuracy": 0.4548471778214276, "eval_loss": 2.855525016784668, "eval_runtime": 70.476, "eval_samples_per_second": 861.328, "eval_steps_per_second": 13.466, "step": 15687 }, { "epoch": 7.1396308481570285, "grad_norm": 0.29578331112861633, "learning_rate": 0.00049996875, "loss": 2.7271, "step": 16000 }, { "epoch": 7.58573579434562, "grad_norm": 0.25437843799591064, "learning_rate": 0.00053121875, "loss": 2.7083, "step": 17000 }, { "epoch": 7.9997211844086324, "eval_accuracy": 0.45683654876378177, "eval_loss": 2.836312770843506, "eval_runtime": 70.4512, "eval_samples_per_second": 861.632, "eval_steps_per_second": 13.47, "step": 17928 }, { "epoch": 8.03211955612558, "grad_norm": 0.2703135907649994, "learning_rate": 0.0005624375, "loss": 2.7104, "step": 18000 }, { "epoch": 8.47822450231417, "grad_norm": 0.24806231260299683, "learning_rate": 0.0005936875, "loss": 2.6771, "step": 19000 }, { "epoch": 8.92432944850276, "grad_norm": 0.2299337089061737, "learning_rate": 0.00062490625, "loss": 2.6908, "step": 20000 }, { "epoch": 8.999721184408632, "eval_accuracy": 0.4584115589467588, "eval_loss": 2.8230695724487305, "eval_runtime": 70.6865, "eval_samples_per_second": 858.764, "eval_steps_per_second": 13.425, "step": 20169 }, { "epoch": 9.37071321028272, "grad_norm": 0.2225543111562729, "learning_rate": 0.0006561562500000001, "loss": 2.6592, "step": 21000 }, { "epoch": 9.81681815647131, "grad_norm": 0.21800974011421204, "learning_rate": 0.0006873749999999999, "loss": 2.6704, "step": 22000 }, { "epoch": 9.999721184408632, "eval_accuracy": 0.4594804727485446, "eval_loss": 2.8132576942443848, "eval_runtime": 70.4993, "eval_samples_per_second": 861.044, "eval_steps_per_second": 13.461, "step": 22410 }, { "epoch": 10.26320191825127, "grad_norm": 0.2179899364709854, "learning_rate": 0.000718625, "loss": 2.6507, "step": 23000 }, { "epoch": 10.70930686443986, "grad_norm": 0.21323835849761963, "learning_rate": 0.000749875, "loss": 2.6536, "step": 24000 }, { "epoch": 10.999721184408632, "eval_accuracy": 0.46032088732895265, "eval_loss": 2.8056821823120117, "eval_runtime": 70.7016, "eval_samples_per_second": 858.58, "eval_steps_per_second": 13.423, "step": 24651 }, { "epoch": 11.155690626219819, "grad_norm": 0.20048922300338745, "learning_rate": 0.00078109375, "loss": 2.6449, "step": 25000 }, { "epoch": 11.60179557240841, "grad_norm": 0.1998824179172516, "learning_rate": 0.00081234375, "loss": 2.6385, "step": 26000 }, { "epoch": 11.999721184408632, "eval_accuracy": 0.46126305092651365, "eval_loss": 2.7991325855255127, "eval_runtime": 70.2522, "eval_samples_per_second": 864.072, "eval_steps_per_second": 13.508, "step": 26892 }, { "epoch": 12.048179334188367, "grad_norm": 0.2191251516342163, "learning_rate": 0.0008435625, "loss": 2.6464, "step": 27000 }, { "epoch": 12.49428428037696, "grad_norm": 0.1840791553258896, "learning_rate": 0.0008748125, "loss": 2.6216, "step": 28000 }, { "epoch": 12.94038922656555, "grad_norm": 0.17640572786331177, "learning_rate": 0.0009060312499999999, "loss": 2.6417, "step": 29000 }, { "epoch": 12.999721184408632, "eval_accuracy": 0.46212052058027303, "eval_loss": 2.7933452129364014, "eval_runtime": 70.3959, "eval_samples_per_second": 862.309, "eval_steps_per_second": 13.481, "step": 29133 }, { "epoch": 13.386772988345509, "grad_norm": 0.18327829241752625, "learning_rate": 0.00093728125, "loss": 2.6113, "step": 30000 }, { "epoch": 13.8328779345341, "grad_norm": 0.18839265406131744, "learning_rate": 0.00096853125, "loss": 2.6348, "step": 31000 }, { "epoch": 13.999721184408632, "eval_accuracy": 0.46265898283930146, "eval_loss": 2.788285970687866, "eval_runtime": 70.5003, "eval_samples_per_second": 861.032, "eval_steps_per_second": 13.461, "step": 31374 }, { "epoch": 14.279261696314057, "grad_norm": 0.18754950165748596, "learning_rate": 0.00099975, "loss": 2.6136, "step": 32000 }, { "epoch": 14.72536664250265, "grad_norm": 0.17947959899902344, "learning_rate": 0.0009226209048361935, "loss": 2.6156, "step": 33000 }, { "epoch": 14.999721184408632, "eval_accuracy": 0.46483098519212634, "eval_loss": 2.7718214988708496, "eval_runtime": 70.2832, "eval_samples_per_second": 863.692, "eval_steps_per_second": 13.503, "step": 33615 }, { "epoch": 15.171750404282607, "grad_norm": 0.17530585825443268, "learning_rate": 0.0008446957878315133, "loss": 2.593, "step": 34000 }, { "epoch": 15.6178553504712, "grad_norm": 0.17448733747005463, "learning_rate": 0.0007666926677067083, "loss": 2.5708, "step": 35000 }, { "epoch": 15.999721184408632, "eval_accuracy": 0.46829445713346207, "eval_loss": 2.744858741760254, "eval_runtime": 71.3532, "eval_samples_per_second": 850.739, "eval_steps_per_second": 13.3, "step": 35856 }, { "epoch": 16.06423911225116, "grad_norm": 0.18279102444648743, "learning_rate": 0.0006886895475819032, "loss": 2.5623, "step": 36000 }, { "epoch": 16.51034405843975, "grad_norm": 0.1779509037733078, "learning_rate": 0.0006106864274570983, "loss": 2.5179, "step": 37000 }, { "epoch": 16.95644900462834, "grad_norm": 0.17570973932743073, "learning_rate": 0.0005327613104524182, "loss": 2.5219, "step": 38000 }, { "epoch": 16.99972118440863, "eval_accuracy": 0.4711115159537614, "eval_loss": 2.722611665725708, "eval_runtime": 70.6828, "eval_samples_per_second": 858.809, "eval_steps_per_second": 13.426, "step": 38097 }, { "epoch": 17.4028327664083, "grad_norm": 0.1890154480934143, "learning_rate": 0.00045475819032761314, "loss": 2.4645, "step": 39000 }, { "epoch": 17.84893771259689, "grad_norm": 0.18561357259750366, "learning_rate": 0.00037683307332293293, "loss": 2.4669, "step": 40000 }, { "epoch": 17.99972118440863, "eval_accuracy": 0.4742804648670334, "eval_loss": 2.701925277709961, "eval_runtime": 70.3229, "eval_samples_per_second": 863.203, "eval_steps_per_second": 13.495, "step": 40338 }, { "epoch": 18.29532147437685, "grad_norm": 0.1927938163280487, "learning_rate": 0.0002988299531981279, "loss": 2.4194, "step": 41000 }, { "epoch": 18.74142642056544, "grad_norm": 0.195767343044281, "learning_rate": 0.00022090483619344775, "loss": 2.4023, "step": 42000 }, { "epoch": 18.99972118440863, "eval_accuracy": 0.47688595033420517, "eval_loss": 2.689098834991455, "eval_runtime": 70.641, "eval_samples_per_second": 859.317, "eval_steps_per_second": 13.434, "step": 42579 }, { "epoch": 19.187810182345398, "grad_norm": 0.2046130895614624, "learning_rate": 0.00014290171606864274, "loss": 2.3731, "step": 43000 }, { "epoch": 19.63391512853399, "grad_norm": 0.2043648362159729, "learning_rate": 6.497659906396255e-05, "loss": 2.3317, "step": 44000 }, { "epoch": 19.99972118440863, "eval_accuracy": 0.4783791090856058, "eval_loss": 2.6849253177642822, "eval_runtime": 70.7459, "eval_samples_per_second": 858.042, "eval_steps_per_second": 13.414, "step": 44820 }, { "epoch": 19.99972118440863, "step": 44820, "total_flos": 1.499260234235904e+18, "train_loss": 2.807752750367791, "train_runtime": 29691.1283, "train_samples_per_second": 386.531, "train_steps_per_second": 1.51 } ], "logging_steps": 1000, "max_steps": 44820, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.499260234235904e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }