|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.010214409722222, |
|
"eval_steps": 1000, |
|
"global_step": 36000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0086102430555555, |
|
"grad_norm": 0.3148804008960724, |
|
"learning_rate": 0.0009998758966336297, |
|
"loss": 5.1098, |
|
"num_input_tokens_seen": 521485760, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0086102430555555, |
|
"eval_loss": 4.390458106994629, |
|
"eval_runtime": 157.9404, |
|
"eval_samples_per_second": 116.5, |
|
"eval_steps_per_second": 7.281, |
|
"num_input_tokens_seen": 521485760, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0169140625, |
|
"grad_norm": 0.24177567660808563, |
|
"learning_rate": 0.0009969004950996173, |
|
"loss": 3.7649, |
|
"num_input_tokens_seen": 1042810727, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.0169140625, |
|
"eval_loss": 4.018285274505615, |
|
"eval_runtime": 157.619, |
|
"eval_samples_per_second": 116.737, |
|
"eval_steps_per_second": 7.296, |
|
"num_input_tokens_seen": 1042810727, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.006030381944444, |
|
"grad_norm": 0.22102612257003784, |
|
"learning_rate": 0.0009899808525182935, |
|
"loss": 3.3244, |
|
"num_input_tokens_seen": 1564614407, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.006030381944444, |
|
"eval_loss": 3.883763074874878, |
|
"eval_runtime": 158.112, |
|
"eval_samples_per_second": 116.373, |
|
"eval_steps_per_second": 7.273, |
|
"num_input_tokens_seen": 1564614407, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.0143046875, |
|
"grad_norm": 0.25155559182167053, |
|
"learning_rate": 0.0009791718948528457, |
|
"loss": 3.2291, |
|
"num_input_tokens_seen": 2086545237, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 5.0143046875, |
|
"eval_loss": 3.7907443046569824, |
|
"eval_runtime": 157.3668, |
|
"eval_samples_per_second": 116.924, |
|
"eval_steps_per_second": 7.308, |
|
"num_input_tokens_seen": 2086545237, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.002637152777778, |
|
"grad_norm": 0.23016224801540375, |
|
"learning_rate": 0.0009645594202357438, |
|
"loss": 3.1779, |
|
"num_input_tokens_seen": 2608572303, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.002637152777778, |
|
"eval_loss": 3.7254457473754883, |
|
"eval_runtime": 158.0679, |
|
"eval_samples_per_second": 116.406, |
|
"eval_steps_per_second": 7.275, |
|
"num_input_tokens_seen": 2608572303, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 8.010796006944444, |
|
"grad_norm": 0.1911187618970871, |
|
"learning_rate": 0.0009462594179299406, |
|
"loss": 3.1106, |
|
"num_input_tokens_seen": 3130451615, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.010796006944444, |
|
"eval_loss": 3.6576924324035645, |
|
"eval_runtime": 156.4171, |
|
"eval_samples_per_second": 117.634, |
|
"eval_steps_per_second": 7.352, |
|
"num_input_tokens_seen": 3130451615, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 9.0191015625, |
|
"grad_norm": 0.24357692897319794, |
|
"learning_rate": 0.0009244171476423036, |
|
"loss": 3.0493, |
|
"num_input_tokens_seen": 3652271439, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 9.0191015625, |
|
"eval_loss": 3.6375908851623535, |
|
"eval_runtime": 157.211, |
|
"eval_samples_per_second": 117.04, |
|
"eval_steps_per_second": 7.315, |
|
"num_input_tokens_seen": 3652271439, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.0083489583333334, |
|
"grad_norm": 0.17816534638404846, |
|
"learning_rate": 0.0008992059864973972, |
|
"loss": 3.0005, |
|
"num_input_tokens_seen": 4174145711, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0083489583333334, |
|
"eval_loss": 3.5855612754821777, |
|
"eval_runtime": 155.3476, |
|
"eval_samples_per_second": 118.444, |
|
"eval_steps_per_second": 7.403, |
|
"num_input_tokens_seen": 4174145711, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.016631076388889, |
|
"grad_norm": 0.1740865856409073, |
|
"learning_rate": 0.0008708260528239789, |
|
"loss": 3.0155, |
|
"num_input_tokens_seen": 4696186623, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.016631076388889, |
|
"eval_loss": 3.580111265182495, |
|
"eval_runtime": 154.9402, |
|
"eval_samples_per_second": 118.755, |
|
"eval_steps_per_second": 7.422, |
|
"num_input_tokens_seen": 4696186623, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.005178819444445, |
|
"grad_norm": 0.19117484986782074, |
|
"learning_rate": 0.0008395026176781626, |
|
"loss": 2.9609, |
|
"num_input_tokens_seen": 5217948966, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.005178819444445, |
|
"eval_loss": 3.5479941368103027, |
|
"eval_runtime": 154.297, |
|
"eval_samples_per_second": 119.251, |
|
"eval_steps_per_second": 7.453, |
|
"num_input_tokens_seen": 5217948966, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.013779513888889, |
|
"grad_norm": 0.16193148493766785, |
|
"learning_rate": 0.0008054843167120826, |
|
"loss": 2.9376, |
|
"num_input_tokens_seen": 5739503318, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.013779513888889, |
|
"eval_loss": 3.5245985984802246, |
|
"eval_runtime": 154.2866, |
|
"eval_samples_per_second": 119.259, |
|
"eval_steps_per_second": 7.454, |
|
"num_input_tokens_seen": 5739503318, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 7.002284722222222, |
|
"grad_norm": 0.16273680329322815, |
|
"learning_rate": 0.0007690411765816864, |
|
"loss": 2.9641, |
|
"num_input_tokens_seen": 6261603062, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.002284722222222, |
|
"eval_loss": 3.5051820278167725, |
|
"eval_runtime": 153.5212, |
|
"eval_samples_per_second": 119.853, |
|
"eval_steps_per_second": 7.491, |
|
"num_input_tokens_seen": 6261603062, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 8.010577256944444, |
|
"grad_norm": 0.1603420227766037, |
|
"learning_rate": 0.0007304624715594139, |
|
"loss": 2.9166, |
|
"num_input_tokens_seen": 6783593829, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 8.010577256944444, |
|
"eval_loss": 3.497760772705078, |
|
"eval_runtime": 153.4387, |
|
"eval_samples_per_second": 119.918, |
|
"eval_steps_per_second": 7.495, |
|
"num_input_tokens_seen": 6783593829, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 9.018918402777778, |
|
"grad_norm": 0.1665274053812027, |
|
"learning_rate": 0.0006900544273653075, |
|
"loss": 2.8821, |
|
"num_input_tokens_seen": 7305434352, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 9.018918402777778, |
|
"eval_loss": 3.4890758991241455, |
|
"eval_runtime": 153.9326, |
|
"eval_samples_per_second": 119.533, |
|
"eval_steps_per_second": 7.471, |
|
"num_input_tokens_seen": 7305434352, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.0083055555555556, |
|
"grad_norm": 0.16452226042747498, |
|
"learning_rate": 0.000648137790442817, |
|
"loss": 2.8766, |
|
"num_input_tokens_seen": 7827274720, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.0083055555555556, |
|
"eval_loss": 3.4698522090911865, |
|
"eval_runtime": 153.8232, |
|
"eval_samples_per_second": 119.618, |
|
"eval_steps_per_second": 7.476, |
|
"num_input_tokens_seen": 7827274720, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.0166302083333334, |
|
"grad_norm": 0.17168712615966797, |
|
"learning_rate": 0.0006050452819736389, |
|
"loss": 2.8809, |
|
"num_input_tokens_seen": 8348682560, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.0166302083333334, |
|
"eval_loss": 3.4530444145202637, |
|
"eval_runtime": 152.7846, |
|
"eval_samples_per_second": 120.431, |
|
"eval_steps_per_second": 7.527, |
|
"num_input_tokens_seen": 8348682560, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 4.005391493055556, |
|
"grad_norm": 0.16454088687896729, |
|
"learning_rate": 0.0005611189568408173, |
|
"loss": 2.885, |
|
"num_input_tokens_seen": 8870476792, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 4.005391493055556, |
|
"eval_loss": 3.442610025405884, |
|
"eval_runtime": 153.4312, |
|
"eval_samples_per_second": 119.923, |
|
"eval_steps_per_second": 7.495, |
|
"num_input_tokens_seen": 8870476792, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.013413194444444, |
|
"grad_norm": 0.1566428244113922, |
|
"learning_rate": 0.0005167074885038374, |
|
"loss": 2.8616, |
|
"num_input_tokens_seen": 9392395768, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.013413194444444, |
|
"eval_loss": 3.4214282035827637, |
|
"eval_runtime": 153.1872, |
|
"eval_samples_per_second": 120.115, |
|
"eval_steps_per_second": 7.507, |
|
"num_input_tokens_seen": 9392395768, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 7.001969618055556, |
|
"grad_norm": 0.16596154868602753, |
|
"learning_rate": 0.000472163401337526, |
|
"loss": 2.8546, |
|
"num_input_tokens_seen": 9914322356, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 7.001969618055556, |
|
"eval_loss": 3.399761438369751, |
|
"eval_runtime": 153.8483, |
|
"eval_samples_per_second": 119.598, |
|
"eval_steps_per_second": 7.475, |
|
"num_input_tokens_seen": 9914322356, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.010303819444445, |
|
"grad_norm": 0.16305798292160034, |
|
"learning_rate": 0.00042784027240358674, |
|
"loss": 2.8039, |
|
"num_input_tokens_seen": 10436233756, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 8.010303819444445, |
|
"eval_loss": 3.3940844535827637, |
|
"eval_runtime": 158.2602, |
|
"eval_samples_per_second": 116.264, |
|
"eval_steps_per_second": 7.267, |
|
"num_input_tokens_seen": 10436233756, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.0082630208333334, |
|
"grad_norm": 0.2344750463962555, |
|
"learning_rate": 0.00038408992486623584, |
|
"loss": 2.7959, |
|
"num_input_tokens_seen": 10958156105, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.0082630208333334, |
|
"eval_loss": 3.4070310592651367, |
|
"eval_runtime": 155.4587, |
|
"eval_samples_per_second": 118.359, |
|
"eval_steps_per_second": 7.397, |
|
"num_input_tokens_seen": 10958156105, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.0165078125, |
|
"grad_norm": 0.16524210572242737, |
|
"learning_rate": 0.0003412596353297288, |
|
"loss": 2.791, |
|
"num_input_tokens_seen": 11479508780, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.0165078125, |
|
"eval_loss": 3.3836262226104736, |
|
"eval_runtime": 155.5662, |
|
"eval_samples_per_second": 118.278, |
|
"eval_steps_per_second": 7.392, |
|
"num_input_tokens_seen": 11479508780, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.005475694444445, |
|
"grad_norm": 0.18112713098526, |
|
"learning_rate": 0.0002996893772650602, |
|
"loss": 2.7732, |
|
"num_input_tokens_seen": 12001335662, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 4.005475694444445, |
|
"eval_loss": 3.374772548675537, |
|
"eval_runtime": 158.5198, |
|
"eval_samples_per_second": 116.074, |
|
"eval_steps_per_second": 7.255, |
|
"num_input_tokens_seen": 12001335662, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 5.013543402777778, |
|
"grad_norm": 0.20000149309635162, |
|
"learning_rate": 0.0002597091224066581, |
|
"loss": 2.776, |
|
"num_input_tokens_seen": 12523295262, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 5.013543402777778, |
|
"eval_loss": 3.3714466094970703, |
|
"eval_runtime": 156.2603, |
|
"eval_samples_per_second": 117.752, |
|
"eval_steps_per_second": 7.36, |
|
"num_input_tokens_seen": 12523295262, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.0082881944444444, |
|
"grad_norm": 0.17906545102596283, |
|
"learning_rate": 0.0002216362215397393, |
|
"loss": 2.7496, |
|
"num_input_tokens_seen": 13045103310, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.0082881944444444, |
|
"eval_loss": 3.3571267127990723, |
|
"eval_runtime": 156.601, |
|
"eval_samples_per_second": 117.496, |
|
"eval_steps_per_second": 7.344, |
|
"num_input_tokens_seen": 13045103310, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.0165208333333333, |
|
"grad_norm": 0.17018218338489532, |
|
"learning_rate": 0.00018577288546882165, |
|
"loss": 2.7435, |
|
"num_input_tokens_seen": 13566410636, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.0165208333333333, |
|
"eval_loss": 3.3441579341888428, |
|
"eval_runtime": 157.974, |
|
"eval_samples_per_second": 116.475, |
|
"eval_steps_per_second": 7.28, |
|
"num_input_tokens_seen": 13566410636, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.005661458333333, |
|
"grad_norm": 0.16987864673137665, |
|
"learning_rate": 0.00015240378616267886, |
|
"loss": 2.7321, |
|
"num_input_tokens_seen": 14088234739, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 4.005661458333333, |
|
"eval_loss": 3.335869550704956, |
|
"eval_runtime": 156.5095, |
|
"eval_samples_per_second": 117.565, |
|
"eval_steps_per_second": 7.348, |
|
"num_input_tokens_seen": 14088234739, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 5.013948784722222, |
|
"grad_norm": 0.18387989699840546, |
|
"learning_rate": 0.00012179379711709738, |
|
"loss": 2.7137, |
|
"num_input_tokens_seen": 14609933027, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 5.013948784722222, |
|
"eval_loss": 3.323387861251831, |
|
"eval_runtime": 156.9047, |
|
"eval_samples_per_second": 117.269, |
|
"eval_steps_per_second": 7.329, |
|
"num_input_tokens_seen": 14609933027, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 7.002777777777778, |
|
"grad_norm": 0.17292186617851257, |
|
"learning_rate": 9.418589087173441e-05, |
|
"loss": 2.7152, |
|
"num_input_tokens_seen": 15131802301, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 7.002777777777778, |
|
"eval_loss": 3.318844795227051, |
|
"eval_runtime": 156.7029, |
|
"eval_samples_per_second": 117.42, |
|
"eval_steps_per_second": 7.339, |
|
"num_input_tokens_seen": 15131802301, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 8.0111015625, |
|
"grad_norm": 0.17619270086288452, |
|
"learning_rate": 6.979921036993042e-05, |
|
"loss": 2.7042, |
|
"num_input_tokens_seen": 15653732413, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 8.0111015625, |
|
"eval_loss": 3.3149850368499756, |
|
"eval_runtime": 157.8074, |
|
"eval_samples_per_second": 116.598, |
|
"eval_steps_per_second": 7.287, |
|
"num_input_tokens_seen": 15653732413, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.0080494791666668, |
|
"grad_norm": 0.20021264255046844, |
|
"learning_rate": 4.882732947041818e-05, |
|
"loss": 2.7114, |
|
"num_input_tokens_seen": 16175778125, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.0080494791666668, |
|
"eval_loss": 3.3138480186462402, |
|
"eval_runtime": 155.6732, |
|
"eval_samples_per_second": 118.196, |
|
"eval_steps_per_second": 7.387, |
|
"num_input_tokens_seen": 16175778125, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.0162526041666666, |
|
"grad_norm": 0.16703246533870697, |
|
"learning_rate": 3.143671641844831e-05, |
|
"loss": 2.6986, |
|
"num_input_tokens_seen": 16697712121, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.0162526041666666, |
|
"eval_loss": 3.3022472858428955, |
|
"eval_runtime": 156.1087, |
|
"eval_samples_per_second": 117.867, |
|
"eval_steps_per_second": 7.367, |
|
"num_input_tokens_seen": 16697712121, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 4.0051015625, |
|
"grad_norm": 0.16916561126708984, |
|
"learning_rate": 1.776541247281177e-05, |
|
"loss": 2.6798, |
|
"num_input_tokens_seen": 17219624329, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 4.0051015625, |
|
"eval_loss": 3.301964521408081, |
|
"eval_runtime": 154.9317, |
|
"eval_samples_per_second": 118.762, |
|
"eval_steps_per_second": 7.423, |
|
"num_input_tokens_seen": 17219624329, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 5.013414930555555, |
|
"grad_norm": 0.16186361014842987, |
|
"learning_rate": 7.921936177411049e-06, |
|
"loss": 2.6798, |
|
"num_input_tokens_seen": 17741407609, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 5.013414930555555, |
|
"eval_loss": 3.300182580947876, |
|
"eval_runtime": 155.5972, |
|
"eval_samples_per_second": 118.254, |
|
"eval_steps_per_second": 7.391, |
|
"num_input_tokens_seen": 17741407609, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 7.002003472222222, |
|
"grad_norm": 0.1644630879163742, |
|
"learning_rate": 1.984421974927375e-06, |
|
"loss": 2.7195, |
|
"num_input_tokens_seen": 18263254841, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 7.002003472222222, |
|
"eval_loss": 3.3019583225250244, |
|
"eval_runtime": 155.6463, |
|
"eval_samples_per_second": 118.217, |
|
"eval_steps_per_second": 7.389, |
|
"num_input_tokens_seen": 18263254841, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 8.010214409722222, |
|
"grad_norm": 0.1579432338476181, |
|
"learning_rate": 0.0, |
|
"loss": 2.683, |
|
"num_input_tokens_seen": 18785262825, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 8.010214409722222, |
|
"eval_loss": 3.3019728660583496, |
|
"eval_runtime": 155.7512, |
|
"eval_samples_per_second": 118.137, |
|
"eval_steps_per_second": 7.384, |
|
"num_input_tokens_seen": 18785262825, |
|
"step": 36000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 36000, |
|
"num_input_tokens_seen": 18785262825, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 1000, |
|
"total_flos": 1.2950229157653494e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|