{ "best_metric": null, "best_model_checkpoint": null, "epoch": 11.91962675448914, "global_step": 76000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "learning_rate": 8.712319219376199e-06, "loss": 3.9787, "step": 1000 }, { "epoch": 0.16, "eval_loss": 3.446138381958008, "eval_runtime": 172.8011, "eval_samples_per_second": 65.578, "eval_steps_per_second": 16.395, "step": 1000 }, { "epoch": 0.31, "learning_rate": 1.7424638438752397e-05, "loss": 3.6486, "step": 2000 }, { "epoch": 0.31, "eval_loss": 3.351425886154175, "eval_runtime": 164.1827, "eval_samples_per_second": 69.021, "eval_steps_per_second": 17.255, "step": 2000 }, { "epoch": 0.47, "learning_rate": 2.6136957658128598e-05, "loss": 3.5509, "step": 3000 }, { "epoch": 0.47, "eval_loss": 3.2961513996124268, "eval_runtime": 164.8921, "eval_samples_per_second": 68.724, "eval_steps_per_second": 17.181, "step": 3000 }, { "epoch": 0.63, "learning_rate": 3.4849276877504794e-05, "loss": 3.488, "step": 4000 }, { "epoch": 0.63, "eval_loss": 3.2645070552825928, "eval_runtime": 175.5668, "eval_samples_per_second": 64.545, "eval_steps_per_second": 16.136, "step": 4000 }, { "epoch": 0.78, "learning_rate": 4.356159609688099e-05, "loss": 3.4485, "step": 5000 }, { "epoch": 0.78, "eval_loss": 3.236461877822876, "eval_runtime": 172.1625, "eval_samples_per_second": 65.822, "eval_steps_per_second": 16.455, "step": 5000 }, { "epoch": 0.94, "learning_rate": 4.985484032435679e-05, "loss": 3.4183, "step": 6000 }, { "epoch": 0.94, "eval_loss": 3.213158130645752, "eval_runtime": 165.7563, "eval_samples_per_second": 68.365, "eval_steps_per_second": 17.091, "step": 6000 }, { "epoch": 1.1, "learning_rate": 4.9298672984727646e-05, "loss": 3.3437, "step": 7000 }, { "epoch": 1.1, "eval_loss": 3.199575185775757, "eval_runtime": 155.1508, "eval_samples_per_second": 73.039, "eval_steps_per_second": 18.26, "step": 7000 }, { "epoch": 1.25, "learning_rate": 4.87425056450985e-05, "loss": 3.2964, "step": 8000 }, { "epoch": 1.25, "eval_loss": 3.1807236671447754, "eval_runtime": 156.1445, "eval_samples_per_second": 72.574, "eval_steps_per_second": 18.143, "step": 8000 }, { "epoch": 1.41, "learning_rate": 4.8186338305469355e-05, "loss": 3.2813, "step": 9000 }, { "epoch": 1.41, "eval_loss": 3.1617496013641357, "eval_runtime": 169.4771, "eval_samples_per_second": 66.864, "eval_steps_per_second": 16.716, "step": 9000 }, { "epoch": 1.57, "learning_rate": 4.76301709658402e-05, "loss": 3.2586, "step": 10000 }, { "epoch": 1.57, "eval_loss": 3.14561128616333, "eval_runtime": 169.05, "eval_samples_per_second": 67.033, "eval_steps_per_second": 16.758, "step": 10000 }, { "epoch": 1.73, "learning_rate": 4.7074003626211057e-05, "loss": 3.248, "step": 11000 }, { "epoch": 1.73, "eval_loss": 3.1361846923828125, "eval_runtime": 165.9904, "eval_samples_per_second": 68.269, "eval_steps_per_second": 17.067, "step": 11000 }, { "epoch": 1.88, "learning_rate": 4.651783628658191e-05, "loss": 3.2413, "step": 12000 }, { "epoch": 1.88, "eval_loss": 3.1259961128234863, "eval_runtime": 167.4544, "eval_samples_per_second": 67.672, "eval_steps_per_second": 16.918, "step": 12000 }, { "epoch": 2.04, "learning_rate": 4.5961668946952765e-05, "loss": 3.1974, "step": 13000 }, { "epoch": 2.04, "eval_loss": 3.121722459793091, "eval_runtime": 167.3385, "eval_samples_per_second": 67.719, "eval_steps_per_second": 16.93, "step": 13000 }, { "epoch": 2.2, "learning_rate": 4.540550160732361e-05, "loss": 3.1014, "step": 14000 }, { "epoch": 2.2, "eval_loss": 3.1170709133148193, "eval_runtime": 166.2081, "eval_samples_per_second": 68.18, "eval_steps_per_second": 17.045, "step": 14000 }, { "epoch": 2.35, "learning_rate": 4.484933426769447e-05, "loss": 3.1037, "step": 15000 }, { "epoch": 2.35, "eval_loss": 3.1060075759887695, "eval_runtime": 163.3667, "eval_samples_per_second": 69.365, "eval_steps_per_second": 17.341, "step": 15000 }, { "epoch": 2.51, "learning_rate": 4.4293166928065315e-05, "loss": 3.1047, "step": 16000 }, { "epoch": 2.51, "eval_loss": 3.1029844284057617, "eval_runtime": 166.2417, "eval_samples_per_second": 68.166, "eval_steps_per_second": 17.041, "step": 16000 }, { "epoch": 2.67, "learning_rate": 4.3736999588436176e-05, "loss": 3.1004, "step": 17000 }, { "epoch": 2.67, "eval_loss": 3.0984201431274414, "eval_runtime": 156.2961, "eval_samples_per_second": 72.503, "eval_steps_per_second": 18.126, "step": 17000 }, { "epoch": 2.82, "learning_rate": 4.3180832248807023e-05, "loss": 3.0932, "step": 18000 }, { "epoch": 2.82, "eval_loss": 3.0867185592651367, "eval_runtime": 163.2041, "eval_samples_per_second": 69.435, "eval_steps_per_second": 17.359, "step": 18000 }, { "epoch": 2.98, "learning_rate": 4.262466490917788e-05, "loss": 3.0966, "step": 19000 }, { "epoch": 2.98, "eval_loss": 3.0811350345611572, "eval_runtime": 165.6977, "eval_samples_per_second": 68.39, "eval_steps_per_second": 17.097, "step": 19000 }, { "epoch": 3.14, "learning_rate": 4.2068497569548725e-05, "loss": 2.9921, "step": 20000 }, { "epoch": 3.14, "eval_loss": 3.089102029800415, "eval_runtime": 158.578, "eval_samples_per_second": 71.46, "eval_steps_per_second": 17.865, "step": 20000 }, { "epoch": 3.29, "learning_rate": 4.151233022991958e-05, "loss": 2.9753, "step": 21000 }, { "epoch": 3.29, "eval_loss": 3.0833535194396973, "eval_runtime": 156.4016, "eval_samples_per_second": 72.454, "eval_steps_per_second": 18.114, "step": 21000 }, { "epoch": 3.45, "learning_rate": 4.0956162890290434e-05, "loss": 2.9872, "step": 22000 }, { "epoch": 3.45, "eval_loss": 3.0813159942626953, "eval_runtime": 154.7129, "eval_samples_per_second": 73.245, "eval_steps_per_second": 18.311, "step": 22000 }, { "epoch": 3.61, "learning_rate": 4.039999555066128e-05, "loss": 2.9916, "step": 23000 }, { "epoch": 3.61, "eval_loss": 3.069411516189575, "eval_runtime": 157.7992, "eval_samples_per_second": 71.813, "eval_steps_per_second": 17.953, "step": 23000 }, { "epoch": 3.76, "learning_rate": 3.9843828211032136e-05, "loss": 2.9885, "step": 24000 }, { "epoch": 3.76, "eval_loss": 3.0638270378112793, "eval_runtime": 155.9919, "eval_samples_per_second": 72.645, "eval_steps_per_second": 18.161, "step": 24000 }, { "epoch": 3.92, "learning_rate": 3.928766087140299e-05, "loss": 2.9925, "step": 25000 }, { "epoch": 3.92, "eval_loss": 3.0645134449005127, "eval_runtime": 154.9825, "eval_samples_per_second": 73.118, "eval_steps_per_second": 18.279, "step": 25000 }, { "epoch": 4.08, "learning_rate": 3.8731493531773845e-05, "loss": 2.9338, "step": 26000 }, { "epoch": 4.08, "eval_loss": 3.072267770767212, "eval_runtime": 160.362, "eval_samples_per_second": 70.665, "eval_steps_per_second": 17.666, "step": 26000 }, { "epoch": 4.23, "learning_rate": 3.817532619214469e-05, "loss": 2.8808, "step": 27000 }, { "epoch": 4.23, "eval_loss": 3.0753729343414307, "eval_runtime": 154.306, "eval_samples_per_second": 73.439, "eval_steps_per_second": 18.36, "step": 27000 }, { "epoch": 4.39, "learning_rate": 3.7619158852515547e-05, "loss": 2.8947, "step": 28000 }, { "epoch": 4.39, "eval_loss": 3.0727920532226562, "eval_runtime": 155.5666, "eval_samples_per_second": 72.843, "eval_steps_per_second": 18.211, "step": 28000 }, { "epoch": 4.55, "learning_rate": 3.7062991512886394e-05, "loss": 2.8972, "step": 29000 }, { "epoch": 4.55, "eval_loss": 3.0629560947418213, "eval_runtime": 155.1806, "eval_samples_per_second": 73.025, "eval_steps_per_second": 18.256, "step": 29000 }, { "epoch": 4.71, "learning_rate": 3.6506824173257255e-05, "loss": 2.8965, "step": 30000 }, { "epoch": 4.71, "eval_loss": 3.063514471054077, "eval_runtime": 154.5456, "eval_samples_per_second": 73.325, "eval_steps_per_second": 18.331, "step": 30000 }, { "epoch": 4.86, "learning_rate": 3.59506568336281e-05, "loss": 2.9067, "step": 31000 }, { "epoch": 4.86, "eval_loss": 3.055640697479248, "eval_runtime": 154.9311, "eval_samples_per_second": 73.142, "eval_steps_per_second": 18.286, "step": 31000 }, { "epoch": 5.02, "learning_rate": 3.539448949399896e-05, "loss": 2.8863, "step": 32000 }, { "epoch": 5.02, "eval_loss": 3.0722622871398926, "eval_runtime": 154.3548, "eval_samples_per_second": 73.415, "eval_steps_per_second": 18.354, "step": 32000 }, { "epoch": 5.18, "learning_rate": 3.4838322154369805e-05, "loss": 2.8005, "step": 33000 }, { "epoch": 5.18, "eval_loss": 3.072225332260132, "eval_runtime": 168.3304, "eval_samples_per_second": 67.32, "eval_steps_per_second": 16.83, "step": 33000 }, { "epoch": 5.33, "learning_rate": 3.4282154814740666e-05, "loss": 2.8136, "step": 34000 }, { "epoch": 5.33, "eval_loss": 3.067979097366333, "eval_runtime": 164.2522, "eval_samples_per_second": 68.991, "eval_steps_per_second": 17.248, "step": 34000 }, { "epoch": 5.49, "learning_rate": 3.3725987475111513e-05, "loss": 2.8092, "step": 35000 }, { "epoch": 5.49, "eval_loss": 3.0639047622680664, "eval_runtime": 167.1507, "eval_samples_per_second": 67.795, "eval_steps_per_second": 16.949, "step": 35000 }, { "epoch": 5.65, "learning_rate": 3.316982013548237e-05, "loss": 2.8186, "step": 36000 }, { "epoch": 5.65, "eval_loss": 3.0562849044799805, "eval_runtime": 158.9079, "eval_samples_per_second": 71.312, "eval_steps_per_second": 17.828, "step": 36000 }, { "epoch": 5.8, "learning_rate": 3.2613652795853215e-05, "loss": 2.8306, "step": 37000 }, { "epoch": 5.8, "eval_loss": 3.0535264015197754, "eval_runtime": 150.4043, "eval_samples_per_second": 75.344, "eval_steps_per_second": 18.836, "step": 37000 }, { "epoch": 5.96, "learning_rate": 3.205748545622407e-05, "loss": 2.8327, "step": 38000 }, { "epoch": 5.96, "eval_loss": 3.0540544986724854, "eval_runtime": 159.9431, "eval_samples_per_second": 70.85, "eval_steps_per_second": 17.713, "step": 38000 }, { "epoch": 6.12, "learning_rate": 3.1501318116594924e-05, "loss": 2.7548, "step": 39000 }, { "epoch": 6.12, "eval_loss": 3.068983554840088, "eval_runtime": 157.2439, "eval_samples_per_second": 72.066, "eval_steps_per_second": 18.017, "step": 39000 }, { "epoch": 6.27, "learning_rate": 3.094515077696578e-05, "loss": 2.7369, "step": 40000 }, { "epoch": 6.27, "eval_loss": 3.067845344543457, "eval_runtime": 151.1866, "eval_samples_per_second": 74.954, "eval_steps_per_second": 18.738, "step": 40000 }, { "epoch": 6.43, "learning_rate": 3.0388983437336626e-05, "loss": 2.7471, "step": 41000 }, { "epoch": 6.43, "eval_loss": 3.063314914703369, "eval_runtime": 150.8738, "eval_samples_per_second": 75.109, "eval_steps_per_second": 18.777, "step": 41000 }, { "epoch": 6.59, "learning_rate": 2.9832816097707477e-05, "loss": 2.7576, "step": 42000 }, { "epoch": 6.59, "eval_loss": 3.0629308223724365, "eval_runtime": 170.5929, "eval_samples_per_second": 66.427, "eval_steps_per_second": 16.607, "step": 42000 }, { "epoch": 6.74, "learning_rate": 2.9276648758078335e-05, "loss": 2.7566, "step": 43000 }, { "epoch": 6.74, "eval_loss": 3.0592966079711914, "eval_runtime": 152.4939, "eval_samples_per_second": 74.311, "eval_steps_per_second": 18.578, "step": 43000 }, { "epoch": 6.9, "learning_rate": 2.8720481418449186e-05, "loss": 2.7642, "step": 44000 }, { "epoch": 6.9, "eval_loss": 3.055969715118408, "eval_runtime": 152.0719, "eval_samples_per_second": 74.517, "eval_steps_per_second": 18.629, "step": 44000 }, { "epoch": 7.06, "learning_rate": 2.8164314078820037e-05, "loss": 2.7268, "step": 45000 }, { "epoch": 7.06, "eval_loss": 3.0751819610595703, "eval_runtime": 150.8723, "eval_samples_per_second": 75.11, "eval_steps_per_second": 18.777, "step": 45000 }, { "epoch": 7.21, "learning_rate": 2.7608146739190888e-05, "loss": 2.6778, "step": 46000 }, { "epoch": 7.21, "eval_loss": 3.0745816230773926, "eval_runtime": 150.773, "eval_samples_per_second": 75.159, "eval_steps_per_second": 18.79, "step": 46000 }, { "epoch": 7.37, "learning_rate": 2.705197939956174e-05, "loss": 2.6895, "step": 47000 }, { "epoch": 7.37, "eval_loss": 3.071889877319336, "eval_runtime": 151.4074, "eval_samples_per_second": 74.844, "eval_steps_per_second": 18.711, "step": 47000 }, { "epoch": 7.53, "learning_rate": 2.6495812059932596e-05, "loss": 2.6908, "step": 48000 }, { "epoch": 7.53, "eval_loss": 3.0682897567749023, "eval_runtime": 150.9084, "eval_samples_per_second": 75.092, "eval_steps_per_second": 18.773, "step": 48000 }, { "epoch": 7.69, "learning_rate": 2.5939644720303447e-05, "loss": 2.6985, "step": 49000 }, { "epoch": 7.69, "eval_loss": 3.06658935546875, "eval_runtime": 151.6999, "eval_samples_per_second": 74.7, "eval_steps_per_second": 18.675, "step": 49000 }, { "epoch": 7.84, "learning_rate": 2.5383477380674298e-05, "loss": 2.6969, "step": 50000 }, { "epoch": 7.84, "eval_loss": 3.0591042041778564, "eval_runtime": 161.1961, "eval_samples_per_second": 70.299, "eval_steps_per_second": 17.575, "step": 50000 }, { "epoch": 8.0, "learning_rate": 2.4827310041045153e-05, "loss": 2.7043, "step": 51000 }, { "epoch": 8.0, "eval_loss": 3.0611181259155273, "eval_runtime": 152.3504, "eval_samples_per_second": 74.381, "eval_steps_per_second": 18.595, "step": 51000 }, { "epoch": 8.16, "learning_rate": 2.4271142701416004e-05, "loss": 2.6225, "step": 52000 }, { "epoch": 8.16, "eval_loss": 3.080932378768921, "eval_runtime": 150.5999, "eval_samples_per_second": 75.246, "eval_steps_per_second": 18.811, "step": 52000 }, { "epoch": 8.31, "learning_rate": 2.3714975361786858e-05, "loss": 2.636, "step": 53000 }, { "epoch": 8.31, "eval_loss": 3.0787675380706787, "eval_runtime": 152.1353, "eval_samples_per_second": 74.486, "eval_steps_per_second": 18.622, "step": 53000 }, { "epoch": 8.47, "learning_rate": 2.315880802215771e-05, "loss": 2.6379, "step": 54000 }, { "epoch": 8.47, "eval_loss": 3.0813567638397217, "eval_runtime": 150.7408, "eval_samples_per_second": 75.175, "eval_steps_per_second": 18.794, "step": 54000 }, { "epoch": 8.63, "learning_rate": 2.260264068252856e-05, "loss": 2.6423, "step": 55000 }, { "epoch": 8.63, "eval_loss": 3.0757715702056885, "eval_runtime": 151.7054, "eval_samples_per_second": 74.697, "eval_steps_per_second": 18.674, "step": 55000 }, { "epoch": 8.78, "learning_rate": 2.2046473342899414e-05, "loss": 2.6417, "step": 56000 }, { "epoch": 8.78, "eval_loss": 3.074010133743286, "eval_runtime": 151.6069, "eval_samples_per_second": 74.746, "eval_steps_per_second": 18.686, "step": 56000 }, { "epoch": 8.94, "learning_rate": 2.1490306003270265e-05, "loss": 2.6507, "step": 57000 }, { "epoch": 8.94, "eval_loss": 3.07344388961792, "eval_runtime": 158.2388, "eval_samples_per_second": 71.613, "eval_steps_per_second": 17.903, "step": 57000 }, { "epoch": 9.1, "learning_rate": 2.0934138663641116e-05, "loss": 2.6058, "step": 58000 }, { "epoch": 9.1, "eval_loss": 3.086090087890625, "eval_runtime": 151.6749, "eval_samples_per_second": 74.712, "eval_steps_per_second": 18.678, "step": 58000 }, { "epoch": 9.25, "learning_rate": 2.0377971324011967e-05, "loss": 2.5833, "step": 59000 }, { "epoch": 9.25, "eval_loss": 3.086378812789917, "eval_runtime": 150.5584, "eval_samples_per_second": 75.266, "eval_steps_per_second": 18.817, "step": 59000 }, { "epoch": 9.41, "learning_rate": 1.982180398438282e-05, "loss": 2.5864, "step": 60000 }, { "epoch": 9.41, "eval_loss": 3.081770896911621, "eval_runtime": 152.7908, "eval_samples_per_second": 74.167, "eval_steps_per_second": 18.542, "step": 60000 }, { "epoch": 9.57, "learning_rate": 1.9265636644753672e-05, "loss": 2.5952, "step": 61000 }, { "epoch": 9.57, "eval_loss": 3.0846848487854004, "eval_runtime": 158.7675, "eval_samples_per_second": 71.375, "eval_steps_per_second": 17.844, "step": 61000 }, { "epoch": 9.72, "learning_rate": 1.8709469305124527e-05, "loss": 2.6003, "step": 62000 }, { "epoch": 9.72, "eval_loss": 3.0796427726745605, "eval_runtime": 151.7749, "eval_samples_per_second": 74.663, "eval_steps_per_second": 18.666, "step": 62000 }, { "epoch": 9.88, "learning_rate": 1.8153301965495378e-05, "loss": 2.6024, "step": 63000 }, { "epoch": 9.88, "eval_loss": 3.076544761657715, "eval_runtime": 150.8625, "eval_samples_per_second": 75.115, "eval_steps_per_second": 18.779, "step": 63000 }, { "epoch": 10.04, "learning_rate": 1.7597134625866232e-05, "loss": 2.5883, "step": 64000 }, { "epoch": 10.04, "eval_loss": 3.0901451110839844, "eval_runtime": 150.7672, "eval_samples_per_second": 75.162, "eval_steps_per_second": 18.791, "step": 64000 }, { "epoch": 10.19, "learning_rate": 1.7040967286237083e-05, "loss": 2.5393, "step": 65000 }, { "epoch": 10.19, "eval_loss": 3.0962793827056885, "eval_runtime": 158.5186, "eval_samples_per_second": 71.487, "eval_steps_per_second": 17.872, "step": 65000 }, { "epoch": 10.35, "learning_rate": 1.6484799946607937e-05, "loss": 2.5485, "step": 66000 }, { "epoch": 10.35, "eval_loss": 3.0939271450042725, "eval_runtime": 150.6091, "eval_samples_per_second": 75.241, "eval_steps_per_second": 18.81, "step": 66000 }, { "epoch": 10.51, "learning_rate": 1.5928632606978788e-05, "loss": 2.5496, "step": 67000 }, { "epoch": 10.51, "eval_loss": 3.092724084854126, "eval_runtime": 151.4272, "eval_samples_per_second": 74.835, "eval_steps_per_second": 18.709, "step": 67000 }, { "epoch": 10.66, "learning_rate": 1.5372465267349643e-05, "loss": 2.5577, "step": 68000 }, { "epoch": 10.66, "eval_loss": 3.0966575145721436, "eval_runtime": 152.0136, "eval_samples_per_second": 74.546, "eval_steps_per_second": 18.636, "step": 68000 }, { "epoch": 10.82, "learning_rate": 1.4816297927720494e-05, "loss": 2.5598, "step": 69000 }, { "epoch": 10.82, "eval_loss": 3.091947078704834, "eval_runtime": 150.5612, "eval_samples_per_second": 75.265, "eval_steps_per_second": 18.816, "step": 69000 }, { "epoch": 10.98, "learning_rate": 1.4260130588091345e-05, "loss": 2.5623, "step": 70000 }, { "epoch": 10.98, "eval_loss": 3.090240955352783, "eval_runtime": 151.6176, "eval_samples_per_second": 74.741, "eval_steps_per_second": 18.685, "step": 70000 }, { "epoch": 11.14, "learning_rate": 1.3703963248462199e-05, "loss": 2.5138, "step": 71000 }, { "epoch": 11.14, "eval_loss": 3.103158473968506, "eval_runtime": 150.096, "eval_samples_per_second": 75.498, "eval_steps_per_second": 18.875, "step": 71000 }, { "epoch": 11.29, "learning_rate": 1.314779590883305e-05, "loss": 2.5142, "step": 72000 }, { "epoch": 11.29, "eval_loss": 3.1014111042022705, "eval_runtime": 151.6463, "eval_samples_per_second": 74.727, "eval_steps_per_second": 18.682, "step": 72000 }, { "epoch": 11.45, "learning_rate": 1.2591628569203902e-05, "loss": 2.514, "step": 73000 }, { "epoch": 11.45, "eval_loss": 3.1006739139556885, "eval_runtime": 158.2805, "eval_samples_per_second": 71.594, "eval_steps_per_second": 17.899, "step": 73000 }, { "epoch": 11.61, "learning_rate": 1.2035461229574755e-05, "loss": 2.5206, "step": 74000 }, { "epoch": 11.61, "eval_loss": 3.1010029315948486, "eval_runtime": 150.3118, "eval_samples_per_second": 75.39, "eval_steps_per_second": 18.847, "step": 74000 }, { "epoch": 11.76, "learning_rate": 1.1479293889945606e-05, "loss": 2.52, "step": 75000 }, { "epoch": 11.76, "eval_loss": 3.098405361175537, "eval_runtime": 156.5194, "eval_samples_per_second": 72.4, "eval_steps_per_second": 18.1, "step": 75000 }, { "epoch": 11.92, "learning_rate": 1.0923126550316459e-05, "loss": 2.5204, "step": 76000 }, { "epoch": 11.92, "eval_loss": 3.0984325408935547, "eval_runtime": 151.2164, "eval_samples_per_second": 74.939, "eval_steps_per_second": 18.735, "step": 76000 } ], "max_steps": 95640, "num_train_epochs": 15, "total_flos": 1.1223588676956365e+18, "trial_name": null, "trial_params": null }