|
{ |
|
"best_metric": 0.40765267610549927, |
|
"best_model_checkpoint": "m2m100_418M_finetuned_fr_to_sw/checkpoint-32000", |
|
"epoch": 5.977956286194657, |
|
"eval_steps": 1000, |
|
"global_step": 32000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 1.992527554642257e-05, |
|
"loss": 2.3549, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 1.9850551092845137e-05, |
|
"loss": 0.8843, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_bleu": 10.6171, |
|
"eval_gen_len": 60.1469, |
|
"eval_loss": 0.76387619972229, |
|
"eval_runtime": 3073.3727, |
|
"eval_samples_per_second": 3.483, |
|
"eval_steps_per_second": 0.436, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 1.97758266392677e-05, |
|
"loss": 0.7804, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 1.9701102185690268e-05, |
|
"loss": 0.7269, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"eval_bleu": 14.3741, |
|
"eval_gen_len": 61.2398, |
|
"eval_loss": 0.6497731804847717, |
|
"eval_runtime": 3071.4883, |
|
"eval_samples_per_second": 3.486, |
|
"eval_steps_per_second": 0.436, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 1.9626377732112836e-05, |
|
"loss": 0.6685, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 1.9551653278535403e-05, |
|
"loss": 0.6504, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_bleu": 19.1778, |
|
"eval_gen_len": 54.184, |
|
"eval_loss": 0.5995421409606934, |
|
"eval_runtime": 2745.8255, |
|
"eval_samples_per_second": 3.899, |
|
"eval_steps_per_second": 0.488, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 1.9476928824957967e-05, |
|
"loss": 0.6243, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 1.9402204371380535e-05, |
|
"loss": 0.6093, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_bleu": 20.3586, |
|
"eval_gen_len": 56.1132, |
|
"eval_loss": 0.5621405243873596, |
|
"eval_runtime": 2804.8241, |
|
"eval_samples_per_second": 3.817, |
|
"eval_steps_per_second": 0.477, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 1.9327479917803102e-05, |
|
"loss": 0.5779, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 1.925275546422567e-05, |
|
"loss": 0.58, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"eval_bleu": 22.497, |
|
"eval_gen_len": 53.262, |
|
"eval_loss": 0.5317678451538086, |
|
"eval_runtime": 2510.0734, |
|
"eval_samples_per_second": 4.265, |
|
"eval_steps_per_second": 0.533, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 1.9178031010648237e-05, |
|
"loss": 0.5499, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 1.91033065570708e-05, |
|
"loss": 0.5067, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_bleu": 24.1584, |
|
"eval_gen_len": 56.1712, |
|
"eval_loss": 0.5155890583992004, |
|
"eval_runtime": 2540.4899, |
|
"eval_samples_per_second": 4.214, |
|
"eval_steps_per_second": 0.527, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"learning_rate": 1.902858210349337e-05, |
|
"loss": 0.5104, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"learning_rate": 1.8953857649915936e-05, |
|
"loss": 0.4985, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"eval_bleu": 24.902, |
|
"eval_gen_len": 55.1034, |
|
"eval_loss": 0.5012524127960205, |
|
"eval_runtime": 2433.6643, |
|
"eval_samples_per_second": 4.399, |
|
"eval_steps_per_second": 0.55, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"learning_rate": 1.8879133196338504e-05, |
|
"loss": 0.4949, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"learning_rate": 1.880440874276107e-05, |
|
"loss": 0.4861, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"eval_bleu": 25.8945, |
|
"eval_gen_len": 55.7148, |
|
"eval_loss": 0.48973962664604187, |
|
"eval_runtime": 2476.4919, |
|
"eval_samples_per_second": 4.323, |
|
"eval_steps_per_second": 0.541, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"learning_rate": 1.8729684289183636e-05, |
|
"loss": 0.4827, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"learning_rate": 1.8654959835606203e-05, |
|
"loss": 0.4789, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"eval_bleu": 26.2593, |
|
"eval_gen_len": 54.9688, |
|
"eval_loss": 0.4776358902454376, |
|
"eval_runtime": 2500.3532, |
|
"eval_samples_per_second": 4.282, |
|
"eval_steps_per_second": 0.536, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"learning_rate": 1.858023538202877e-05, |
|
"loss": 0.4757, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"learning_rate": 1.8505510928451338e-05, |
|
"loss": 0.4748, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"eval_bleu": 26.8308, |
|
"eval_gen_len": 53.234, |
|
"eval_loss": 0.4675232470035553, |
|
"eval_runtime": 2354.2254, |
|
"eval_samples_per_second": 4.548, |
|
"eval_steps_per_second": 0.569, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"learning_rate": 1.8430786474873902e-05, |
|
"loss": 0.4721, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"learning_rate": 1.835606202129647e-05, |
|
"loss": 0.4365, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"eval_bleu": 27.8127, |
|
"eval_gen_len": 53.7894, |
|
"eval_loss": 0.46269142627716064, |
|
"eval_runtime": 2320.7276, |
|
"eval_samples_per_second": 4.613, |
|
"eval_steps_per_second": 0.577, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"learning_rate": 1.8281337567719037e-05, |
|
"loss": 0.4124, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"learning_rate": 1.8206613114141605e-05, |
|
"loss": 0.4065, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"eval_bleu": 28.1334, |
|
"eval_gen_len": 52.4625, |
|
"eval_loss": 0.4552680253982544, |
|
"eval_runtime": 2260.3502, |
|
"eval_samples_per_second": 4.736, |
|
"eval_steps_per_second": 0.592, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"learning_rate": 1.8131888660564172e-05, |
|
"loss": 0.4236, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"learning_rate": 1.8057164206986736e-05, |
|
"loss": 0.4159, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"eval_bleu": 28.5473, |
|
"eval_gen_len": 53.1084, |
|
"eval_loss": 0.4502773582935333, |
|
"eval_runtime": 2305.3584, |
|
"eval_samples_per_second": 4.644, |
|
"eval_steps_per_second": 0.581, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"learning_rate": 1.7982439753409304e-05, |
|
"loss": 0.4037, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"learning_rate": 1.790771529983187e-05, |
|
"loss": 0.4078, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"eval_bleu": 28.522, |
|
"eval_gen_len": 53.9566, |
|
"eval_loss": 0.44360852241516113, |
|
"eval_runtime": 2357.175, |
|
"eval_samples_per_second": 4.542, |
|
"eval_steps_per_second": 0.568, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"learning_rate": 1.783299084625444e-05, |
|
"loss": 0.4016, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"learning_rate": 1.7758266392677006e-05, |
|
"loss": 0.4088, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"eval_bleu": 29.6642, |
|
"eval_gen_len": 54.4689, |
|
"eval_loss": 0.439211368560791, |
|
"eval_runtime": 2379.609, |
|
"eval_samples_per_second": 4.499, |
|
"eval_steps_per_second": 0.563, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"learning_rate": 1.768354193909957e-05, |
|
"loss": 0.4046, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"learning_rate": 1.7608817485522138e-05, |
|
"loss": 0.4039, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_bleu": 29.8929, |
|
"eval_gen_len": 55.4612, |
|
"eval_loss": 0.4344358444213867, |
|
"eval_runtime": 2401.3922, |
|
"eval_samples_per_second": 4.458, |
|
"eval_steps_per_second": 0.558, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"learning_rate": 1.7534093031944705e-05, |
|
"loss": 0.3635, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"learning_rate": 1.7459368578367273e-05, |
|
"loss": 0.3537, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"eval_bleu": 30.2302, |
|
"eval_gen_len": 54.3727, |
|
"eval_loss": 0.43423154950141907, |
|
"eval_runtime": 2351.6946, |
|
"eval_samples_per_second": 4.552, |
|
"eval_steps_per_second": 0.569, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 1.7384644124789837e-05, |
|
"loss": 0.3575, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"learning_rate": 1.7309919671212404e-05, |
|
"loss": 0.3569, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"eval_bleu": 30.1139, |
|
"eval_gen_len": 54.6381, |
|
"eval_loss": 0.4319211542606354, |
|
"eval_runtime": 2402.5651, |
|
"eval_samples_per_second": 4.456, |
|
"eval_steps_per_second": 0.557, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"learning_rate": 1.7235195217634972e-05, |
|
"loss": 0.3564, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"learning_rate": 1.716047076405754e-05, |
|
"loss": 0.3564, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"eval_bleu": 30.8007, |
|
"eval_gen_len": 53.8819, |
|
"eval_loss": 0.42764702439308167, |
|
"eval_runtime": 2333.0447, |
|
"eval_samples_per_second": 4.589, |
|
"eval_steps_per_second": 0.574, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 1.7085746310480107e-05, |
|
"loss": 0.3576, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"learning_rate": 1.701102185690267e-05, |
|
"loss": 0.3637, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"eval_bleu": 30.8698, |
|
"eval_gen_len": 53.7231, |
|
"eval_loss": 0.422607421875, |
|
"eval_runtime": 2331.5045, |
|
"eval_samples_per_second": 4.592, |
|
"eval_steps_per_second": 0.574, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"learning_rate": 1.693629740332524e-05, |
|
"loss": 0.3601, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"learning_rate": 1.6861572949747806e-05, |
|
"loss": 0.3571, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"eval_bleu": 31.1343, |
|
"eval_gen_len": 53.5349, |
|
"eval_loss": 0.41751930117607117, |
|
"eval_runtime": 2304.4971, |
|
"eval_samples_per_second": 4.646, |
|
"eval_steps_per_second": 0.581, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"learning_rate": 1.6786848496170374e-05, |
|
"loss": 0.3441, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"learning_rate": 1.671212404259294e-05, |
|
"loss": 0.3099, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"eval_bleu": 31.3026, |
|
"eval_gen_len": 53.4483, |
|
"eval_loss": 0.421342134475708, |
|
"eval_runtime": 2298.8454, |
|
"eval_samples_per_second": 4.657, |
|
"eval_steps_per_second": 0.582, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"learning_rate": 1.6637399589015505e-05, |
|
"loss": 0.3175, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"learning_rate": 1.6562675135438073e-05, |
|
"loss": 0.3104, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"eval_bleu": 31.1261, |
|
"eval_gen_len": 51.5196, |
|
"eval_loss": 0.4227532744407654, |
|
"eval_runtime": 2198.9363, |
|
"eval_samples_per_second": 4.869, |
|
"eval_steps_per_second": 0.609, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"learning_rate": 1.648795068186064e-05, |
|
"loss": 0.3169, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"learning_rate": 1.6413226228283208e-05, |
|
"loss": 0.3162, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"eval_bleu": 31.9091, |
|
"eval_gen_len": 53.0626, |
|
"eval_loss": 0.4195193946361542, |
|
"eval_runtime": 2270.3312, |
|
"eval_samples_per_second": 4.716, |
|
"eval_steps_per_second": 0.59, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"learning_rate": 1.6338501774705772e-05, |
|
"loss": 0.3128, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"learning_rate": 1.626377732112834e-05, |
|
"loss": 0.3177, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"eval_bleu": 31.5561, |
|
"eval_gen_len": 52.3463, |
|
"eval_loss": 0.4158227741718292, |
|
"eval_runtime": 2237.9742, |
|
"eval_samples_per_second": 4.784, |
|
"eval_steps_per_second": 0.598, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"learning_rate": 1.6189052867550907e-05, |
|
"loss": 0.3216, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"learning_rate": 1.6114328413973474e-05, |
|
"loss": 0.3181, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"eval_bleu": 32.1029, |
|
"eval_gen_len": 53.8831, |
|
"eval_loss": 0.4130856692790985, |
|
"eval_runtime": 2288.7247, |
|
"eval_samples_per_second": 4.678, |
|
"eval_steps_per_second": 0.585, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"learning_rate": 1.6039603960396042e-05, |
|
"loss": 0.3176, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"learning_rate": 1.5964879506818606e-05, |
|
"loss": 0.2941, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 5.04, |
|
"eval_bleu": 32.1061, |
|
"eval_gen_len": 52.7448, |
|
"eval_loss": 0.4150530993938446, |
|
"eval_runtime": 2247.1776, |
|
"eval_samples_per_second": 4.764, |
|
"eval_steps_per_second": 0.596, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"learning_rate": 1.5890155053241173e-05, |
|
"loss": 0.2752, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"learning_rate": 1.581543059966374e-05, |
|
"loss": 0.274, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 5.23, |
|
"eval_bleu": 31.9128, |
|
"eval_gen_len": 52.9394, |
|
"eval_loss": 0.4146653711795807, |
|
"eval_runtime": 2267.9887, |
|
"eval_samples_per_second": 4.72, |
|
"eval_steps_per_second": 0.59, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"learning_rate": 1.574070614608631e-05, |
|
"loss": 0.2852, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"learning_rate": 1.5665981692508876e-05, |
|
"loss": 0.2713, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 5.42, |
|
"eval_bleu": 32.452, |
|
"eval_gen_len": 52.881, |
|
"eval_loss": 0.41379648447036743, |
|
"eval_runtime": 2262.9812, |
|
"eval_samples_per_second": 4.731, |
|
"eval_steps_per_second": 0.592, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 5.51, |
|
"learning_rate": 1.559125723893144e-05, |
|
"loss": 0.2791, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"learning_rate": 1.5516532785354008e-05, |
|
"loss": 0.283, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"eval_bleu": 32.6103, |
|
"eval_gen_len": 53.2173, |
|
"eval_loss": 0.4103504717350006, |
|
"eval_runtime": 2279.2685, |
|
"eval_samples_per_second": 4.697, |
|
"eval_steps_per_second": 0.587, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"learning_rate": 1.5441808331776575e-05, |
|
"loss": 0.2835, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"learning_rate": 1.5367083878199142e-05, |
|
"loss": 0.2866, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 5.79, |
|
"eval_bleu": 32.5888, |
|
"eval_gen_len": 52.9638, |
|
"eval_loss": 0.41094180941581726, |
|
"eval_runtime": 2257.2287, |
|
"eval_samples_per_second": 4.743, |
|
"eval_steps_per_second": 0.593, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 5.88, |
|
"learning_rate": 1.5292359424621707e-05, |
|
"loss": 0.2851, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"learning_rate": 1.5217634971044276e-05, |
|
"loss": 0.2865, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"eval_bleu": 32.6545, |
|
"eval_gen_len": 52.4693, |
|
"eval_loss": 0.40765267610549927, |
|
"eval_runtime": 2233.0208, |
|
"eval_samples_per_second": 4.794, |
|
"eval_steps_per_second": 0.6, |
|
"step": 32000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 133825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 1000, |
|
"total_flos": 1.0234506177547469e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|