|
{ |
|
"best_metric": 5.6603827476501465, |
|
"best_model_checkpoint": "../checkpoints/nf-bart-newsroom-rqnsf/checkpoint-60000", |
|
"epoch": 0.9648008490247472, |
|
"global_step": 60000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"gate_score": 0.371, |
|
"learning_rate": 4.9725e-05, |
|
"loss": 592.8312, |
|
"nf_loss": 591.7519, |
|
"ppl": 3.3519, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"gate_score": 0.2007, |
|
"learning_rate": 4.834853544667631e-05, |
|
"loss": 333.2925, |
|
"nf_loss": 332.3035, |
|
"ppl": 2.8873, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 252.84817504882812, |
|
"eval_nf_loss": 251.2677459716797, |
|
"eval_perplexity": 6.363341331481934, |
|
"eval_runtime": 4381.8372, |
|
"eval_samples_per_second": 24.837, |
|
"eval_steps_per_second": 1.552, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"gate_score": 0.1135, |
|
"learning_rate": 4.6687102294439184e-05, |
|
"loss": 211.0105, |
|
"nf_loss": 209.9559, |
|
"ppl": 3.1158, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"gate_score": 0.0666, |
|
"learning_rate": 4.5025669142202066e-05, |
|
"loss": 134.1864, |
|
"nf_loss": 133.0995, |
|
"ppl": 3.2307, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 102.85702514648438, |
|
"eval_nf_loss": 101.2651596069336, |
|
"eval_perplexity": 6.448662757873535, |
|
"eval_runtime": 4005.1516, |
|
"eval_samples_per_second": 27.173, |
|
"eval_steps_per_second": 1.698, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"gate_score": 0.0352, |
|
"learning_rate": 4.3365897423117185e-05, |
|
"loss": 95.272, |
|
"nf_loss": 94.1643, |
|
"ppl": 3.2978, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"gate_score": 0.023, |
|
"learning_rate": 4.170446427088006e-05, |
|
"loss": 68.625, |
|
"nf_loss": 67.5129, |
|
"ppl": 3.324, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_loss": 53.623291015625, |
|
"eval_nf_loss": 52.02442169189453, |
|
"eval_perplexity": 6.536867618560791, |
|
"eval_runtime": 3924.6092, |
|
"eval_samples_per_second": 27.731, |
|
"eval_steps_per_second": 1.733, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"gate_score": 0.023, |
|
"learning_rate": 4.004386183521906e-05, |
|
"loss": 49.4308, |
|
"nf_loss": 48.3272, |
|
"ppl": 3.2964, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"gate_score": 0.0219, |
|
"learning_rate": 3.8383259399558055e-05, |
|
"loss": 36.663, |
|
"nf_loss": 35.5419, |
|
"ppl": 3.3952, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"eval_loss": 29.72612762451172, |
|
"eval_nf_loss": 28.141469955444336, |
|
"eval_perplexity": 6.444383144378662, |
|
"eval_runtime": 3904.4886, |
|
"eval_samples_per_second": 27.874, |
|
"eval_steps_per_second": 1.742, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"gate_score": 0.0183, |
|
"learning_rate": 3.672265696389706e-05, |
|
"loss": 30.1161, |
|
"nf_loss": 29.0084, |
|
"ppl": 3.3204, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"gate_score": 0.0188, |
|
"learning_rate": 3.5061223811659935e-05, |
|
"loss": 25.2734, |
|
"nf_loss": 24.1661, |
|
"ppl": 3.3019, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_loss": 21.10334014892578, |
|
"eval_nf_loss": 19.523048400878906, |
|
"eval_perplexity": 6.385104656219482, |
|
"eval_runtime": 3951.5734, |
|
"eval_samples_per_second": 27.541, |
|
"eval_steps_per_second": 1.721, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"gate_score": 0.0189, |
|
"learning_rate": 3.340062137599894e-05, |
|
"loss": 21.8002, |
|
"nf_loss": 20.6893, |
|
"ppl": 3.3211, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"gate_score": 0.0186, |
|
"learning_rate": 3.1740018940337936e-05, |
|
"loss": 19.5679, |
|
"nf_loss": 18.4664, |
|
"ppl": 3.2967, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 16.757369995117188, |
|
"eval_nf_loss": 15.157673835754395, |
|
"eval_perplexity": 6.50905179977417, |
|
"eval_runtime": 4060.1325, |
|
"eval_samples_per_second": 26.805, |
|
"eval_steps_per_second": 1.675, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"gate_score": 0.0169, |
|
"learning_rate": 3.0079416504676937e-05, |
|
"loss": 17.5505, |
|
"nf_loss": 16.4471, |
|
"ppl": 3.2866, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"gate_score": 0.0169, |
|
"learning_rate": 2.8417983352439813e-05, |
|
"loss": 16.0243, |
|
"nf_loss": 14.9265, |
|
"ppl": 3.2771, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 13.916472434997559, |
|
"eval_nf_loss": 12.336326599121094, |
|
"eval_perplexity": 6.333822250366211, |
|
"eval_runtime": 4073.3885, |
|
"eval_samples_per_second": 26.718, |
|
"eval_steps_per_second": 1.67, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"gate_score": 0.0164, |
|
"learning_rate": 2.6757380916778813e-05, |
|
"loss": 14.7704, |
|
"nf_loss": 13.6695, |
|
"ppl": 3.2909, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"gate_score": 0.0165, |
|
"learning_rate": 2.509677848111781e-05, |
|
"loss": 13.8487, |
|
"nf_loss": 12.7566, |
|
"ppl": 3.2604, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 12.310139656066895, |
|
"eval_nf_loss": 10.705780029296875, |
|
"eval_perplexity": 6.516045570373535, |
|
"eval_runtime": 3950.6921, |
|
"eval_samples_per_second": 27.548, |
|
"eval_steps_per_second": 1.722, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"gate_score": 0.0163, |
|
"learning_rate": 2.3435345328880693e-05, |
|
"loss": 13.021, |
|
"nf_loss": 11.939, |
|
"ppl": 3.2217, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"gate_score": 0.0164, |
|
"learning_rate": 2.1774742893219694e-05, |
|
"loss": 12.3656, |
|
"nf_loss": 11.2731, |
|
"ppl": 3.2491, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"eval_loss": 11.076099395751953, |
|
"eval_nf_loss": 9.518845558166504, |
|
"eval_perplexity": 6.16482400894165, |
|
"eval_runtime": 3933.1054, |
|
"eval_samples_per_second": 27.671, |
|
"eval_steps_per_second": 1.729, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"gate_score": 0.0169, |
|
"learning_rate": 2.0113309740982573e-05, |
|
"loss": 11.834, |
|
"nf_loss": 10.7422, |
|
"ppl": 3.248, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"gate_score": 0.017, |
|
"learning_rate": 1.845270730532157e-05, |
|
"loss": 11.301, |
|
"nf_loss": 10.2211, |
|
"ppl": 3.2176, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 9.942972183227539, |
|
"eval_nf_loss": 8.422438621520996, |
|
"eval_perplexity": 5.888382911682129, |
|
"eval_runtime": 3905.9353, |
|
"eval_samples_per_second": 27.863, |
|
"eval_steps_per_second": 1.741, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"gate_score": 0.0178, |
|
"learning_rate": 1.679293558623669e-05, |
|
"loss": 10.826, |
|
"nf_loss": 9.7634, |
|
"ppl": 3.1432, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"gate_score": 0.0167, |
|
"learning_rate": 1.513150243399957e-05, |
|
"loss": 10.5722, |
|
"nf_loss": 9.4924, |
|
"ppl": 3.2112, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 9.34610366821289, |
|
"eval_nf_loss": 7.819733619689941, |
|
"eval_perplexity": 5.929094314575195, |
|
"eval_runtime": 3809.9667, |
|
"eval_samples_per_second": 28.565, |
|
"eval_steps_per_second": 1.785, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"gate_score": 0.0164, |
|
"learning_rate": 1.3470899998338569e-05, |
|
"loss": 10.1982, |
|
"nf_loss": 9.1308, |
|
"ppl": 3.1538, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"gate_score": 0.0156, |
|
"learning_rate": 1.1810297562677566e-05, |
|
"loss": 9.9637, |
|
"nf_loss": 8.9113, |
|
"ppl": 3.1019, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"eval_loss": 9.272148132324219, |
|
"eval_nf_loss": 7.770688056945801, |
|
"eval_perplexity": 5.756178855895996, |
|
"eval_runtime": 3799.2449, |
|
"eval_samples_per_second": 28.646, |
|
"eval_steps_per_second": 1.79, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"gate_score": 0.0159, |
|
"learning_rate": 1.0148864410440447e-05, |
|
"loss": 9.7627, |
|
"nf_loss": 8.701, |
|
"ppl": 3.1339, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"gate_score": 0.015, |
|
"learning_rate": 8.488261974779444e-06, |
|
"loss": 9.5228, |
|
"nf_loss": 8.4605, |
|
"ppl": 3.1399, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 8.568826675415039, |
|
"eval_nf_loss": 7.074533462524414, |
|
"eval_perplexity": 5.70849609375, |
|
"eval_runtime": 3805.5723, |
|
"eval_samples_per_second": 28.598, |
|
"eval_steps_per_second": 1.787, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"gate_score": 0.0143, |
|
"learning_rate": 6.827659539118445e-06, |
|
"loss": 9.3407, |
|
"nf_loss": 8.2955, |
|
"ppl": 3.0762, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"gate_score": 0.0148, |
|
"learning_rate": 5.167057103457443e-06, |
|
"loss": 9.1988, |
|
"nf_loss": 8.1602, |
|
"ppl": 3.058, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"eval_loss": 8.280665397644043, |
|
"eval_nf_loss": 6.778278827667236, |
|
"eval_perplexity": 5.754031658172607, |
|
"eval_runtime": 3795.1994, |
|
"eval_samples_per_second": 28.676, |
|
"eval_steps_per_second": 1.792, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"gate_score": 0.0146, |
|
"learning_rate": 3.5056239512203227e-06, |
|
"loss": 9.0572, |
|
"nf_loss": 8.0245, |
|
"ppl": 3.0413, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"gate_score": 0.0138, |
|
"learning_rate": 1.8450215155593216e-06, |
|
"loss": 9.0079, |
|
"nf_loss": 7.9878, |
|
"ppl": 3.002, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_loss": 8.222879409790039, |
|
"eval_nf_loss": 6.735788822174072, |
|
"eval_perplexity": 5.6603827476501465, |
|
"eval_runtime": 3672.6524, |
|
"eval_samples_per_second": 29.633, |
|
"eval_steps_per_second": 1.852, |
|
"step": 60000 |
|
} |
|
], |
|
"max_steps": 62189, |
|
"num_train_epochs": 1, |
|
"total_flos": 1.06597659967488e+18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|