|
{ |
|
"best_metric": 0.25283440947532654, |
|
"best_model_checkpoint": "tuna6/mistral-saiga-journal-finetune6/checkpoint-1000", |
|
"epoch": 0.06540008502011052, |
|
"eval_steps": 40, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.6344897150993347, |
|
"learning_rate": 0.00028828828828828825, |
|
"loss": 0.8513, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 0.6471168398857117, |
|
"eval_runtime": 596.3818, |
|
"eval_samples_per_second": 12.821, |
|
"eval_steps_per_second": 1.603, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.4552938938140869, |
|
"learning_rate": 0.00027627627627627627, |
|
"loss": 0.5674, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 0.5346186757087708, |
|
"eval_runtime": 595.5393, |
|
"eval_samples_per_second": 12.839, |
|
"eval_steps_per_second": 1.605, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.505496621131897, |
|
"learning_rate": 0.00026426426426426423, |
|
"loss": 0.4685, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 0.45588552951812744, |
|
"eval_runtime": 595.3281, |
|
"eval_samples_per_second": 12.843, |
|
"eval_steps_per_second": 1.606, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.49343597888946533, |
|
"learning_rate": 0.00025225225225225225, |
|
"loss": 0.4164, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 0.4150841534137726, |
|
"eval_runtime": 595.6775, |
|
"eval_samples_per_second": 12.836, |
|
"eval_steps_per_second": 1.605, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.7332279682159424, |
|
"learning_rate": 0.00024024024024024023, |
|
"loss": 0.3707, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_loss": 0.38310083746910095, |
|
"eval_runtime": 595.6271, |
|
"eval_samples_per_second": 12.837, |
|
"eval_steps_per_second": 1.605, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.30062803626060486, |
|
"learning_rate": 0.0002282282282282282, |
|
"loss": 0.3497, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 0.3645000457763672, |
|
"eval_runtime": 596.1295, |
|
"eval_samples_per_second": 12.826, |
|
"eval_steps_per_second": 1.604, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.3287501633167267, |
|
"learning_rate": 0.0002162162162162162, |
|
"loss": 0.296, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 0.3538116216659546, |
|
"eval_runtime": 597.2837, |
|
"eval_samples_per_second": 12.801, |
|
"eval_steps_per_second": 1.601, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5396926999092102, |
|
"learning_rate": 0.00020420420420420418, |
|
"loss": 0.2976, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 0.3392348885536194, |
|
"eval_runtime": 596.5398, |
|
"eval_samples_per_second": 12.817, |
|
"eval_steps_per_second": 1.603, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.1267449855804443, |
|
"learning_rate": 0.00019219219219219217, |
|
"loss": 0.3018, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_loss": 0.3298206925392151, |
|
"eval_runtime": 596.9832, |
|
"eval_samples_per_second": 12.808, |
|
"eval_steps_per_second": 1.601, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.375235378742218, |
|
"learning_rate": 0.00018018018018018016, |
|
"loss": 0.3019, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.3226415514945984, |
|
"eval_runtime": 597.4868, |
|
"eval_samples_per_second": 12.797, |
|
"eval_steps_per_second": 1.6, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.4212028980255127, |
|
"learning_rate": 0.00016816816816816817, |
|
"loss": 0.323, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.31242895126342773, |
|
"eval_runtime": 596.6515, |
|
"eval_samples_per_second": 12.815, |
|
"eval_steps_per_second": 1.602, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.3362777531147003, |
|
"learning_rate": 0.00015615615615615616, |
|
"loss": 0.299, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.3068313002586365, |
|
"eval_runtime": 596.6534, |
|
"eval_samples_per_second": 12.815, |
|
"eval_steps_per_second": 1.602, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.29624176025390625, |
|
"learning_rate": 0.00014414414414414412, |
|
"loss": 0.2671, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.30176782608032227, |
|
"eval_runtime": 596.3445, |
|
"eval_samples_per_second": 12.821, |
|
"eval_steps_per_second": 1.603, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3254912197589874, |
|
"learning_rate": 0.00013213213213213211, |
|
"loss": 0.2799, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.2952657639980316, |
|
"eval_runtime": 596.6573, |
|
"eval_samples_per_second": 12.815, |
|
"eval_steps_per_second": 1.602, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.3114880919456482, |
|
"learning_rate": 0.00012012012012012012, |
|
"loss": 0.2674, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.29019972681999207, |
|
"eval_runtime": 596.4197, |
|
"eval_samples_per_second": 12.82, |
|
"eval_steps_per_second": 1.603, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.2812901437282562, |
|
"learning_rate": 0.0001081081081081081, |
|
"loss": 0.2395, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.2831648588180542, |
|
"eval_runtime": 596.2022, |
|
"eval_samples_per_second": 12.825, |
|
"eval_steps_per_second": 1.603, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.553597092628479, |
|
"learning_rate": 9.609609609609608e-05, |
|
"loss": 0.2436, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_loss": 0.27831366658210754, |
|
"eval_runtime": 595.5956, |
|
"eval_samples_per_second": 12.838, |
|
"eval_steps_per_second": 1.605, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.46639448404312134, |
|
"learning_rate": 8.408408408408409e-05, |
|
"loss": 0.2537, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.27261102199554443, |
|
"eval_runtime": 595.7096, |
|
"eval_samples_per_second": 12.835, |
|
"eval_steps_per_second": 1.605, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3072221279144287, |
|
"learning_rate": 7.207207207207206e-05, |
|
"loss": 0.238, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.2690950334072113, |
|
"eval_runtime": 595.7471, |
|
"eval_samples_per_second": 12.834, |
|
"eval_steps_per_second": 1.605, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.3728131055831909, |
|
"learning_rate": 6.006006006006006e-05, |
|
"loss": 0.2583, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.2644895017147064, |
|
"eval_runtime": 595.5546, |
|
"eval_samples_per_second": 12.838, |
|
"eval_steps_per_second": 1.605, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.4398050010204315, |
|
"learning_rate": 4.804804804804804e-05, |
|
"loss": 0.2274, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.26183322072029114, |
|
"eval_runtime": 595.4684, |
|
"eval_samples_per_second": 12.84, |
|
"eval_steps_per_second": 1.605, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.37743285298347473, |
|
"learning_rate": 3.603603603603603e-05, |
|
"loss": 0.2279, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.258406400680542, |
|
"eval_runtime": 595.383, |
|
"eval_samples_per_second": 12.842, |
|
"eval_steps_per_second": 1.606, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.25567543506622314, |
|
"learning_rate": 2.402402402402402e-05, |
|
"loss": 0.2497, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.25550028681755066, |
|
"eval_runtime": 595.2117, |
|
"eval_samples_per_second": 12.846, |
|
"eval_steps_per_second": 1.606, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.379567414522171, |
|
"learning_rate": 1.201201201201201e-05, |
|
"loss": 0.2599, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 0.25368040800094604, |
|
"eval_runtime": 595.2349, |
|
"eval_samples_per_second": 12.845, |
|
"eval_steps_per_second": 1.606, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.3067691922187805, |
|
"learning_rate": 0.0, |
|
"loss": 0.2332, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_loss": 0.25283440947532654, |
|
"eval_runtime": 595.1198, |
|
"eval_samples_per_second": 12.848, |
|
"eval_steps_per_second": 1.606, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 40, |
|
"max_steps": 1000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 40, |
|
"total_flos": 3.1676545818624e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|