tuna6 / trainer_state.json
andandsoso344's picture
Upload 8 files
576bb16 verified
{
"best_metric": 0.25283440947532654,
"best_model_checkpoint": "tuna6/mistral-saiga-journal-finetune6/checkpoint-1000",
"epoch": 0.06540008502011052,
"eval_steps": 40,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.6344897150993347,
"learning_rate": 0.00028828828828828825,
"loss": 0.8513,
"step": 40
},
{
"epoch": 0.0,
"eval_loss": 0.6471168398857117,
"eval_runtime": 596.3818,
"eval_samples_per_second": 12.821,
"eval_steps_per_second": 1.603,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 0.4552938938140869,
"learning_rate": 0.00027627627627627627,
"loss": 0.5674,
"step": 80
},
{
"epoch": 0.01,
"eval_loss": 0.5346186757087708,
"eval_runtime": 595.5393,
"eval_samples_per_second": 12.839,
"eval_steps_per_second": 1.605,
"step": 80
},
{
"epoch": 0.01,
"grad_norm": 0.505496621131897,
"learning_rate": 0.00026426426426426423,
"loss": 0.4685,
"step": 120
},
{
"epoch": 0.01,
"eval_loss": 0.45588552951812744,
"eval_runtime": 595.3281,
"eval_samples_per_second": 12.843,
"eval_steps_per_second": 1.606,
"step": 120
},
{
"epoch": 0.01,
"grad_norm": 0.49343597888946533,
"learning_rate": 0.00025225225225225225,
"loss": 0.4164,
"step": 160
},
{
"epoch": 0.01,
"eval_loss": 0.4150841534137726,
"eval_runtime": 595.6775,
"eval_samples_per_second": 12.836,
"eval_steps_per_second": 1.605,
"step": 160
},
{
"epoch": 0.01,
"grad_norm": 0.7332279682159424,
"learning_rate": 0.00024024024024024023,
"loss": 0.3707,
"step": 200
},
{
"epoch": 0.01,
"eval_loss": 0.38310083746910095,
"eval_runtime": 595.6271,
"eval_samples_per_second": 12.837,
"eval_steps_per_second": 1.605,
"step": 200
},
{
"epoch": 0.02,
"grad_norm": 0.30062803626060486,
"learning_rate": 0.0002282282282282282,
"loss": 0.3497,
"step": 240
},
{
"epoch": 0.02,
"eval_loss": 0.3645000457763672,
"eval_runtime": 596.1295,
"eval_samples_per_second": 12.826,
"eval_steps_per_second": 1.604,
"step": 240
},
{
"epoch": 0.02,
"grad_norm": 0.3287501633167267,
"learning_rate": 0.0002162162162162162,
"loss": 0.296,
"step": 280
},
{
"epoch": 0.02,
"eval_loss": 0.3538116216659546,
"eval_runtime": 597.2837,
"eval_samples_per_second": 12.801,
"eval_steps_per_second": 1.601,
"step": 280
},
{
"epoch": 0.02,
"grad_norm": 0.5396926999092102,
"learning_rate": 0.00020420420420420418,
"loss": 0.2976,
"step": 320
},
{
"epoch": 0.02,
"eval_loss": 0.3392348885536194,
"eval_runtime": 596.5398,
"eval_samples_per_second": 12.817,
"eval_steps_per_second": 1.603,
"step": 320
},
{
"epoch": 0.02,
"grad_norm": 1.1267449855804443,
"learning_rate": 0.00019219219219219217,
"loss": 0.3018,
"step": 360
},
{
"epoch": 0.02,
"eval_loss": 0.3298206925392151,
"eval_runtime": 596.9832,
"eval_samples_per_second": 12.808,
"eval_steps_per_second": 1.601,
"step": 360
},
{
"epoch": 0.03,
"grad_norm": 0.375235378742218,
"learning_rate": 0.00018018018018018016,
"loss": 0.3019,
"step": 400
},
{
"epoch": 0.03,
"eval_loss": 0.3226415514945984,
"eval_runtime": 597.4868,
"eval_samples_per_second": 12.797,
"eval_steps_per_second": 1.6,
"step": 400
},
{
"epoch": 0.03,
"grad_norm": 0.4212028980255127,
"learning_rate": 0.00016816816816816817,
"loss": 0.323,
"step": 440
},
{
"epoch": 0.03,
"eval_loss": 0.31242895126342773,
"eval_runtime": 596.6515,
"eval_samples_per_second": 12.815,
"eval_steps_per_second": 1.602,
"step": 440
},
{
"epoch": 0.03,
"grad_norm": 0.3362777531147003,
"learning_rate": 0.00015615615615615616,
"loss": 0.299,
"step": 480
},
{
"epoch": 0.03,
"eval_loss": 0.3068313002586365,
"eval_runtime": 596.6534,
"eval_samples_per_second": 12.815,
"eval_steps_per_second": 1.602,
"step": 480
},
{
"epoch": 0.03,
"grad_norm": 0.29624176025390625,
"learning_rate": 0.00014414414414414412,
"loss": 0.2671,
"step": 520
},
{
"epoch": 0.03,
"eval_loss": 0.30176782608032227,
"eval_runtime": 596.3445,
"eval_samples_per_second": 12.821,
"eval_steps_per_second": 1.603,
"step": 520
},
{
"epoch": 0.04,
"grad_norm": 0.3254912197589874,
"learning_rate": 0.00013213213213213211,
"loss": 0.2799,
"step": 560
},
{
"epoch": 0.04,
"eval_loss": 0.2952657639980316,
"eval_runtime": 596.6573,
"eval_samples_per_second": 12.815,
"eval_steps_per_second": 1.602,
"step": 560
},
{
"epoch": 0.04,
"grad_norm": 0.3114880919456482,
"learning_rate": 0.00012012012012012012,
"loss": 0.2674,
"step": 600
},
{
"epoch": 0.04,
"eval_loss": 0.29019972681999207,
"eval_runtime": 596.4197,
"eval_samples_per_second": 12.82,
"eval_steps_per_second": 1.603,
"step": 600
},
{
"epoch": 0.04,
"grad_norm": 0.2812901437282562,
"learning_rate": 0.0001081081081081081,
"loss": 0.2395,
"step": 640
},
{
"epoch": 0.04,
"eval_loss": 0.2831648588180542,
"eval_runtime": 596.2022,
"eval_samples_per_second": 12.825,
"eval_steps_per_second": 1.603,
"step": 640
},
{
"epoch": 0.04,
"grad_norm": 0.553597092628479,
"learning_rate": 9.609609609609608e-05,
"loss": 0.2436,
"step": 680
},
{
"epoch": 0.04,
"eval_loss": 0.27831366658210754,
"eval_runtime": 595.5956,
"eval_samples_per_second": 12.838,
"eval_steps_per_second": 1.605,
"step": 680
},
{
"epoch": 0.05,
"grad_norm": 0.46639448404312134,
"learning_rate": 8.408408408408409e-05,
"loss": 0.2537,
"step": 720
},
{
"epoch": 0.05,
"eval_loss": 0.27261102199554443,
"eval_runtime": 595.7096,
"eval_samples_per_second": 12.835,
"eval_steps_per_second": 1.605,
"step": 720
},
{
"epoch": 0.05,
"grad_norm": 0.3072221279144287,
"learning_rate": 7.207207207207206e-05,
"loss": 0.238,
"step": 760
},
{
"epoch": 0.05,
"eval_loss": 0.2690950334072113,
"eval_runtime": 595.7471,
"eval_samples_per_second": 12.834,
"eval_steps_per_second": 1.605,
"step": 760
},
{
"epoch": 0.05,
"grad_norm": 0.3728131055831909,
"learning_rate": 6.006006006006006e-05,
"loss": 0.2583,
"step": 800
},
{
"epoch": 0.05,
"eval_loss": 0.2644895017147064,
"eval_runtime": 595.5546,
"eval_samples_per_second": 12.838,
"eval_steps_per_second": 1.605,
"step": 800
},
{
"epoch": 0.05,
"grad_norm": 0.4398050010204315,
"learning_rate": 4.804804804804804e-05,
"loss": 0.2274,
"step": 840
},
{
"epoch": 0.05,
"eval_loss": 0.26183322072029114,
"eval_runtime": 595.4684,
"eval_samples_per_second": 12.84,
"eval_steps_per_second": 1.605,
"step": 840
},
{
"epoch": 0.06,
"grad_norm": 0.37743285298347473,
"learning_rate": 3.603603603603603e-05,
"loss": 0.2279,
"step": 880
},
{
"epoch": 0.06,
"eval_loss": 0.258406400680542,
"eval_runtime": 595.383,
"eval_samples_per_second": 12.842,
"eval_steps_per_second": 1.606,
"step": 880
},
{
"epoch": 0.06,
"grad_norm": 0.25567543506622314,
"learning_rate": 2.402402402402402e-05,
"loss": 0.2497,
"step": 920
},
{
"epoch": 0.06,
"eval_loss": 0.25550028681755066,
"eval_runtime": 595.2117,
"eval_samples_per_second": 12.846,
"eval_steps_per_second": 1.606,
"step": 920
},
{
"epoch": 0.06,
"grad_norm": 0.379567414522171,
"learning_rate": 1.201201201201201e-05,
"loss": 0.2599,
"step": 960
},
{
"epoch": 0.06,
"eval_loss": 0.25368040800094604,
"eval_runtime": 595.2349,
"eval_samples_per_second": 12.845,
"eval_steps_per_second": 1.606,
"step": 960
},
{
"epoch": 0.07,
"grad_norm": 0.3067691922187805,
"learning_rate": 0.0,
"loss": 0.2332,
"step": 1000
},
{
"epoch": 0.07,
"eval_loss": 0.25283440947532654,
"eval_runtime": 595.1198,
"eval_samples_per_second": 12.848,
"eval_steps_per_second": 1.606,
"step": 1000
}
],
"logging_steps": 40,
"max_steps": 1000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 40,
"total_flos": 3.1676545818624e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}