adapters-opt-bnb8-QLORA-super_glue-axg
/
trainer_state-opt-bnb8-QLORA-super_glue-axg-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 10.0, | |
"eval_steps": 1, | |
"global_step": 90, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.1111111111111111, | |
"grad_norm": 3.634396553039551, | |
"learning_rate": 2.5e-05, | |
"loss": 0.7339, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.1111111111111111, | |
"eval_accuracy": 0.6666666666666666, | |
"eval_loss": 0.6577080488204956, | |
"eval_runtime": 1.3251, | |
"eval_samples_per_second": 54.338, | |
"eval_steps_per_second": 3.773, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"grad_norm": 4.2958664894104, | |
"learning_rate": 5e-05, | |
"loss": 0.6651, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.2222222222222222, | |
"eval_accuracy": 0.625, | |
"eval_loss": 0.66876220703125, | |
"eval_runtime": 1.3312, | |
"eval_samples_per_second": 54.088, | |
"eval_steps_per_second": 3.756, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"grad_norm": 11.94453239440918, | |
"learning_rate": 4.943181818181818e-05, | |
"loss": 0.8153, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.3333333333333333, | |
"eval_accuracy": 0.5833333333333334, | |
"eval_loss": 0.6917453408241272, | |
"eval_runtime": 1.3109, | |
"eval_samples_per_second": 54.922, | |
"eval_steps_per_second": 3.814, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"grad_norm": 14.649009704589844, | |
"learning_rate": 4.886363636363637e-05, | |
"loss": 0.8472, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4444444444444444, | |
"eval_accuracy": 0.5416666666666666, | |
"eval_loss": 0.7288275957107544, | |
"eval_runtime": 1.3394, | |
"eval_samples_per_second": 53.755, | |
"eval_steps_per_second": 3.733, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"grad_norm": 6.4821929931640625, | |
"learning_rate": 4.829545454545455e-05, | |
"loss": 0.7116, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.5555555555555556, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.7855902910232544, | |
"eval_runtime": 1.315, | |
"eval_samples_per_second": 54.754, | |
"eval_steps_per_second": 3.802, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"grad_norm": 11.31198787689209, | |
"learning_rate": 4.772727272727273e-05, | |
"loss": 0.7795, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.6666666666666666, | |
"eval_accuracy": 0.5, | |
"eval_loss": 0.7721625566482544, | |
"eval_runtime": 1.3044, | |
"eval_samples_per_second": 55.198, | |
"eval_steps_per_second": 3.833, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"grad_norm": 12.5160551071167, | |
"learning_rate": 4.715909090909091e-05, | |
"loss": 0.6865, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.7777777777777778, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.7403971552848816, | |
"eval_runtime": 1.3043, | |
"eval_samples_per_second": 55.204, | |
"eval_steps_per_second": 3.834, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"grad_norm": 9.578635215759277, | |
"learning_rate": 4.659090909090909e-05, | |
"loss": 0.7128, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.8888888888888888, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.7220730185508728, | |
"eval_runtime": 1.3069, | |
"eval_samples_per_second": 55.093, | |
"eval_steps_per_second": 3.826, | |
"step": 8 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 5.968580722808838, | |
"learning_rate": 4.602272727272727e-05, | |
"loss": 0.7407, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.0, | |
"eval_accuracy": 0.4722222222222222, | |
"eval_loss": 0.7479112148284912, | |
"eval_runtime": 1.3064, | |
"eval_samples_per_second": 55.112, | |
"eval_steps_per_second": 3.827, | |
"step": 9 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"grad_norm": 4.716536521911621, | |
"learning_rate": 4.545454545454546e-05, | |
"loss": 0.7308, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.1111111111111112, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.7470703125, | |
"eval_runtime": 1.3004, | |
"eval_samples_per_second": 55.366, | |
"eval_steps_per_second": 3.845, | |
"step": 10 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"grad_norm": 4.706182479858398, | |
"learning_rate": 4.488636363636364e-05, | |
"loss": 0.6675, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.2222222222222223, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.7646213173866272, | |
"eval_runtime": 1.2542, | |
"eval_samples_per_second": 57.408, | |
"eval_steps_per_second": 3.987, | |
"step": 11 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"grad_norm": 5.189148426055908, | |
"learning_rate": 4.431818181818182e-05, | |
"loss": 0.7677, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.3333333333333333, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.7891845703125, | |
"eval_runtime": 1.3273, | |
"eval_samples_per_second": 54.244, | |
"eval_steps_per_second": 3.767, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"grad_norm": 6.091189861297607, | |
"learning_rate": 4.375e-05, | |
"loss": 0.6286, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.4444444444444444, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.81201171875, | |
"eval_runtime": 1.3227, | |
"eval_samples_per_second": 54.434, | |
"eval_steps_per_second": 3.78, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"grad_norm": 11.45678997039795, | |
"learning_rate": 4.318181818181819e-05, | |
"loss": 0.7549, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.5555555555555556, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.8420138955116272, | |
"eval_runtime": 1.3297, | |
"eval_samples_per_second": 54.148, | |
"eval_steps_per_second": 3.76, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"grad_norm": 7.684475421905518, | |
"learning_rate": 4.261363636363637e-05, | |
"loss": 0.7589, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.6666666666666665, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8681640625, | |
"eval_runtime": 1.3198, | |
"eval_samples_per_second": 54.553, | |
"eval_steps_per_second": 3.788, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"grad_norm": 14.620720863342285, | |
"learning_rate": 4.204545454545455e-05, | |
"loss": 0.7893, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.7777777777777777, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8725178837776184, | |
"eval_runtime": 1.3319, | |
"eval_samples_per_second": 54.059, | |
"eval_steps_per_second": 3.754, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"grad_norm": 13.67403507232666, | |
"learning_rate": 4.1477272727272734e-05, | |
"loss": 0.7441, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.8888888888888888, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8889024257659912, | |
"eval_runtime": 1.3268, | |
"eval_samples_per_second": 54.266, | |
"eval_steps_per_second": 3.768, | |
"step": 17 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 10.722838401794434, | |
"learning_rate": 4.0909090909090915e-05, | |
"loss": 0.7324, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8865559697151184, | |
"eval_runtime": 1.2973, | |
"eval_samples_per_second": 55.498, | |
"eval_steps_per_second": 3.854, | |
"step": 18 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"grad_norm": 8.048421859741211, | |
"learning_rate": 4.034090909090909e-05, | |
"loss": 0.7708, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.111111111111111, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8744167685508728, | |
"eval_runtime": 1.315, | |
"eval_samples_per_second": 54.754, | |
"eval_steps_per_second": 3.802, | |
"step": 19 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"grad_norm": 9.907569885253906, | |
"learning_rate": 3.9772727272727275e-05, | |
"loss": 0.7127, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.2222222222222223, | |
"eval_accuracy": 0.3333333333333333, | |
"eval_loss": 0.8528510332107544, | |
"eval_runtime": 1.3369, | |
"eval_samples_per_second": 53.855, | |
"eval_steps_per_second": 3.74, | |
"step": 20 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"grad_norm": 9.420624732971191, | |
"learning_rate": 3.9204545454545456e-05, | |
"loss": 0.7355, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.3333333333333335, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.828125, | |
"eval_runtime": 1.3234, | |
"eval_samples_per_second": 54.407, | |
"eval_steps_per_second": 3.778, | |
"step": 21 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"grad_norm": 4.088751316070557, | |
"learning_rate": 3.8636363636363636e-05, | |
"loss": 0.6403, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.4444444444444446, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.7942436933517456, | |
"eval_runtime": 1.336, | |
"eval_samples_per_second": 53.894, | |
"eval_steps_per_second": 3.743, | |
"step": 22 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"grad_norm": 8.925795555114746, | |
"learning_rate": 3.8068181818181816e-05, | |
"loss": 0.7234, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.5555555555555554, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.7515190839767456, | |
"eval_runtime": 1.3269, | |
"eval_samples_per_second": 54.261, | |
"eval_steps_per_second": 3.768, | |
"step": 23 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"grad_norm": 8.9390869140625, | |
"learning_rate": 3.7500000000000003e-05, | |
"loss": 0.769, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.6666666666666665, | |
"eval_accuracy": 0.5555555555555556, | |
"eval_loss": 0.7111545205116272, | |
"eval_runtime": 1.3044, | |
"eval_samples_per_second": 55.197, | |
"eval_steps_per_second": 3.833, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"grad_norm": 6.278990268707275, | |
"learning_rate": 3.6931818181818184e-05, | |
"loss": 0.6623, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.7777777777777777, | |
"eval_accuracy": 0.5833333333333334, | |
"eval_loss": 0.6884629726409912, | |
"eval_runtime": 1.3259, | |
"eval_samples_per_second": 54.302, | |
"eval_steps_per_second": 3.771, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"grad_norm": 10.197561264038086, | |
"learning_rate": 3.6363636363636364e-05, | |
"loss": 0.7684, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.888888888888889, | |
"eval_accuracy": 0.625, | |
"eval_loss": 0.6922810673713684, | |
"eval_runtime": 1.3394, | |
"eval_samples_per_second": 53.757, | |
"eval_steps_per_second": 3.733, | |
"step": 26 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 8.900382995605469, | |
"learning_rate": 3.579545454545455e-05, | |
"loss": 0.7966, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.0, | |
"eval_accuracy": 0.625, | |
"eval_loss": 0.6924845576286316, | |
"eval_runtime": 1.3319, | |
"eval_samples_per_second": 54.06, | |
"eval_steps_per_second": 3.754, | |
"step": 27 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"grad_norm": 7.561734199523926, | |
"learning_rate": 3.522727272727273e-05, | |
"loss": 0.7325, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.111111111111111, | |
"eval_accuracy": 0.625, | |
"eval_loss": 0.6963433027267456, | |
"eval_runtime": 1.3255, | |
"eval_samples_per_second": 54.32, | |
"eval_steps_per_second": 3.772, | |
"step": 28 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"grad_norm": 8.331040382385254, | |
"learning_rate": 3.465909090909091e-05, | |
"loss": 0.6668, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.2222222222222223, | |
"eval_accuracy": 0.6111111111111112, | |
"eval_loss": 0.6986897587776184, | |
"eval_runtime": 1.3264, | |
"eval_samples_per_second": 54.283, | |
"eval_steps_per_second": 3.77, | |
"step": 29 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"grad_norm": 7.695322036743164, | |
"learning_rate": 3.409090909090909e-05, | |
"loss": 0.816, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.3333333333333335, | |
"eval_accuracy": 0.5694444444444444, | |
"eval_loss": 0.7047797441482544, | |
"eval_runtime": 1.3273, | |
"eval_samples_per_second": 54.247, | |
"eval_steps_per_second": 3.767, | |
"step": 30 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"grad_norm": 3.6831297874450684, | |
"learning_rate": 3.352272727272727e-05, | |
"loss": 0.6271, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.4444444444444446, | |
"eval_accuracy": 0.5833333333333334, | |
"eval_loss": 0.7084689736366272, | |
"eval_runtime": 1.3374, | |
"eval_samples_per_second": 53.835, | |
"eval_steps_per_second": 3.739, | |
"step": 31 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"grad_norm": 6.039569854736328, | |
"learning_rate": 3.295454545454545e-05, | |
"loss": 0.6925, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.5555555555555554, | |
"eval_accuracy": 0.5277777777777778, | |
"eval_loss": 0.7225205898284912, | |
"eval_runtime": 1.315, | |
"eval_samples_per_second": 54.753, | |
"eval_steps_per_second": 3.802, | |
"step": 32 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"grad_norm": 4.002073764801025, | |
"learning_rate": 3.238636363636364e-05, | |
"loss": 0.6644, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.6666666666666665, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.7347276210784912, | |
"eval_runtime": 1.2988, | |
"eval_samples_per_second": 55.436, | |
"eval_steps_per_second": 3.85, | |
"step": 33 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"grad_norm": 15.78573989868164, | |
"learning_rate": 3.181818181818182e-05, | |
"loss": 0.7701, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.7777777777777777, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.7496337890625, | |
"eval_runtime": 1.3057, | |
"eval_samples_per_second": 55.141, | |
"eval_steps_per_second": 3.829, | |
"step": 34 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"grad_norm": 7.7502360343933105, | |
"learning_rate": 3.125e-05, | |
"loss": 0.6733, | |
"step": 35 | |
}, | |
{ | |
"epoch": 3.888888888888889, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.7713758945465088, | |
"eval_runtime": 1.2988, | |
"eval_samples_per_second": 55.437, | |
"eval_steps_per_second": 3.85, | |
"step": 35 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 10.381406784057617, | |
"learning_rate": 3.068181818181818e-05, | |
"loss": 0.662, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.7901747226715088, | |
"eval_runtime": 1.3034, | |
"eval_samples_per_second": 55.241, | |
"eval_steps_per_second": 3.836, | |
"step": 36 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"grad_norm": 8.049551010131836, | |
"learning_rate": 3.0113636363636365e-05, | |
"loss": 0.7122, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.111111111111111, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.7994791865348816, | |
"eval_runtime": 1.2941, | |
"eval_samples_per_second": 55.636, | |
"eval_steps_per_second": 3.864, | |
"step": 37 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"grad_norm": 10.217594146728516, | |
"learning_rate": 2.954545454545455e-05, | |
"loss": 0.6739, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.222222222222222, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.8046061396598816, | |
"eval_runtime": 1.3084, | |
"eval_samples_per_second": 55.028, | |
"eval_steps_per_second": 3.821, | |
"step": 38 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"grad_norm": 8.423747062683105, | |
"learning_rate": 2.8977272727272732e-05, | |
"loss": 0.7023, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.333333333333333, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.798828125, | |
"eval_runtime": 1.3438, | |
"eval_samples_per_second": 53.58, | |
"eval_steps_per_second": 3.721, | |
"step": 39 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"grad_norm": 5.674834728240967, | |
"learning_rate": 2.8409090909090912e-05, | |
"loss": 0.632, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.444444444444445, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8052436113357544, | |
"eval_runtime": 1.3156, | |
"eval_samples_per_second": 54.729, | |
"eval_steps_per_second": 3.801, | |
"step": 40 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"grad_norm": 10.919861793518066, | |
"learning_rate": 2.784090909090909e-05, | |
"loss": 0.6727, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.555555555555555, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.8005913496017456, | |
"eval_runtime": 1.3225, | |
"eval_samples_per_second": 54.44, | |
"eval_steps_per_second": 3.781, | |
"step": 41 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"grad_norm": 5.758547306060791, | |
"learning_rate": 2.7272727272727273e-05, | |
"loss": 0.7238, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.666666666666667, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.7941080927848816, | |
"eval_runtime": 1.3042, | |
"eval_samples_per_second": 55.208, | |
"eval_steps_per_second": 3.834, | |
"step": 42 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"grad_norm": 7.344580173492432, | |
"learning_rate": 2.6704545454545453e-05, | |
"loss": 0.6473, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.777777777777778, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.7832980751991272, | |
"eval_runtime": 1.324, | |
"eval_samples_per_second": 54.383, | |
"eval_steps_per_second": 3.777, | |
"step": 43 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"grad_norm": 3.755070686340332, | |
"learning_rate": 2.6136363636363637e-05, | |
"loss": 0.6757, | |
"step": 44 | |
}, | |
{ | |
"epoch": 4.888888888888889, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.7928873896598816, | |
"eval_runtime": 1.3091, | |
"eval_samples_per_second": 54.999, | |
"eval_steps_per_second": 3.819, | |
"step": 44 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 16.001384735107422, | |
"learning_rate": 2.5568181818181817e-05, | |
"loss": 0.7647, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.0, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.7873671054840088, | |
"eval_runtime": 1.3179, | |
"eval_samples_per_second": 54.632, | |
"eval_steps_per_second": 3.794, | |
"step": 45 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"grad_norm": 6.816504001617432, | |
"learning_rate": 2.5e-05, | |
"loss": 0.5865, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.111111111111111, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7898491621017456, | |
"eval_runtime": 1.3205, | |
"eval_samples_per_second": 54.526, | |
"eval_steps_per_second": 3.787, | |
"step": 46 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"grad_norm": 5.1409759521484375, | |
"learning_rate": 2.4431818181818185e-05, | |
"loss": 0.6982, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.222222222222222, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7936469316482544, | |
"eval_runtime": 1.3208, | |
"eval_samples_per_second": 54.514, | |
"eval_steps_per_second": 3.786, | |
"step": 47 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"grad_norm": 8.150239944458008, | |
"learning_rate": 2.3863636363636365e-05, | |
"loss": 0.6918, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.333333333333333, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.7925347089767456, | |
"eval_runtime": 1.2679, | |
"eval_samples_per_second": 56.788, | |
"eval_steps_per_second": 3.944, | |
"step": 48 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"grad_norm": 5.917128086090088, | |
"learning_rate": 2.3295454545454546e-05, | |
"loss": 0.7026, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.444444444444445, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.7916395664215088, | |
"eval_runtime": 1.3199, | |
"eval_samples_per_second": 54.548, | |
"eval_steps_per_second": 3.788, | |
"step": 49 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"grad_norm": 7.280149459838867, | |
"learning_rate": 2.272727272727273e-05, | |
"loss": 0.7423, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.555555555555555, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7977702021598816, | |
"eval_runtime": 1.3187, | |
"eval_samples_per_second": 54.6, | |
"eval_steps_per_second": 3.792, | |
"step": 50 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"grad_norm": 5.515018939971924, | |
"learning_rate": 2.215909090909091e-05, | |
"loss": 0.6506, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.666666666666667, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.7934299111366272, | |
"eval_runtime": 1.3177, | |
"eval_samples_per_second": 54.642, | |
"eval_steps_per_second": 3.795, | |
"step": 51 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"grad_norm": 12.648579597473145, | |
"learning_rate": 2.1590909090909093e-05, | |
"loss": 0.6749, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.777777777777778, | |
"eval_accuracy": 0.3194444444444444, | |
"eval_loss": 0.791259765625, | |
"eval_runtime": 1.299, | |
"eval_samples_per_second": 55.429, | |
"eval_steps_per_second": 3.849, | |
"step": 52 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"grad_norm": 15.123281478881836, | |
"learning_rate": 2.1022727272727274e-05, | |
"loss": 0.6599, | |
"step": 53 | |
}, | |
{ | |
"epoch": 5.888888888888889, | |
"eval_accuracy": 0.2638888888888889, | |
"eval_loss": 0.7919921875, | |
"eval_runtime": 1.3101, | |
"eval_samples_per_second": 54.958, | |
"eval_steps_per_second": 3.817, | |
"step": 53 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 8.299882888793945, | |
"learning_rate": 2.0454545454545457e-05, | |
"loss": 0.7627, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.3194444444444444, | |
"eval_loss": 0.7933756709098816, | |
"eval_runtime": 1.2997, | |
"eval_samples_per_second": 55.397, | |
"eval_steps_per_second": 3.847, | |
"step": 54 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"grad_norm": 5.253805160522461, | |
"learning_rate": 1.9886363636363638e-05, | |
"loss": 0.7062, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.111111111111111, | |
"eval_accuracy": 0.3194444444444444, | |
"eval_loss": 0.7950575351715088, | |
"eval_runtime": 1.3002, | |
"eval_samples_per_second": 55.374, | |
"eval_steps_per_second": 3.845, | |
"step": 55 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"grad_norm": 3.113704204559326, | |
"learning_rate": 1.9318181818181818e-05, | |
"loss": 0.6252, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.222222222222222, | |
"eval_accuracy": 0.3055555555555556, | |
"eval_loss": 0.8138291835784912, | |
"eval_runtime": 1.2974, | |
"eval_samples_per_second": 55.497, | |
"eval_steps_per_second": 3.854, | |
"step": 56 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"grad_norm": 7.795933723449707, | |
"learning_rate": 1.8750000000000002e-05, | |
"loss": 0.6671, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.333333333333333, | |
"eval_accuracy": 0.3194444444444444, | |
"eval_loss": 0.8019205927848816, | |
"eval_runtime": 1.2994, | |
"eval_samples_per_second": 55.409, | |
"eval_steps_per_second": 3.848, | |
"step": 57 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"grad_norm": 3.6579177379608154, | |
"learning_rate": 1.8181818181818182e-05, | |
"loss": 0.6658, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.444444444444445, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.8158366084098816, | |
"eval_runtime": 1.3038, | |
"eval_samples_per_second": 55.225, | |
"eval_steps_per_second": 3.835, | |
"step": 58 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"grad_norm": 4.053081035614014, | |
"learning_rate": 1.7613636363636366e-05, | |
"loss": 0.6398, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.555555555555555, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.8124186396598816, | |
"eval_runtime": 1.3371, | |
"eval_samples_per_second": 53.846, | |
"eval_steps_per_second": 3.739, | |
"step": 59 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"grad_norm": 6.540220737457275, | |
"learning_rate": 1.7045454545454546e-05, | |
"loss": 0.6659, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.666666666666667, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.817138671875, | |
"eval_runtime": 1.3017, | |
"eval_samples_per_second": 55.312, | |
"eval_steps_per_second": 3.841, | |
"step": 60 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"grad_norm": 7.515323638916016, | |
"learning_rate": 1.6477272727272726e-05, | |
"loss": 0.6599, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.777777777777778, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8284233808517456, | |
"eval_runtime": 1.2963, | |
"eval_samples_per_second": 55.545, | |
"eval_steps_per_second": 3.857, | |
"step": 61 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"grad_norm": 3.0841293334960938, | |
"learning_rate": 1.590909090909091e-05, | |
"loss": 0.5672, | |
"step": 62 | |
}, | |
{ | |
"epoch": 6.888888888888889, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.8299153447151184, | |
"eval_runtime": 1.3042, | |
"eval_samples_per_second": 55.208, | |
"eval_steps_per_second": 3.834, | |
"step": 62 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 4.5201215744018555, | |
"learning_rate": 1.534090909090909e-05, | |
"loss": 0.7332, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.0, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.8300510048866272, | |
"eval_runtime": 1.3348, | |
"eval_samples_per_second": 53.942, | |
"eval_steps_per_second": 3.746, | |
"step": 63 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"grad_norm": 4.400593280792236, | |
"learning_rate": 1.4772727272727274e-05, | |
"loss": 0.6534, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.111111111111111, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.8352186679840088, | |
"eval_runtime": 1.3222, | |
"eval_samples_per_second": 54.455, | |
"eval_steps_per_second": 3.782, | |
"step": 64 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"grad_norm": 7.905196666717529, | |
"learning_rate": 1.4204545454545456e-05, | |
"loss": 0.721, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.222222222222222, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.8336724042892456, | |
"eval_runtime": 1.3106, | |
"eval_samples_per_second": 54.936, | |
"eval_steps_per_second": 3.815, | |
"step": 65 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"grad_norm": 6.187906742095947, | |
"learning_rate": 1.3636363636363637e-05, | |
"loss": 0.7052, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.333333333333333, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.8466660976409912, | |
"eval_runtime": 1.3111, | |
"eval_samples_per_second": 54.917, | |
"eval_steps_per_second": 3.814, | |
"step": 66 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"grad_norm": 13.40283203125, | |
"learning_rate": 1.3068181818181819e-05, | |
"loss": 0.6568, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.444444444444445, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.8384060263633728, | |
"eval_runtime": 1.3134, | |
"eval_samples_per_second": 54.818, | |
"eval_steps_per_second": 3.807, | |
"step": 67 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"grad_norm": 9.783227920532227, | |
"learning_rate": 1.25e-05, | |
"loss": 0.6376, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.555555555555555, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.8467475175857544, | |
"eval_runtime": 1.3144, | |
"eval_samples_per_second": 54.776, | |
"eval_steps_per_second": 3.804, | |
"step": 68 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"grad_norm": 7.485418319702148, | |
"learning_rate": 1.1931818181818183e-05, | |
"loss": 0.6411, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.666666666666667, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8541395664215088, | |
"eval_runtime": 1.3386, | |
"eval_samples_per_second": 53.786, | |
"eval_steps_per_second": 3.735, | |
"step": 69 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"grad_norm": 6.181356430053711, | |
"learning_rate": 1.1363636363636365e-05, | |
"loss": 0.6383, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.777777777777778, | |
"eval_accuracy": 0.4583333333333333, | |
"eval_loss": 0.8521457314491272, | |
"eval_runtime": 1.3191, | |
"eval_samples_per_second": 54.581, | |
"eval_steps_per_second": 3.79, | |
"step": 70 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"grad_norm": 5.323921203613281, | |
"learning_rate": 1.0795454545454547e-05, | |
"loss": 0.6682, | |
"step": 71 | |
}, | |
{ | |
"epoch": 7.888888888888889, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8590766191482544, | |
"eval_runtime": 1.3329, | |
"eval_samples_per_second": 54.02, | |
"eval_steps_per_second": 3.751, | |
"step": 71 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 7.953044891357422, | |
"learning_rate": 1.0227272727272729e-05, | |
"loss": 0.6628, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.870849609375, | |
"eval_runtime": 1.3445, | |
"eval_samples_per_second": 53.55, | |
"eval_steps_per_second": 3.719, | |
"step": 72 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"grad_norm": 7.083126068115234, | |
"learning_rate": 9.659090909090909e-06, | |
"loss": 0.5482, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.11111111111111, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.8663601279258728, | |
"eval_runtime": 1.3094, | |
"eval_samples_per_second": 54.989, | |
"eval_steps_per_second": 3.819, | |
"step": 73 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"grad_norm": 3.150639533996582, | |
"learning_rate": 9.090909090909091e-06, | |
"loss": 0.5797, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.222222222222221, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.8770887851715088, | |
"eval_runtime": 1.3221, | |
"eval_samples_per_second": 54.457, | |
"eval_steps_per_second": 3.782, | |
"step": 74 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"grad_norm": 5.521983623504639, | |
"learning_rate": 8.522727272727273e-06, | |
"loss": 0.6201, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.333333333333334, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.8856879472732544, | |
"eval_runtime": 1.3509, | |
"eval_samples_per_second": 53.297, | |
"eval_steps_per_second": 3.701, | |
"step": 75 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"grad_norm": 14.707962036132812, | |
"learning_rate": 7.954545454545455e-06, | |
"loss": 0.6884, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.444444444444445, | |
"eval_accuracy": 0.4444444444444444, | |
"eval_loss": 0.884765625, | |
"eval_runtime": 1.3114, | |
"eval_samples_per_second": 54.902, | |
"eval_steps_per_second": 3.813, | |
"step": 76 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"grad_norm": 4.804405212402344, | |
"learning_rate": 7.386363636363637e-06, | |
"loss": 0.5927, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.555555555555555, | |
"eval_accuracy": 0.4166666666666667, | |
"eval_loss": 0.8981662392616272, | |
"eval_runtime": 1.3552, | |
"eval_samples_per_second": 53.129, | |
"eval_steps_per_second": 3.689, | |
"step": 77 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"grad_norm": 6.568211555480957, | |
"learning_rate": 6.818181818181818e-06, | |
"loss": 0.6602, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.666666666666666, | |
"eval_accuracy": 0.4305555555555556, | |
"eval_loss": 0.9038899540901184, | |
"eval_runtime": 1.3273, | |
"eval_samples_per_second": 54.247, | |
"eval_steps_per_second": 3.767, | |
"step": 78 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"grad_norm": 17.07306671142578, | |
"learning_rate": 6.25e-06, | |
"loss": 0.6417, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.777777777777779, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.9076063632965088, | |
"eval_runtime": 1.3502, | |
"eval_samples_per_second": 53.326, | |
"eval_steps_per_second": 3.703, | |
"step": 79 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"grad_norm": 9.68088436126709, | |
"learning_rate": 5.681818181818182e-06, | |
"loss": 0.5976, | |
"step": 80 | |
}, | |
{ | |
"epoch": 8.88888888888889, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.9077419638633728, | |
"eval_runtime": 1.3561, | |
"eval_samples_per_second": 53.092, | |
"eval_steps_per_second": 3.687, | |
"step": 80 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 7.580165386199951, | |
"learning_rate": 5.113636363636364e-06, | |
"loss": 0.6813, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.0, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.9115126132965088, | |
"eval_runtime": 1.3425, | |
"eval_samples_per_second": 53.63, | |
"eval_steps_per_second": 3.724, | |
"step": 81 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"grad_norm": 7.186085224151611, | |
"learning_rate": 4.5454545454545455e-06, | |
"loss": 0.6841, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.11111111111111, | |
"eval_accuracy": 0.375, | |
"eval_loss": 0.9128689169883728, | |
"eval_runtime": 1.2996, | |
"eval_samples_per_second": 55.4, | |
"eval_steps_per_second": 3.847, | |
"step": 82 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"grad_norm": 3.475069284439087, | |
"learning_rate": 3.9772727272727275e-06, | |
"loss": 0.5897, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.222222222222221, | |
"eval_accuracy": 0.4027777777777778, | |
"eval_loss": 0.9119194746017456, | |
"eval_runtime": 1.3002, | |
"eval_samples_per_second": 55.375, | |
"eval_steps_per_second": 3.845, | |
"step": 83 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"grad_norm": 9.563972473144531, | |
"learning_rate": 3.409090909090909e-06, | |
"loss": 0.6398, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.333333333333334, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.9182942509651184, | |
"eval_runtime": 1.301, | |
"eval_samples_per_second": 55.343, | |
"eval_steps_per_second": 3.843, | |
"step": 84 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"grad_norm": 3.5905749797821045, | |
"learning_rate": 2.840909090909091e-06, | |
"loss": 0.642, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.444444444444445, | |
"eval_accuracy": 0.3472222222222222, | |
"eval_loss": 0.9127333164215088, | |
"eval_runtime": 1.303, | |
"eval_samples_per_second": 55.258, | |
"eval_steps_per_second": 3.837, | |
"step": 85 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"grad_norm": 9.188106536865234, | |
"learning_rate": 2.2727272727272728e-06, | |
"loss": 0.7411, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.555555555555555, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.91796875, | |
"eval_runtime": 1.3079, | |
"eval_samples_per_second": 55.05, | |
"eval_steps_per_second": 3.823, | |
"step": 86 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"grad_norm": 11.32044506072998, | |
"learning_rate": 1.7045454545454546e-06, | |
"loss": 0.6304, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.666666666666666, | |
"eval_accuracy": 0.3333333333333333, | |
"eval_loss": 0.9201117753982544, | |
"eval_runtime": 1.3167, | |
"eval_samples_per_second": 54.683, | |
"eval_steps_per_second": 3.797, | |
"step": 87 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"grad_norm": 8.788740158081055, | |
"learning_rate": 1.1363636363636364e-06, | |
"loss": 0.6226, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.777777777777779, | |
"eval_accuracy": 0.3611111111111111, | |
"eval_loss": 0.916748046875, | |
"eval_runtime": 1.2517, | |
"eval_samples_per_second": 57.523, | |
"eval_steps_per_second": 3.995, | |
"step": 88 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"grad_norm": 10.416966438293457, | |
"learning_rate": 5.681818181818182e-07, | |
"loss": 0.7727, | |
"step": 89 | |
}, | |
{ | |
"epoch": 9.88888888888889, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.9222276210784912, | |
"eval_runtime": 1.3026, | |
"eval_samples_per_second": 55.276, | |
"eval_steps_per_second": 3.839, | |
"step": 89 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 9.8905611038208, | |
"learning_rate": 0.0, | |
"loss": 0.6162, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"eval_accuracy": 0.3888888888888889, | |
"eval_loss": 0.9153916835784912, | |
"eval_runtime": 1.3045, | |
"eval_samples_per_second": 55.192, | |
"eval_steps_per_second": 3.833, | |
"step": 90 | |
}, | |
{ | |
"epoch": 10.0, | |
"step": 90, | |
"total_flos": 4413748701822976.0, | |
"train_loss": 0.6887566460503473, | |
"train_runtime": 297.7228, | |
"train_samples_per_second": 9.539, | |
"train_steps_per_second": 0.302 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 90, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 4413748701822976.0, | |
"train_batch_size": 4, | |
"trial_name": null, | |
"trial_params": null | |
} | |