adapters-opt-gptq-QLORA-super_glue-wic
/
trainer_state-opt-gptq-QLORA-super_glue-wic-sequence_classification.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 9.6, | |
"eval_steps": 1, | |
"global_step": 120, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.08, | |
"grad_norm": 9.880268096923828, | |
"learning_rate": 2.5e-05, | |
"loss": 0.8184, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.08, | |
"eval_accuracy": 0.484, | |
"eval_loss": 0.794564425945282, | |
"eval_runtime": 2.0232, | |
"eval_samples_per_second": 123.566, | |
"eval_steps_per_second": 3.46, | |
"step": 1 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 13.567129135131836, | |
"learning_rate": 5e-05, | |
"loss": 0.8494, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.16, | |
"eval_accuracy": 0.484, | |
"eval_loss": 0.7883964776992798, | |
"eval_runtime": 2.0183, | |
"eval_samples_per_second": 123.869, | |
"eval_steps_per_second": 3.468, | |
"step": 2 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 15.441740989685059, | |
"learning_rate": 4.957627118644068e-05, | |
"loss": 0.8586, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.24, | |
"eval_accuracy": 0.484, | |
"eval_loss": 0.763421893119812, | |
"eval_runtime": 2.0194, | |
"eval_samples_per_second": 123.8, | |
"eval_steps_per_second": 3.466, | |
"step": 3 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 7.4458136558532715, | |
"learning_rate": 4.915254237288136e-05, | |
"loss": 0.7138, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.32, | |
"eval_accuracy": 0.484, | |
"eval_loss": 0.7402032017707825, | |
"eval_runtime": 2.0299, | |
"eval_samples_per_second": 123.158, | |
"eval_steps_per_second": 3.448, | |
"step": 4 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 8.256447792053223, | |
"learning_rate": 4.8728813559322034e-05, | |
"loss": 0.718, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.4, | |
"eval_accuracy": 0.496, | |
"eval_loss": 0.71826171875, | |
"eval_runtime": 2.0247, | |
"eval_samples_per_second": 123.473, | |
"eval_steps_per_second": 3.457, | |
"step": 5 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 9.767592430114746, | |
"learning_rate": 4.8305084745762714e-05, | |
"loss": 0.754, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.48, | |
"eval_accuracy": 0.504, | |
"eval_loss": 0.6996328234672546, | |
"eval_runtime": 2.0186, | |
"eval_samples_per_second": 123.847, | |
"eval_steps_per_second": 3.468, | |
"step": 6 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 5.398552417755127, | |
"learning_rate": 4.788135593220339e-05, | |
"loss": 0.712, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.56, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.689703106880188, | |
"eval_runtime": 2.0279, | |
"eval_samples_per_second": 123.282, | |
"eval_steps_per_second": 3.452, | |
"step": 7 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 3.870819330215454, | |
"learning_rate": 4.745762711864407e-05, | |
"loss": 0.7295, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.64, | |
"eval_accuracy": 0.492, | |
"eval_loss": 0.6884413957595825, | |
"eval_runtime": 2.03, | |
"eval_samples_per_second": 123.154, | |
"eval_steps_per_second": 3.448, | |
"step": 8 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 5.142106533050537, | |
"learning_rate": 4.703389830508475e-05, | |
"loss": 0.7051, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.72, | |
"eval_accuracy": 0.504, | |
"eval_loss": 0.6892617344856262, | |
"eval_runtime": 1.9705, | |
"eval_samples_per_second": 126.874, | |
"eval_steps_per_second": 3.552, | |
"step": 9 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 5.520991325378418, | |
"learning_rate": 4.6610169491525425e-05, | |
"loss": 0.8498, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.8, | |
"eval_accuracy": 0.492, | |
"eval_loss": 0.6888476610183716, | |
"eval_runtime": 2.0284, | |
"eval_samples_per_second": 123.25, | |
"eval_steps_per_second": 3.451, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 4.853059768676758, | |
"learning_rate": 4.6186440677966104e-05, | |
"loss": 0.7578, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.88, | |
"eval_accuracy": 0.468, | |
"eval_loss": 0.6881210803985596, | |
"eval_runtime": 2.0199, | |
"eval_samples_per_second": 123.766, | |
"eval_steps_per_second": 3.465, | |
"step": 11 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 6.042579650878906, | |
"learning_rate": 4.5762711864406784e-05, | |
"loss": 0.7002, | |
"step": 12 | |
}, | |
{ | |
"epoch": 0.96, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6887617111206055, | |
"eval_runtime": 2.0231, | |
"eval_samples_per_second": 123.57, | |
"eval_steps_per_second": 3.46, | |
"step": 12 | |
}, | |
{ | |
"epoch": 1.04, | |
"grad_norm": 2.7558984756469727, | |
"learning_rate": 4.533898305084746e-05, | |
"loss": 0.6943, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.04, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6887617111206055, | |
"eval_runtime": 2.0283, | |
"eval_samples_per_second": 123.257, | |
"eval_steps_per_second": 3.451, | |
"step": 13 | |
}, | |
{ | |
"epoch": 1.12, | |
"grad_norm": 4.205049991607666, | |
"learning_rate": 4.491525423728814e-05, | |
"loss": 0.7401, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.12, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6896406412124634, | |
"eval_runtime": 2.0318, | |
"eval_samples_per_second": 123.045, | |
"eval_steps_per_second": 3.445, | |
"step": 14 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 2.5496792793273926, | |
"learning_rate": 4.4491525423728816e-05, | |
"loss": 0.73, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.2, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6907109618186951, | |
"eval_runtime": 2.0241, | |
"eval_samples_per_second": 123.512, | |
"eval_steps_per_second": 3.458, | |
"step": 15 | |
}, | |
{ | |
"epoch": 1.28, | |
"grad_norm": 8.400224685668945, | |
"learning_rate": 4.4067796610169495e-05, | |
"loss": 0.6979, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.28, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6910077929496765, | |
"eval_runtime": 2.0229, | |
"eval_samples_per_second": 123.583, | |
"eval_steps_per_second": 3.46, | |
"step": 16 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"grad_norm": 2.9197959899902344, | |
"learning_rate": 4.3644067796610175e-05, | |
"loss": 0.7125, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.3599999999999999, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6910234093666077, | |
"eval_runtime": 2.0278, | |
"eval_samples_per_second": 123.287, | |
"eval_steps_per_second": 3.452, | |
"step": 17 | |
}, | |
{ | |
"epoch": 1.44, | |
"grad_norm": 2.210724115371704, | |
"learning_rate": 4.3220338983050854e-05, | |
"loss": 0.7068, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.44, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6906171441078186, | |
"eval_runtime": 2.0216, | |
"eval_samples_per_second": 123.667, | |
"eval_steps_per_second": 3.463, | |
"step": 18 | |
}, | |
{ | |
"epoch": 1.52, | |
"grad_norm": 9.887163162231445, | |
"learning_rate": 4.279661016949153e-05, | |
"loss": 0.7026, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.52, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6904453039169312, | |
"eval_runtime": 2.0246, | |
"eval_samples_per_second": 123.482, | |
"eval_steps_per_second": 3.458, | |
"step": 19 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 1.4493184089660645, | |
"learning_rate": 4.2372881355932206e-05, | |
"loss": 0.7118, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.6, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.691031277179718, | |
"eval_runtime": 2.0269, | |
"eval_samples_per_second": 123.339, | |
"eval_steps_per_second": 3.453, | |
"step": 20 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"grad_norm": 2.368281364440918, | |
"learning_rate": 4.1949152542372886e-05, | |
"loss": 0.7186, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.6800000000000002, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6909297108650208, | |
"eval_runtime": 2.021, | |
"eval_samples_per_second": 123.701, | |
"eval_steps_per_second": 3.464, | |
"step": 21 | |
}, | |
{ | |
"epoch": 1.76, | |
"grad_norm": 1.8369947671890259, | |
"learning_rate": 4.152542372881356e-05, | |
"loss": 0.7003, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.76, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6904531121253967, | |
"eval_runtime": 2.0203, | |
"eval_samples_per_second": 123.741, | |
"eval_steps_per_second": 3.465, | |
"step": 22 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"grad_norm": 3.725130796432495, | |
"learning_rate": 4.110169491525424e-05, | |
"loss": 0.6743, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.8399999999999999, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6904218792915344, | |
"eval_runtime": 2.0289, | |
"eval_samples_per_second": 123.217, | |
"eval_steps_per_second": 3.45, | |
"step": 23 | |
}, | |
{ | |
"epoch": 1.92, | |
"grad_norm": 10.2611665725708, | |
"learning_rate": 4.067796610169492e-05, | |
"loss": 0.7153, | |
"step": 24 | |
}, | |
{ | |
"epoch": 1.92, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6903828382492065, | |
"eval_runtime": 2.0289, | |
"eval_samples_per_second": 123.222, | |
"eval_steps_per_second": 3.45, | |
"step": 24 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 2.8820903301239014, | |
"learning_rate": 4.025423728813559e-05, | |
"loss": 0.7439, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.0, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6901171803474426, | |
"eval_runtime": 2.0222, | |
"eval_samples_per_second": 123.628, | |
"eval_steps_per_second": 3.462, | |
"step": 25 | |
}, | |
{ | |
"epoch": 2.08, | |
"grad_norm": 3.839857816696167, | |
"learning_rate": 3.983050847457627e-05, | |
"loss": 0.7569, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.08, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.690011739730835, | |
"eval_runtime": 2.0192, | |
"eval_samples_per_second": 123.81, | |
"eval_steps_per_second": 3.467, | |
"step": 26 | |
}, | |
{ | |
"epoch": 2.16, | |
"grad_norm": 2.944941520690918, | |
"learning_rate": 3.940677966101695e-05, | |
"loss": 0.7443, | |
"step": 27 | |
}, | |
{ | |
"epoch": 2.16, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6898750066757202, | |
"eval_runtime": 2.0288, | |
"eval_samples_per_second": 123.224, | |
"eval_steps_per_second": 3.45, | |
"step": 27 | |
}, | |
{ | |
"epoch": 2.24, | |
"grad_norm": 2.2883877754211426, | |
"learning_rate": 3.898305084745763e-05, | |
"loss": 0.7171, | |
"step": 28 | |
}, | |
{ | |
"epoch": 2.24, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6903359293937683, | |
"eval_runtime": 2.0189, | |
"eval_samples_per_second": 123.832, | |
"eval_steps_per_second": 3.467, | |
"step": 28 | |
}, | |
{ | |
"epoch": 2.32, | |
"grad_norm": 5.3827691078186035, | |
"learning_rate": 3.855932203389831e-05, | |
"loss": 0.755, | |
"step": 29 | |
}, | |
{ | |
"epoch": 2.32, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6900156140327454, | |
"eval_runtime": 2.0291, | |
"eval_samples_per_second": 123.209, | |
"eval_steps_per_second": 3.45, | |
"step": 29 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 2.5618762969970703, | |
"learning_rate": 3.813559322033898e-05, | |
"loss": 0.6827, | |
"step": 30 | |
}, | |
{ | |
"epoch": 2.4, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6895468831062317, | |
"eval_runtime": 2.0196, | |
"eval_samples_per_second": 123.787, | |
"eval_steps_per_second": 3.466, | |
"step": 30 | |
}, | |
{ | |
"epoch": 2.48, | |
"grad_norm": 3.690232276916504, | |
"learning_rate": 3.771186440677966e-05, | |
"loss": 0.7392, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.48, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6898476481437683, | |
"eval_runtime": 2.0266, | |
"eval_samples_per_second": 123.361, | |
"eval_steps_per_second": 3.454, | |
"step": 31 | |
}, | |
{ | |
"epoch": 2.56, | |
"grad_norm": 1.4257826805114746, | |
"learning_rate": 3.728813559322034e-05, | |
"loss": 0.7212, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.56, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6890624761581421, | |
"eval_runtime": 2.0204, | |
"eval_samples_per_second": 123.738, | |
"eval_steps_per_second": 3.465, | |
"step": 32 | |
}, | |
{ | |
"epoch": 2.64, | |
"grad_norm": 1.9220340251922607, | |
"learning_rate": 3.686440677966102e-05, | |
"loss": 0.7333, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.64, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.689382791519165, | |
"eval_runtime": 2.0254, | |
"eval_samples_per_second": 123.43, | |
"eval_steps_per_second": 3.456, | |
"step": 33 | |
}, | |
{ | |
"epoch": 2.7199999999999998, | |
"grad_norm": 7.450087070465088, | |
"learning_rate": 3.644067796610169e-05, | |
"loss": 0.7509, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.7199999999999998, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.689257800579071, | |
"eval_runtime": 2.0291, | |
"eval_samples_per_second": 123.206, | |
"eval_steps_per_second": 3.45, | |
"step": 34 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 8.32366943359375, | |
"learning_rate": 3.601694915254237e-05, | |
"loss": 0.7312, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.8, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6896445155143738, | |
"eval_runtime": 2.0268, | |
"eval_samples_per_second": 123.345, | |
"eval_steps_per_second": 3.454, | |
"step": 35 | |
}, | |
{ | |
"epoch": 2.88, | |
"grad_norm": 8.8251314163208, | |
"learning_rate": 3.559322033898305e-05, | |
"loss": 0.6624, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.88, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6905156373977661, | |
"eval_runtime": 2.0218, | |
"eval_samples_per_second": 123.651, | |
"eval_steps_per_second": 3.462, | |
"step": 36 | |
}, | |
{ | |
"epoch": 2.96, | |
"grad_norm": 4.100712299346924, | |
"learning_rate": 3.516949152542373e-05, | |
"loss": 0.7233, | |
"step": 37 | |
}, | |
{ | |
"epoch": 2.96, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6898671984672546, | |
"eval_runtime": 2.0212, | |
"eval_samples_per_second": 123.69, | |
"eval_steps_per_second": 3.463, | |
"step": 37 | |
}, | |
{ | |
"epoch": 3.04, | |
"grad_norm": 2.2732489109039307, | |
"learning_rate": 3.474576271186441e-05, | |
"loss": 0.6459, | |
"step": 38 | |
}, | |
{ | |
"epoch": 3.04, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6897343993186951, | |
"eval_runtime": 2.0299, | |
"eval_samples_per_second": 123.161, | |
"eval_steps_per_second": 3.449, | |
"step": 38 | |
}, | |
{ | |
"epoch": 3.12, | |
"grad_norm": 4.981455326080322, | |
"learning_rate": 3.432203389830508e-05, | |
"loss": 0.7077, | |
"step": 39 | |
}, | |
{ | |
"epoch": 3.12, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6902265548706055, | |
"eval_runtime": 2.0257, | |
"eval_samples_per_second": 123.414, | |
"eval_steps_per_second": 3.456, | |
"step": 39 | |
}, | |
{ | |
"epoch": 3.2, | |
"grad_norm": 4.601206302642822, | |
"learning_rate": 3.389830508474576e-05, | |
"loss": 0.7111, | |
"step": 40 | |
}, | |
{ | |
"epoch": 3.2, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6908008456230164, | |
"eval_runtime": 2.0209, | |
"eval_samples_per_second": 123.706, | |
"eval_steps_per_second": 3.464, | |
"step": 40 | |
}, | |
{ | |
"epoch": 3.2800000000000002, | |
"grad_norm": 1.5413074493408203, | |
"learning_rate": 3.347457627118644e-05, | |
"loss": 0.6503, | |
"step": 41 | |
}, | |
{ | |
"epoch": 3.2800000000000002, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6919609308242798, | |
"eval_runtime": 2.0246, | |
"eval_samples_per_second": 123.478, | |
"eval_steps_per_second": 3.457, | |
"step": 41 | |
}, | |
{ | |
"epoch": 3.36, | |
"grad_norm": 4.281999111175537, | |
"learning_rate": 3.305084745762712e-05, | |
"loss": 0.7574, | |
"step": 42 | |
}, | |
{ | |
"epoch": 3.36, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6926249861717224, | |
"eval_runtime": 2.0304, | |
"eval_samples_per_second": 123.13, | |
"eval_steps_per_second": 3.448, | |
"step": 42 | |
}, | |
{ | |
"epoch": 3.44, | |
"grad_norm": 5.510625839233398, | |
"learning_rate": 3.26271186440678e-05, | |
"loss": 0.6575, | |
"step": 43 | |
}, | |
{ | |
"epoch": 3.44, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6930938363075256, | |
"eval_runtime": 2.0293, | |
"eval_samples_per_second": 123.196, | |
"eval_steps_per_second": 3.449, | |
"step": 43 | |
}, | |
{ | |
"epoch": 3.52, | |
"grad_norm": 4.354837417602539, | |
"learning_rate": 3.2203389830508473e-05, | |
"loss": 0.7324, | |
"step": 44 | |
}, | |
{ | |
"epoch": 3.52, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6937890648841858, | |
"eval_runtime": 2.0277, | |
"eval_samples_per_second": 123.292, | |
"eval_steps_per_second": 3.452, | |
"step": 44 | |
}, | |
{ | |
"epoch": 3.6, | |
"grad_norm": 5.1033148765563965, | |
"learning_rate": 3.177966101694915e-05, | |
"loss": 0.6588, | |
"step": 45 | |
}, | |
{ | |
"epoch": 3.6, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6935625076293945, | |
"eval_runtime": 2.0277, | |
"eval_samples_per_second": 123.291, | |
"eval_steps_per_second": 3.452, | |
"step": 45 | |
}, | |
{ | |
"epoch": 3.68, | |
"grad_norm": 7.331698894500732, | |
"learning_rate": 3.135593220338983e-05, | |
"loss": 0.8145, | |
"step": 46 | |
}, | |
{ | |
"epoch": 3.68, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6932734251022339, | |
"eval_runtime": 2.0207, | |
"eval_samples_per_second": 123.72, | |
"eval_steps_per_second": 3.464, | |
"step": 46 | |
}, | |
{ | |
"epoch": 3.76, | |
"grad_norm": 7.018001079559326, | |
"learning_rate": 3.093220338983051e-05, | |
"loss": 0.7016, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.76, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6925547122955322, | |
"eval_runtime": 2.0244, | |
"eval_samples_per_second": 123.49, | |
"eval_steps_per_second": 3.458, | |
"step": 47 | |
}, | |
{ | |
"epoch": 3.84, | |
"grad_norm": 3.4472386837005615, | |
"learning_rate": 3.050847457627119e-05, | |
"loss": 0.6726, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.84, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.692257821559906, | |
"eval_runtime": 2.0236, | |
"eval_samples_per_second": 123.539, | |
"eval_steps_per_second": 3.459, | |
"step": 48 | |
}, | |
{ | |
"epoch": 3.92, | |
"grad_norm": 7.608280181884766, | |
"learning_rate": 3.0084745762711864e-05, | |
"loss": 0.7211, | |
"step": 49 | |
}, | |
{ | |
"epoch": 3.92, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6909765601158142, | |
"eval_runtime": 2.0313, | |
"eval_samples_per_second": 123.075, | |
"eval_steps_per_second": 3.446, | |
"step": 49 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 2.3237762451171875, | |
"learning_rate": 2.9661016949152544e-05, | |
"loss": 0.7066, | |
"step": 50 | |
}, | |
{ | |
"epoch": 4.0, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6903671622276306, | |
"eval_runtime": 2.025, | |
"eval_samples_per_second": 123.457, | |
"eval_steps_per_second": 3.457, | |
"step": 50 | |
}, | |
{ | |
"epoch": 4.08, | |
"grad_norm": 3.2661080360412598, | |
"learning_rate": 2.9237288135593223e-05, | |
"loss": 0.7416, | |
"step": 51 | |
}, | |
{ | |
"epoch": 4.08, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.690011739730835, | |
"eval_runtime": 2.0217, | |
"eval_samples_per_second": 123.657, | |
"eval_steps_per_second": 3.462, | |
"step": 51 | |
}, | |
{ | |
"epoch": 4.16, | |
"grad_norm": 6.197864532470703, | |
"learning_rate": 2.88135593220339e-05, | |
"loss": 0.6967, | |
"step": 52 | |
}, | |
{ | |
"epoch": 4.16, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6896718740463257, | |
"eval_runtime": 2.0228, | |
"eval_samples_per_second": 123.592, | |
"eval_steps_per_second": 3.461, | |
"step": 52 | |
}, | |
{ | |
"epoch": 4.24, | |
"grad_norm": 12.455964088439941, | |
"learning_rate": 2.838983050847458e-05, | |
"loss": 0.6901, | |
"step": 53 | |
}, | |
{ | |
"epoch": 4.24, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6897343993186951, | |
"eval_runtime": 2.0305, | |
"eval_samples_per_second": 123.123, | |
"eval_steps_per_second": 3.447, | |
"step": 53 | |
}, | |
{ | |
"epoch": 4.32, | |
"grad_norm": 4.135785102844238, | |
"learning_rate": 2.7966101694915255e-05, | |
"loss": 0.7262, | |
"step": 54 | |
}, | |
{ | |
"epoch": 4.32, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6892734169960022, | |
"eval_runtime": 2.022, | |
"eval_samples_per_second": 123.642, | |
"eval_steps_per_second": 3.462, | |
"step": 54 | |
}, | |
{ | |
"epoch": 4.4, | |
"grad_norm": 2.7662761211395264, | |
"learning_rate": 2.754237288135593e-05, | |
"loss": 0.7369, | |
"step": 55 | |
}, | |
{ | |
"epoch": 4.4, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6887929439544678, | |
"eval_runtime": 2.0254, | |
"eval_samples_per_second": 123.434, | |
"eval_steps_per_second": 3.456, | |
"step": 55 | |
}, | |
{ | |
"epoch": 4.48, | |
"grad_norm": 7.013899326324463, | |
"learning_rate": 2.711864406779661e-05, | |
"loss": 0.754, | |
"step": 56 | |
}, | |
{ | |
"epoch": 4.48, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6891640424728394, | |
"eval_runtime": 2.0296, | |
"eval_samples_per_second": 123.174, | |
"eval_steps_per_second": 3.449, | |
"step": 56 | |
}, | |
{ | |
"epoch": 4.5600000000000005, | |
"grad_norm": 4.8530144691467285, | |
"learning_rate": 2.669491525423729e-05, | |
"loss": 0.707, | |
"step": 57 | |
}, | |
{ | |
"epoch": 4.5600000000000005, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6898828744888306, | |
"eval_runtime": 2.0231, | |
"eval_samples_per_second": 123.571, | |
"eval_steps_per_second": 3.46, | |
"step": 57 | |
}, | |
{ | |
"epoch": 4.64, | |
"grad_norm": 3.5552706718444824, | |
"learning_rate": 2.627118644067797e-05, | |
"loss": 0.7089, | |
"step": 58 | |
}, | |
{ | |
"epoch": 4.64, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6896093487739563, | |
"eval_runtime": 2.0292, | |
"eval_samples_per_second": 123.2, | |
"eval_steps_per_second": 3.45, | |
"step": 58 | |
}, | |
{ | |
"epoch": 4.72, | |
"grad_norm": 2.477534532546997, | |
"learning_rate": 2.5847457627118642e-05, | |
"loss": 0.7266, | |
"step": 59 | |
}, | |
{ | |
"epoch": 4.72, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.690679669380188, | |
"eval_runtime": 2.0212, | |
"eval_samples_per_second": 123.688, | |
"eval_steps_per_second": 3.463, | |
"step": 59 | |
}, | |
{ | |
"epoch": 4.8, | |
"grad_norm": 7.140619277954102, | |
"learning_rate": 2.5423728813559322e-05, | |
"loss": 0.7071, | |
"step": 60 | |
}, | |
{ | |
"epoch": 4.8, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6908749938011169, | |
"eval_runtime": 2.0308, | |
"eval_samples_per_second": 123.105, | |
"eval_steps_per_second": 3.447, | |
"step": 60 | |
}, | |
{ | |
"epoch": 4.88, | |
"grad_norm": 2.1332101821899414, | |
"learning_rate": 2.5e-05, | |
"loss": 0.6684, | |
"step": 61 | |
}, | |
{ | |
"epoch": 4.88, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6921796798706055, | |
"eval_runtime": 2.0217, | |
"eval_samples_per_second": 123.659, | |
"eval_steps_per_second": 3.462, | |
"step": 61 | |
}, | |
{ | |
"epoch": 4.96, | |
"grad_norm": 6.111132621765137, | |
"learning_rate": 2.457627118644068e-05, | |
"loss": 0.7108, | |
"step": 62 | |
}, | |
{ | |
"epoch": 4.96, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6918125152587891, | |
"eval_runtime": 2.0232, | |
"eval_samples_per_second": 123.57, | |
"eval_steps_per_second": 3.46, | |
"step": 62 | |
}, | |
{ | |
"epoch": 5.04, | |
"grad_norm": 2.674424171447754, | |
"learning_rate": 2.4152542372881357e-05, | |
"loss": 0.6958, | |
"step": 63 | |
}, | |
{ | |
"epoch": 5.04, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.692257821559906, | |
"eval_runtime": 2.0217, | |
"eval_samples_per_second": 123.657, | |
"eval_steps_per_second": 3.462, | |
"step": 63 | |
}, | |
{ | |
"epoch": 5.12, | |
"grad_norm": 4.979844093322754, | |
"learning_rate": 2.3728813559322036e-05, | |
"loss": 0.6996, | |
"step": 64 | |
}, | |
{ | |
"epoch": 5.12, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6926876306533813, | |
"eval_runtime": 2.0292, | |
"eval_samples_per_second": 123.198, | |
"eval_steps_per_second": 3.45, | |
"step": 64 | |
}, | |
{ | |
"epoch": 5.2, | |
"grad_norm": 1.8670213222503662, | |
"learning_rate": 2.3305084745762712e-05, | |
"loss": 0.7389, | |
"step": 65 | |
}, | |
{ | |
"epoch": 5.2, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.693109393119812, | |
"eval_runtime": 2.022, | |
"eval_samples_per_second": 123.639, | |
"eval_steps_per_second": 3.462, | |
"step": 65 | |
}, | |
{ | |
"epoch": 5.28, | |
"grad_norm": 4.056942939758301, | |
"learning_rate": 2.2881355932203392e-05, | |
"loss": 0.7584, | |
"step": 66 | |
}, | |
{ | |
"epoch": 5.28, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6923593878746033, | |
"eval_runtime": 2.0259, | |
"eval_samples_per_second": 123.399, | |
"eval_steps_per_second": 3.455, | |
"step": 66 | |
}, | |
{ | |
"epoch": 5.36, | |
"grad_norm": 5.482074737548828, | |
"learning_rate": 2.245762711864407e-05, | |
"loss": 0.6746, | |
"step": 67 | |
}, | |
{ | |
"epoch": 5.36, | |
"eval_accuracy": 0.512, | |
"eval_loss": 0.6919531226158142, | |
"eval_runtime": 2.026, | |
"eval_samples_per_second": 123.394, | |
"eval_steps_per_second": 3.455, | |
"step": 67 | |
}, | |
{ | |
"epoch": 5.44, | |
"grad_norm": 2.0411086082458496, | |
"learning_rate": 2.2033898305084748e-05, | |
"loss": 0.7089, | |
"step": 68 | |
}, | |
{ | |
"epoch": 5.44, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6914218664169312, | |
"eval_runtime": 2.0245, | |
"eval_samples_per_second": 123.488, | |
"eval_steps_per_second": 3.458, | |
"step": 68 | |
}, | |
{ | |
"epoch": 5.52, | |
"grad_norm": 5.141605854034424, | |
"learning_rate": 2.1610169491525427e-05, | |
"loss": 0.6672, | |
"step": 69 | |
}, | |
{ | |
"epoch": 5.52, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6912265419960022, | |
"eval_runtime": 2.0218, | |
"eval_samples_per_second": 123.65, | |
"eval_steps_per_second": 3.462, | |
"step": 69 | |
}, | |
{ | |
"epoch": 5.6, | |
"grad_norm": 1.6354871988296509, | |
"learning_rate": 2.1186440677966103e-05, | |
"loss": 0.6658, | |
"step": 70 | |
}, | |
{ | |
"epoch": 5.6, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6908749938011169, | |
"eval_runtime": 2.0254, | |
"eval_samples_per_second": 123.435, | |
"eval_steps_per_second": 3.456, | |
"step": 70 | |
}, | |
{ | |
"epoch": 5.68, | |
"grad_norm": 2.918964385986328, | |
"learning_rate": 2.076271186440678e-05, | |
"loss": 0.7021, | |
"step": 71 | |
}, | |
{ | |
"epoch": 5.68, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6909531354904175, | |
"eval_runtime": 2.0283, | |
"eval_samples_per_second": 123.256, | |
"eval_steps_per_second": 3.451, | |
"step": 71 | |
}, | |
{ | |
"epoch": 5.76, | |
"grad_norm": 4.230044841766357, | |
"learning_rate": 2.033898305084746e-05, | |
"loss": 0.67, | |
"step": 72 | |
}, | |
{ | |
"epoch": 5.76, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6901953220367432, | |
"eval_runtime": 2.0208, | |
"eval_samples_per_second": 123.712, | |
"eval_steps_per_second": 3.464, | |
"step": 72 | |
}, | |
{ | |
"epoch": 5.84, | |
"grad_norm": 3.8107471466064453, | |
"learning_rate": 1.9915254237288135e-05, | |
"loss": 0.7184, | |
"step": 73 | |
}, | |
{ | |
"epoch": 5.84, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6903281211853027, | |
"eval_runtime": 2.0192, | |
"eval_samples_per_second": 123.813, | |
"eval_steps_per_second": 3.467, | |
"step": 73 | |
}, | |
{ | |
"epoch": 5.92, | |
"grad_norm": 8.891433715820312, | |
"learning_rate": 1.9491525423728814e-05, | |
"loss": 0.7887, | |
"step": 74 | |
}, | |
{ | |
"epoch": 5.92, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6897187232971191, | |
"eval_runtime": 2.0311, | |
"eval_samples_per_second": 123.085, | |
"eval_steps_per_second": 3.446, | |
"step": 74 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 3.761192798614502, | |
"learning_rate": 1.906779661016949e-05, | |
"loss": 0.6871, | |
"step": 75 | |
}, | |
{ | |
"epoch": 6.0, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.689648449420929, | |
"eval_runtime": 2.0234, | |
"eval_samples_per_second": 123.557, | |
"eval_steps_per_second": 3.46, | |
"step": 75 | |
}, | |
{ | |
"epoch": 6.08, | |
"grad_norm": 5.231982231140137, | |
"learning_rate": 1.864406779661017e-05, | |
"loss": 0.6555, | |
"step": 76 | |
}, | |
{ | |
"epoch": 6.08, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.689203143119812, | |
"eval_runtime": 2.0301, | |
"eval_samples_per_second": 123.147, | |
"eval_steps_per_second": 3.448, | |
"step": 76 | |
}, | |
{ | |
"epoch": 6.16, | |
"grad_norm": 4.370417594909668, | |
"learning_rate": 1.8220338983050846e-05, | |
"loss": 0.7241, | |
"step": 77 | |
}, | |
{ | |
"epoch": 6.16, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6886523365974426, | |
"eval_runtime": 2.0236, | |
"eval_samples_per_second": 123.543, | |
"eval_steps_per_second": 3.459, | |
"step": 77 | |
}, | |
{ | |
"epoch": 6.24, | |
"grad_norm": 2.4579694271087646, | |
"learning_rate": 1.7796610169491526e-05, | |
"loss": 0.6802, | |
"step": 78 | |
}, | |
{ | |
"epoch": 6.24, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6887656450271606, | |
"eval_runtime": 2.0193, | |
"eval_samples_per_second": 123.808, | |
"eval_steps_per_second": 3.467, | |
"step": 78 | |
}, | |
{ | |
"epoch": 6.32, | |
"grad_norm": 2.63549542427063, | |
"learning_rate": 1.7372881355932205e-05, | |
"loss": 0.6833, | |
"step": 79 | |
}, | |
{ | |
"epoch": 6.32, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6883828043937683, | |
"eval_runtime": 2.0284, | |
"eval_samples_per_second": 123.247, | |
"eval_steps_per_second": 3.451, | |
"step": 79 | |
}, | |
{ | |
"epoch": 6.4, | |
"grad_norm": 4.547414779663086, | |
"learning_rate": 1.694915254237288e-05, | |
"loss": 0.7329, | |
"step": 80 | |
}, | |
{ | |
"epoch": 6.4, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6879101395606995, | |
"eval_runtime": 2.0191, | |
"eval_samples_per_second": 123.819, | |
"eval_steps_per_second": 3.467, | |
"step": 80 | |
}, | |
{ | |
"epoch": 6.48, | |
"grad_norm": 3.964456558227539, | |
"learning_rate": 1.652542372881356e-05, | |
"loss": 0.6692, | |
"step": 81 | |
}, | |
{ | |
"epoch": 6.48, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6881679892539978, | |
"eval_runtime": 2.0284, | |
"eval_samples_per_second": 123.251, | |
"eval_steps_per_second": 3.451, | |
"step": 81 | |
}, | |
{ | |
"epoch": 6.5600000000000005, | |
"grad_norm": 5.390689849853516, | |
"learning_rate": 1.6101694915254237e-05, | |
"loss": 0.6828, | |
"step": 82 | |
}, | |
{ | |
"epoch": 6.5600000000000005, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6882968544960022, | |
"eval_runtime": 2.0308, | |
"eval_samples_per_second": 123.105, | |
"eval_steps_per_second": 3.447, | |
"step": 82 | |
}, | |
{ | |
"epoch": 6.64, | |
"grad_norm": 5.355860710144043, | |
"learning_rate": 1.5677966101694916e-05, | |
"loss": 0.7421, | |
"step": 83 | |
}, | |
{ | |
"epoch": 6.64, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6880898475646973, | |
"eval_runtime": 2.0269, | |
"eval_samples_per_second": 123.342, | |
"eval_steps_per_second": 3.454, | |
"step": 83 | |
}, | |
{ | |
"epoch": 6.72, | |
"grad_norm": 4.701043128967285, | |
"learning_rate": 1.5254237288135596e-05, | |
"loss": 0.6674, | |
"step": 84 | |
}, | |
{ | |
"epoch": 6.72, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6879178881645203, | |
"eval_runtime": 2.0224, | |
"eval_samples_per_second": 123.617, | |
"eval_steps_per_second": 3.461, | |
"step": 84 | |
}, | |
{ | |
"epoch": 6.8, | |
"grad_norm": 2.8967323303222656, | |
"learning_rate": 1.4830508474576272e-05, | |
"loss": 0.7777, | |
"step": 85 | |
}, | |
{ | |
"epoch": 6.8, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.6875312328338623, | |
"eval_runtime": 2.0275, | |
"eval_samples_per_second": 123.307, | |
"eval_steps_per_second": 3.453, | |
"step": 85 | |
}, | |
{ | |
"epoch": 6.88, | |
"grad_norm": 1.572066068649292, | |
"learning_rate": 1.440677966101695e-05, | |
"loss": 0.6839, | |
"step": 86 | |
}, | |
{ | |
"epoch": 6.88, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.687863290309906, | |
"eval_runtime": 2.0216, | |
"eval_samples_per_second": 123.662, | |
"eval_steps_per_second": 3.463, | |
"step": 86 | |
}, | |
{ | |
"epoch": 6.96, | |
"grad_norm": 3.870039224624634, | |
"learning_rate": 1.3983050847457627e-05, | |
"loss": 0.7349, | |
"step": 87 | |
}, | |
{ | |
"epoch": 6.96, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6874960660934448, | |
"eval_runtime": 2.0258, | |
"eval_samples_per_second": 123.408, | |
"eval_steps_per_second": 3.455, | |
"step": 87 | |
}, | |
{ | |
"epoch": 7.04, | |
"grad_norm": 2.870877265930176, | |
"learning_rate": 1.3559322033898305e-05, | |
"loss": 0.6877, | |
"step": 88 | |
}, | |
{ | |
"epoch": 7.04, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6879101395606995, | |
"eval_runtime": 2.0295, | |
"eval_samples_per_second": 123.183, | |
"eval_steps_per_second": 3.449, | |
"step": 88 | |
}, | |
{ | |
"epoch": 7.12, | |
"grad_norm": 8.396225929260254, | |
"learning_rate": 1.3135593220338985e-05, | |
"loss": 0.7215, | |
"step": 89 | |
}, | |
{ | |
"epoch": 7.12, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6874336004257202, | |
"eval_runtime": 1.9743, | |
"eval_samples_per_second": 126.624, | |
"eval_steps_per_second": 3.545, | |
"step": 89 | |
}, | |
{ | |
"epoch": 7.2, | |
"grad_norm": 5.108537197113037, | |
"learning_rate": 1.2711864406779661e-05, | |
"loss": 0.7067, | |
"step": 90 | |
}, | |
{ | |
"epoch": 7.2, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6867539286613464, | |
"eval_runtime": 2.0219, | |
"eval_samples_per_second": 123.646, | |
"eval_steps_per_second": 3.462, | |
"step": 90 | |
}, | |
{ | |
"epoch": 7.28, | |
"grad_norm": 6.04660177230835, | |
"learning_rate": 1.228813559322034e-05, | |
"loss": 0.6296, | |
"step": 91 | |
}, | |
{ | |
"epoch": 7.28, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6873242259025574, | |
"eval_runtime": 2.0208, | |
"eval_samples_per_second": 123.711, | |
"eval_steps_per_second": 3.464, | |
"step": 91 | |
}, | |
{ | |
"epoch": 7.36, | |
"grad_norm": 1.4922665357589722, | |
"learning_rate": 1.1864406779661018e-05, | |
"loss": 0.7286, | |
"step": 92 | |
}, | |
{ | |
"epoch": 7.36, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.687429666519165, | |
"eval_runtime": 2.0278, | |
"eval_samples_per_second": 123.288, | |
"eval_steps_per_second": 3.452, | |
"step": 92 | |
}, | |
{ | |
"epoch": 7.44, | |
"grad_norm": 2.887815475463867, | |
"learning_rate": 1.1440677966101696e-05, | |
"loss": 0.6758, | |
"step": 93 | |
}, | |
{ | |
"epoch": 7.44, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6870546936988831, | |
"eval_runtime": 2.024, | |
"eval_samples_per_second": 123.517, | |
"eval_steps_per_second": 3.458, | |
"step": 93 | |
}, | |
{ | |
"epoch": 7.52, | |
"grad_norm": 4.4572272300720215, | |
"learning_rate": 1.1016949152542374e-05, | |
"loss": 0.7017, | |
"step": 94 | |
}, | |
{ | |
"epoch": 7.52, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6872226595878601, | |
"eval_runtime": 2.0285, | |
"eval_samples_per_second": 123.243, | |
"eval_steps_per_second": 3.451, | |
"step": 94 | |
}, | |
{ | |
"epoch": 7.6, | |
"grad_norm": 3.5533223152160645, | |
"learning_rate": 1.0593220338983052e-05, | |
"loss": 0.7203, | |
"step": 95 | |
}, | |
{ | |
"epoch": 7.6, | |
"eval_accuracy": 0.54, | |
"eval_loss": 0.6870664358139038, | |
"eval_runtime": 2.0221, | |
"eval_samples_per_second": 123.636, | |
"eval_steps_per_second": 3.462, | |
"step": 95 | |
}, | |
{ | |
"epoch": 7.68, | |
"grad_norm": 2.765678882598877, | |
"learning_rate": 1.016949152542373e-05, | |
"loss": 0.7448, | |
"step": 96 | |
}, | |
{ | |
"epoch": 7.68, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6872734427452087, | |
"eval_runtime": 2.0298, | |
"eval_samples_per_second": 123.164, | |
"eval_steps_per_second": 3.449, | |
"step": 96 | |
}, | |
{ | |
"epoch": 7.76, | |
"grad_norm": 4.892889022827148, | |
"learning_rate": 9.745762711864407e-06, | |
"loss": 0.6589, | |
"step": 97 | |
}, | |
{ | |
"epoch": 7.76, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6874336004257202, | |
"eval_runtime": 2.0274, | |
"eval_samples_per_second": 123.309, | |
"eval_steps_per_second": 3.453, | |
"step": 97 | |
}, | |
{ | |
"epoch": 7.84, | |
"grad_norm": 9.976033210754395, | |
"learning_rate": 9.322033898305085e-06, | |
"loss": 0.7137, | |
"step": 98 | |
}, | |
{ | |
"epoch": 7.84, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.6868671774864197, | |
"eval_runtime": 2.0221, | |
"eval_samples_per_second": 123.633, | |
"eval_steps_per_second": 3.462, | |
"step": 98 | |
}, | |
{ | |
"epoch": 7.92, | |
"grad_norm": 3.153660297393799, | |
"learning_rate": 8.898305084745763e-06, | |
"loss": 0.73, | |
"step": 99 | |
}, | |
{ | |
"epoch": 7.92, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.686941385269165, | |
"eval_runtime": 2.0272, | |
"eval_samples_per_second": 123.325, | |
"eval_steps_per_second": 3.453, | |
"step": 99 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 6.274194240570068, | |
"learning_rate": 8.47457627118644e-06, | |
"loss": 0.6882, | |
"step": 100 | |
}, | |
{ | |
"epoch": 8.0, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6874140501022339, | |
"eval_runtime": 2.0308, | |
"eval_samples_per_second": 123.102, | |
"eval_steps_per_second": 3.447, | |
"step": 100 | |
}, | |
{ | |
"epoch": 8.08, | |
"grad_norm": 2.984788417816162, | |
"learning_rate": 8.050847457627118e-06, | |
"loss": 0.6798, | |
"step": 101 | |
}, | |
{ | |
"epoch": 8.08, | |
"eval_accuracy": 0.528, | |
"eval_loss": 0.6867109537124634, | |
"eval_runtime": 2.0263, | |
"eval_samples_per_second": 123.378, | |
"eval_steps_per_second": 3.455, | |
"step": 101 | |
}, | |
{ | |
"epoch": 8.16, | |
"grad_norm": 3.377530813217163, | |
"learning_rate": 7.627118644067798e-06, | |
"loss": 0.7054, | |
"step": 102 | |
}, | |
{ | |
"epoch": 8.16, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6870234608650208, | |
"eval_runtime": 2.0274, | |
"eval_samples_per_second": 123.309, | |
"eval_steps_per_second": 3.453, | |
"step": 102 | |
}, | |
{ | |
"epoch": 8.24, | |
"grad_norm": 7.250791072845459, | |
"learning_rate": 7.203389830508475e-06, | |
"loss": 0.6952, | |
"step": 103 | |
}, | |
{ | |
"epoch": 8.24, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6866171956062317, | |
"eval_runtime": 2.0228, | |
"eval_samples_per_second": 123.592, | |
"eval_steps_per_second": 3.461, | |
"step": 103 | |
}, | |
{ | |
"epoch": 8.32, | |
"grad_norm": 5.269489765167236, | |
"learning_rate": 6.779661016949153e-06, | |
"loss": 0.7793, | |
"step": 104 | |
}, | |
{ | |
"epoch": 8.32, | |
"eval_accuracy": 0.544, | |
"eval_loss": 0.6874960660934448, | |
"eval_runtime": 2.0212, | |
"eval_samples_per_second": 123.691, | |
"eval_steps_per_second": 3.463, | |
"step": 104 | |
}, | |
{ | |
"epoch": 8.4, | |
"grad_norm": 4.358240127563477, | |
"learning_rate": 6.3559322033898304e-06, | |
"loss": 0.7286, | |
"step": 105 | |
}, | |
{ | |
"epoch": 8.4, | |
"eval_accuracy": 0.536, | |
"eval_loss": 0.6872695088386536, | |
"eval_runtime": 2.0305, | |
"eval_samples_per_second": 123.124, | |
"eval_steps_per_second": 3.447, | |
"step": 105 | |
}, | |
{ | |
"epoch": 8.48, | |
"grad_norm": 2.62113881111145, | |
"learning_rate": 5.932203389830509e-06, | |
"loss": 0.7209, | |
"step": 106 | |
}, | |
{ | |
"epoch": 8.48, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6873710751533508, | |
"eval_runtime": 2.0243, | |
"eval_samples_per_second": 123.499, | |
"eval_steps_per_second": 3.458, | |
"step": 106 | |
}, | |
{ | |
"epoch": 8.56, | |
"grad_norm": 4.648255348205566, | |
"learning_rate": 5.508474576271187e-06, | |
"loss": 0.6545, | |
"step": 107 | |
}, | |
{ | |
"epoch": 8.56, | |
"eval_accuracy": 0.532, | |
"eval_loss": 0.687738299369812, | |
"eval_runtime": 2.0205, | |
"eval_samples_per_second": 123.731, | |
"eval_steps_per_second": 3.464, | |
"step": 107 | |
}, | |
{ | |
"epoch": 8.64, | |
"grad_norm": 2.2081334590911865, | |
"learning_rate": 5.084745762711865e-06, | |
"loss": 0.7435, | |
"step": 108 | |
}, | |
{ | |
"epoch": 8.64, | |
"eval_accuracy": 0.516, | |
"eval_loss": 0.687386691570282, | |
"eval_runtime": 2.0291, | |
"eval_samples_per_second": 123.207, | |
"eval_steps_per_second": 3.45, | |
"step": 108 | |
}, | |
{ | |
"epoch": 8.72, | |
"grad_norm": 2.0511488914489746, | |
"learning_rate": 4.6610169491525425e-06, | |
"loss": 0.6881, | |
"step": 109 | |
}, | |
{ | |
"epoch": 8.72, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6879023313522339, | |
"eval_runtime": 2.0181, | |
"eval_samples_per_second": 123.877, | |
"eval_steps_per_second": 3.469, | |
"step": 109 | |
}, | |
{ | |
"epoch": 8.8, | |
"grad_norm": 2.262899160385132, | |
"learning_rate": 4.23728813559322e-06, | |
"loss": 0.7414, | |
"step": 110 | |
}, | |
{ | |
"epoch": 8.8, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.68798828125, | |
"eval_runtime": 2.0297, | |
"eval_samples_per_second": 123.17, | |
"eval_steps_per_second": 3.449, | |
"step": 110 | |
}, | |
{ | |
"epoch": 8.88, | |
"grad_norm": 2.8543484210968018, | |
"learning_rate": 3.813559322033899e-06, | |
"loss": 0.6834, | |
"step": 111 | |
}, | |
{ | |
"epoch": 8.88, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6876718997955322, | |
"eval_runtime": 2.0227, | |
"eval_samples_per_second": 123.595, | |
"eval_steps_per_second": 3.461, | |
"step": 111 | |
}, | |
{ | |
"epoch": 8.96, | |
"grad_norm": 7.8969855308532715, | |
"learning_rate": 3.3898305084745763e-06, | |
"loss": 0.7387, | |
"step": 112 | |
}, | |
{ | |
"epoch": 8.96, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6874452829360962, | |
"eval_runtime": 2.0222, | |
"eval_samples_per_second": 123.629, | |
"eval_steps_per_second": 3.462, | |
"step": 112 | |
}, | |
{ | |
"epoch": 9.04, | |
"grad_norm": 15.415665626525879, | |
"learning_rate": 2.9661016949152545e-06, | |
"loss": 0.6816, | |
"step": 113 | |
}, | |
{ | |
"epoch": 9.04, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6880937218666077, | |
"eval_runtime": 2.0198, | |
"eval_samples_per_second": 123.774, | |
"eval_steps_per_second": 3.466, | |
"step": 113 | |
}, | |
{ | |
"epoch": 9.12, | |
"grad_norm": 2.6060779094696045, | |
"learning_rate": 2.5423728813559323e-06, | |
"loss": 0.6739, | |
"step": 114 | |
}, | |
{ | |
"epoch": 9.12, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6881679892539978, | |
"eval_runtime": 2.0217, | |
"eval_samples_per_second": 123.661, | |
"eval_steps_per_second": 3.463, | |
"step": 114 | |
}, | |
{ | |
"epoch": 9.2, | |
"grad_norm": 3.591651201248169, | |
"learning_rate": 2.11864406779661e-06, | |
"loss": 0.6649, | |
"step": 115 | |
}, | |
{ | |
"epoch": 9.2, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6879257559776306, | |
"eval_runtime": 2.0224, | |
"eval_samples_per_second": 123.614, | |
"eval_steps_per_second": 3.461, | |
"step": 115 | |
}, | |
{ | |
"epoch": 9.28, | |
"grad_norm": 2.9647672176361084, | |
"learning_rate": 1.6949152542372882e-06, | |
"loss": 0.7469, | |
"step": 116 | |
}, | |
{ | |
"epoch": 9.28, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6880781054496765, | |
"eval_runtime": 2.0268, | |
"eval_samples_per_second": 123.347, | |
"eval_steps_per_second": 3.454, | |
"step": 116 | |
}, | |
{ | |
"epoch": 9.36, | |
"grad_norm": 4.3675031661987305, | |
"learning_rate": 1.2711864406779662e-06, | |
"loss": 0.7278, | |
"step": 117 | |
}, | |
{ | |
"epoch": 9.36, | |
"eval_accuracy": 0.524, | |
"eval_loss": 0.6877461075782776, | |
"eval_runtime": 2.0288, | |
"eval_samples_per_second": 123.227, | |
"eval_steps_per_second": 3.45, | |
"step": 117 | |
}, | |
{ | |
"epoch": 9.44, | |
"grad_norm": 5.043523788452148, | |
"learning_rate": 8.474576271186441e-07, | |
"loss": 0.6509, | |
"step": 118 | |
}, | |
{ | |
"epoch": 9.44, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6879922151565552, | |
"eval_runtime": 2.0252, | |
"eval_samples_per_second": 123.447, | |
"eval_steps_per_second": 3.457, | |
"step": 118 | |
}, | |
{ | |
"epoch": 9.52, | |
"grad_norm": 5.11408805847168, | |
"learning_rate": 4.2372881355932204e-07, | |
"loss": 0.6899, | |
"step": 119 | |
}, | |
{ | |
"epoch": 9.52, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.6880234479904175, | |
"eval_runtime": 2.0206, | |
"eval_samples_per_second": 123.725, | |
"eval_steps_per_second": 3.464, | |
"step": 119 | |
}, | |
{ | |
"epoch": 9.6, | |
"grad_norm": 2.0295844078063965, | |
"learning_rate": 0.0, | |
"loss": 0.7045, | |
"step": 120 | |
}, | |
{ | |
"epoch": 9.6, | |
"eval_accuracy": 0.52, | |
"eval_loss": 0.687429666519165, | |
"eval_runtime": 2.0271, | |
"eval_samples_per_second": 123.33, | |
"eval_steps_per_second": 3.453, | |
"step": 120 | |
}, | |
{ | |
"epoch": 9.6, | |
"step": 120, | |
"total_flos": 70235960737792.0, | |
"train_loss": 0.7138384501139323, | |
"train_runtime": 562.6296, | |
"train_samples_per_second": 17.774, | |
"train_steps_per_second": 0.213 | |
} | |
], | |
"logging_steps": 1, | |
"max_steps": 120, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 10, | |
"save_steps": 500, | |
"total_flos": 70235960737792.0, | |
"train_batch_size": 10, | |
"trial_name": null, | |
"trial_params": null | |
} | |