{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "grad_norm": 23.256885528564453, "learning_rate": 2.5e-05, "loss": 0.9107, "step": 1 }, { "epoch": 0.2, "eval_accuracy": 0.57, "eval_loss": 0.8262696862220764, "eval_runtime": 0.8414, "eval_samples_per_second": 118.852, "eval_steps_per_second": 3.566, "step": 1 }, { "epoch": 0.4, "grad_norm": 23.631181716918945, "learning_rate": 5e-05, "loss": 0.9117, "step": 2 }, { "epoch": 0.4, "eval_accuracy": 0.57, "eval_loss": 0.8167089819908142, "eval_runtime": 0.8385, "eval_samples_per_second": 119.261, "eval_steps_per_second": 3.578, "step": 2 }, { "epoch": 0.6, "grad_norm": 28.508174896240234, "learning_rate": 4.8958333333333335e-05, "loss": 0.9991, "step": 3 }, { "epoch": 0.6, "eval_accuracy": 0.57, "eval_loss": 0.7878709435462952, "eval_runtime": 0.8413, "eval_samples_per_second": 118.87, "eval_steps_per_second": 3.566, "step": 3 }, { "epoch": 0.8, "grad_norm": 19.008262634277344, "learning_rate": 4.791666666666667e-05, "loss": 0.8288, "step": 4 }, { "epoch": 0.8, "eval_accuracy": 0.56, "eval_loss": 0.7578710913658142, "eval_runtime": 0.8391, "eval_samples_per_second": 119.176, "eval_steps_per_second": 3.575, "step": 4 }, { "epoch": 1.0, "grad_norm": 23.889429092407227, "learning_rate": 4.6875e-05, "loss": 0.8763, "step": 5 }, { "epoch": 1.0, "eval_accuracy": 0.53, "eval_loss": 0.7321679592132568, "eval_runtime": 0.8386, "eval_samples_per_second": 119.253, "eval_steps_per_second": 3.578, "step": 5 }, { "epoch": 1.2, "grad_norm": 9.946539878845215, "learning_rate": 4.5833333333333334e-05, "loss": 0.7244, "step": 6 }, { "epoch": 1.2, "eval_accuracy": 0.55, "eval_loss": 0.7117968797683716, "eval_runtime": 0.8351, "eval_samples_per_second": 119.747, "eval_steps_per_second": 3.592, "step": 6 }, { "epoch": 1.4, "grad_norm": 9.386951446533203, "learning_rate": 4.4791666666666673e-05, "loss": 0.7601, "step": 7 }, { "epoch": 1.4, "eval_accuracy": 0.54, "eval_loss": 0.6952930688858032, "eval_runtime": 0.8393, "eval_samples_per_second": 119.145, "eval_steps_per_second": 3.574, "step": 7 }, { "epoch": 1.6, "grad_norm": 20.157466888427734, "learning_rate": 4.375e-05, "loss": 0.8343, "step": 8 }, { "epoch": 1.6, "eval_accuracy": 0.54, "eval_loss": 0.6882422566413879, "eval_runtime": 0.8343, "eval_samples_per_second": 119.865, "eval_steps_per_second": 3.596, "step": 8 }, { "epoch": 1.8, "grad_norm": 6.170377731323242, "learning_rate": 4.270833333333333e-05, "loss": 0.6825, "step": 9 }, { "epoch": 1.8, "eval_accuracy": 0.53, "eval_loss": 0.6919335126876831, "eval_runtime": 0.8402, "eval_samples_per_second": 119.014, "eval_steps_per_second": 3.57, "step": 9 }, { "epoch": 2.0, "grad_norm": 3.945521831512451, "learning_rate": 4.166666666666667e-05, "loss": 0.7545, "step": 10 }, { "epoch": 2.0, "eval_accuracy": 0.45, "eval_loss": 0.6997266411781311, "eval_runtime": 0.8398, "eval_samples_per_second": 119.072, "eval_steps_per_second": 3.572, "step": 10 }, { "epoch": 2.2, "grad_norm": 8.659540176391602, "learning_rate": 4.0625000000000005e-05, "loss": 0.709, "step": 11 }, { "epoch": 2.2, "eval_accuracy": 0.45, "eval_loss": 0.7026562690734863, "eval_runtime": 0.8388, "eval_samples_per_second": 119.217, "eval_steps_per_second": 3.577, "step": 11 }, { "epoch": 2.4, "grad_norm": 3.4722447395324707, "learning_rate": 3.958333333333333e-05, "loss": 0.6914, "step": 12 }, { "epoch": 2.4, "eval_accuracy": 0.46, "eval_loss": 0.704160213470459, "eval_runtime": 0.839, "eval_samples_per_second": 119.192, "eval_steps_per_second": 3.576, "step": 12 }, { "epoch": 2.6, "grad_norm": 4.033767223358154, "learning_rate": 3.854166666666667e-05, "loss": 0.6999, "step": 13 }, { "epoch": 2.6, "eval_accuracy": 0.48, "eval_loss": 0.7015135884284973, "eval_runtime": 0.8381, "eval_samples_per_second": 119.324, "eval_steps_per_second": 3.58, "step": 13 }, { "epoch": 2.8, "grad_norm": 2.7851364612579346, "learning_rate": 3.7500000000000003e-05, "loss": 0.6829, "step": 14 }, { "epoch": 2.8, "eval_accuracy": 0.49, "eval_loss": 0.7031835913658142, "eval_runtime": 0.839, "eval_samples_per_second": 119.197, "eval_steps_per_second": 3.576, "step": 14 }, { "epoch": 3.0, "grad_norm": 2.5898079872131348, "learning_rate": 3.6458333333333336e-05, "loss": 0.6776, "step": 15 }, { "epoch": 3.0, "eval_accuracy": 0.47, "eval_loss": 0.7040430307388306, "eval_runtime": 0.8373, "eval_samples_per_second": 119.43, "eval_steps_per_second": 3.583, "step": 15 }, { "epoch": 3.2, "grad_norm": 7.040603160858154, "learning_rate": 3.541666666666667e-05, "loss": 0.7151, "step": 16 }, { "epoch": 3.2, "eval_accuracy": 0.46, "eval_loss": 0.7056737542152405, "eval_runtime": 0.8384, "eval_samples_per_second": 119.274, "eval_steps_per_second": 3.578, "step": 16 }, { "epoch": 3.4, "grad_norm": 6.8378705978393555, "learning_rate": 3.4375e-05, "loss": 0.7186, "step": 17 }, { "epoch": 3.4, "eval_accuracy": 0.46, "eval_loss": 0.7082374095916748, "eval_runtime": 0.8394, "eval_samples_per_second": 119.131, "eval_steps_per_second": 3.574, "step": 17 }, { "epoch": 3.6, "grad_norm": 4.7442545890808105, "learning_rate": 3.3333333333333335e-05, "loss": 0.7058, "step": 18 }, { "epoch": 3.6, "eval_accuracy": 0.46, "eval_loss": 0.7107178568840027, "eval_runtime": 0.7888, "eval_samples_per_second": 126.768, "eval_steps_per_second": 3.803, "step": 18 }, { "epoch": 3.8, "grad_norm": 5.4023237228393555, "learning_rate": 3.229166666666667e-05, "loss": 0.7207, "step": 19 }, { "epoch": 3.8, "eval_accuracy": 0.46, "eval_loss": 0.7103956341743469, "eval_runtime": 0.8395, "eval_samples_per_second": 119.117, "eval_steps_per_second": 3.573, "step": 19 }, { "epoch": 4.0, "grad_norm": 5.683109283447266, "learning_rate": 3.125e-05, "loss": 0.6839, "step": 20 }, { "epoch": 4.0, "eval_accuracy": 0.47, "eval_loss": 0.7080614566802979, "eval_runtime": 0.8405, "eval_samples_per_second": 118.971, "eval_steps_per_second": 3.569, "step": 20 }, { "epoch": 4.2, "grad_norm": 4.991653919219971, "learning_rate": 3.0208333333333334e-05, "loss": 0.6378, "step": 21 }, { "epoch": 4.2, "eval_accuracy": 0.49, "eval_loss": 0.7062841653823853, "eval_runtime": 0.839, "eval_samples_per_second": 119.188, "eval_steps_per_second": 3.576, "step": 21 }, { "epoch": 4.4, "grad_norm": 4.332605361938477, "learning_rate": 2.916666666666667e-05, "loss": 0.7266, "step": 22 }, { "epoch": 4.4, "eval_accuracy": 0.47, "eval_loss": 0.7057519555091858, "eval_runtime": 0.8392, "eval_samples_per_second": 119.161, "eval_steps_per_second": 3.575, "step": 22 }, { "epoch": 4.6, "grad_norm": 3.1174395084381104, "learning_rate": 2.8125000000000003e-05, "loss": 0.6729, "step": 23 }, { "epoch": 4.6, "eval_accuracy": 0.46, "eval_loss": 0.7046825885772705, "eval_runtime": 0.8377, "eval_samples_per_second": 119.373, "eval_steps_per_second": 3.581, "step": 23 }, { "epoch": 4.8, "grad_norm": 3.505262613296509, "learning_rate": 2.7083333333333332e-05, "loss": 0.7189, "step": 24 }, { "epoch": 4.8, "eval_accuracy": 0.46, "eval_loss": 0.702241063117981, "eval_runtime": 0.8383, "eval_samples_per_second": 119.288, "eval_steps_per_second": 3.579, "step": 24 }, { "epoch": 5.0, "grad_norm": 3.249929904937744, "learning_rate": 2.604166666666667e-05, "loss": 0.7092, "step": 25 }, { "epoch": 5.0, "eval_accuracy": 0.46, "eval_loss": 0.7012988328933716, "eval_runtime": 0.8369, "eval_samples_per_second": 119.494, "eval_steps_per_second": 3.585, "step": 25 }, { "epoch": 5.2, "grad_norm": 2.5304999351501465, "learning_rate": 2.5e-05, "loss": 0.7145, "step": 26 }, { "epoch": 5.2, "eval_accuracy": 0.47, "eval_loss": 0.7005956768989563, "eval_runtime": 0.836, "eval_samples_per_second": 119.623, "eval_steps_per_second": 3.589, "step": 26 }, { "epoch": 5.4, "grad_norm": 2.5304205417633057, "learning_rate": 2.3958333333333334e-05, "loss": 0.6946, "step": 27 }, { "epoch": 5.4, "eval_accuracy": 0.48, "eval_loss": 0.7012646198272705, "eval_runtime": 0.8366, "eval_samples_per_second": 119.526, "eval_steps_per_second": 3.586, "step": 27 }, { "epoch": 5.6, "grad_norm": 7.720176696777344, "learning_rate": 2.2916666666666667e-05, "loss": 0.7456, "step": 28 }, { "epoch": 5.6, "eval_accuracy": 0.46, "eval_loss": 0.7020359635353088, "eval_runtime": 0.8393, "eval_samples_per_second": 119.141, "eval_steps_per_second": 3.574, "step": 28 }, { "epoch": 5.8, "grad_norm": 4.178812026977539, "learning_rate": 2.1875e-05, "loss": 0.7231, "step": 29 }, { "epoch": 5.8, "eval_accuracy": 0.47, "eval_loss": 0.7020019292831421, "eval_runtime": 0.8397, "eval_samples_per_second": 119.086, "eval_steps_per_second": 3.573, "step": 29 }, { "epoch": 6.0, "grad_norm": 6.495377063751221, "learning_rate": 2.0833333333333336e-05, "loss": 0.6874, "step": 30 }, { "epoch": 6.0, "eval_accuracy": 0.46, "eval_loss": 0.7010400295257568, "eval_runtime": 0.7916, "eval_samples_per_second": 126.329, "eval_steps_per_second": 3.79, "step": 30 }, { "epoch": 6.2, "grad_norm": 1.7229987382888794, "learning_rate": 1.9791666666666665e-05, "loss": 0.7039, "step": 31 }, { "epoch": 6.2, "eval_accuracy": 0.45, "eval_loss": 0.7006199359893799, "eval_runtime": 0.8371, "eval_samples_per_second": 119.466, "eval_steps_per_second": 3.584, "step": 31 }, { "epoch": 6.4, "grad_norm": 4.806630611419678, "learning_rate": 1.8750000000000002e-05, "loss": 0.6858, "step": 32 }, { "epoch": 6.4, "eval_accuracy": 0.45, "eval_loss": 0.6990869045257568, "eval_runtime": 0.836, "eval_samples_per_second": 119.617, "eval_steps_per_second": 3.589, "step": 32 }, { "epoch": 6.6, "grad_norm": 2.4336676597595215, "learning_rate": 1.7708333333333335e-05, "loss": 0.6571, "step": 33 }, { "epoch": 6.6, "eval_accuracy": 0.46, "eval_loss": 0.6985548138618469, "eval_runtime": 0.8383, "eval_samples_per_second": 119.293, "eval_steps_per_second": 3.579, "step": 33 }, { "epoch": 6.8, "grad_norm": 2.346224784851074, "learning_rate": 1.6666666666666667e-05, "loss": 0.7069, "step": 34 }, { "epoch": 6.8, "eval_accuracy": 0.46, "eval_loss": 0.6970605254173279, "eval_runtime": 0.8399, "eval_samples_per_second": 119.069, "eval_steps_per_second": 3.572, "step": 34 }, { "epoch": 7.0, "grad_norm": 6.023119926452637, "learning_rate": 1.5625e-05, "loss": 0.6878, "step": 35 }, { "epoch": 7.0, "eval_accuracy": 0.46, "eval_loss": 0.6974707245826721, "eval_runtime": 0.8356, "eval_samples_per_second": 119.671, "eval_steps_per_second": 3.59, "step": 35 }, { "epoch": 7.2, "grad_norm": 2.5812861919403076, "learning_rate": 1.4583333333333335e-05, "loss": 0.7148, "step": 36 }, { "epoch": 7.2, "eval_accuracy": 0.46, "eval_loss": 0.6966310143470764, "eval_runtime": 0.8396, "eval_samples_per_second": 119.101, "eval_steps_per_second": 3.573, "step": 36 }, { "epoch": 7.4, "grad_norm": 6.9987263679504395, "learning_rate": 1.3541666666666666e-05, "loss": 0.6821, "step": 37 }, { "epoch": 7.4, "eval_accuracy": 0.46, "eval_loss": 0.6957228183746338, "eval_runtime": 0.838, "eval_samples_per_second": 119.326, "eval_steps_per_second": 3.58, "step": 37 }, { "epoch": 7.6, "grad_norm": 4.426673889160156, "learning_rate": 1.25e-05, "loss": 0.6866, "step": 38 }, { "epoch": 7.6, "eval_accuracy": 0.46, "eval_loss": 0.6958692073822021, "eval_runtime": 0.8381, "eval_samples_per_second": 119.321, "eval_steps_per_second": 3.58, "step": 38 }, { "epoch": 7.8, "grad_norm": 1.72467839717865, "learning_rate": 1.1458333333333333e-05, "loss": 0.6441, "step": 39 }, { "epoch": 7.8, "eval_accuracy": 0.46, "eval_loss": 0.6971582174301147, "eval_runtime": 0.8397, "eval_samples_per_second": 119.085, "eval_steps_per_second": 3.573, "step": 39 }, { "epoch": 8.0, "grad_norm": 10.352705001831055, "learning_rate": 1.0416666666666668e-05, "loss": 0.7264, "step": 40 }, { "epoch": 8.0, "eval_accuracy": 0.46, "eval_loss": 0.6962793469429016, "eval_runtime": 0.8388, "eval_samples_per_second": 119.219, "eval_steps_per_second": 3.577, "step": 40 }, { "epoch": 8.2, "grad_norm": 4.645693778991699, "learning_rate": 9.375000000000001e-06, "loss": 0.7164, "step": 41 }, { "epoch": 8.2, "eval_accuracy": 0.46, "eval_loss": 0.695371150970459, "eval_runtime": 0.839, "eval_samples_per_second": 119.183, "eval_steps_per_second": 3.575, "step": 41 }, { "epoch": 8.4, "grad_norm": 7.059908390045166, "learning_rate": 8.333333333333334e-06, "loss": 0.737, "step": 42 }, { "epoch": 8.4, "eval_accuracy": 0.46, "eval_loss": 0.6947849988937378, "eval_runtime": 0.8369, "eval_samples_per_second": 119.483, "eval_steps_per_second": 3.585, "step": 42 }, { "epoch": 8.6, "grad_norm": 1.3762595653533936, "learning_rate": 7.2916666666666674e-06, "loss": 0.7173, "step": 43 }, { "epoch": 8.6, "eval_accuracy": 0.47, "eval_loss": 0.6942089796066284, "eval_runtime": 0.8392, "eval_samples_per_second": 119.159, "eval_steps_per_second": 3.575, "step": 43 }, { "epoch": 8.8, "grad_norm": 4.145285129547119, "learning_rate": 6.25e-06, "loss": 0.6933, "step": 44 }, { "epoch": 8.8, "eval_accuracy": 0.47, "eval_loss": 0.6938575506210327, "eval_runtime": 0.8362, "eval_samples_per_second": 119.582, "eval_steps_per_second": 3.587, "step": 44 }, { "epoch": 9.0, "grad_norm": 3.7309446334838867, "learning_rate": 5.208333333333334e-06, "loss": 0.7204, "step": 45 }, { "epoch": 9.0, "eval_accuracy": 0.47, "eval_loss": 0.6939452886581421, "eval_runtime": 0.7873, "eval_samples_per_second": 127.008, "eval_steps_per_second": 3.81, "step": 45 }, { "epoch": 9.2, "grad_norm": 2.0658915042877197, "learning_rate": 4.166666666666667e-06, "loss": 0.707, "step": 46 }, { "epoch": 9.2, "eval_accuracy": 0.47, "eval_loss": 0.6938378810882568, "eval_runtime": 0.8363, "eval_samples_per_second": 119.577, "eval_steps_per_second": 3.587, "step": 46 }, { "epoch": 9.4, "grad_norm": 2.766176223754883, "learning_rate": 3.125e-06, "loss": 0.7343, "step": 47 }, { "epoch": 9.4, "eval_accuracy": 0.47, "eval_loss": 0.6941795945167542, "eval_runtime": 0.8392, "eval_samples_per_second": 119.163, "eval_steps_per_second": 3.575, "step": 47 }, { "epoch": 9.6, "grad_norm": 3.535402774810791, "learning_rate": 2.0833333333333334e-06, "loss": 0.7072, "step": 48 }, { "epoch": 9.6, "eval_accuracy": 0.48, "eval_loss": 0.6940819621086121, "eval_runtime": 0.79, "eval_samples_per_second": 126.575, "eval_steps_per_second": 3.797, "step": 48 }, { "epoch": 9.8, "grad_norm": 4.9612932205200195, "learning_rate": 1.0416666666666667e-06, "loss": 0.6949, "step": 49 }, { "epoch": 9.8, "eval_accuracy": 0.47, "eval_loss": 0.6937304139137268, "eval_runtime": 0.8383, "eval_samples_per_second": 119.288, "eval_steps_per_second": 3.579, "step": 49 }, { "epoch": 10.0, "grad_norm": 3.1242637634277344, "learning_rate": 0.0, "loss": 0.6862, "step": 50 }, { "epoch": 10.0, "eval_accuracy": 0.46, "eval_loss": 0.6938574910163879, "eval_runtime": 0.7902, "eval_samples_per_second": 126.558, "eval_steps_per_second": 3.797, "step": 50 }, { "epoch": 10.0, "step": 50, "total_flos": 24115496386560.0, "train_loss": 0.7265405237674714, "train_runtime": 165.0104, "train_samples_per_second": 24.241, "train_steps_per_second": 0.303 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 24115496386560.0, "train_batch_size": 10, "trial_name": null, "trial_params": null }