| { | |
| "best_metric": 0.14178870618343353, | |
| "best_model_checkpoint": "t5/checkpoint-63492", | |
| "epoch": 22.0, | |
| "eval_steps": 500, | |
| "global_step": 107448, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.4002791941165924, | |
| "learning_rate": 0.00019750000000000003, | |
| "loss": 0.3509, | |
| "step": 4884 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_accuracy": 0.05755138360609973, | |
| "eval_loss": 0.21836575865745544, | |
| "eval_runtime": 1442.1861, | |
| "eval_samples_per_second": 79.482, | |
| "eval_steps_per_second": 0.311, | |
| "step": 4884 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.3467308580875397, | |
| "learning_rate": 0.000195, | |
| "loss": 0.2125, | |
| "step": 9768 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_accuracy": 0.077276058205674, | |
| "eval_loss": 0.18824860453605652, | |
| "eval_runtime": 1440.8058, | |
| "eval_samples_per_second": 79.558, | |
| "eval_steps_per_second": 0.311, | |
| "step": 9768 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.3095360994338989, | |
| "learning_rate": 0.00019250000000000002, | |
| "loss": 0.1842, | |
| "step": 14652 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_accuracy": 0.09321457235579439, | |
| "eval_loss": 0.17374937236309052, | |
| "eval_runtime": 1440.3978, | |
| "eval_samples_per_second": 79.581, | |
| "eval_steps_per_second": 0.311, | |
| "step": 14652 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.26522552967071533, | |
| "learning_rate": 0.00019, | |
| "loss": 0.1669, | |
| "step": 19536 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_accuracy": 0.10587291063265519, | |
| "eval_loss": 0.16474950313568115, | |
| "eval_runtime": 1439.905, | |
| "eval_samples_per_second": 79.608, | |
| "eval_steps_per_second": 0.311, | |
| "step": 19536 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.22116659581661224, | |
| "learning_rate": 0.0001875, | |
| "loss": 0.1544, | |
| "step": 24420 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_accuracy": 0.11401228321178071, | |
| "eval_loss": 0.15819723904132843, | |
| "eval_runtime": 1441.587, | |
| "eval_samples_per_second": 79.515, | |
| "eval_steps_per_second": 0.311, | |
| "step": 24420 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.2789860963821411, | |
| "learning_rate": 0.00018500000000000002, | |
| "loss": 0.1444, | |
| "step": 29304 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_accuracy": 0.12168056670272534, | |
| "eval_loss": 0.15368323028087616, | |
| "eval_runtime": 1440.4412, | |
| "eval_samples_per_second": 79.578, | |
| "eval_steps_per_second": 0.311, | |
| "step": 29304 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.24488751590251923, | |
| "learning_rate": 0.0001825, | |
| "loss": 0.1359, | |
| "step": 34188 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_accuracy": 0.12852880622535506, | |
| "eval_loss": 0.15017201006412506, | |
| "eval_runtime": 1439.8603, | |
| "eval_samples_per_second": 79.611, | |
| "eval_steps_per_second": 0.311, | |
| "step": 34188 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.2019621729850769, | |
| "learning_rate": 0.00018, | |
| "loss": 0.1284, | |
| "step": 39072 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_accuracy": 0.1320968698747252, | |
| "eval_loss": 0.1474558413028717, | |
| "eval_runtime": 1440.5681, | |
| "eval_samples_per_second": 79.571, | |
| "eval_steps_per_second": 0.311, | |
| "step": 39072 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.24134604632854462, | |
| "learning_rate": 0.0001775, | |
| "loss": 0.1218, | |
| "step": 43956 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_accuracy": 0.13765397634085913, | |
| "eval_loss": 0.14563630521297455, | |
| "eval_runtime": 1439.521, | |
| "eval_samples_per_second": 79.629, | |
| "eval_steps_per_second": 0.311, | |
| "step": 43956 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.21248851716518402, | |
| "learning_rate": 0.000175, | |
| "loss": 0.1156, | |
| "step": 48840 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_accuracy": 0.14216421816659106, | |
| "eval_loss": 0.14398989081382751, | |
| "eval_runtime": 1439.8066, | |
| "eval_samples_per_second": 79.613, | |
| "eval_steps_per_second": 0.311, | |
| "step": 48840 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.18349188566207886, | |
| "learning_rate": 0.00017250000000000002, | |
| "loss": 0.11, | |
| "step": 53724 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_accuracy": 0.14525246885577695, | |
| "eval_loss": 0.14398634433746338, | |
| "eval_runtime": 1441.3594, | |
| "eval_samples_per_second": 79.528, | |
| "eval_steps_per_second": 0.311, | |
| "step": 53724 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.19695980846881866, | |
| "learning_rate": 0.00017, | |
| "loss": 0.1049, | |
| "step": 58608 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_accuracy": 0.1470757581044771, | |
| "eval_loss": 0.14296095073223114, | |
| "eval_runtime": 1438.3626, | |
| "eval_samples_per_second": 79.693, | |
| "eval_steps_per_second": 0.311, | |
| "step": 58608 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.19672971963882446, | |
| "learning_rate": 0.0001675, | |
| "loss": 0.1001, | |
| "step": 63492 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_accuracy": 0.14978888229751894, | |
| "eval_loss": 0.14178870618343353, | |
| "eval_runtime": 1440.3056, | |
| "eval_samples_per_second": 79.586, | |
| "eval_steps_per_second": 0.311, | |
| "step": 63492 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.192398801445961, | |
| "learning_rate": 0.000165, | |
| "loss": 0.0956, | |
| "step": 68376 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_accuracy": 0.15094915727396446, | |
| "eval_loss": 0.1433423012495041, | |
| "eval_runtime": 1438.7215, | |
| "eval_samples_per_second": 79.674, | |
| "eval_steps_per_second": 0.311, | |
| "step": 68376 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.17484577000141144, | |
| "learning_rate": 0.00016250000000000002, | |
| "loss": 0.0914, | |
| "step": 73260 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_accuracy": 0.15243221551453398, | |
| "eval_loss": 0.14287354052066803, | |
| "eval_runtime": 1440.6167, | |
| "eval_samples_per_second": 79.569, | |
| "eval_steps_per_second": 0.311, | |
| "step": 73260 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.16681896150112152, | |
| "learning_rate": 0.00016, | |
| "loss": 0.0874, | |
| "step": 78144 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_accuracy": 0.15390654988310012, | |
| "eval_loss": 0.14442311227321625, | |
| "eval_runtime": 1440.0503, | |
| "eval_samples_per_second": 79.6, | |
| "eval_steps_per_second": 0.311, | |
| "step": 78144 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "grad_norm": 0.19815337657928467, | |
| "learning_rate": 0.0001575, | |
| "loss": 0.0837, | |
| "step": 83028 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_accuracy": 0.1557036675157902, | |
| "eval_loss": 0.14355292916297913, | |
| "eval_runtime": 1439.7841, | |
| "eval_samples_per_second": 79.615, | |
| "eval_steps_per_second": 0.311, | |
| "step": 83028 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 0.17696212232112885, | |
| "learning_rate": 0.000155, | |
| "loss": 0.0802, | |
| "step": 87912 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_accuracy": 0.15677670377220226, | |
| "eval_loss": 0.1449592113494873, | |
| "eval_runtime": 1439.963, | |
| "eval_samples_per_second": 79.605, | |
| "eval_steps_per_second": 0.311, | |
| "step": 87912 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "grad_norm": 0.1683954894542694, | |
| "learning_rate": 0.0001525, | |
| "loss": 0.0769, | |
| "step": 92796 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_accuracy": 0.15593921205988065, | |
| "eval_loss": 0.14668723940849304, | |
| "eval_runtime": 1441.0988, | |
| "eval_samples_per_second": 79.542, | |
| "eval_steps_per_second": 0.311, | |
| "step": 92796 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 0.20112481713294983, | |
| "learning_rate": 0.00015000000000000001, | |
| "loss": 0.0738, | |
| "step": 97680 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_accuracy": 0.15597410754789406, | |
| "eval_loss": 0.1498769223690033, | |
| "eval_runtime": 1441.1799, | |
| "eval_samples_per_second": 79.538, | |
| "eval_steps_per_second": 0.311, | |
| "step": 97680 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "grad_norm": 0.16424116492271423, | |
| "learning_rate": 0.0001475, | |
| "loss": 0.0709, | |
| "step": 102564 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_accuracy": 0.1572041735003664, | |
| "eval_loss": 0.14935219287872314, | |
| "eval_runtime": 1439.3479, | |
| "eval_samples_per_second": 79.639, | |
| "eval_steps_per_second": 0.311, | |
| "step": 102564 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 0.16410428285598755, | |
| "learning_rate": 0.000145, | |
| "loss": 0.068, | |
| "step": 107448 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_accuracy": 0.157518232892487, | |
| "eval_loss": 0.15158186852931976, | |
| "eval_runtime": 1441.5687, | |
| "eval_samples_per_second": 79.516, | |
| "eval_steps_per_second": 0.311, | |
| "step": 107448 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 390720, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 80, | |
| "save_steps": 500, | |
| "total_flos": 3.224876944629031e+18, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |