{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.699453551912568, "eval_steps": 25, "global_step": 44, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08743169398907104, "grad_norm": 22.347578048706055, "learning_rate": 0.00019545454545454548, "loss": 9.5025, "step": 1 }, { "epoch": 0.17486338797814208, "grad_norm": 25.239198684692383, "learning_rate": 0.00019090909090909092, "loss": 9.3395, "step": 2 }, { "epoch": 0.26229508196721313, "grad_norm": 32.5155143737793, "learning_rate": 0.00018636363636363636, "loss": 9.064, "step": 3 }, { "epoch": 0.34972677595628415, "grad_norm": 52.9838752746582, "learning_rate": 0.00018181818181818183, "loss": 8.4152, "step": 4 }, { "epoch": 0.4371584699453552, "grad_norm": 37.703086853027344, "learning_rate": 0.00017727272727272728, "loss": 8.2592, "step": 5 }, { "epoch": 0.5245901639344263, "grad_norm": 51.64411926269531, "learning_rate": 0.00017272727272727275, "loss": 7.5229, "step": 6 }, { "epoch": 0.6120218579234973, "grad_norm": 51.453033447265625, "learning_rate": 0.0001681818181818182, "loss": 7.8703, "step": 7 }, { "epoch": 0.6994535519125683, "grad_norm": 49.968597412109375, "learning_rate": 0.00016363636363636366, "loss": 7.4822, "step": 8 }, { "epoch": 0.7868852459016393, "grad_norm": 51.338375091552734, "learning_rate": 0.0001590909090909091, "loss": 7.1884, "step": 9 }, { "epoch": 0.8743169398907104, "grad_norm": 31.432743072509766, "learning_rate": 0.00015454545454545454, "loss": 6.8064, "step": 10 }, { "epoch": 0.9617486338797814, "grad_norm": 38.92168045043945, "learning_rate": 0.00015000000000000001, "loss": 7.1349, "step": 11 }, { "epoch": 1.0, "grad_norm": 29.082275390625, "learning_rate": 0.00014545454545454546, "loss": 2.7656, "step": 12 }, { "epoch": 1.0874316939890711, "grad_norm": 45.3143424987793, "learning_rate": 0.00014090909090909093, "loss": 6.6156, "step": 13 }, { "epoch": 1.174863387978142, "grad_norm": 21.88313865661621, "learning_rate": 0.00013636363636363637, "loss": 6.798, "step": 14 }, { "epoch": 1.2622950819672132, "grad_norm": 17.01368522644043, "learning_rate": 0.0001318181818181818, "loss": 6.6264, "step": 15 }, { "epoch": 1.349726775956284, "grad_norm": 34.124168395996094, "learning_rate": 0.00012727272727272728, "loss": 6.2059, "step": 16 }, { "epoch": 1.4371584699453552, "grad_norm": 13.495098114013672, "learning_rate": 0.00012272727272727272, "loss": 6.5543, "step": 17 }, { "epoch": 1.5245901639344264, "grad_norm": 15.081934928894043, "learning_rate": 0.0001181818181818182, "loss": 6.5956, "step": 18 }, { "epoch": 1.6120218579234973, "grad_norm": 16.291242599487305, "learning_rate": 0.00011363636363636365, "loss": 6.4254, "step": 19 }, { "epoch": 1.6994535519125682, "grad_norm": 26.84221839904785, "learning_rate": 0.00010909090909090909, "loss": 6.2627, "step": 20 }, { "epoch": 1.7868852459016393, "grad_norm": 22.350202560424805, "learning_rate": 0.00010454545454545455, "loss": 6.1832, "step": 21 }, { "epoch": 1.8743169398907105, "grad_norm": 16.34981346130371, "learning_rate": 0.0001, "loss": 6.6073, "step": 22 }, { "epoch": 1.9617486338797814, "grad_norm": 18.4270076751709, "learning_rate": 9.545454545454546e-05, "loss": 6.5258, "step": 23 }, { "epoch": 2.0, "grad_norm": 15.420706748962402, "learning_rate": 9.090909090909092e-05, "loss": 2.5959, "step": 24 }, { "epoch": 2.087431693989071, "grad_norm": 11.540351867675781, "learning_rate": 8.636363636363637e-05, "loss": 6.334, "step": 25 }, { "epoch": 2.087431693989071, "eval_clap": 0.18773917853832245, "eval_loss": 6.105246543884277, "eval_runtime": 83.5333, "eval_samples_per_second": 0.096, "eval_steps_per_second": 0.096, "step": 25 }, { "epoch": 2.1748633879781423, "grad_norm": 21.185924530029297, "learning_rate": 8.181818181818183e-05, "loss": 6.109, "step": 26 }, { "epoch": 2.262295081967213, "grad_norm": 13.807082176208496, "learning_rate": 7.727272727272727e-05, "loss": 6.4383, "step": 27 }, { "epoch": 2.349726775956284, "grad_norm": 17.374765396118164, "learning_rate": 7.272727272727273e-05, "loss": 6.0599, "step": 28 }, { "epoch": 2.4371584699453552, "grad_norm": 12.8378324508667, "learning_rate": 6.818181818181818e-05, "loss": 6.2779, "step": 29 }, { "epoch": 2.5245901639344264, "grad_norm": 16.53411865234375, "learning_rate": 6.363636363636364e-05, "loss": 6.3555, "step": 30 }, { "epoch": 2.612021857923497, "grad_norm": 13.256036758422852, "learning_rate": 5.90909090909091e-05, "loss": 6.1763, "step": 31 }, { "epoch": 2.699453551912568, "grad_norm": 19.911523818969727, "learning_rate": 5.4545454545454546e-05, "loss": 6.5248, "step": 32 }, { "epoch": 2.7868852459016393, "grad_norm": 12.554062843322754, "learning_rate": 5e-05, "loss": 6.0678, "step": 33 }, { "epoch": 2.8743169398907105, "grad_norm": 9.20517635345459, "learning_rate": 4.545454545454546e-05, "loss": 6.2365, "step": 34 }, { "epoch": 2.9617486338797816, "grad_norm": 19.72951316833496, "learning_rate": 4.0909090909090915e-05, "loss": 6.5335, "step": 35 }, { "epoch": 3.0, "grad_norm": 7.918391704559326, "learning_rate": 3.6363636363636364e-05, "loss": 2.8033, "step": 36 }, { "epoch": 3.087431693989071, "grad_norm": 10.007157325744629, "learning_rate": 3.181818181818182e-05, "loss": 6.2187, "step": 37 }, { "epoch": 3.1748633879781423, "grad_norm": 18.43683433532715, "learning_rate": 2.7272727272727273e-05, "loss": 6.1133, "step": 38 }, { "epoch": 3.262295081967213, "grad_norm": 11.159557342529297, "learning_rate": 2.272727272727273e-05, "loss": 6.1788, "step": 39 }, { "epoch": 3.349726775956284, "grad_norm": 15.116974830627441, "learning_rate": 1.8181818181818182e-05, "loss": 6.1412, "step": 40 }, { "epoch": 3.4371584699453552, "grad_norm": 14.885272026062012, "learning_rate": 1.3636363636363637e-05, "loss": 6.2777, "step": 41 }, { "epoch": 3.5245901639344264, "grad_norm": 9.910028457641602, "learning_rate": 9.090909090909091e-06, "loss": 6.4341, "step": 42 }, { "epoch": 3.612021857923497, "grad_norm": 15.322835922241211, "learning_rate": 4.5454545454545455e-06, "loss": 6.0193, "step": 43 }, { "epoch": 3.699453551912568, "grad_norm": 20.181869506835938, "learning_rate": 0.0, "loss": 6.1425, "step": 44 }, { "epoch": 3.699453551912568, "step": 44, "total_flos": 212119392152640.0, "train_loss": 6.517944124611941, "train_runtime": 2793.5932, "train_samples_per_second": 0.262, "train_steps_per_second": 0.016 } ], "logging_steps": 1.0, "max_steps": 44, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 212119392152640.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }