{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 500, "global_step": 4770, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9968553459119497, "grad_norm": 0.3008987009525299, "learning_rate": 0.00021943281418587574, "loss": 0.1836, "step": 317 }, { "epoch": 1.0, "eval_accuracy": 0.8977419354838709, "eval_loss": 0.048293426632881165, "eval_runtime": 5.4409, "eval_samples_per_second": 569.756, "eval_steps_per_second": 11.946, "step": 318 }, { "epoch": 1.9937106918238994, "grad_norm": 0.1929541975259781, "learning_rate": 0.000212311269269539, "loss": 0.0436, "step": 634 }, { "epoch": 2.0, "eval_accuracy": 0.9235483870967742, "eval_loss": 0.03785553574562073, "eval_runtime": 5.4055, "eval_samples_per_second": 573.489, "eval_steps_per_second": 12.025, "step": 636 }, { "epoch": 2.990566037735849, "grad_norm": 0.1676182746887207, "learning_rate": 0.00020078619740999677, "loss": 0.0318, "step": 951 }, { "epoch": 3.0, "eval_accuracy": 0.9332258064516129, "eval_loss": 0.029865220189094543, "eval_runtime": 5.3842, "eval_samples_per_second": 575.754, "eval_steps_per_second": 12.072, "step": 954 }, { "epoch": 3.9874213836477987, "grad_norm": 0.14221236109733582, "learning_rate": 0.00018535814809928703, "loss": 0.0243, "step": 1268 }, { "epoch": 4.0, "eval_accuracy": 0.9403225806451613, "eval_loss": 0.027224214747548103, "eval_runtime": 5.4527, "eval_samples_per_second": 568.529, "eval_steps_per_second": 11.921, "step": 1272 }, { "epoch": 4.984276729559748, "grad_norm": 0.08744881302118301, "learning_rate": 0.00016669718243028002, "loss": 0.0211, "step": 1585 }, { "epoch": 5.0, "eval_accuracy": 0.9432258064516129, "eval_loss": 0.023913592100143433, "eval_runtime": 5.4569, "eval_samples_per_second": 568.09, "eval_steps_per_second": 11.912, "step": 1590 }, { "epoch": 5.981132075471698, "grad_norm": 0.08495471626520157, "learning_rate": 0.00014561377143520847, "loss": 0.019, "step": 1902 }, { "epoch": 6.0, "eval_accuracy": 0.9419354838709677, "eval_loss": 0.023042848333716393, "eval_runtime": 5.4211, "eval_samples_per_second": 571.836, "eval_steps_per_second": 11.99, "step": 1908 }, { "epoch": 6.977987421383648, "grad_norm": 0.05824749171733856, "learning_rate": 0.00012302359623021862, "loss": 0.0177, "step": 2219 }, { "epoch": 7.0, "eval_accuracy": 0.94, "eval_loss": 0.022627023980021477, "eval_runtime": 5.3959, "eval_samples_per_second": 574.514, "eval_steps_per_second": 12.046, "step": 2226 }, { "epoch": 7.9748427672955975, "grad_norm": 0.07080487161874771, "learning_rate": 9.990777874337415e-05, "loss": 0.0165, "step": 2536 }, { "epoch": 8.0, "eval_accuracy": 0.9406451612903226, "eval_loss": 0.02114924229681492, "eval_runtime": 5.4349, "eval_samples_per_second": 570.39, "eval_steps_per_second": 11.96, "step": 2544 }, { "epoch": 8.971698113207546, "grad_norm": 0.06504567712545395, "learning_rate": 7.727027025949969e-05, "loss": 0.0154, "step": 2853 }, { "epoch": 9.0, "eval_accuracy": 0.9458064516129032, "eval_loss": 0.019661111757159233, "eval_runtime": 5.4444, "eval_samples_per_second": 569.397, "eval_steps_per_second": 11.939, "step": 2862 }, { "epoch": 9.968553459119496, "grad_norm": 0.05454040318727493, "learning_rate": 5.6094248455116685e-05, "loss": 0.0146, "step": 3170 }, { "epoch": 10.0, "eval_accuracy": 0.9483870967741935, "eval_loss": 0.01887706108391285, "eval_runtime": 5.4106, "eval_samples_per_second": 572.947, "eval_steps_per_second": 12.013, "step": 3180 }, { "epoch": 10.965408805031446, "grad_norm": 0.06293663382530212, "learning_rate": 3.7299416659345335e-05, "loss": 0.0137, "step": 3487 }, { "epoch": 11.0, "eval_accuracy": 0.9487096774193549, "eval_loss": 0.0184471495449543, "eval_runtime": 5.408, "eval_samples_per_second": 573.222, "eval_steps_per_second": 12.019, "step": 3498 }, { "epoch": 11.962264150943396, "grad_norm": 0.04655018821358681, "learning_rate": 2.170205989174698e-05, "loss": 0.0132, "step": 3804 }, { "epoch": 12.0, "eval_accuracy": 0.9480645161290323, "eval_loss": 0.01816466823220253, "eval_runtime": 5.4333, "eval_samples_per_second": 570.554, "eval_steps_per_second": 11.963, "step": 3816 }, { "epoch": 12.959119496855346, "grad_norm": 0.0534706749022007, "learning_rate": 9.979592497525249e-06, "loss": 0.0128, "step": 4121 }, { "epoch": 13.0, "eval_accuracy": 0.947741935483871, "eval_loss": 0.01782587543129921, "eval_runtime": 5.4477, "eval_samples_per_second": 569.044, "eval_steps_per_second": 11.932, "step": 4134 }, { "epoch": 13.955974842767295, "grad_norm": 0.04927356168627739, "learning_rate": 2.641137124340782e-06, "loss": 0.0126, "step": 4438 }, { "epoch": 14.0, "eval_accuracy": 0.9487096774193549, "eval_loss": 0.017633505165576935, "eval_runtime": 5.4206, "eval_samples_per_second": 571.896, "eval_steps_per_second": 11.991, "step": 4452 }, { "epoch": 14.952830188679245, "grad_norm": 0.049865659326314926, "learning_rate": 5.412835654343996e-09, "loss": 0.0125, "step": 4755 } ], "logging_steps": 317, "max_steps": 4770, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 1000000000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1236796014635076.0, "train_batch_size": 48, "trial_name": null, "trial_params": { "alpha": 0.18249968951489548, "learning_rate": 0.0002218415338106894, "lr_scheduler_type": "cosine", "num_train_epochs": 15, "temperature": 8.914270069493725, "weight_decay": 0.05600918336516608 } }