{ "best_metric": 0.7816455696202531, "best_model_checkpoint": "tsec_vit_model/checkpoint-276", "epoch": 9.873417721518987, "eval_steps": 500, "global_step": 390, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.25316455696202533, "grad_norm": 1.1729109287261963, "learning_rate": 1.282051282051282e-05, "loss": 0.5143, "step": 10 }, { "epoch": 0.5063291139240507, "grad_norm": 1.1752610206604004, "learning_rate": 2.564102564102564e-05, "loss": 0.559, "step": 20 }, { "epoch": 0.759493670886076, "grad_norm": 1.3603609800338745, "learning_rate": 3.846153846153846e-05, "loss": 0.4963, "step": 30 }, { "epoch": 0.9873417721518988, "eval_accuracy": 0.6724683544303798, "eval_loss": 0.5699456334114075, "eval_runtime": 4.299, "eval_samples_per_second": 147.012, "eval_steps_per_second": 9.305, "step": 39 }, { "epoch": 1.0126582278481013, "grad_norm": 2.1996357440948486, "learning_rate": 4.985754985754986e-05, "loss": 0.5522, "step": 40 }, { "epoch": 1.2658227848101267, "grad_norm": 1.0542365312576294, "learning_rate": 4.8433048433048433e-05, "loss": 0.4911, "step": 50 }, { "epoch": 1.518987341772152, "grad_norm": 1.4736473560333252, "learning_rate": 4.700854700854701e-05, "loss": 0.5448, "step": 60 }, { "epoch": 1.7721518987341773, "grad_norm": 1.3524911403656006, "learning_rate": 4.558404558404559e-05, "loss": 0.4959, "step": 70 }, { "epoch": 2.0, "eval_accuracy": 0.7151898734177216, "eval_loss": 0.5484605431556702, "eval_runtime": 4.1719, "eval_samples_per_second": 151.489, "eval_steps_per_second": 9.588, "step": 79 }, { "epoch": 2.0253164556962027, "grad_norm": 1.6714565753936768, "learning_rate": 4.415954415954416e-05, "loss": 0.4969, "step": 80 }, { "epoch": 2.278481012658228, "grad_norm": 1.8887039422988892, "learning_rate": 4.2735042735042735e-05, "loss": 0.4863, "step": 90 }, { "epoch": 2.5316455696202533, "grad_norm": 2.640259027481079, "learning_rate": 4.131054131054131e-05, "loss": 0.5075, "step": 100 }, { "epoch": 2.7848101265822782, "grad_norm": 1.167457103729248, "learning_rate": 3.988603988603989e-05, "loss": 0.4879, "step": 110 }, { "epoch": 2.9873417721518987, "eval_accuracy": 0.7689873417721519, "eval_loss": 0.48862743377685547, "eval_runtime": 4.3228, "eval_samples_per_second": 146.201, "eval_steps_per_second": 9.253, "step": 118 }, { "epoch": 3.037974683544304, "grad_norm": 1.0525858402252197, "learning_rate": 3.846153846153846e-05, "loss": 0.4946, "step": 120 }, { "epoch": 3.291139240506329, "grad_norm": 2.056792736053467, "learning_rate": 3.7037037037037037e-05, "loss": 0.4884, "step": 130 }, { "epoch": 3.5443037974683547, "grad_norm": 1.7059816122055054, "learning_rate": 3.561253561253561e-05, "loss": 0.4926, "step": 140 }, { "epoch": 3.7974683544303796, "grad_norm": 1.2786283493041992, "learning_rate": 3.418803418803419e-05, "loss": 0.5243, "step": 150 }, { "epoch": 4.0, "eval_accuracy": 0.7468354430379747, "eval_loss": 0.5133278965950012, "eval_runtime": 4.3787, "eval_samples_per_second": 144.335, "eval_steps_per_second": 9.135, "step": 158 }, { "epoch": 4.050632911392405, "grad_norm": 2.1460752487182617, "learning_rate": 3.2763532763532764e-05, "loss": 0.4983, "step": 160 }, { "epoch": 4.30379746835443, "grad_norm": 1.596248984336853, "learning_rate": 3.133903133903134e-05, "loss": 0.4936, "step": 170 }, { "epoch": 4.556962025316456, "grad_norm": 1.6535227298736572, "learning_rate": 2.9914529914529915e-05, "loss": 0.5049, "step": 180 }, { "epoch": 4.810126582278481, "grad_norm": 1.3324358463287354, "learning_rate": 2.8490028490028492e-05, "loss": 0.4654, "step": 190 }, { "epoch": 4.987341772151899, "eval_accuracy": 0.7515822784810127, "eval_loss": 0.49274376034736633, "eval_runtime": 4.3657, "eval_samples_per_second": 144.763, "eval_steps_per_second": 9.162, "step": 197 }, { "epoch": 5.063291139240507, "grad_norm": 1.212802529335022, "learning_rate": 2.706552706552707e-05, "loss": 0.4828, "step": 200 }, { "epoch": 5.3164556962025316, "grad_norm": 1.189599633216858, "learning_rate": 2.564102564102564e-05, "loss": 0.4816, "step": 210 }, { "epoch": 5.569620253164557, "grad_norm": 1.3671377897262573, "learning_rate": 2.4216524216524217e-05, "loss": 0.4709, "step": 220 }, { "epoch": 5.822784810126582, "grad_norm": 1.5688464641571045, "learning_rate": 2.2792022792022794e-05, "loss": 0.4776, "step": 230 }, { "epoch": 6.0, "eval_accuracy": 0.7642405063291139, "eval_loss": 0.4901277422904968, "eval_runtime": 4.2593, "eval_samples_per_second": 148.382, "eval_steps_per_second": 9.391, "step": 237 }, { "epoch": 6.075949367088608, "grad_norm": 1.6727008819580078, "learning_rate": 2.1367521367521368e-05, "loss": 0.4878, "step": 240 }, { "epoch": 6.329113924050633, "grad_norm": 1.2271509170532227, "learning_rate": 1.9943019943019945e-05, "loss": 0.4725, "step": 250 }, { "epoch": 6.582278481012658, "grad_norm": 1.5308541059494019, "learning_rate": 1.8518518518518518e-05, "loss": 0.444, "step": 260 }, { "epoch": 6.8354430379746836, "grad_norm": 1.6207118034362793, "learning_rate": 1.7094017094017095e-05, "loss": 0.4767, "step": 270 }, { "epoch": 6.987341772151899, "eval_accuracy": 0.7816455696202531, "eval_loss": 0.46520036458969116, "eval_runtime": 4.1881, "eval_samples_per_second": 150.905, "eval_steps_per_second": 9.551, "step": 276 }, { "epoch": 7.0886075949367084, "grad_norm": 1.3883824348449707, "learning_rate": 1.566951566951567e-05, "loss": 0.4649, "step": 280 }, { "epoch": 7.341772151898734, "grad_norm": 2.2651216983795166, "learning_rate": 1.4245014245014246e-05, "loss": 0.4644, "step": 290 }, { "epoch": 7.594936708860759, "grad_norm": 1.2660713195800781, "learning_rate": 1.282051282051282e-05, "loss": 0.4448, "step": 300 }, { "epoch": 7.848101265822785, "grad_norm": 1.8906290531158447, "learning_rate": 1.1396011396011397e-05, "loss": 0.4465, "step": 310 }, { "epoch": 8.0, "eval_accuracy": 0.7642405063291139, "eval_loss": 0.4795072674751282, "eval_runtime": 4.3585, "eval_samples_per_second": 145.004, "eval_steps_per_second": 9.177, "step": 316 }, { "epoch": 8.10126582278481, "grad_norm": 1.7846542596817017, "learning_rate": 9.971509971509972e-06, "loss": 0.4552, "step": 320 }, { "epoch": 8.354430379746836, "grad_norm": 1.2796516418457031, "learning_rate": 8.547008547008548e-06, "loss": 0.4284, "step": 330 }, { "epoch": 8.60759493670886, "grad_norm": 1.181015133857727, "learning_rate": 7.122507122507123e-06, "loss": 0.3985, "step": 340 }, { "epoch": 8.860759493670885, "grad_norm": 1.352704644203186, "learning_rate": 5.6980056980056985e-06, "loss": 0.467, "step": 350 }, { "epoch": 8.987341772151899, "eval_accuracy": 0.7484177215189873, "eval_loss": 0.4690525233745575, "eval_runtime": 4.2658, "eval_samples_per_second": 148.154, "eval_steps_per_second": 9.377, "step": 355 }, { "epoch": 9.113924050632912, "grad_norm": 1.3692480325698853, "learning_rate": 4.273504273504274e-06, "loss": 0.437, "step": 360 }, { "epoch": 9.367088607594937, "grad_norm": 2.6086838245391846, "learning_rate": 2.8490028490028492e-06, "loss": 0.4062, "step": 370 }, { "epoch": 9.620253164556962, "grad_norm": 1.352748155593872, "learning_rate": 1.4245014245014246e-06, "loss": 0.4365, "step": 380 }, { "epoch": 9.873417721518987, "grad_norm": 1.9576495885849, "learning_rate": 0.0, "loss": 0.4121, "step": 390 }, { "epoch": 9.873417721518987, "eval_accuracy": 0.7689873417721519, "eval_loss": 0.482129842042923, "eval_runtime": 4.3754, "eval_samples_per_second": 144.443, "eval_steps_per_second": 9.142, "step": 390 }, { "epoch": 9.873417721518987, "step": 390, "total_flos": 1.9321077770606223e+18, "train_loss": 0.4781974340096498, "train_runtime": 382.4209, "train_samples_per_second": 66.027, "train_steps_per_second": 1.02 } ], "logging_steps": 10, "max_steps": 390, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9321077770606223e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }