{ "best_metric": 1.38231360912323, "best_model_checkpoint": "miner_id_24/checkpoint-200", "epoch": 0.02160818950382195, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010804094751910974, "eval_loss": 1.7889565229415894, "eval_runtime": 537.8419, "eval_samples_per_second": 28.984, "eval_steps_per_second": 7.247, "step": 1 }, { "epoch": 0.00032412284255732923, "grad_norm": 0.8091393709182739, "learning_rate": 3e-05, "loss": 1.5699, "step": 3 }, { "epoch": 0.0006482456851146585, "grad_norm": 0.8223854303359985, "learning_rate": 6e-05, "loss": 1.7854, "step": 6 }, { "epoch": 0.0009723685276719876, "grad_norm": 0.8447192311286926, "learning_rate": 9e-05, "loss": 1.8246, "step": 9 }, { "epoch": 0.001296491370229317, "grad_norm": 0.8499614000320435, "learning_rate": 9.997266286704631e-05, "loss": 1.8264, "step": 12 }, { "epoch": 0.0016206142127866462, "grad_norm": 0.8196111917495728, "learning_rate": 9.98292246503335e-05, "loss": 1.7786, "step": 15 }, { "epoch": 0.0019447370553439753, "grad_norm": 0.7101804614067078, "learning_rate": 9.956320346634876e-05, "loss": 1.7327, "step": 18 }, { "epoch": 0.0022688598979013048, "grad_norm": 0.7729286551475525, "learning_rate": 9.917525374361912e-05, "loss": 1.7386, "step": 21 }, { "epoch": 0.002592982740458634, "grad_norm": 0.6755093336105347, "learning_rate": 9.86663298624003e-05, "loss": 1.4494, "step": 24 }, { "epoch": 0.002917105583015963, "grad_norm": 0.7253813743591309, "learning_rate": 9.803768380684242e-05, "loss": 1.4436, "step": 27 }, { "epoch": 0.0032412284255732924, "grad_norm": 0.5313703417778015, "learning_rate": 9.729086208503174e-05, "loss": 1.3522, "step": 30 }, { "epoch": 0.0035653512681306215, "grad_norm": 0.5184443593025208, "learning_rate": 9.642770192448536e-05, "loss": 1.2604, "step": 33 }, { "epoch": 0.0038894741106879505, "grad_norm": 0.6221433281898499, "learning_rate": 9.545032675245813e-05, "loss": 1.354, "step": 36 }, { "epoch": 0.00421359695324528, "grad_norm": 0.5940028429031372, "learning_rate": 9.43611409721806e-05, "loss": 1.3683, "step": 39 }, { "epoch": 0.0045377197958026095, "grad_norm": 0.5949783325195312, "learning_rate": 9.316282404787871e-05, "loss": 1.3312, "step": 42 }, { "epoch": 0.004861842638359939, "grad_norm": 0.5560756325721741, "learning_rate": 9.185832391312644e-05, "loss": 1.3331, "step": 45 }, { "epoch": 0.005185965480917268, "grad_norm": 0.5510199666023254, "learning_rate": 9.045084971874738e-05, "loss": 1.2569, "step": 48 }, { "epoch": 0.005402047375955487, "eval_loss": 1.4472901821136475, "eval_runtime": 542.7445, "eval_samples_per_second": 28.723, "eval_steps_per_second": 7.182, "step": 50 }, { "epoch": 0.005510088323474597, "grad_norm": 0.560280978679657, "learning_rate": 8.894386393810563e-05, "loss": 1.3376, "step": 51 }, { "epoch": 0.005834211166031926, "grad_norm": 0.4924287497997284, "learning_rate": 8.73410738492077e-05, "loss": 1.4859, "step": 54 }, { "epoch": 0.006158334008589256, "grad_norm": 0.5823892951011658, "learning_rate": 8.564642241456986e-05, "loss": 1.6142, "step": 57 }, { "epoch": 0.006482456851146585, "grad_norm": 0.4990837275981903, "learning_rate": 8.386407858128706e-05, "loss": 1.6329, "step": 60 }, { "epoch": 0.006806579693703914, "grad_norm": 0.5565041899681091, "learning_rate": 8.199842702516583e-05, "loss": 1.6867, "step": 63 }, { "epoch": 0.007130702536261243, "grad_norm": 0.5534407496452332, "learning_rate": 8.005405736415126e-05, "loss": 1.6361, "step": 66 }, { "epoch": 0.007454825378818572, "grad_norm": 0.6305408477783203, "learning_rate": 7.803575286758364e-05, "loss": 1.5459, "step": 69 }, { "epoch": 0.007778948221375901, "grad_norm": 0.6603249311447144, "learning_rate": 7.594847868906076e-05, "loss": 1.6251, "step": 72 }, { "epoch": 0.00810307106393323, "grad_norm": 0.6320300698280334, "learning_rate": 7.379736965185368e-05, "loss": 1.3576, "step": 75 }, { "epoch": 0.00842719390649056, "grad_norm": 0.5601158142089844, "learning_rate": 7.158771761692464e-05, "loss": 1.3429, "step": 78 }, { "epoch": 0.00875131674904789, "grad_norm": 0.5731490850448608, "learning_rate": 6.932495846462261e-05, "loss": 1.3739, "step": 81 }, { "epoch": 0.009075439591605219, "grad_norm": 0.5410912036895752, "learning_rate": 6.701465872208216e-05, "loss": 1.2813, "step": 84 }, { "epoch": 0.009399562434162548, "grad_norm": 0.5254283547401428, "learning_rate": 6.466250186922325e-05, "loss": 1.2508, "step": 87 }, { "epoch": 0.009723685276719877, "grad_norm": 0.5095140933990479, "learning_rate": 6.227427435703997e-05, "loss": 1.204, "step": 90 }, { "epoch": 0.010047808119277206, "grad_norm": 0.49006080627441406, "learning_rate": 5.985585137257401e-05, "loss": 1.2471, "step": 93 }, { "epoch": 0.010371930961834535, "grad_norm": 0.5014355778694153, "learning_rate": 5.74131823855921e-05, "loss": 1.2864, "step": 96 }, { "epoch": 0.010696053804391864, "grad_norm": 0.53614342212677, "learning_rate": 5.495227651252315e-05, "loss": 1.2349, "step": 99 }, { "epoch": 0.010804094751910975, "eval_loss": 1.4096521139144897, "eval_runtime": 542.4184, "eval_samples_per_second": 28.74, "eval_steps_per_second": 7.186, "step": 100 }, { "epoch": 0.011020176646949193, "grad_norm": 0.49735087156295776, "learning_rate": 5.247918773366112e-05, "loss": 1.3988, "step": 102 }, { "epoch": 0.011344299489506523, "grad_norm": 0.4606347382068634, "learning_rate": 5e-05, "loss": 1.5228, "step": 105 }, { "epoch": 0.011668422332063852, "grad_norm": 0.5091155171394348, "learning_rate": 4.7520812266338885e-05, "loss": 1.4814, "step": 108 }, { "epoch": 0.01199254517462118, "grad_norm": 0.5642185211181641, "learning_rate": 4.504772348747687e-05, "loss": 1.6132, "step": 111 }, { "epoch": 0.012316668017178511, "grad_norm": 0.5649376511573792, "learning_rate": 4.2586817614407895e-05, "loss": 1.5986, "step": 114 }, { "epoch": 0.01264079085973584, "grad_norm": 0.5556029081344604, "learning_rate": 4.0144148627425993e-05, "loss": 1.606, "step": 117 }, { "epoch": 0.01296491370229317, "grad_norm": 0.6111022233963013, "learning_rate": 3.772572564296005e-05, "loss": 1.5919, "step": 120 }, { "epoch": 0.013289036544850499, "grad_norm": 0.7231109142303467, "learning_rate": 3.533749813077677e-05, "loss": 1.45, "step": 123 }, { "epoch": 0.013613159387407828, "grad_norm": 0.570808470249176, "learning_rate": 3.298534127791785e-05, "loss": 1.3601, "step": 126 }, { "epoch": 0.013937282229965157, "grad_norm": 0.5434361696243286, "learning_rate": 3.0675041535377405e-05, "loss": 1.3497, "step": 129 }, { "epoch": 0.014261405072522486, "grad_norm": 0.4750564992427826, "learning_rate": 2.8412282383075363e-05, "loss": 1.2799, "step": 132 }, { "epoch": 0.014585527915079815, "grad_norm": 0.5212507843971252, "learning_rate": 2.6202630348146324e-05, "loss": 1.2695, "step": 135 }, { "epoch": 0.014909650757637144, "grad_norm": 0.5255863070487976, "learning_rate": 2.405152131093926e-05, "loss": 1.2256, "step": 138 }, { "epoch": 0.015233773600194473, "grad_norm": 0.5165453553199768, "learning_rate": 2.196424713241637e-05, "loss": 1.2671, "step": 141 }, { "epoch": 0.015557896442751802, "grad_norm": 0.5220006704330444, "learning_rate": 1.9945942635848748e-05, "loss": 1.2104, "step": 144 }, { "epoch": 0.01588201928530913, "grad_norm": 0.48809775710105896, "learning_rate": 1.800157297483417e-05, "loss": 1.206, "step": 147 }, { "epoch": 0.01620614212786646, "grad_norm": 0.6223570108413696, "learning_rate": 1.6135921418712956e-05, "loss": 1.1717, "step": 150 }, { "epoch": 0.01620614212786646, "eval_loss": 1.3886877298355103, "eval_runtime": 541.2859, "eval_samples_per_second": 28.8, "eval_steps_per_second": 7.201, "step": 150 }, { "epoch": 0.01653026497042379, "grad_norm": 0.5191346406936646, "learning_rate": 1.435357758543015e-05, "loss": 1.5168, "step": 153 }, { "epoch": 0.01685438781298112, "grad_norm": 0.5353146195411682, "learning_rate": 1.2658926150792322e-05, "loss": 1.4967, "step": 156 }, { "epoch": 0.017178510655538447, "grad_norm": 0.5311741828918457, "learning_rate": 1.1056136061894384e-05, "loss": 1.5718, "step": 159 }, { "epoch": 0.01750263349809578, "grad_norm": 0.5845088362693787, "learning_rate": 9.549150281252633e-06, "loss": 1.5607, "step": 162 }, { "epoch": 0.01782675634065311, "grad_norm": 0.5743259191513062, "learning_rate": 8.141676086873572e-06, "loss": 1.581, "step": 165 }, { "epoch": 0.018150879183210438, "grad_norm": 0.6573442220687866, "learning_rate": 6.837175952121306e-06, "loss": 1.6465, "step": 168 }, { "epoch": 0.018475002025767767, "grad_norm": 0.6954129934310913, "learning_rate": 5.6388590278194096e-06, "loss": 1.5841, "step": 171 }, { "epoch": 0.018799124868325096, "grad_norm": 0.5963082909584045, "learning_rate": 4.549673247541875e-06, "loss": 1.4765, "step": 174 }, { "epoch": 0.019123247710882425, "grad_norm": 0.44104716181755066, "learning_rate": 3.5722980755146517e-06, "loss": 1.2437, "step": 177 }, { "epoch": 0.019447370553439754, "grad_norm": 0.5068080425262451, "learning_rate": 2.7091379149682685e-06, "loss": 1.2188, "step": 180 }, { "epoch": 0.019771493395997083, "grad_norm": 0.4765148460865021, "learning_rate": 1.962316193157593e-06, "loss": 1.2455, "step": 183 }, { "epoch": 0.020095616238554413, "grad_norm": 0.5346724390983582, "learning_rate": 1.333670137599713e-06, "loss": 1.3474, "step": 186 }, { "epoch": 0.02041973908111174, "grad_norm": 0.5100587606430054, "learning_rate": 8.247462563808817e-07, "loss": 1.262, "step": 189 }, { "epoch": 0.02074386192366907, "grad_norm": 0.48295655846595764, "learning_rate": 4.367965336512403e-07, "loss": 1.3058, "step": 192 }, { "epoch": 0.0210679847662264, "grad_norm": 0.5347855687141418, "learning_rate": 1.7077534966650766e-07, "loss": 1.2393, "step": 195 }, { "epoch": 0.02139210760878373, "grad_norm": 0.5421356558799744, "learning_rate": 2.7337132953697554e-08, "loss": 1.2671, "step": 198 }, { "epoch": 0.02160818950382195, "eval_loss": 1.38231360912323, "eval_runtime": 542.45, "eval_samples_per_second": 28.738, "eval_steps_per_second": 7.186, "step": 200 } ], "logging_steps": 3, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.8039887118336e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }