{ "best_metric": 0.9532889723777771, "best_model_checkpoint": "miner_id_24/checkpoint-150", "epoch": 3.0093457943925235, "eval_steps": 50, "global_step": 161, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018691588785046728, "eval_loss": 1.7978484630584717, "eval_runtime": 1.7929, "eval_samples_per_second": 50.198, "eval_steps_per_second": 12.828, "step": 1 }, { "epoch": 0.056074766355140186, "grad_norm": 7.484181880950928, "learning_rate": 3e-05, "loss": 7.3462, "step": 3 }, { "epoch": 0.11214953271028037, "grad_norm": 5.32301139831543, "learning_rate": 6e-05, "loss": 7.0775, "step": 6 }, { "epoch": 0.16822429906542055, "grad_norm": 4.536668300628662, "learning_rate": 9e-05, "loss": 6.1639, "step": 9 }, { "epoch": 0.22429906542056074, "grad_norm": 4.83793830871582, "learning_rate": 9.995672040508655e-05, "loss": 5.9664, "step": 12 }, { "epoch": 0.2803738317757009, "grad_norm": 4.512387752532959, "learning_rate": 9.972970737805311e-05, "loss": 5.4463, "step": 15 }, { "epoch": 0.3364485981308411, "grad_norm": 3.3357937335968018, "learning_rate": 9.930902394260747e-05, "loss": 5.397, "step": 18 }, { "epoch": 0.3925233644859813, "grad_norm": 2.8627398014068604, "learning_rate": 9.86963084340033e-05, "loss": 5.2486, "step": 21 }, { "epoch": 0.4485981308411215, "grad_norm": 2.5798230171203613, "learning_rate": 9.789394704892364e-05, "loss": 5.1503, "step": 24 }, { "epoch": 0.5046728971962616, "grad_norm": 3.5169315338134766, "learning_rate": 9.690506455253073e-05, "loss": 5.0864, "step": 27 }, { "epoch": 0.5607476635514018, "grad_norm": 2.785013198852539, "learning_rate": 9.573351210918974e-05, "loss": 4.7364, "step": 30 }, { "epoch": 0.616822429906542, "grad_norm": 2.747760772705078, "learning_rate": 9.438385228425938e-05, "loss": 4.7173, "step": 33 }, { "epoch": 0.6728971962616822, "grad_norm": 2.9312195777893066, "learning_rate": 9.286134127535859e-05, "loss": 4.5421, "step": 36 }, { "epoch": 0.7289719626168224, "grad_norm": 2.8997135162353516, "learning_rate": 9.117190844230971e-05, "loss": 4.6737, "step": 39 }, { "epoch": 0.7850467289719626, "grad_norm": 2.88925838470459, "learning_rate": 8.93221332154777e-05, "loss": 4.7061, "step": 42 }, { "epoch": 0.8411214953271028, "grad_norm": 2.7651543617248535, "learning_rate": 8.731921947243469e-05, "loss": 4.4704, "step": 45 }, { "epoch": 0.897196261682243, "grad_norm": 2.9074134826660156, "learning_rate": 8.517096748273951e-05, "loss": 4.566, "step": 48 }, { "epoch": 0.9345794392523364, "eval_loss": 1.0549684762954712, "eval_runtime": 1.798, "eval_samples_per_second": 50.055, "eval_steps_per_second": 12.792, "step": 50 }, { "epoch": 0.9532710280373832, "grad_norm": 2.6287901401519775, "learning_rate": 8.288574353009164e-05, "loss": 4.5007, "step": 51 }, { "epoch": 1.0093457943925233, "grad_norm": 2.882777214050293, "learning_rate": 8.047244733016522e-05, "loss": 4.6791, "step": 54 }, { "epoch": 1.0654205607476634, "grad_norm": 2.7834925651550293, "learning_rate": 7.794047737101297e-05, "loss": 4.2182, "step": 57 }, { "epoch": 1.1214953271028036, "grad_norm": 2.7983498573303223, "learning_rate": 7.529969431102064e-05, "loss": 4.0828, "step": 60 }, { "epoch": 1.1775700934579438, "grad_norm": 3.095180034637451, "learning_rate": 7.256038257695687e-05, "loss": 4.0762, "step": 63 }, { "epoch": 1.233644859813084, "grad_norm": 3.032648801803589, "learning_rate": 6.973321031167383e-05, "loss": 4.3798, "step": 66 }, { "epoch": 1.2897196261682242, "grad_norm": 3.197693347930908, "learning_rate": 6.682918782744032e-05, "loss": 4.1759, "step": 69 }, { "epoch": 1.3457943925233644, "grad_norm": 3.1764252185821533, "learning_rate": 6.385962472670953e-05, "loss": 4.1947, "step": 72 }, { "epoch": 1.4018691588785046, "grad_norm": 2.944417715072632, "learning_rate": 6.083608585731283e-05, "loss": 4.1047, "step": 75 }, { "epoch": 1.4579439252336448, "grad_norm": 3.2092721462249756, "learning_rate": 5.7770346273610254e-05, "loss": 4.0432, "step": 78 }, { "epoch": 1.514018691588785, "grad_norm": 3.754558801651001, "learning_rate": 5.4674345379e-05, "loss": 4.2195, "step": 81 }, { "epoch": 1.5700934579439252, "grad_norm": 3.33128023147583, "learning_rate": 5.1560140428376956e-05, "loss": 3.8932, "step": 84 }, { "epoch": 1.6261682242990654, "grad_norm": 3.100324869155884, "learning_rate": 4.8439859571623035e-05, "loss": 4.0706, "step": 87 }, { "epoch": 1.6822429906542056, "grad_norm": 3.433746099472046, "learning_rate": 4.532565462099999e-05, "loss": 3.9536, "step": 90 }, { "epoch": 1.7383177570093458, "grad_norm": 3.482595920562744, "learning_rate": 4.2229653726389765e-05, "loss": 4.1327, "step": 93 }, { "epoch": 1.794392523364486, "grad_norm": 3.3688647747039795, "learning_rate": 3.9163914142687184e-05, "loss": 3.8966, "step": 96 }, { "epoch": 1.8504672897196262, "grad_norm": 3.2575838565826416, "learning_rate": 3.614037527329048e-05, "loss": 3.9248, "step": 99 }, { "epoch": 1.8691588785046729, "eval_loss": 0.9720526933670044, "eval_runtime": 1.8012, "eval_samples_per_second": 49.967, "eval_steps_per_second": 12.769, "step": 100 }, { "epoch": 1.9065420560747663, "grad_norm": 3.1702170372009277, "learning_rate": 3.31708121725597e-05, "loss": 3.8573, "step": 102 }, { "epoch": 1.9626168224299065, "grad_norm": 3.2316336631774902, "learning_rate": 3.0266789688326186e-05, "loss": 4.0095, "step": 105 }, { "epoch": 2.0186915887850465, "grad_norm": 4.608758449554443, "learning_rate": 2.7439617423043145e-05, "loss": 4.0982, "step": 108 }, { "epoch": 2.074766355140187, "grad_norm": 3.2477152347564697, "learning_rate": 2.470030568897938e-05, "loss": 3.6945, "step": 111 }, { "epoch": 2.130841121495327, "grad_norm": 3.651806116104126, "learning_rate": 2.205952262898704e-05, "loss": 3.6482, "step": 114 }, { "epoch": 2.1869158878504673, "grad_norm": 3.4141387939453125, "learning_rate": 1.9527552669834798e-05, "loss": 3.6739, "step": 117 }, { "epoch": 2.2429906542056073, "grad_norm": 3.712567090988159, "learning_rate": 1.711425646990838e-05, "loss": 3.9801, "step": 120 }, { "epoch": 2.2990654205607477, "grad_norm": 4.190512180328369, "learning_rate": 1.4829032517260489e-05, "loss": 3.67, "step": 123 }, { "epoch": 2.3551401869158877, "grad_norm": 3.286802053451538, "learning_rate": 1.268078052756531e-05, "loss": 3.6418, "step": 126 }, { "epoch": 2.411214953271028, "grad_norm": 3.205436944961548, "learning_rate": 1.0677866784522317e-05, "loss": 3.6929, "step": 129 }, { "epoch": 2.467289719626168, "grad_norm": 3.498103380203247, "learning_rate": 8.828091557690289e-06, "loss": 3.8541, "step": 132 }, { "epoch": 2.5233644859813085, "grad_norm": 3.3449270725250244, "learning_rate": 7.138658724641417e-06, "loss": 3.7771, "step": 135 }, { "epoch": 2.5794392523364484, "grad_norm": 3.2664265632629395, "learning_rate": 5.616147715740611e-06, "loss": 3.6785, "step": 138 }, { "epoch": 2.635514018691589, "grad_norm": 3.2811155319213867, "learning_rate": 4.266487890810256e-06, "loss": 3.7523, "step": 141 }, { "epoch": 2.691588785046729, "grad_norm": 3.3988234996795654, "learning_rate": 3.0949354474692937e-06, "loss": 3.6636, "step": 144 }, { "epoch": 2.7476635514018692, "grad_norm": 3.528200387954712, "learning_rate": 2.106052951076365e-06, "loss": 3.8593, "step": 147 }, { "epoch": 2.803738317757009, "grad_norm": 3.478485107421875, "learning_rate": 1.303691565996712e-06, "loss": 3.7467, "step": 150 }, { "epoch": 2.803738317757009, "eval_loss": 0.9532889723777771, "eval_runtime": 1.8036, "eval_samples_per_second": 49.901, "eval_steps_per_second": 12.753, "step": 150 }, { "epoch": 2.8598130841121496, "grad_norm": 3.1623222827911377, "learning_rate": 6.909760573925561e-07, "loss": 3.7121, "step": 153 }, { "epoch": 2.9158878504672896, "grad_norm": 3.4083008766174316, "learning_rate": 2.702926219468882e-07, "loss": 3.6461, "step": 156 }, { "epoch": 2.97196261682243, "grad_norm": 3.622955322265625, "learning_rate": 4.3279594913447905e-08, "loss": 3.818, "step": 159 } ], "logging_steps": 3, "max_steps": 161, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.000267220759347e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }