{ "best_metric": 0.2870234549045563, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.015524816419045845, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.104963283809169e-05, "eval_loss": 3.7327520847320557, "eval_runtime": 1160.7988, "eval_samples_per_second": 11.682, "eval_steps_per_second": 2.921, "step": 1 }, { "epoch": 0.0003104963283809169, "grad_norm": 3.784548759460449, "learning_rate": 4.08e-05, "loss": 3.065, "step": 10 }, { "epoch": 0.0006209926567618338, "grad_norm": 3.344172954559326, "learning_rate": 8.16e-05, "loss": 2.3919, "step": 20 }, { "epoch": 0.0009314889851427506, "grad_norm": 1.5794700384140015, "learning_rate": 0.0001224, "loss": 0.8608, "step": 30 }, { "epoch": 0.0012419853135236677, "grad_norm": 3.304838180541992, "learning_rate": 0.0001632, "loss": 0.4056, "step": 40 }, { "epoch": 0.0015524816419045845, "grad_norm": 0.6182494163513184, "learning_rate": 0.000204, "loss": 0.3099, "step": 50 }, { "epoch": 0.0015524816419045845, "eval_loss": 0.34165456891059875, "eval_runtime": 1161.9557, "eval_samples_per_second": 11.671, "eval_steps_per_second": 2.918, "step": 50 }, { "epoch": 0.0018629779702855013, "grad_norm": 0.9414771199226379, "learning_rate": 0.00020375153312650207, "loss": 0.3528, "step": 60 }, { "epoch": 0.0021734742986664183, "grad_norm": 3.154303789138794, "learning_rate": 0.00020300734301164017, "loss": 0.3092, "step": 70 }, { "epoch": 0.0024839706270473353, "grad_norm": 4.68408727645874, "learning_rate": 0.00020177105527484818, "loss": 0.5714, "step": 80 }, { "epoch": 0.002794466955428252, "grad_norm": 6.463703155517578, "learning_rate": 0.00020004869298570854, "loss": 0.3781, "step": 90 }, { "epoch": 0.003104963283809169, "grad_norm": 0.6307673454284668, "learning_rate": 0.00019784864732016265, "loss": 0.2635, "step": 100 }, { "epoch": 0.003104963283809169, "eval_loss": 0.36230921745300293, "eval_runtime": 1160.8503, "eval_samples_per_second": 11.682, "eval_steps_per_second": 2.921, "step": 100 }, { "epoch": 0.003415459612190086, "grad_norm": 3.080695390701294, "learning_rate": 0.00019518163667954527, "loss": 0.3737, "step": 110 }, { "epoch": 0.0037259559405710026, "grad_norm": 0.564507782459259, "learning_rate": 0.00019206065447161056, "loss": 0.3038, "step": 120 }, { "epoch": 0.00403645226895192, "grad_norm": 0.3009290099143982, "learning_rate": 0.00018850090580795544, "loss": 0.3419, "step": 130 }, { "epoch": 0.004346948597332837, "grad_norm": 7.988672733306885, "learning_rate": 0.00018451973342624464, "loss": 2.1628, "step": 140 }, { "epoch": 0.004657444925713753, "grad_norm": 3.4192960262298584, "learning_rate": 0.00018013653319813575, "loss": 0.8874, "step": 150 }, { "epoch": 0.004657444925713753, "eval_loss": 0.3869711458683014, "eval_runtime": 1161.6515, "eval_samples_per_second": 11.674, "eval_steps_per_second": 2.919, "step": 150 }, { "epoch": 0.004967941254094671, "grad_norm": 0.5543773770332336, "learning_rate": 0.0001753726596345424, "loss": 0.3118, "step": 160 }, { "epoch": 0.005278437582475587, "grad_norm": 3.5551018714904785, "learning_rate": 0.00017025132184860355, "loss": 0.4065, "step": 170 }, { "epoch": 0.005588933910856504, "grad_norm": 0.8357908725738525, "learning_rate": 0.00016479747048321714, "loss": 0.4147, "step": 180 }, { "epoch": 0.005899430239237421, "grad_norm": 0.5181112289428711, "learning_rate": 0.00015903767615401616, "loss": 0.2827, "step": 190 }, { "epoch": 0.006209926567618338, "grad_norm": 1.829988956451416, "learning_rate": 0.000153, "loss": 0.2747, "step": 200 }, { "epoch": 0.006209926567618338, "eval_loss": 0.31800615787506104, "eval_runtime": 1161.2828, "eval_samples_per_second": 11.678, "eval_steps_per_second": 2.92, "step": 200 }, { "epoch": 0.0065204228959992545, "grad_norm": 1.3195180892944336, "learning_rate": 0.0001467138569724859, "loss": 0.3177, "step": 210 }, { "epoch": 0.006830919224380172, "grad_norm": 2.611358642578125, "learning_rate": 0.00014020987252842305, "loss": 0.3129, "step": 220 }, { "epoch": 0.0071414155527610886, "grad_norm": 23.8262996673584, "learning_rate": 0.00013351973342624464, "loss": 0.5152, "step": 230 }, { "epoch": 0.007451911881142005, "grad_norm": 0.7884902954101562, "learning_rate": 0.00012667603335116609, "loss": 0.867, "step": 240 }, { "epoch": 0.007762408209522923, "grad_norm": 0.3505696654319763, "learning_rate": 0.00011971211412202691, "loss": 0.2918, "step": 250 }, { "epoch": 0.007762408209522923, "eval_loss": 0.3034397065639496, "eval_runtime": 1160.652, "eval_samples_per_second": 11.684, "eval_steps_per_second": 2.922, "step": 250 }, { "epoch": 0.00807290453790384, "grad_norm": 0.6287646889686584, "learning_rate": 0.00011266190325330066, "loss": 0.3212, "step": 260 }, { "epoch": 0.008383400866284757, "grad_norm": 1.5718376636505127, "learning_rate": 0.00010555974866365511, "loss": 0.2674, "step": 270 }, { "epoch": 0.008693897194665673, "grad_norm": 0.6371111869812012, "learning_rate": 9.844025133634492e-05, "loss": 0.4243, "step": 280 }, { "epoch": 0.00900439352304659, "grad_norm": 0.7290711402893066, "learning_rate": 9.133809674669937e-05, "loss": 0.2891, "step": 290 }, { "epoch": 0.009314889851427506, "grad_norm": 0.5713692307472229, "learning_rate": 8.428788587797311e-05, "loss": 0.2543, "step": 300 }, { "epoch": 0.009314889851427506, "eval_loss": 0.3538323938846588, "eval_runtime": 1160.2598, "eval_samples_per_second": 11.688, "eval_steps_per_second": 2.923, "step": 300 }, { "epoch": 0.009625386179808423, "grad_norm": 6.953015327453613, "learning_rate": 7.73239666488339e-05, "loss": 0.3133, "step": 310 }, { "epoch": 0.009935882508189341, "grad_norm": 2.1346802711486816, "learning_rate": 7.048026657375537e-05, "loss": 0.334, "step": 320 }, { "epoch": 0.010246378836570258, "grad_norm": 2.4181416034698486, "learning_rate": 6.379012747157697e-05, "loss": 0.3077, "step": 330 }, { "epoch": 0.010556875164951175, "grad_norm": 0.9341647624969482, "learning_rate": 5.7286143027514095e-05, "loss": 0.3673, "step": 340 }, { "epoch": 0.010867371493332091, "grad_norm": 0.5034169554710388, "learning_rate": 5.100000000000002e-05, "loss": 0.2662, "step": 350 }, { "epoch": 0.010867371493332091, "eval_loss": 0.2990635335445404, "eval_runtime": 1159.9666, "eval_samples_per_second": 11.691, "eval_steps_per_second": 2.923, "step": 350 }, { "epoch": 0.011177867821713008, "grad_norm": 0.6715922951698303, "learning_rate": 4.496232384598384e-05, "loss": 0.3294, "step": 360 }, { "epoch": 0.011488364150093924, "grad_norm": 0.6445696353912354, "learning_rate": 3.9202529516782854e-05, "loss": 0.2706, "step": 370 }, { "epoch": 0.011798860478474843, "grad_norm": 0.26978200674057007, "learning_rate": 3.374867815139649e-05, "loss": 0.3663, "step": 380 }, { "epoch": 0.01210935680685576, "grad_norm": 0.5984402298927307, "learning_rate": 2.8627340365457602e-05, "loss": 0.2697, "step": 390 }, { "epoch": 0.012419853135236676, "grad_norm": 0.7722185254096985, "learning_rate": 2.3863466801864254e-05, "loss": 0.2701, "step": 400 }, { "epoch": 0.012419853135236676, "eval_loss": 0.2882494032382965, "eval_runtime": 1161.5989, "eval_samples_per_second": 11.674, "eval_steps_per_second": 2.919, "step": 400 }, { "epoch": 0.012730349463617592, "grad_norm": 0.4869549572467804, "learning_rate": 1.9480266573755372e-05, "loss": 0.3046, "step": 410 }, { "epoch": 0.013040845791998509, "grad_norm": 1.2998945713043213, "learning_rate": 1.5499094192044554e-05, "loss": 0.2766, "step": 420 }, { "epoch": 0.013351342120379427, "grad_norm": 1.5833019018173218, "learning_rate": 1.1939345528389446e-05, "loss": 0.3613, "step": 430 }, { "epoch": 0.013661838448760344, "grad_norm": 0.3303629159927368, "learning_rate": 8.818363320454701e-06, "loss": 0.2713, "step": 440 }, { "epoch": 0.01397233477714126, "grad_norm": 0.5843662619590759, "learning_rate": 6.1513526798373514e-06, "loss": 0.2793, "step": 450 }, { "epoch": 0.01397233477714126, "eval_loss": 0.2871423065662384, "eval_runtime": 1162.0849, "eval_samples_per_second": 11.67, "eval_steps_per_second": 2.918, "step": 450 }, { "epoch": 0.014282831105522177, "grad_norm": 0.6200801134109497, "learning_rate": 3.9513070142914725e-06, "loss": 0.2882, "step": 460 }, { "epoch": 0.014593327433903094, "grad_norm": 1.349033236503601, "learning_rate": 2.2289447251518195e-06, "loss": 0.2813, "step": 470 }, { "epoch": 0.01490382376228401, "grad_norm": 0.3599369525909424, "learning_rate": 9.92656988359823e-07, "loss": 0.3312, "step": 480 }, { "epoch": 0.015214320090664929, "grad_norm": 0.32290154695510864, "learning_rate": 2.4846687349793185e-07, "loss": 0.2915, "step": 490 }, { "epoch": 0.015524816419045845, "grad_norm": 0.5347475409507751, "learning_rate": 0.0, "loss": 0.2675, "step": 500 }, { "epoch": 0.015524816419045845, "eval_loss": 0.2870234549045563, "eval_runtime": 1160.4192, "eval_samples_per_second": 11.686, "eval_steps_per_second": 2.922, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3954559915537203e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }