{ "best_metric": 1.1731915473937988, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.2841716396703609, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005683432793407218, "eval_loss": 1.5135992765426636, "eval_runtime": 20.6777, "eval_samples_per_second": 35.836, "eval_steps_per_second": 8.995, "step": 1 }, { "epoch": 0.005683432793407218, "grad_norm": 0.8190021514892578, "learning_rate": 4.1400000000000003e-05, "loss": 1.2435, "step": 10 }, { "epoch": 0.011366865586814436, "grad_norm": 0.3123916685581207, "learning_rate": 8.280000000000001e-05, "loss": 1.1829, "step": 20 }, { "epoch": 0.017050298380221655, "grad_norm": 0.5734857320785522, "learning_rate": 0.00012419999999999998, "loss": 1.2506, "step": 30 }, { "epoch": 0.022733731173628872, "grad_norm": 0.6435819864273071, "learning_rate": 0.00016560000000000001, "loss": 1.2011, "step": 40 }, { "epoch": 0.02841716396703609, "grad_norm": 1.421791911125183, "learning_rate": 0.000207, "loss": 1.4213, "step": 50 }, { "epoch": 0.02841716396703609, "eval_loss": 1.356560230255127, "eval_runtime": 20.6706, "eval_samples_per_second": 35.848, "eval_steps_per_second": 8.998, "step": 50 }, { "epoch": 0.03410059676044331, "grad_norm": 0.22179025411605835, "learning_rate": 0.00020674787920189178, "loss": 1.2039, "step": 60 }, { "epoch": 0.039784029553850526, "grad_norm": 0.27510905265808105, "learning_rate": 0.00020599274511475253, "loss": 1.1736, "step": 70 }, { "epoch": 0.045467462347257744, "grad_norm": 0.3930874168872833, "learning_rate": 0.00020473827667594888, "loss": 1.2343, "step": 80 }, { "epoch": 0.05115089514066496, "grad_norm": 0.5367498397827148, "learning_rate": 0.00020299058552961598, "loss": 1.1852, "step": 90 }, { "epoch": 0.05683432793407218, "grad_norm": 2.141770601272583, "learning_rate": 0.00020075818625134152, "loss": 1.2462, "step": 100 }, { "epoch": 0.05683432793407218, "eval_loss": 1.2812310457229614, "eval_runtime": 20.7342, "eval_samples_per_second": 35.738, "eval_steps_per_second": 8.971, "step": 100 }, { "epoch": 0.0625177607274794, "grad_norm": 0.20809020102024078, "learning_rate": 0.00019805195486600916, "loss": 1.2057, "step": 110 }, { "epoch": 0.06820119352088662, "grad_norm": 0.28819963335990906, "learning_rate": 0.00019488507586089894, "loss": 1.1406, "step": 120 }, { "epoch": 0.07388462631429384, "grad_norm": 0.3833145499229431, "learning_rate": 0.00019127297795219008, "loss": 1.1137, "step": 130 }, { "epoch": 0.07956805910770105, "grad_norm": 0.6189620494842529, "learning_rate": 0.00018723325891780706, "loss": 1.1794, "step": 140 }, { "epoch": 0.08525149190110827, "grad_norm": 1.7773363590240479, "learning_rate": 0.0001827855998628142, "loss": 1.1643, "step": 150 }, { "epoch": 0.08525149190110827, "eval_loss": 1.252834439277649, "eval_runtime": 20.6977, "eval_samples_per_second": 35.801, "eval_steps_per_second": 8.987, "step": 150 }, { "epoch": 0.09093492469451549, "grad_norm": 0.2051444947719574, "learning_rate": 0.0001779516693350504, "loss": 1.2162, "step": 160 }, { "epoch": 0.0966183574879227, "grad_norm": 0.2756437063217163, "learning_rate": 0.00017275501775814182, "loss": 1.1328, "step": 170 }, { "epoch": 0.10230179028132992, "grad_norm": 0.35582274198532104, "learning_rate": 0.00016722096269620562, "loss": 1.0878, "step": 180 }, { "epoch": 0.10798522307473714, "grad_norm": 0.501879096031189, "learning_rate": 0.00016137646550922228, "loss": 1.175, "step": 190 }, { "epoch": 0.11366865586814436, "grad_norm": 1.5594193935394287, "learning_rate": 0.00015525, "loss": 1.287, "step": 200 }, { "epoch": 0.11366865586814436, "eval_loss": 1.2266287803649902, "eval_runtime": 20.8034, "eval_samples_per_second": 35.619, "eval_steps_per_second": 8.941, "step": 200 }, { "epoch": 0.11935208866155157, "grad_norm": 0.21998944878578186, "learning_rate": 0.0001488714136926695, "loss": 1.18, "step": 210 }, { "epoch": 0.1250355214549588, "grad_norm": 0.2769666314125061, "learning_rate": 0.0001422717824185469, "loss": 1.1022, "step": 220 }, { "epoch": 0.13071895424836602, "grad_norm": 0.37277042865753174, "learning_rate": 0.00013548325891780705, "loss": 1.1071, "step": 230 }, { "epoch": 0.13640238704177324, "grad_norm": 0.6978037357330322, "learning_rate": 0.0001285389161945656, "loss": 1.1831, "step": 240 }, { "epoch": 0.14208581983518045, "grad_norm": 1.4473388195037842, "learning_rate": 0.0001214725863885273, "loss": 1.2392, "step": 250 }, { "epoch": 0.14208581983518045, "eval_loss": 1.2064555883407593, "eval_runtime": 20.7754, "eval_samples_per_second": 35.667, "eval_steps_per_second": 8.953, "step": 250 }, { "epoch": 0.14776925262858767, "grad_norm": 0.1963949352502823, "learning_rate": 0.00011431869594820213, "loss": 1.1858, "step": 260 }, { "epoch": 0.1534526854219949, "grad_norm": 0.2839827537536621, "learning_rate": 0.00010711209790870886, "loss": 1.1089, "step": 270 }, { "epoch": 0.1591361182154021, "grad_norm": 0.3557945191860199, "learning_rate": 9.988790209129117e-05, "loss": 1.1748, "step": 280 }, { "epoch": 0.16481955100880932, "grad_norm": 0.8184950351715088, "learning_rate": 9.268130405179787e-05, "loss": 1.1755, "step": 290 }, { "epoch": 0.17050298380221654, "grad_norm": 1.427021861076355, "learning_rate": 8.55274136114727e-05, "loss": 1.3114, "step": 300 }, { "epoch": 0.17050298380221654, "eval_loss": 1.1922367811203003, "eval_runtime": 20.7928, "eval_samples_per_second": 35.637, "eval_steps_per_second": 8.945, "step": 300 }, { "epoch": 0.17618641659562376, "grad_norm": 0.21867354214191437, "learning_rate": 7.84610838054344e-05, "loss": 1.134, "step": 310 }, { "epoch": 0.18186984938903097, "grad_norm": 0.2639070749282837, "learning_rate": 7.151674108219295e-05, "loss": 1.1421, "step": 320 }, { "epoch": 0.1875532821824382, "grad_norm": 0.4181519150733948, "learning_rate": 6.472821758145309e-05, "loss": 1.117, "step": 330 }, { "epoch": 0.1932367149758454, "grad_norm": 0.5369199514389038, "learning_rate": 5.8128586307330475e-05, "loss": 1.2034, "step": 340 }, { "epoch": 0.19892014776925263, "grad_norm": 1.7513502836227417, "learning_rate": 5.175000000000002e-05, "loss": 1.2398, "step": 350 }, { "epoch": 0.19892014776925263, "eval_loss": 1.1814223527908325, "eval_runtime": 20.8716, "eval_samples_per_second": 35.503, "eval_steps_per_second": 8.912, "step": 350 }, { "epoch": 0.20460358056265984, "grad_norm": 0.20256148278713226, "learning_rate": 4.5623534490777714e-05, "loss": 1.1515, "step": 360 }, { "epoch": 0.21028701335606706, "grad_norm": 0.2732991576194763, "learning_rate": 3.9779037303794365e-05, "loss": 1.0711, "step": 370 }, { "epoch": 0.21597044614947428, "grad_norm": 0.39095339179039, "learning_rate": 3.42449822418582e-05, "loss": 1.0953, "step": 380 }, { "epoch": 0.2216538789428815, "grad_norm": 0.5963056683540344, "learning_rate": 2.9048330664949622e-05, "loss": 1.0448, "step": 390 }, { "epoch": 0.2273373117362887, "grad_norm": 1.832690715789795, "learning_rate": 2.4214400137185785e-05, "loss": 1.2795, "step": 400 }, { "epoch": 0.2273373117362887, "eval_loss": 1.1731915473937988, "eval_runtime": 20.8501, "eval_samples_per_second": 35.539, "eval_steps_per_second": 8.921, "step": 400 }, { "epoch": 0.23302074452969593, "grad_norm": 0.2054441124200821, "learning_rate": 1.976674108219295e-05, "loss": 1.14, "step": 410 }, { "epoch": 0.23870417732310314, "grad_norm": 0.3580988645553589, "learning_rate": 1.572702204780991e-05, "loss": 1.0821, "step": 420 }, { "epoch": 0.24438761011651036, "grad_norm": 0.3528273105621338, "learning_rate": 1.2114924139101056e-05, "loss": 1.1813, "step": 430 }, { "epoch": 0.2500710429099176, "grad_norm": 0.5998412370681763, "learning_rate": 8.948045133990798e-06, "loss": 1.0501, "step": 440 }, { "epoch": 0.2557544757033248, "grad_norm": 1.1859114170074463, "learning_rate": 6.241813748658489e-06, "loss": 1.3292, "step": 450 }, { "epoch": 0.2557544757033248, "eval_loss": 1.1840455532073975, "eval_runtime": 20.8449, "eval_samples_per_second": 35.548, "eval_steps_per_second": 8.923, "step": 450 }, { "epoch": 0.26143790849673204, "grad_norm": 0.19075030088424683, "learning_rate": 4.009414470383994e-06, "loss": 1.1471, "step": 460 }, { "epoch": 0.26712134129013926, "grad_norm": 0.2625552713871002, "learning_rate": 2.261723324051111e-06, "loss": 1.1226, "step": 470 }, { "epoch": 0.2728047740835465, "grad_norm": 0.3403363525867462, "learning_rate": 1.0072548852474675e-06, "loss": 1.0545, "step": 480 }, { "epoch": 0.2784882068769537, "grad_norm": 0.5292404294013977, "learning_rate": 2.5212079810819554e-07, "loss": 1.2098, "step": 490 }, { "epoch": 0.2841716396703609, "grad_norm": 1.3367726802825928, "learning_rate": 0.0, "loss": 1.2654, "step": 500 }, { "epoch": 0.2841716396703609, "eval_loss": 1.1794259548187256, "eval_runtime": 20.8245, "eval_samples_per_second": 35.583, "eval_steps_per_second": 8.932, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4011222261170176e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }