{ "best_metric": 1.1736711263656616, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 0.2841716396703609, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005683432793407218, "eval_loss": 1.5135992765426636, "eval_runtime": 20.731, "eval_samples_per_second": 35.744, "eval_steps_per_second": 8.972, "step": 1 }, { "epoch": 0.005683432793407218, "grad_norm": 0.8103986978530884, "learning_rate": 4.2000000000000004e-05, "loss": 1.2436, "step": 10 }, { "epoch": 0.011366865586814436, "grad_norm": 0.31512126326560974, "learning_rate": 8.400000000000001e-05, "loss": 1.1831, "step": 20 }, { "epoch": 0.017050298380221655, "grad_norm": 0.5794227719306946, "learning_rate": 0.000126, "loss": 1.2505, "step": 30 }, { "epoch": 0.022733731173628872, "grad_norm": 0.6370463967323303, "learning_rate": 0.00016800000000000002, "loss": 1.2006, "step": 40 }, { "epoch": 0.02841716396703609, "grad_norm": 1.415936827659607, "learning_rate": 0.00021, "loss": 1.4222, "step": 50 }, { "epoch": 0.02841716396703609, "eval_loss": 1.355082392692566, "eval_runtime": 20.7459, "eval_samples_per_second": 35.718, "eval_steps_per_second": 8.966, "step": 50 }, { "epoch": 0.03410059676044331, "grad_norm": 0.22089043259620667, "learning_rate": 0.00020974422527728155, "loss": 1.2034, "step": 60 }, { "epoch": 0.039784029553850526, "grad_norm": 0.2758491635322571, "learning_rate": 0.0002089781472178649, "loss": 1.1731, "step": 70 }, { "epoch": 0.045467462347257744, "grad_norm": 0.38943949341773987, "learning_rate": 0.0002077054980770496, "loss": 1.2336, "step": 80 }, { "epoch": 0.05115089514066496, "grad_norm": 0.5280950665473938, "learning_rate": 0.00020593247807352348, "loss": 1.1845, "step": 90 }, { "epoch": 0.05683432793407218, "grad_norm": 2.150968074798584, "learning_rate": 0.00020366772518252038, "loss": 1.2446, "step": 100 }, { "epoch": 0.05683432793407218, "eval_loss": 1.2823526859283447, "eval_runtime": 20.8206, "eval_samples_per_second": 35.59, "eval_steps_per_second": 8.933, "step": 100 }, { "epoch": 0.0625177607274794, "grad_norm": 0.20953252911567688, "learning_rate": 0.0002009222730524731, "loss": 1.2066, "step": 110 }, { "epoch": 0.06820119352088662, "grad_norm": 0.28513628244400024, "learning_rate": 0.00019770949725018733, "loss": 1.1415, "step": 120 }, { "epoch": 0.07388462631429384, "grad_norm": 0.38282203674316406, "learning_rate": 0.00019404505009642473, "loss": 1.1126, "step": 130 }, { "epoch": 0.07956805910770105, "grad_norm": 0.6304633021354675, "learning_rate": 0.0001899467844093695, "loss": 1.1787, "step": 140 }, { "epoch": 0.08525149190110827, "grad_norm": 2.261017084121704, "learning_rate": 0.00018543466652749268, "loss": 1.1682, "step": 150 }, { "epoch": 0.08525149190110827, "eval_loss": 1.2519747018814087, "eval_runtime": 20.6467, "eval_samples_per_second": 35.89, "eval_steps_per_second": 9.009, "step": 150 }, { "epoch": 0.09093492469451549, "grad_norm": 0.20803095400333405, "learning_rate": 0.00018053067903555837, "loss": 1.215, "step": 160 }, { "epoch": 0.0966183574879227, "grad_norm": 0.2767871022224426, "learning_rate": 0.00017525871366768012, "loss": 1.1321, "step": 170 }, { "epoch": 0.10230179028132992, "grad_norm": 0.3595309257507324, "learning_rate": 0.00016964445490919413, "loss": 1.0872, "step": 180 }, { "epoch": 0.10798522307473714, "grad_norm": 0.4991762638092041, "learning_rate": 0.00016371525486442843, "loss": 1.1756, "step": 190 }, { "epoch": 0.11366865586814436, "grad_norm": 1.7282116413116455, "learning_rate": 0.0001575, "loss": 1.286, "step": 200 }, { "epoch": 0.11366865586814436, "eval_loss": 1.2275176048278809, "eval_runtime": 20.815, "eval_samples_per_second": 35.599, "eval_steps_per_second": 8.936, "step": 200 }, { "epoch": 0.11935208866155157, "grad_norm": 0.2192612886428833, "learning_rate": 0.00015102897041285315, "loss": 1.1807, "step": 210 }, { "epoch": 0.1250355214549588, "grad_norm": 0.2621805965900421, "learning_rate": 0.00014433369230867077, "loss": 1.1024, "step": 220 }, { "epoch": 0.13071895424836602, "grad_norm": 0.3690217137336731, "learning_rate": 0.0001374467844093695, "loss": 1.1082, "step": 230 }, { "epoch": 0.13640238704177324, "grad_norm": 0.713192880153656, "learning_rate": 0.0001304017990379651, "loss": 1.1818, "step": 240 }, { "epoch": 0.14208581983518045, "grad_norm": 1.5362468957901, "learning_rate": 0.0001232330586550277, "loss": 1.239, "step": 250 }, { "epoch": 0.14208581983518045, "eval_loss": 1.2070493698120117, "eval_runtime": 20.8556, "eval_samples_per_second": 35.53, "eval_steps_per_second": 8.918, "step": 250 }, { "epoch": 0.14776925262858767, "grad_norm": 0.2006831169128418, "learning_rate": 0.00011597548864310363, "loss": 1.1851, "step": 260 }, { "epoch": 0.1534526854219949, "grad_norm": 0.28160572052001953, "learning_rate": 0.00010866444715376263, "loss": 1.1084, "step": 270 }, { "epoch": 0.1591361182154021, "grad_norm": 0.35404494404792786, "learning_rate": 0.00010133555284623744, "loss": 1.1729, "step": 280 }, { "epoch": 0.16481955100880932, "grad_norm": 0.8237364292144775, "learning_rate": 9.402451135689641e-05, "loss": 1.1748, "step": 290 }, { "epoch": 0.17050298380221654, "grad_norm": 1.429248571395874, "learning_rate": 8.676694134497232e-05, "loss": 1.3167, "step": 300 }, { "epoch": 0.17050298380221654, "eval_loss": 1.1927704811096191, "eval_runtime": 20.9276, "eval_samples_per_second": 35.408, "eval_steps_per_second": 8.888, "step": 300 }, { "epoch": 0.17618641659562376, "grad_norm": 0.20985211431980133, "learning_rate": 7.95982009620349e-05, "loss": 1.1342, "step": 310 }, { "epoch": 0.18186984938903097, "grad_norm": 0.2655479609966278, "learning_rate": 7.255321559063053e-05, "loss": 1.1409, "step": 320 }, { "epoch": 0.1875532821824382, "grad_norm": 0.41280436515808105, "learning_rate": 6.566630769132923e-05, "loss": 1.1169, "step": 330 }, { "epoch": 0.1932367149758454, "grad_norm": 0.5316308736801147, "learning_rate": 5.897102958714686e-05, "loss": 1.2024, "step": 340 }, { "epoch": 0.19892014776925263, "grad_norm": 1.7537466287612915, "learning_rate": 5.250000000000002e-05, "loss": 1.2405, "step": 350 }, { "epoch": 0.19892014776925263, "eval_loss": 1.1823617219924927, "eval_runtime": 20.6991, "eval_samples_per_second": 35.799, "eval_steps_per_second": 8.986, "step": 350 }, { "epoch": 0.20460358056265984, "grad_norm": 0.20923574268817902, "learning_rate": 4.62847451355716e-05, "loss": 1.1516, "step": 360 }, { "epoch": 0.21028701335606706, "grad_norm": 0.2749004364013672, "learning_rate": 4.035554509080588e-05, "loss": 1.0708, "step": 370 }, { "epoch": 0.21597044614947428, "grad_norm": 0.3869272768497467, "learning_rate": 3.474128633231992e-05, "loss": 1.0946, "step": 380 }, { "epoch": 0.2216538789428815, "grad_norm": 0.6056302785873413, "learning_rate": 2.946932096444165e-05, "loss": 1.0444, "step": 390 }, { "epoch": 0.2273373117362887, "grad_norm": 1.8501702547073364, "learning_rate": 2.456533347250732e-05, "loss": 1.2803, "step": 400 }, { "epoch": 0.2273373117362887, "eval_loss": 1.1736711263656616, "eval_runtime": 20.689, "eval_samples_per_second": 35.816, "eval_steps_per_second": 8.99, "step": 400 }, { "epoch": 0.23302074452969593, "grad_norm": 0.20664860308170319, "learning_rate": 2.005321559063053e-05, "loss": 1.1395, "step": 410 }, { "epoch": 0.23870417732310314, "grad_norm": 0.3032397925853729, "learning_rate": 1.5954949903575276e-05, "loss": 1.0822, "step": 420 }, { "epoch": 0.24438761011651036, "grad_norm": 0.3599047064781189, "learning_rate": 1.2290502749812666e-05, "loss": 1.1818, "step": 430 }, { "epoch": 0.2500710429099176, "grad_norm": 0.6150122284889221, "learning_rate": 9.077726947526898e-06, "loss": 1.0494, "step": 440 }, { "epoch": 0.2557544757033248, "grad_norm": 1.1710139513015747, "learning_rate": 6.332274817479627e-06, "loss": 1.3281, "step": 450 }, { "epoch": 0.2557544757033248, "eval_loss": 1.1847209930419922, "eval_runtime": 20.8171, "eval_samples_per_second": 35.596, "eval_steps_per_second": 8.935, "step": 450 }, { "epoch": 0.26143790849673204, "grad_norm": 0.19276390969753265, "learning_rate": 4.067521926476516e-06, "loss": 1.1466, "step": 460 }, { "epoch": 0.26712134129013926, "grad_norm": 0.2616647779941559, "learning_rate": 2.294501922950403e-06, "loss": 1.1233, "step": 470 }, { "epoch": 0.2728047740835465, "grad_norm": 0.3395529091358185, "learning_rate": 1.021852782135112e-06, "loss": 1.0534, "step": 480 }, { "epoch": 0.2784882068769537, "grad_norm": 0.5176828503608704, "learning_rate": 2.5577472271845927e-07, "loss": 1.208, "step": 490 }, { "epoch": 0.2841716396703609, "grad_norm": 1.3167757987976074, "learning_rate": 0.0, "loss": 1.2601, "step": 500 }, { "epoch": 0.2841716396703609, "eval_loss": 1.180171251296997, "eval_runtime": 20.6202, "eval_samples_per_second": 35.936, "eval_steps_per_second": 9.02, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4011222261170176e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }