{ "best_metric": 1.97798752784729, "best_model_checkpoint": "miner_id_24/checkpoint-100", "epoch": 0.007456287514446557, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 7.456287514446557e-05, "grad_norm": 0.43125054240226746, "learning_rate": 3.3333333333333333e-06, "loss": 1.764, "step": 1 }, { "epoch": 7.456287514446557e-05, "eval_loss": 2.176044464111328, "eval_runtime": 1967.9509, "eval_samples_per_second": 11.478, "eval_steps_per_second": 5.739, "step": 1 }, { "epoch": 0.00014912575028893114, "grad_norm": 0.4944254457950592, "learning_rate": 6.666666666666667e-06, "loss": 2.0124, "step": 2 }, { "epoch": 0.0002236886254333967, "grad_norm": 0.5164911150932312, "learning_rate": 1e-05, "loss": 2.0545, "step": 3 }, { "epoch": 0.0002982515005778623, "grad_norm": 0.5371212363243103, "learning_rate": 1.3333333333333333e-05, "loss": 2.1354, "step": 4 }, { "epoch": 0.00037281437572232783, "grad_norm": 0.5677233338356018, "learning_rate": 1.6666666666666667e-05, "loss": 2.0822, "step": 5 }, { "epoch": 0.0004473772508667934, "grad_norm": 0.5842496156692505, "learning_rate": 2e-05, "loss": 2.1827, "step": 6 }, { "epoch": 0.000521940126011259, "grad_norm": 0.5665204524993896, "learning_rate": 2.3333333333333336e-05, "loss": 2.1415, "step": 7 }, { "epoch": 0.0005965030011557246, "grad_norm": 0.5413942337036133, "learning_rate": 2.6666666666666667e-05, "loss": 2.0118, "step": 8 }, { "epoch": 0.0006710658763001902, "grad_norm": 0.5594425201416016, "learning_rate": 3e-05, "loss": 2.0801, "step": 9 }, { "epoch": 0.0007456287514446557, "grad_norm": 0.553996741771698, "learning_rate": 3.3333333333333335e-05, "loss": 2.0225, "step": 10 }, { "epoch": 0.0008201916265891213, "grad_norm": 0.5754431486129761, "learning_rate": 3.6666666666666666e-05, "loss": 2.064, "step": 11 }, { "epoch": 0.0008947545017335869, "grad_norm": 0.6240922808647156, "learning_rate": 4e-05, "loss": 2.126, "step": 12 }, { "epoch": 0.0009693173768780525, "grad_norm": 0.5755719542503357, "learning_rate": 4.3333333333333334e-05, "loss": 2.0683, "step": 13 }, { "epoch": 0.001043880252022518, "grad_norm": 0.6122748851776123, "learning_rate": 4.666666666666667e-05, "loss": 2.0893, "step": 14 }, { "epoch": 0.0011184431271669835, "grad_norm": 0.6614731550216675, "learning_rate": 5e-05, "loss": 2.1551, "step": 15 }, { "epoch": 0.0011930060023114491, "grad_norm": 0.6139146685600281, "learning_rate": 5.333333333333333e-05, "loss": 1.959, "step": 16 }, { "epoch": 0.0012675688774559147, "grad_norm": 0.7065208554267883, "learning_rate": 5.666666666666667e-05, "loss": 2.015, "step": 17 }, { "epoch": 0.0013421317526003803, "grad_norm": 0.706400990486145, "learning_rate": 6e-05, "loss": 2.0351, "step": 18 }, { "epoch": 0.001416694627744846, "grad_norm": 0.6575025916099548, "learning_rate": 6.333333333333333e-05, "loss": 2.0483, "step": 19 }, { "epoch": 0.0014912575028893113, "grad_norm": 0.6933155059814453, "learning_rate": 6.666666666666667e-05, "loss": 2.0612, "step": 20 }, { "epoch": 0.001565820378033777, "grad_norm": 0.6782702207565308, "learning_rate": 7e-05, "loss": 2.0763, "step": 21 }, { "epoch": 0.0016403832531782425, "grad_norm": 0.671043872833252, "learning_rate": 7.333333333333333e-05, "loss": 2.076, "step": 22 }, { "epoch": 0.001714946128322708, "grad_norm": 0.6524346470832825, "learning_rate": 7.666666666666667e-05, "loss": 2.0047, "step": 23 }, { "epoch": 0.0017895090034671737, "grad_norm": 0.71756511926651, "learning_rate": 8e-05, "loss": 2.1351, "step": 24 }, { "epoch": 0.0018640718786116393, "grad_norm": 0.6905089020729065, "learning_rate": 8.333333333333334e-05, "loss": 2.0644, "step": 25 }, { "epoch": 0.001938634753756105, "grad_norm": 0.6620098352432251, "learning_rate": 8.666666666666667e-05, "loss": 2.1028, "step": 26 }, { "epoch": 0.0020131976289005705, "grad_norm": 0.6542001962661743, "learning_rate": 9e-05, "loss": 2.0178, "step": 27 }, { "epoch": 0.002087760504045036, "grad_norm": 0.725846529006958, "learning_rate": 9.333333333333334e-05, "loss": 2.1445, "step": 28 }, { "epoch": 0.0021623233791895017, "grad_norm": 0.6756733655929565, "learning_rate": 9.666666666666667e-05, "loss": 2.0946, "step": 29 }, { "epoch": 0.002236886254333967, "grad_norm": 0.6713683605194092, "learning_rate": 0.0001, "loss": 2.0383, "step": 30 }, { "epoch": 0.002311449129478433, "grad_norm": 0.8826857805252075, "learning_rate": 9.994965332706573e-05, "loss": 2.1636, "step": 31 }, { "epoch": 0.0023860120046228983, "grad_norm": 0.7072741985321045, "learning_rate": 9.979871469976196e-05, "loss": 2.0969, "step": 32 }, { "epoch": 0.0024605748797673637, "grad_norm": 0.6709433197975159, "learning_rate": 9.954748808839674e-05, "loss": 2.0807, "step": 33 }, { "epoch": 0.0025351377549118295, "grad_norm": 0.6410077810287476, "learning_rate": 9.919647942993148e-05, "loss": 1.9685, "step": 34 }, { "epoch": 0.002609700630056295, "grad_norm": 0.7483009696006775, "learning_rate": 9.874639560909117e-05, "loss": 2.1124, "step": 35 }, { "epoch": 0.0026842635052007607, "grad_norm": 0.667682409286499, "learning_rate": 9.819814303479267e-05, "loss": 1.9806, "step": 36 }, { "epoch": 0.002758826380345226, "grad_norm": 0.6942345499992371, "learning_rate": 9.755282581475769e-05, "loss": 2.0423, "step": 37 }, { "epoch": 0.002833389255489692, "grad_norm": 0.7064806222915649, "learning_rate": 9.681174353198687e-05, "loss": 2.1869, "step": 38 }, { "epoch": 0.0029079521306341572, "grad_norm": 0.6997663974761963, "learning_rate": 9.597638862757255e-05, "loss": 2.1582, "step": 39 }, { "epoch": 0.0029825150057786226, "grad_norm": 0.6768945455551147, "learning_rate": 9.504844339512095e-05, "loss": 2.1976, "step": 40 }, { "epoch": 0.0030570778809230884, "grad_norm": 0.7602648138999939, "learning_rate": 9.40297765928369e-05, "loss": 2.0887, "step": 41 }, { "epoch": 0.003131640756067554, "grad_norm": 0.6888823509216309, "learning_rate": 9.292243968009331e-05, "loss": 2.0271, "step": 42 }, { "epoch": 0.0032062036312120196, "grad_norm": 0.694747805595398, "learning_rate": 9.172866268606513e-05, "loss": 2.081, "step": 43 }, { "epoch": 0.003280766506356485, "grad_norm": 0.6592668890953064, "learning_rate": 9.045084971874738e-05, "loss": 1.9375, "step": 44 }, { "epoch": 0.003355329381500951, "grad_norm": 0.7744700312614441, "learning_rate": 8.90915741234015e-05, "loss": 2.1293, "step": 45 }, { "epoch": 0.003429892256645416, "grad_norm": 0.7216829657554626, "learning_rate": 8.765357330018056e-05, "loss": 2.0884, "step": 46 }, { "epoch": 0.003504455131789882, "grad_norm": 0.7232696413993835, "learning_rate": 8.613974319136958e-05, "loss": 2.0312, "step": 47 }, { "epoch": 0.0035790180069343474, "grad_norm": 0.964648425579071, "learning_rate": 8.455313244934324e-05, "loss": 1.9873, "step": 48 }, { "epoch": 0.003653580882078813, "grad_norm": 0.8183997869491577, "learning_rate": 8.289693629698564e-05, "loss": 2.0139, "step": 49 }, { "epoch": 0.0037281437572232786, "grad_norm": 0.970118522644043, "learning_rate": 8.117449009293668e-05, "loss": 2.1238, "step": 50 }, { "epoch": 0.0037281437572232786, "eval_loss": 2.0290143489837646, "eval_runtime": 1975.9168, "eval_samples_per_second": 11.432, "eval_steps_per_second": 5.716, "step": 50 }, { "epoch": 0.003802706632367744, "grad_norm": 0.5115548968315125, "learning_rate": 7.938926261462366e-05, "loss": 1.6365, "step": 51 }, { "epoch": 0.00387726950751221, "grad_norm": 0.7051359415054321, "learning_rate": 7.754484907260513e-05, "loss": 2.0046, "step": 52 }, { "epoch": 0.003951832382656675, "grad_norm": 0.46429726481437683, "learning_rate": 7.564496387029532e-05, "loss": 1.8634, "step": 53 }, { "epoch": 0.004026395257801141, "grad_norm": 0.40492793917655945, "learning_rate": 7.369343312364993e-05, "loss": 1.9598, "step": 54 }, { "epoch": 0.004100958132945607, "grad_norm": 0.39108383655548096, "learning_rate": 7.169418695587791e-05, "loss": 1.8478, "step": 55 }, { "epoch": 0.004175521008090072, "grad_norm": 0.42470911145210266, "learning_rate": 6.965125158269619e-05, "loss": 2.0799, "step": 56 }, { "epoch": 0.004250083883234538, "grad_norm": 0.45118194818496704, "learning_rate": 6.756874120406714e-05, "loss": 2.045, "step": 57 }, { "epoch": 0.004324646758379003, "grad_norm": 0.4153003990650177, "learning_rate": 6.545084971874738e-05, "loss": 1.8896, "step": 58 }, { "epoch": 0.004399209633523468, "grad_norm": 0.4770011305809021, "learning_rate": 6.330184227833376e-05, "loss": 2.0487, "step": 59 }, { "epoch": 0.004473772508667934, "grad_norm": 0.46756961941719055, "learning_rate": 6.112604669781572e-05, "loss": 1.9976, "step": 60 }, { "epoch": 0.0045483353838124, "grad_norm": 0.47102442383766174, "learning_rate": 5.8927844739931834e-05, "loss": 1.9983, "step": 61 }, { "epoch": 0.004622898258956866, "grad_norm": 0.5105215907096863, "learning_rate": 5.6711663290882776e-05, "loss": 2.0145, "step": 62 }, { "epoch": 0.004697461134101331, "grad_norm": 0.48268643021583557, "learning_rate": 5.448196544517168e-05, "loss": 2.0284, "step": 63 }, { "epoch": 0.0047720240092457965, "grad_norm": 0.49653950333595276, "learning_rate": 5.2243241517525754e-05, "loss": 2.0299, "step": 64 }, { "epoch": 0.004846586884390262, "grad_norm": 0.4842401444911957, "learning_rate": 5e-05, "loss": 1.9628, "step": 65 }, { "epoch": 0.004921149759534727, "grad_norm": 0.5133103728294373, "learning_rate": 4.775675848247427e-05, "loss": 1.9677, "step": 66 }, { "epoch": 0.004995712634679193, "grad_norm": 0.5002983808517456, "learning_rate": 4.551803455482833e-05, "loss": 2.0314, "step": 67 }, { "epoch": 0.005070275509823659, "grad_norm": 0.5741726756095886, "learning_rate": 4.328833670911724e-05, "loss": 2.007, "step": 68 }, { "epoch": 0.005144838384968125, "grad_norm": 0.5411697626113892, "learning_rate": 4.107215526006817e-05, "loss": 1.954, "step": 69 }, { "epoch": 0.00521940126011259, "grad_norm": 0.5465304255485535, "learning_rate": 3.887395330218429e-05, "loss": 1.9437, "step": 70 }, { "epoch": 0.0052939641352570555, "grad_norm": 0.5512676239013672, "learning_rate": 3.6698157721666246e-05, "loss": 1.8954, "step": 71 }, { "epoch": 0.005368527010401521, "grad_norm": 0.5813472270965576, "learning_rate": 3.4549150281252636e-05, "loss": 2.0268, "step": 72 }, { "epoch": 0.005443089885545986, "grad_norm": 0.5529889464378357, "learning_rate": 3.243125879593286e-05, "loss": 1.9832, "step": 73 }, { "epoch": 0.005517652760690452, "grad_norm": 0.5440077781677246, "learning_rate": 3.0348748417303823e-05, "loss": 1.9148, "step": 74 }, { "epoch": 0.005592215635834918, "grad_norm": 0.5623196363449097, "learning_rate": 2.8305813044122097e-05, "loss": 2.0762, "step": 75 }, { "epoch": 0.005666778510979384, "grad_norm": 0.5488706827163696, "learning_rate": 2.630656687635007e-05, "loss": 1.9115, "step": 76 }, { "epoch": 0.005741341386123849, "grad_norm": 0.5504707098007202, "learning_rate": 2.43550361297047e-05, "loss": 1.9987, "step": 77 }, { "epoch": 0.0058159042612683145, "grad_norm": 0.544782817363739, "learning_rate": 2.245515092739488e-05, "loss": 2.0636, "step": 78 }, { "epoch": 0.00589046713641278, "grad_norm": 0.5880534648895264, "learning_rate": 2.061073738537635e-05, "loss": 1.9786, "step": 79 }, { "epoch": 0.005965030011557245, "grad_norm": 0.526584267616272, "learning_rate": 1.8825509907063327e-05, "loss": 1.9431, "step": 80 }, { "epoch": 0.006039592886701711, "grad_norm": 0.5774157643318176, "learning_rate": 1.7103063703014372e-05, "loss": 2.0321, "step": 81 }, { "epoch": 0.006114155761846177, "grad_norm": 0.5706311464309692, "learning_rate": 1.544686755065677e-05, "loss": 2.0225, "step": 82 }, { "epoch": 0.006188718636990643, "grad_norm": 0.5908026099205017, "learning_rate": 1.3860256808630428e-05, "loss": 1.9953, "step": 83 }, { "epoch": 0.006263281512135108, "grad_norm": 0.6149851679801941, "learning_rate": 1.2346426699819458e-05, "loss": 1.9548, "step": 84 }, { "epoch": 0.0063378443872795735, "grad_norm": 0.6206725835800171, "learning_rate": 1.090842587659851e-05, "loss": 2.006, "step": 85 }, { "epoch": 0.006412407262424039, "grad_norm": 0.610331118106842, "learning_rate": 9.549150281252633e-06, "loss": 2.0609, "step": 86 }, { "epoch": 0.006486970137568504, "grad_norm": 0.5870972275733948, "learning_rate": 8.271337313934869e-06, "loss": 1.9884, "step": 87 }, { "epoch": 0.00656153301271297, "grad_norm": 0.641563892364502, "learning_rate": 7.077560319906695e-06, "loss": 2.1203, "step": 88 }, { "epoch": 0.006636095887857436, "grad_norm": 0.7119360566139221, "learning_rate": 5.9702234071631e-06, "loss": 2.0064, "step": 89 }, { "epoch": 0.006710658763001902, "grad_norm": 0.6424351930618286, "learning_rate": 4.951556604879048e-06, "loss": 2.1295, "step": 90 }, { "epoch": 0.006785221638146367, "grad_norm": 0.7995273470878601, "learning_rate": 4.023611372427471e-06, "loss": 1.9488, "step": 91 }, { "epoch": 0.006859784513290832, "grad_norm": 0.6758121848106384, "learning_rate": 3.18825646801314e-06, "loss": 2.0611, "step": 92 }, { "epoch": 0.006934347388435298, "grad_norm": 0.6477658152580261, "learning_rate": 2.4471741852423237e-06, "loss": 1.9928, "step": 93 }, { "epoch": 0.007008910263579764, "grad_norm": 0.6778663396835327, "learning_rate": 1.8018569652073381e-06, "loss": 2.0962, "step": 94 }, { "epoch": 0.007083473138724229, "grad_norm": 0.6519765853881836, "learning_rate": 1.2536043909088191e-06, "loss": 2.0006, "step": 95 }, { "epoch": 0.007158036013868695, "grad_norm": 0.7212676405906677, "learning_rate": 8.035205700685167e-07, "loss": 2.0475, "step": 96 }, { "epoch": 0.007232598889013161, "grad_norm": 0.7102315425872803, "learning_rate": 4.52511911603265e-07, "loss": 2.0604, "step": 97 }, { "epoch": 0.007307161764157626, "grad_norm": 0.7387847900390625, "learning_rate": 2.012853002380466e-07, "loss": 2.0218, "step": 98 }, { "epoch": 0.007381724639302091, "grad_norm": 0.8174800276756287, "learning_rate": 5.0346672934270534e-08, "loss": 2.0107, "step": 99 }, { "epoch": 0.007456287514446557, "grad_norm": 0.9243033528327942, "learning_rate": 0.0, "loss": 1.9478, "step": 100 }, { "epoch": 0.007456287514446557, "eval_loss": 1.97798752784729, "eval_runtime": 1976.5858, "eval_samples_per_second": 11.428, "eval_steps_per_second": 5.714, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.591481825576878e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }