prxy5606's picture
Training in progress, step 100, checkpoint
f431e08 verified
{
"best_metric": 1.97798752784729,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.007456287514446557,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 7.456287514446557e-05,
"grad_norm": 0.43125054240226746,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.764,
"step": 1
},
{
"epoch": 7.456287514446557e-05,
"eval_loss": 2.176044464111328,
"eval_runtime": 1967.9509,
"eval_samples_per_second": 11.478,
"eval_steps_per_second": 5.739,
"step": 1
},
{
"epoch": 0.00014912575028893114,
"grad_norm": 0.4944254457950592,
"learning_rate": 6.666666666666667e-06,
"loss": 2.0124,
"step": 2
},
{
"epoch": 0.0002236886254333967,
"grad_norm": 0.5164911150932312,
"learning_rate": 1e-05,
"loss": 2.0545,
"step": 3
},
{
"epoch": 0.0002982515005778623,
"grad_norm": 0.5371212363243103,
"learning_rate": 1.3333333333333333e-05,
"loss": 2.1354,
"step": 4
},
{
"epoch": 0.00037281437572232783,
"grad_norm": 0.5677233338356018,
"learning_rate": 1.6666666666666667e-05,
"loss": 2.0822,
"step": 5
},
{
"epoch": 0.0004473772508667934,
"grad_norm": 0.5842496156692505,
"learning_rate": 2e-05,
"loss": 2.1827,
"step": 6
},
{
"epoch": 0.000521940126011259,
"grad_norm": 0.5665204524993896,
"learning_rate": 2.3333333333333336e-05,
"loss": 2.1415,
"step": 7
},
{
"epoch": 0.0005965030011557246,
"grad_norm": 0.5413942337036133,
"learning_rate": 2.6666666666666667e-05,
"loss": 2.0118,
"step": 8
},
{
"epoch": 0.0006710658763001902,
"grad_norm": 0.5594425201416016,
"learning_rate": 3e-05,
"loss": 2.0801,
"step": 9
},
{
"epoch": 0.0007456287514446557,
"grad_norm": 0.553996741771698,
"learning_rate": 3.3333333333333335e-05,
"loss": 2.0225,
"step": 10
},
{
"epoch": 0.0008201916265891213,
"grad_norm": 0.5754431486129761,
"learning_rate": 3.6666666666666666e-05,
"loss": 2.064,
"step": 11
},
{
"epoch": 0.0008947545017335869,
"grad_norm": 0.6240922808647156,
"learning_rate": 4e-05,
"loss": 2.126,
"step": 12
},
{
"epoch": 0.0009693173768780525,
"grad_norm": 0.5755719542503357,
"learning_rate": 4.3333333333333334e-05,
"loss": 2.0683,
"step": 13
},
{
"epoch": 0.001043880252022518,
"grad_norm": 0.6122748851776123,
"learning_rate": 4.666666666666667e-05,
"loss": 2.0893,
"step": 14
},
{
"epoch": 0.0011184431271669835,
"grad_norm": 0.6614731550216675,
"learning_rate": 5e-05,
"loss": 2.1551,
"step": 15
},
{
"epoch": 0.0011930060023114491,
"grad_norm": 0.6139146685600281,
"learning_rate": 5.333333333333333e-05,
"loss": 1.959,
"step": 16
},
{
"epoch": 0.0012675688774559147,
"grad_norm": 0.7065208554267883,
"learning_rate": 5.666666666666667e-05,
"loss": 2.015,
"step": 17
},
{
"epoch": 0.0013421317526003803,
"grad_norm": 0.706400990486145,
"learning_rate": 6e-05,
"loss": 2.0351,
"step": 18
},
{
"epoch": 0.001416694627744846,
"grad_norm": 0.6575025916099548,
"learning_rate": 6.333333333333333e-05,
"loss": 2.0483,
"step": 19
},
{
"epoch": 0.0014912575028893113,
"grad_norm": 0.6933155059814453,
"learning_rate": 6.666666666666667e-05,
"loss": 2.0612,
"step": 20
},
{
"epoch": 0.001565820378033777,
"grad_norm": 0.6782702207565308,
"learning_rate": 7e-05,
"loss": 2.0763,
"step": 21
},
{
"epoch": 0.0016403832531782425,
"grad_norm": 0.671043872833252,
"learning_rate": 7.333333333333333e-05,
"loss": 2.076,
"step": 22
},
{
"epoch": 0.001714946128322708,
"grad_norm": 0.6524346470832825,
"learning_rate": 7.666666666666667e-05,
"loss": 2.0047,
"step": 23
},
{
"epoch": 0.0017895090034671737,
"grad_norm": 0.71756511926651,
"learning_rate": 8e-05,
"loss": 2.1351,
"step": 24
},
{
"epoch": 0.0018640718786116393,
"grad_norm": 0.6905089020729065,
"learning_rate": 8.333333333333334e-05,
"loss": 2.0644,
"step": 25
},
{
"epoch": 0.001938634753756105,
"grad_norm": 0.6620098352432251,
"learning_rate": 8.666666666666667e-05,
"loss": 2.1028,
"step": 26
},
{
"epoch": 0.0020131976289005705,
"grad_norm": 0.6542001962661743,
"learning_rate": 9e-05,
"loss": 2.0178,
"step": 27
},
{
"epoch": 0.002087760504045036,
"grad_norm": 0.725846529006958,
"learning_rate": 9.333333333333334e-05,
"loss": 2.1445,
"step": 28
},
{
"epoch": 0.0021623233791895017,
"grad_norm": 0.6756733655929565,
"learning_rate": 9.666666666666667e-05,
"loss": 2.0946,
"step": 29
},
{
"epoch": 0.002236886254333967,
"grad_norm": 0.6713683605194092,
"learning_rate": 0.0001,
"loss": 2.0383,
"step": 30
},
{
"epoch": 0.002311449129478433,
"grad_norm": 0.8826857805252075,
"learning_rate": 9.994965332706573e-05,
"loss": 2.1636,
"step": 31
},
{
"epoch": 0.0023860120046228983,
"grad_norm": 0.7072741985321045,
"learning_rate": 9.979871469976196e-05,
"loss": 2.0969,
"step": 32
},
{
"epoch": 0.0024605748797673637,
"grad_norm": 0.6709433197975159,
"learning_rate": 9.954748808839674e-05,
"loss": 2.0807,
"step": 33
},
{
"epoch": 0.0025351377549118295,
"grad_norm": 0.6410077810287476,
"learning_rate": 9.919647942993148e-05,
"loss": 1.9685,
"step": 34
},
{
"epoch": 0.002609700630056295,
"grad_norm": 0.7483009696006775,
"learning_rate": 9.874639560909117e-05,
"loss": 2.1124,
"step": 35
},
{
"epoch": 0.0026842635052007607,
"grad_norm": 0.667682409286499,
"learning_rate": 9.819814303479267e-05,
"loss": 1.9806,
"step": 36
},
{
"epoch": 0.002758826380345226,
"grad_norm": 0.6942345499992371,
"learning_rate": 9.755282581475769e-05,
"loss": 2.0423,
"step": 37
},
{
"epoch": 0.002833389255489692,
"grad_norm": 0.7064806222915649,
"learning_rate": 9.681174353198687e-05,
"loss": 2.1869,
"step": 38
},
{
"epoch": 0.0029079521306341572,
"grad_norm": 0.6997663974761963,
"learning_rate": 9.597638862757255e-05,
"loss": 2.1582,
"step": 39
},
{
"epoch": 0.0029825150057786226,
"grad_norm": 0.6768945455551147,
"learning_rate": 9.504844339512095e-05,
"loss": 2.1976,
"step": 40
},
{
"epoch": 0.0030570778809230884,
"grad_norm": 0.7602648138999939,
"learning_rate": 9.40297765928369e-05,
"loss": 2.0887,
"step": 41
},
{
"epoch": 0.003131640756067554,
"grad_norm": 0.6888823509216309,
"learning_rate": 9.292243968009331e-05,
"loss": 2.0271,
"step": 42
},
{
"epoch": 0.0032062036312120196,
"grad_norm": 0.694747805595398,
"learning_rate": 9.172866268606513e-05,
"loss": 2.081,
"step": 43
},
{
"epoch": 0.003280766506356485,
"grad_norm": 0.6592668890953064,
"learning_rate": 9.045084971874738e-05,
"loss": 1.9375,
"step": 44
},
{
"epoch": 0.003355329381500951,
"grad_norm": 0.7744700312614441,
"learning_rate": 8.90915741234015e-05,
"loss": 2.1293,
"step": 45
},
{
"epoch": 0.003429892256645416,
"grad_norm": 0.7216829657554626,
"learning_rate": 8.765357330018056e-05,
"loss": 2.0884,
"step": 46
},
{
"epoch": 0.003504455131789882,
"grad_norm": 0.7232696413993835,
"learning_rate": 8.613974319136958e-05,
"loss": 2.0312,
"step": 47
},
{
"epoch": 0.0035790180069343474,
"grad_norm": 0.964648425579071,
"learning_rate": 8.455313244934324e-05,
"loss": 1.9873,
"step": 48
},
{
"epoch": 0.003653580882078813,
"grad_norm": 0.8183997869491577,
"learning_rate": 8.289693629698564e-05,
"loss": 2.0139,
"step": 49
},
{
"epoch": 0.0037281437572232786,
"grad_norm": 0.970118522644043,
"learning_rate": 8.117449009293668e-05,
"loss": 2.1238,
"step": 50
},
{
"epoch": 0.0037281437572232786,
"eval_loss": 2.0290143489837646,
"eval_runtime": 1975.9168,
"eval_samples_per_second": 11.432,
"eval_steps_per_second": 5.716,
"step": 50
},
{
"epoch": 0.003802706632367744,
"grad_norm": 0.5115548968315125,
"learning_rate": 7.938926261462366e-05,
"loss": 1.6365,
"step": 51
},
{
"epoch": 0.00387726950751221,
"grad_norm": 0.7051359415054321,
"learning_rate": 7.754484907260513e-05,
"loss": 2.0046,
"step": 52
},
{
"epoch": 0.003951832382656675,
"grad_norm": 0.46429726481437683,
"learning_rate": 7.564496387029532e-05,
"loss": 1.8634,
"step": 53
},
{
"epoch": 0.004026395257801141,
"grad_norm": 0.40492793917655945,
"learning_rate": 7.369343312364993e-05,
"loss": 1.9598,
"step": 54
},
{
"epoch": 0.004100958132945607,
"grad_norm": 0.39108383655548096,
"learning_rate": 7.169418695587791e-05,
"loss": 1.8478,
"step": 55
},
{
"epoch": 0.004175521008090072,
"grad_norm": 0.42470911145210266,
"learning_rate": 6.965125158269619e-05,
"loss": 2.0799,
"step": 56
},
{
"epoch": 0.004250083883234538,
"grad_norm": 0.45118194818496704,
"learning_rate": 6.756874120406714e-05,
"loss": 2.045,
"step": 57
},
{
"epoch": 0.004324646758379003,
"grad_norm": 0.4153003990650177,
"learning_rate": 6.545084971874738e-05,
"loss": 1.8896,
"step": 58
},
{
"epoch": 0.004399209633523468,
"grad_norm": 0.4770011305809021,
"learning_rate": 6.330184227833376e-05,
"loss": 2.0487,
"step": 59
},
{
"epoch": 0.004473772508667934,
"grad_norm": 0.46756961941719055,
"learning_rate": 6.112604669781572e-05,
"loss": 1.9976,
"step": 60
},
{
"epoch": 0.0045483353838124,
"grad_norm": 0.47102442383766174,
"learning_rate": 5.8927844739931834e-05,
"loss": 1.9983,
"step": 61
},
{
"epoch": 0.004622898258956866,
"grad_norm": 0.5105215907096863,
"learning_rate": 5.6711663290882776e-05,
"loss": 2.0145,
"step": 62
},
{
"epoch": 0.004697461134101331,
"grad_norm": 0.48268643021583557,
"learning_rate": 5.448196544517168e-05,
"loss": 2.0284,
"step": 63
},
{
"epoch": 0.0047720240092457965,
"grad_norm": 0.49653950333595276,
"learning_rate": 5.2243241517525754e-05,
"loss": 2.0299,
"step": 64
},
{
"epoch": 0.004846586884390262,
"grad_norm": 0.4842401444911957,
"learning_rate": 5e-05,
"loss": 1.9628,
"step": 65
},
{
"epoch": 0.004921149759534727,
"grad_norm": 0.5133103728294373,
"learning_rate": 4.775675848247427e-05,
"loss": 1.9677,
"step": 66
},
{
"epoch": 0.004995712634679193,
"grad_norm": 0.5002983808517456,
"learning_rate": 4.551803455482833e-05,
"loss": 2.0314,
"step": 67
},
{
"epoch": 0.005070275509823659,
"grad_norm": 0.5741726756095886,
"learning_rate": 4.328833670911724e-05,
"loss": 2.007,
"step": 68
},
{
"epoch": 0.005144838384968125,
"grad_norm": 0.5411697626113892,
"learning_rate": 4.107215526006817e-05,
"loss": 1.954,
"step": 69
},
{
"epoch": 0.00521940126011259,
"grad_norm": 0.5465304255485535,
"learning_rate": 3.887395330218429e-05,
"loss": 1.9437,
"step": 70
},
{
"epoch": 0.0052939641352570555,
"grad_norm": 0.5512676239013672,
"learning_rate": 3.6698157721666246e-05,
"loss": 1.8954,
"step": 71
},
{
"epoch": 0.005368527010401521,
"grad_norm": 0.5813472270965576,
"learning_rate": 3.4549150281252636e-05,
"loss": 2.0268,
"step": 72
},
{
"epoch": 0.005443089885545986,
"grad_norm": 0.5529889464378357,
"learning_rate": 3.243125879593286e-05,
"loss": 1.9832,
"step": 73
},
{
"epoch": 0.005517652760690452,
"grad_norm": 0.5440077781677246,
"learning_rate": 3.0348748417303823e-05,
"loss": 1.9148,
"step": 74
},
{
"epoch": 0.005592215635834918,
"grad_norm": 0.5623196363449097,
"learning_rate": 2.8305813044122097e-05,
"loss": 2.0762,
"step": 75
},
{
"epoch": 0.005666778510979384,
"grad_norm": 0.5488706827163696,
"learning_rate": 2.630656687635007e-05,
"loss": 1.9115,
"step": 76
},
{
"epoch": 0.005741341386123849,
"grad_norm": 0.5504707098007202,
"learning_rate": 2.43550361297047e-05,
"loss": 1.9987,
"step": 77
},
{
"epoch": 0.0058159042612683145,
"grad_norm": 0.544782817363739,
"learning_rate": 2.245515092739488e-05,
"loss": 2.0636,
"step": 78
},
{
"epoch": 0.00589046713641278,
"grad_norm": 0.5880534648895264,
"learning_rate": 2.061073738537635e-05,
"loss": 1.9786,
"step": 79
},
{
"epoch": 0.005965030011557245,
"grad_norm": 0.526584267616272,
"learning_rate": 1.8825509907063327e-05,
"loss": 1.9431,
"step": 80
},
{
"epoch": 0.006039592886701711,
"grad_norm": 0.5774157643318176,
"learning_rate": 1.7103063703014372e-05,
"loss": 2.0321,
"step": 81
},
{
"epoch": 0.006114155761846177,
"grad_norm": 0.5706311464309692,
"learning_rate": 1.544686755065677e-05,
"loss": 2.0225,
"step": 82
},
{
"epoch": 0.006188718636990643,
"grad_norm": 0.5908026099205017,
"learning_rate": 1.3860256808630428e-05,
"loss": 1.9953,
"step": 83
},
{
"epoch": 0.006263281512135108,
"grad_norm": 0.6149851679801941,
"learning_rate": 1.2346426699819458e-05,
"loss": 1.9548,
"step": 84
},
{
"epoch": 0.0063378443872795735,
"grad_norm": 0.6206725835800171,
"learning_rate": 1.090842587659851e-05,
"loss": 2.006,
"step": 85
},
{
"epoch": 0.006412407262424039,
"grad_norm": 0.610331118106842,
"learning_rate": 9.549150281252633e-06,
"loss": 2.0609,
"step": 86
},
{
"epoch": 0.006486970137568504,
"grad_norm": 0.5870972275733948,
"learning_rate": 8.271337313934869e-06,
"loss": 1.9884,
"step": 87
},
{
"epoch": 0.00656153301271297,
"grad_norm": 0.641563892364502,
"learning_rate": 7.077560319906695e-06,
"loss": 2.1203,
"step": 88
},
{
"epoch": 0.006636095887857436,
"grad_norm": 0.7119360566139221,
"learning_rate": 5.9702234071631e-06,
"loss": 2.0064,
"step": 89
},
{
"epoch": 0.006710658763001902,
"grad_norm": 0.6424351930618286,
"learning_rate": 4.951556604879048e-06,
"loss": 2.1295,
"step": 90
},
{
"epoch": 0.006785221638146367,
"grad_norm": 0.7995273470878601,
"learning_rate": 4.023611372427471e-06,
"loss": 1.9488,
"step": 91
},
{
"epoch": 0.006859784513290832,
"grad_norm": 0.6758121848106384,
"learning_rate": 3.18825646801314e-06,
"loss": 2.0611,
"step": 92
},
{
"epoch": 0.006934347388435298,
"grad_norm": 0.6477658152580261,
"learning_rate": 2.4471741852423237e-06,
"loss": 1.9928,
"step": 93
},
{
"epoch": 0.007008910263579764,
"grad_norm": 0.6778663396835327,
"learning_rate": 1.8018569652073381e-06,
"loss": 2.0962,
"step": 94
},
{
"epoch": 0.007083473138724229,
"grad_norm": 0.6519765853881836,
"learning_rate": 1.2536043909088191e-06,
"loss": 2.0006,
"step": 95
},
{
"epoch": 0.007158036013868695,
"grad_norm": 0.7212676405906677,
"learning_rate": 8.035205700685167e-07,
"loss": 2.0475,
"step": 96
},
{
"epoch": 0.007232598889013161,
"grad_norm": 0.7102315425872803,
"learning_rate": 4.52511911603265e-07,
"loss": 2.0604,
"step": 97
},
{
"epoch": 0.007307161764157626,
"grad_norm": 0.7387847900390625,
"learning_rate": 2.012853002380466e-07,
"loss": 2.0218,
"step": 98
},
{
"epoch": 0.007381724639302091,
"grad_norm": 0.8174800276756287,
"learning_rate": 5.0346672934270534e-08,
"loss": 2.0107,
"step": 99
},
{
"epoch": 0.007456287514446557,
"grad_norm": 0.9243033528327942,
"learning_rate": 0.0,
"loss": 1.9478,
"step": 100
},
{
"epoch": 0.007456287514446557,
"eval_loss": 1.97798752784729,
"eval_runtime": 1976.5858,
"eval_samples_per_second": 11.428,
"eval_steps_per_second": 5.714,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.591481825576878e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}