prxy5605's picture
Training in progress, step 193, checkpoint
6198a4d verified
raw
history blame
34.8 kB
{
"best_metric": 0.5428289771080017,
"best_model_checkpoint": "miner_id_24/checkpoint-50",
"epoch": 3.0038910505836576,
"eval_steps": 50,
"global_step": 193,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01556420233463035,
"grad_norm": 1.7703475952148438,
"learning_rate": 5e-06,
"loss": 0.6598,
"step": 1
},
{
"epoch": 0.01556420233463035,
"eval_loss": 0.805687665939331,
"eval_runtime": 10.5307,
"eval_samples_per_second": 10.256,
"eval_steps_per_second": 5.128,
"step": 1
},
{
"epoch": 0.0311284046692607,
"grad_norm": 2.244293689727783,
"learning_rate": 1e-05,
"loss": 0.7386,
"step": 2
},
{
"epoch": 0.04669260700389105,
"grad_norm": 1.9753533601760864,
"learning_rate": 1.5e-05,
"loss": 0.709,
"step": 3
},
{
"epoch": 0.0622568093385214,
"grad_norm": 1.581067681312561,
"learning_rate": 2e-05,
"loss": 0.6545,
"step": 4
},
{
"epoch": 0.07782101167315175,
"grad_norm": 1.4357757568359375,
"learning_rate": 2.5e-05,
"loss": 0.6287,
"step": 5
},
{
"epoch": 0.0933852140077821,
"grad_norm": 1.279219627380371,
"learning_rate": 3e-05,
"loss": 0.6593,
"step": 6
},
{
"epoch": 0.10894941634241245,
"grad_norm": 1.0461534261703491,
"learning_rate": 3.5e-05,
"loss": 0.563,
"step": 7
},
{
"epoch": 0.1245136186770428,
"grad_norm": 0.9932262897491455,
"learning_rate": 4e-05,
"loss": 0.6083,
"step": 8
},
{
"epoch": 0.14007782101167315,
"grad_norm": 1.0002769231796265,
"learning_rate": 4.5e-05,
"loss": 0.575,
"step": 9
},
{
"epoch": 0.1556420233463035,
"grad_norm": 1.071124792098999,
"learning_rate": 5e-05,
"loss": 0.5916,
"step": 10
},
{
"epoch": 0.17120622568093385,
"grad_norm": 1.0537177324295044,
"learning_rate": 5.500000000000001e-05,
"loss": 0.5642,
"step": 11
},
{
"epoch": 0.1867704280155642,
"grad_norm": 0.9426125884056091,
"learning_rate": 6e-05,
"loss": 0.5533,
"step": 12
},
{
"epoch": 0.20233463035019456,
"grad_norm": 0.9893934726715088,
"learning_rate": 6.500000000000001e-05,
"loss": 0.5613,
"step": 13
},
{
"epoch": 0.2178988326848249,
"grad_norm": 0.9386663436889648,
"learning_rate": 7e-05,
"loss": 0.578,
"step": 14
},
{
"epoch": 0.23346303501945526,
"grad_norm": 0.9610932469367981,
"learning_rate": 7.500000000000001e-05,
"loss": 0.5623,
"step": 15
},
{
"epoch": 0.2490272373540856,
"grad_norm": 1.0002214908599854,
"learning_rate": 8e-05,
"loss": 0.6155,
"step": 16
},
{
"epoch": 0.26459143968871596,
"grad_norm": 0.7454622983932495,
"learning_rate": 8.5e-05,
"loss": 0.5654,
"step": 17
},
{
"epoch": 0.2801556420233463,
"grad_norm": 0.7215997576713562,
"learning_rate": 9e-05,
"loss": 0.5547,
"step": 18
},
{
"epoch": 0.29571984435797666,
"grad_norm": 0.6941145658493042,
"learning_rate": 9.5e-05,
"loss": 0.5509,
"step": 19
},
{
"epoch": 0.311284046692607,
"grad_norm": 0.7395036220550537,
"learning_rate": 0.0001,
"loss": 0.5265,
"step": 20
},
{
"epoch": 0.32684824902723736,
"grad_norm": 0.7585510015487671,
"learning_rate": 9.999175604498867e-05,
"loss": 0.584,
"step": 21
},
{
"epoch": 0.3424124513618677,
"grad_norm": 0.7440906167030334,
"learning_rate": 9.996702689846645e-05,
"loss": 0.5634,
"step": 22
},
{
"epoch": 0.35797665369649806,
"grad_norm": 0.7914589643478394,
"learning_rate": 9.992582071507216e-05,
"loss": 0.5744,
"step": 23
},
{
"epoch": 0.3735408560311284,
"grad_norm": 0.7631120681762695,
"learning_rate": 9.986815108288272e-05,
"loss": 0.5593,
"step": 24
},
{
"epoch": 0.38910505836575876,
"grad_norm": 0.68586665391922,
"learning_rate": 9.979403701893226e-05,
"loss": 0.5455,
"step": 25
},
{
"epoch": 0.4046692607003891,
"grad_norm": 0.7517482042312622,
"learning_rate": 9.970350296294113e-05,
"loss": 0.5978,
"step": 26
},
{
"epoch": 0.42023346303501946,
"grad_norm": 0.7497158646583557,
"learning_rate": 9.959657876925671e-05,
"loss": 0.5164,
"step": 27
},
{
"epoch": 0.4357976653696498,
"grad_norm": 0.6995195746421814,
"learning_rate": 9.94732996970087e-05,
"loss": 0.5031,
"step": 28
},
{
"epoch": 0.45136186770428016,
"grad_norm": 0.7487537264823914,
"learning_rate": 9.933370639848211e-05,
"loss": 0.5705,
"step": 29
},
{
"epoch": 0.4669260700389105,
"grad_norm": 0.7707574963569641,
"learning_rate": 9.917784490571187e-05,
"loss": 0.5422,
"step": 30
},
{
"epoch": 0.48249027237354086,
"grad_norm": 0.7618570327758789,
"learning_rate": 9.900576661530335e-05,
"loss": 0.4955,
"step": 31
},
{
"epoch": 0.4980544747081712,
"grad_norm": 0.8625350594520569,
"learning_rate": 9.88175282714839e-05,
"loss": 0.6098,
"step": 32
},
{
"epoch": 0.5136186770428015,
"grad_norm": 0.5663038492202759,
"learning_rate": 9.861319194739109e-05,
"loss": 0.5213,
"step": 33
},
{
"epoch": 0.5291828793774319,
"grad_norm": 0.6109879016876221,
"learning_rate": 9.83928250246034e-05,
"loss": 0.555,
"step": 34
},
{
"epoch": 0.5447470817120622,
"grad_norm": 0.5990862250328064,
"learning_rate": 9.815650017092077e-05,
"loss": 0.5395,
"step": 35
},
{
"epoch": 0.5603112840466926,
"grad_norm": 0.5716366767883301,
"learning_rate": 9.790429531640161e-05,
"loss": 0.5209,
"step": 36
},
{
"epoch": 0.5758754863813229,
"grad_norm": 0.6569287776947021,
"learning_rate": 9.763629362766496e-05,
"loss": 0.5587,
"step": 37
},
{
"epoch": 0.5914396887159533,
"grad_norm": 0.6579481363296509,
"learning_rate": 9.735258348046536e-05,
"loss": 0.5347,
"step": 38
},
{
"epoch": 0.6070038910505836,
"grad_norm": 0.6650751829147339,
"learning_rate": 9.705325843055045e-05,
"loss": 0.5646,
"step": 39
},
{
"epoch": 0.622568093385214,
"grad_norm": 0.6674900650978088,
"learning_rate": 9.673841718280999e-05,
"loss": 0.6048,
"step": 40
},
{
"epoch": 0.6381322957198443,
"grad_norm": 0.6020020246505737,
"learning_rate": 9.64081635587273e-05,
"loss": 0.5133,
"step": 41
},
{
"epoch": 0.6536964980544747,
"grad_norm": 0.6875168681144714,
"learning_rate": 9.606260646214313e-05,
"loss": 0.5529,
"step": 42
},
{
"epoch": 0.669260700389105,
"grad_norm": 0.6856683492660522,
"learning_rate": 9.570185984334383e-05,
"loss": 0.5606,
"step": 43
},
{
"epoch": 0.6848249027237354,
"grad_norm": 0.6488378047943115,
"learning_rate": 9.53260426614852e-05,
"loss": 0.5298,
"step": 44
},
{
"epoch": 0.7003891050583657,
"grad_norm": 0.6739886403083801,
"learning_rate": 9.493527884536486e-05,
"loss": 0.5314,
"step": 45
},
{
"epoch": 0.7159533073929961,
"grad_norm": 0.6689781546592712,
"learning_rate": 9.452969725255558e-05,
"loss": 0.5413,
"step": 46
},
{
"epoch": 0.7315175097276264,
"grad_norm": 0.7617279887199402,
"learning_rate": 9.410943162691359e-05,
"loss": 0.5513,
"step": 47
},
{
"epoch": 0.7470817120622568,
"grad_norm": 0.7101250886917114,
"learning_rate": 9.367462055447528e-05,
"loss": 0.4799,
"step": 48
},
{
"epoch": 0.7626459143968871,
"grad_norm": 0.5001528263092041,
"learning_rate": 9.322540741775744e-05,
"loss": 0.5058,
"step": 49
},
{
"epoch": 0.7782101167315175,
"grad_norm": 0.5632477402687073,
"learning_rate": 9.276194034847566e-05,
"loss": 0.5636,
"step": 50
},
{
"epoch": 0.7782101167315175,
"eval_loss": 0.5428289771080017,
"eval_runtime": 10.8958,
"eval_samples_per_second": 9.912,
"eval_steps_per_second": 4.956,
"step": 50
},
{
"epoch": 0.7937743190661478,
"grad_norm": 0.5578231811523438,
"learning_rate": 9.228437217869667e-05,
"loss": 0.5325,
"step": 51
},
{
"epoch": 0.8093385214007782,
"grad_norm": 0.5764113068580627,
"learning_rate": 9.179286039044073e-05,
"loss": 0.5326,
"step": 52
},
{
"epoch": 0.8249027237354085,
"grad_norm": 0.6034092903137207,
"learning_rate": 9.128756706375065e-05,
"loss": 0.5717,
"step": 53
},
{
"epoch": 0.8404669260700389,
"grad_norm": 0.6172077655792236,
"learning_rate": 9.076865882324452e-05,
"loss": 0.5724,
"step": 54
},
{
"epoch": 0.8560311284046692,
"grad_norm": 0.5496807098388672,
"learning_rate": 9.023630678316995e-05,
"loss": 0.5209,
"step": 55
},
{
"epoch": 0.8715953307392996,
"grad_norm": 0.583033561706543,
"learning_rate": 8.969068649097766e-05,
"loss": 0.5555,
"step": 56
},
{
"epoch": 0.8871595330739299,
"grad_norm": 0.5868954062461853,
"learning_rate": 8.913197786943336e-05,
"loss": 0.5197,
"step": 57
},
{
"epoch": 0.9027237354085603,
"grad_norm": 0.5880308747291565,
"learning_rate": 8.856036515728666e-05,
"loss": 0.5064,
"step": 58
},
{
"epoch": 0.9182879377431906,
"grad_norm": 0.689042866230011,
"learning_rate": 8.797603684851685e-05,
"loss": 0.5509,
"step": 59
},
{
"epoch": 0.933852140077821,
"grad_norm": 0.6428240537643433,
"learning_rate": 8.737918563017553e-05,
"loss": 0.5875,
"step": 60
},
{
"epoch": 0.9494163424124513,
"grad_norm": 0.5802725553512573,
"learning_rate": 8.677000831884638e-05,
"loss": 0.5031,
"step": 61
},
{
"epoch": 0.9649805447470817,
"grad_norm": 0.6728083491325378,
"learning_rate": 8.614870579574337e-05,
"loss": 0.5216,
"step": 62
},
{
"epoch": 0.980544747081712,
"grad_norm": 0.620725154876709,
"learning_rate": 8.551548294046843e-05,
"loss": 0.4774,
"step": 63
},
{
"epoch": 0.9961089494163424,
"grad_norm": 0.6825420260429382,
"learning_rate": 8.487054856345081e-05,
"loss": 0.5513,
"step": 64
},
{
"epoch": 1.0116731517509727,
"grad_norm": 1.8129425048828125,
"learning_rate": 8.421411533709009e-05,
"loss": 0.9706,
"step": 65
},
{
"epoch": 1.027237354085603,
"grad_norm": 0.5069100260734558,
"learning_rate": 8.35463997256257e-05,
"loss": 0.4212,
"step": 66
},
{
"epoch": 1.0428015564202335,
"grad_norm": 0.5340793132781982,
"learning_rate": 8.28676219137561e-05,
"loss": 0.4766,
"step": 67
},
{
"epoch": 1.0583657587548638,
"grad_norm": 0.5560380816459656,
"learning_rate": 8.217800573403105e-05,
"loss": 0.4478,
"step": 68
},
{
"epoch": 1.0739299610894941,
"grad_norm": 0.5705240368843079,
"learning_rate": 8.147777859304096e-05,
"loss": 0.395,
"step": 69
},
{
"epoch": 1.0894941634241244,
"grad_norm": 0.6902859807014465,
"learning_rate": 8.076717139642775e-05,
"loss": 0.4399,
"step": 70
},
{
"epoch": 1.105058365758755,
"grad_norm": 0.6529071927070618,
"learning_rate": 8.004641847274181e-05,
"loss": 0.3999,
"step": 71
},
{
"epoch": 1.1206225680933852,
"grad_norm": 0.5854980945587158,
"learning_rate": 7.931575749617026e-05,
"loss": 0.3876,
"step": 72
},
{
"epoch": 1.1361867704280155,
"grad_norm": 0.5517836809158325,
"learning_rate": 7.857542940816183e-05,
"loss": 0.3721,
"step": 73
},
{
"epoch": 1.1517509727626458,
"grad_norm": 0.6773819327354431,
"learning_rate": 7.782567833797457e-05,
"loss": 0.3864,
"step": 74
},
{
"epoch": 1.1673151750972763,
"grad_norm": 0.7298661470413208,
"learning_rate": 7.70667515221722e-05,
"loss": 0.4056,
"step": 75
},
{
"epoch": 1.1828793774319066,
"grad_norm": 0.6650790572166443,
"learning_rate": 7.629889922309577e-05,
"loss": 0.4164,
"step": 76
},
{
"epoch": 1.198443579766537,
"grad_norm": 0.7574925422668457,
"learning_rate": 7.552237464633761e-05,
"loss": 0.3818,
"step": 77
},
{
"epoch": 1.2140077821011672,
"grad_norm": 0.7199667692184448,
"learning_rate": 7.473743385724478e-05,
"loss": 0.3314,
"step": 78
},
{
"epoch": 1.2295719844357977,
"grad_norm": 0.7638531923294067,
"learning_rate": 7.394433569647934e-05,
"loss": 0.3508,
"step": 79
},
{
"epoch": 1.245136186770428,
"grad_norm": 0.8220713138580322,
"learning_rate": 7.31433416946636e-05,
"loss": 0.3309,
"step": 80
},
{
"epoch": 1.2607003891050583,
"grad_norm": 0.7258894443511963,
"learning_rate": 7.233471598613815e-05,
"loss": 0.4268,
"step": 81
},
{
"epoch": 1.2762645914396886,
"grad_norm": 0.680307149887085,
"learning_rate": 7.151872522186146e-05,
"loss": 0.4124,
"step": 82
},
{
"epoch": 1.2918287937743191,
"grad_norm": 0.6185837984085083,
"learning_rate": 7.069563848147956e-05,
"loss": 0.4174,
"step": 83
},
{
"epoch": 1.3073929961089494,
"grad_norm": 0.5700533390045166,
"learning_rate": 6.986572718459479e-05,
"loss": 0.4186,
"step": 84
},
{
"epoch": 1.3229571984435797,
"grad_norm": 0.6120938062667847,
"learning_rate": 6.902926500126292e-05,
"loss": 0.4139,
"step": 85
},
{
"epoch": 1.3385214007782102,
"grad_norm": 0.5696559548377991,
"learning_rate": 6.818652776174827e-05,
"loss": 0.3949,
"step": 86
},
{
"epoch": 1.3540856031128405,
"grad_norm": 0.6349061131477356,
"learning_rate": 6.733779336556642e-05,
"loss": 0.4209,
"step": 87
},
{
"epoch": 1.3696498054474708,
"grad_norm": 0.6833882331848145,
"learning_rate": 6.648334168984452e-05,
"loss": 0.4119,
"step": 88
},
{
"epoch": 1.3852140077821011,
"grad_norm": 0.6829484105110168,
"learning_rate": 6.562345449702951e-05,
"loss": 0.3991,
"step": 89
},
{
"epoch": 1.4007782101167314,
"grad_norm": 0.6942391991615295,
"learning_rate": 6.47584153419747e-05,
"loss": 0.4104,
"step": 90
},
{
"epoch": 1.416342412451362,
"grad_norm": 0.6838539242744446,
"learning_rate": 6.388850947843517e-05,
"loss": 0.3672,
"step": 91
},
{
"epoch": 1.4319066147859922,
"grad_norm": 0.7023414373397827,
"learning_rate": 6.301402376500304e-05,
"loss": 0.3579,
"step": 92
},
{
"epoch": 1.4474708171206225,
"grad_norm": 0.738709032535553,
"learning_rate": 6.213524657051353e-05,
"loss": 0.4006,
"step": 93
},
{
"epoch": 1.463035019455253,
"grad_norm": 0.724032461643219,
"learning_rate": 6.125246767895286e-05,
"loss": 0.328,
"step": 94
},
{
"epoch": 1.4785992217898833,
"grad_norm": 0.7786082029342651,
"learning_rate": 6.036597819389972e-05,
"loss": 0.3658,
"step": 95
},
{
"epoch": 1.4941634241245136,
"grad_norm": 0.9172019958496094,
"learning_rate": 5.947607044253142e-05,
"loss": 0.3407,
"step": 96
},
{
"epoch": 1.509727626459144,
"grad_norm": 0.6716464161872864,
"learning_rate": 5.858303787922663e-05,
"loss": 0.3548,
"step": 97
},
{
"epoch": 1.5252918287937742,
"grad_norm": 0.7111585736274719,
"learning_rate": 5.768717498879635e-05,
"loss": 0.4403,
"step": 98
},
{
"epoch": 1.5408560311284045,
"grad_norm": 0.6962729692459106,
"learning_rate": 5.67887771893752e-05,
"loss": 0.4766,
"step": 99
},
{
"epoch": 1.556420233463035,
"grad_norm": 0.5779573321342468,
"learning_rate": 5.5888140735004804e-05,
"loss": 0.3564,
"step": 100
},
{
"epoch": 1.556420233463035,
"eval_loss": 0.5569015145301819,
"eval_runtime": 10.8918,
"eval_samples_per_second": 9.916,
"eval_steps_per_second": 4.958,
"step": 100
},
{
"epoch": 1.5719844357976653,
"grad_norm": 0.6904802322387695,
"learning_rate": 5.498556261794161e-05,
"loss": 0.4492,
"step": 101
},
{
"epoch": 1.5875486381322959,
"grad_norm": 0.6333062648773193,
"learning_rate": 5.4081340470721284e-05,
"loss": 0.4208,
"step": 102
},
{
"epoch": 1.6031128404669261,
"grad_norm": 0.5850105881690979,
"learning_rate": 5.31757724680119e-05,
"loss": 0.3787,
"step": 103
},
{
"epoch": 1.6186770428015564,
"grad_norm": 0.628386914730072,
"learning_rate": 5.22691572282884e-05,
"loss": 0.4535,
"step": 104
},
{
"epoch": 1.6342412451361867,
"grad_norm": 0.5983573198318481,
"learning_rate": 5.136179371536076e-05,
"loss": 0.3768,
"step": 105
},
{
"epoch": 1.649805447470817,
"grad_norm": 0.6634320020675659,
"learning_rate": 5.045398113978817e-05,
"loss": 0.3814,
"step": 106
},
{
"epoch": 1.6653696498054473,
"grad_norm": 0.5900952219963074,
"learning_rate": 4.9546018860211844e-05,
"loss": 0.3388,
"step": 107
},
{
"epoch": 1.6809338521400778,
"grad_norm": 0.741165816783905,
"learning_rate": 4.863820628463925e-05,
"loss": 0.359,
"step": 108
},
{
"epoch": 1.6964980544747081,
"grad_norm": 0.7050174474716187,
"learning_rate": 4.773084277171161e-05,
"loss": 0.3425,
"step": 109
},
{
"epoch": 1.7120622568093387,
"grad_norm": 0.7681849598884583,
"learning_rate": 4.682422753198812e-05,
"loss": 0.3615,
"step": 110
},
{
"epoch": 1.727626459143969,
"grad_norm": 0.7492854595184326,
"learning_rate": 4.591865952927873e-05,
"loss": 0.3326,
"step": 111
},
{
"epoch": 1.7431906614785992,
"grad_norm": 0.8712835907936096,
"learning_rate": 4.501443738205841e-05,
"loss": 0.3441,
"step": 112
},
{
"epoch": 1.7587548638132295,
"grad_norm": 0.6653512120246887,
"learning_rate": 4.41118592649952e-05,
"loss": 0.4284,
"step": 113
},
{
"epoch": 1.7743190661478598,
"grad_norm": 0.6634366512298584,
"learning_rate": 4.321122281062481e-05,
"loss": 0.441,
"step": 114
},
{
"epoch": 1.7898832684824901,
"grad_norm": 0.6530463099479675,
"learning_rate": 4.231282501120366e-05,
"loss": 0.4032,
"step": 115
},
{
"epoch": 1.8054474708171206,
"grad_norm": 0.6511101126670837,
"learning_rate": 4.1416962120773396e-05,
"loss": 0.4025,
"step": 116
},
{
"epoch": 1.821011673151751,
"grad_norm": 0.6638808846473694,
"learning_rate": 4.0523929557468594e-05,
"loss": 0.4072,
"step": 117
},
{
"epoch": 1.8365758754863815,
"grad_norm": 0.6273797750473022,
"learning_rate": 3.9634021806100274e-05,
"loss": 0.424,
"step": 118
},
{
"epoch": 1.8521400778210118,
"grad_norm": 0.6482309103012085,
"learning_rate": 3.874753232104714e-05,
"loss": 0.4046,
"step": 119
},
{
"epoch": 1.867704280155642,
"grad_norm": 0.6184787750244141,
"learning_rate": 3.786475342948647e-05,
"loss": 0.3906,
"step": 120
},
{
"epoch": 1.8832684824902723,
"grad_norm": 0.655604362487793,
"learning_rate": 3.6985976234996954e-05,
"loss": 0.4001,
"step": 121
},
{
"epoch": 1.8988326848249026,
"grad_norm": 0.6891288757324219,
"learning_rate": 3.611149052156483e-05,
"loss": 0.4013,
"step": 122
},
{
"epoch": 1.914396887159533,
"grad_norm": 0.7344037890434265,
"learning_rate": 3.524158465802531e-05,
"loss": 0.3742,
"step": 123
},
{
"epoch": 1.9299610894941635,
"grad_norm": 0.669601559638977,
"learning_rate": 3.437654550297049e-05,
"loss": 0.3722,
"step": 124
},
{
"epoch": 1.9455252918287937,
"grad_norm": 0.7063509225845337,
"learning_rate": 3.351665831015549e-05,
"loss": 0.3521,
"step": 125
},
{
"epoch": 1.9610894941634243,
"grad_norm": 0.7894170880317688,
"learning_rate": 3.2662206634433576e-05,
"loss": 0.3634,
"step": 126
},
{
"epoch": 1.9766536964980546,
"grad_norm": 0.799424946308136,
"learning_rate": 3.181347223825174e-05,
"loss": 0.3357,
"step": 127
},
{
"epoch": 1.9922178988326849,
"grad_norm": 0.8893400430679321,
"learning_rate": 3.0970734998737095e-05,
"loss": 0.3503,
"step": 128
},
{
"epoch": 2.007782101167315,
"grad_norm": 2.1515607833862305,
"learning_rate": 3.013427281540523e-05,
"loss": 0.6816,
"step": 129
},
{
"epoch": 2.0233463035019454,
"grad_norm": 0.48720091581344604,
"learning_rate": 2.9304361518520445e-05,
"loss": 0.2903,
"step": 130
},
{
"epoch": 2.0389105058365757,
"grad_norm": 0.6416101455688477,
"learning_rate": 2.8481274778138567e-05,
"loss": 0.3466,
"step": 131
},
{
"epoch": 2.054474708171206,
"grad_norm": 0.6130954623222351,
"learning_rate": 2.766528401386187e-05,
"loss": 0.3311,
"step": 132
},
{
"epoch": 2.0700389105058368,
"grad_norm": 0.6637570858001709,
"learning_rate": 2.685665830533642e-05,
"loss": 0.2966,
"step": 133
},
{
"epoch": 2.085603112840467,
"grad_norm": 0.697539210319519,
"learning_rate": 2.6055664303520653e-05,
"loss": 0.2972,
"step": 134
},
{
"epoch": 2.1011673151750974,
"grad_norm": 0.692152202129364,
"learning_rate": 2.526256614275524e-05,
"loss": 0.3071,
"step": 135
},
{
"epoch": 2.1167315175097277,
"grad_norm": 0.704645574092865,
"learning_rate": 2.4477625353662398e-05,
"loss": 0.2674,
"step": 136
},
{
"epoch": 2.132295719844358,
"grad_norm": 0.7347646951675415,
"learning_rate": 2.370110077690425e-05,
"loss": 0.2777,
"step": 137
},
{
"epoch": 2.1478599221789882,
"grad_norm": 0.7842221260070801,
"learning_rate": 2.2933248477827813e-05,
"loss": 0.2354,
"step": 138
},
{
"epoch": 2.1634241245136185,
"grad_norm": 0.702087938785553,
"learning_rate": 2.2174321662025427e-05,
"loss": 0.2226,
"step": 139
},
{
"epoch": 2.178988326848249,
"grad_norm": 0.8129343390464783,
"learning_rate": 2.1424570591838183e-05,
"loss": 0.2312,
"step": 140
},
{
"epoch": 2.1945525291828796,
"grad_norm": 0.7870627045631409,
"learning_rate": 2.068424250382974e-05,
"loss": 0.2237,
"step": 141
},
{
"epoch": 2.21011673151751,
"grad_norm": 0.8365576267242432,
"learning_rate": 1.9953581527258182e-05,
"loss": 0.2173,
"step": 142
},
{
"epoch": 2.22568093385214,
"grad_norm": 0.8475888967514038,
"learning_rate": 1.9232828603572256e-05,
"loss": 0.2003,
"step": 143
},
{
"epoch": 2.2412451361867705,
"grad_norm": 0.9297432899475098,
"learning_rate": 1.852222140695906e-05,
"loss": 0.1835,
"step": 144
},
{
"epoch": 2.2568093385214008,
"grad_norm": 0.9988775253295898,
"learning_rate": 1.7821994265968962e-05,
"loss": 0.2748,
"step": 145
},
{
"epoch": 2.272373540856031,
"grad_norm": 0.8897411823272705,
"learning_rate": 1.7132378086243904e-05,
"loss": 0.2846,
"step": 146
},
{
"epoch": 2.2879377431906613,
"grad_norm": 1.0305852890014648,
"learning_rate": 1.6453600274374298e-05,
"loss": 0.337,
"step": 147
},
{
"epoch": 2.3035019455252916,
"grad_norm": 0.9018167853355408,
"learning_rate": 1.5785884662909916e-05,
"loss": 0.2914,
"step": 148
},
{
"epoch": 2.319066147859922,
"grad_norm": 0.9446598291397095,
"learning_rate": 1.5129451436549203e-05,
"loss": 0.2451,
"step": 149
},
{
"epoch": 2.3346303501945527,
"grad_norm": 0.8176512718200684,
"learning_rate": 1.4484517059531588e-05,
"loss": 0.2951,
"step": 150
},
{
"epoch": 2.3346303501945527,
"eval_loss": 0.5986903309822083,
"eval_runtime": 10.8903,
"eval_samples_per_second": 9.917,
"eval_steps_per_second": 4.959,
"step": 150
},
{
"epoch": 2.350194552529183,
"grad_norm": 0.7516515851020813,
"learning_rate": 1.3851294204256638e-05,
"loss": 0.2704,
"step": 151
},
{
"epoch": 2.3657587548638133,
"grad_norm": 0.7066805958747864,
"learning_rate": 1.322999168115363e-05,
"loss": 0.2575,
"step": 152
},
{
"epoch": 2.3813229571984436,
"grad_norm": 0.7425258755683899,
"learning_rate": 1.262081436982448e-05,
"loss": 0.2803,
"step": 153
},
{
"epoch": 2.396887159533074,
"grad_norm": 0.7812573909759521,
"learning_rate": 1.2023963151483165e-05,
"loss": 0.2676,
"step": 154
},
{
"epoch": 2.412451361867704,
"grad_norm": 0.7473729848861694,
"learning_rate": 1.143963484271337e-05,
"loss": 0.2523,
"step": 155
},
{
"epoch": 2.4280155642023344,
"grad_norm": 0.7576512694358826,
"learning_rate": 1.0868022130566651e-05,
"loss": 0.2431,
"step": 156
},
{
"epoch": 2.443579766536965,
"grad_norm": 0.7459601759910583,
"learning_rate": 1.0309313509022351e-05,
"loss": 0.2187,
"step": 157
},
{
"epoch": 2.4591439688715955,
"grad_norm": 0.7837074398994446,
"learning_rate": 9.763693216830055e-06,
"loss": 0.209,
"step": 158
},
{
"epoch": 2.4747081712062258,
"grad_norm": 0.7549136281013489,
"learning_rate": 9.231341176755488e-06,
"loss": 0.1914,
"step": 159
},
{
"epoch": 2.490272373540856,
"grad_norm": 0.826531708240509,
"learning_rate": 8.712432936249365e-06,
"loss": 0.1852,
"step": 160
},
{
"epoch": 2.5058365758754864,
"grad_norm": 0.778913140296936,
"learning_rate": 8.207139609559283e-06,
"loss": 0.2547,
"step": 161
},
{
"epoch": 2.5214007782101167,
"grad_norm": 0.7146987915039062,
"learning_rate": 7.715627821303339e-06,
"loss": 0.3109,
"step": 162
},
{
"epoch": 2.536964980544747,
"grad_norm": 0.7523741722106934,
"learning_rate": 7.238059651524354e-06,
"loss": 0.308,
"step": 163
},
{
"epoch": 2.5525291828793772,
"grad_norm": 0.7645745873451233,
"learning_rate": 6.774592582242567e-06,
"loss": 0.266,
"step": 164
},
{
"epoch": 2.5680933852140075,
"grad_norm": 0.7666687965393066,
"learning_rate": 6.325379445524731e-06,
"loss": 0.2654,
"step": 165
},
{
"epoch": 2.5836575875486383,
"grad_norm": 0.8015041351318359,
"learning_rate": 5.890568373086425e-06,
"loss": 0.27,
"step": 166
},
{
"epoch": 2.5992217898832686,
"grad_norm": 0.883911669254303,
"learning_rate": 5.470302747444428e-06,
"loss": 0.283,
"step": 167
},
{
"epoch": 2.614785992217899,
"grad_norm": 0.9192494750022888,
"learning_rate": 5.064721154635155e-06,
"loss": 0.2714,
"step": 168
},
{
"epoch": 2.630350194552529,
"grad_norm": 0.9188051819801331,
"learning_rate": 4.673957338514812e-06,
"loss": 0.2363,
"step": 169
},
{
"epoch": 2.6459143968871595,
"grad_norm": 0.9108679890632629,
"learning_rate": 4.298140156656178e-06,
"loss": 0.2704,
"step": 170
},
{
"epoch": 2.6614785992217898,
"grad_norm": 0.9353700876235962,
"learning_rate": 3.937393537856871e-06,
"loss": 0.259,
"step": 171
},
{
"epoch": 2.6770428015564205,
"grad_norm": 0.7926956415176392,
"learning_rate": 3.5918364412727e-06,
"loss": 0.2228,
"step": 172
},
{
"epoch": 2.692607003891051,
"grad_norm": 0.8170326948165894,
"learning_rate": 3.261582817190023e-06,
"loss": 0.2315,
"step": 173
},
{
"epoch": 2.708171206225681,
"grad_norm": 0.7552840709686279,
"learning_rate": 2.9467415694495627e-06,
"loss": 0.2082,
"step": 174
},
{
"epoch": 2.7237354085603114,
"grad_norm": 0.8596252799034119,
"learning_rate": 2.6474165195346346e-06,
"loss": 0.1993,
"step": 175
},
{
"epoch": 2.7392996108949417,
"grad_norm": 0.8098993301391602,
"learning_rate": 2.363706372335045e-06,
"loss": 0.1707,
"step": 176
},
{
"epoch": 2.754863813229572,
"grad_norm": 0.8145043253898621,
"learning_rate": 2.095704683598376e-06,
"loss": 0.2661,
"step": 177
},
{
"epoch": 2.7704280155642023,
"grad_norm": 0.7261046767234802,
"learning_rate": 1.843499829079237e-06,
"loss": 0.3408,
"step": 178
},
{
"epoch": 2.7859922178988326,
"grad_norm": 0.6709286570549011,
"learning_rate": 1.6071749753965914e-06,
"loss": 0.264,
"step": 179
},
{
"epoch": 2.801556420233463,
"grad_norm": 0.7309187650680542,
"learning_rate": 1.3868080526089178e-06,
"loss": 0.3079,
"step": 180
},
{
"epoch": 2.817120622568093,
"grad_norm": 0.729026198387146,
"learning_rate": 1.1824717285160991e-06,
"loss": 0.2465,
"step": 181
},
{
"epoch": 2.832684824902724,
"grad_norm": 0.847587525844574,
"learning_rate": 9.942333846966746e-07,
"loss": 0.302,
"step": 182
},
{
"epoch": 2.848249027237354,
"grad_norm": 0.8388762474060059,
"learning_rate": 8.221550942881406e-07,
"loss": 0.2598,
"step": 183
},
{
"epoch": 2.8638132295719845,
"grad_norm": 0.8012559413909912,
"learning_rate": 6.662936015178978e-07,
"loss": 0.2566,
"step": 184
},
{
"epoch": 2.8793774319066148,
"grad_norm": 0.749428391456604,
"learning_rate": 5.267003029913065e-07,
"loss": 0.2746,
"step": 185
},
{
"epoch": 2.894941634241245,
"grad_norm": 0.7644572257995605,
"learning_rate": 4.03421230743295e-07,
"loss": 0.222,
"step": 186
},
{
"epoch": 2.9105058365758754,
"grad_norm": 0.8482156991958618,
"learning_rate": 2.9649703705887375e-07,
"loss": 0.2298,
"step": 187
},
{
"epoch": 2.926070038910506,
"grad_norm": 0.7980473041534424,
"learning_rate": 2.0596298106774213e-07,
"loss": 0.2058,
"step": 188
},
{
"epoch": 2.9416342412451364,
"grad_norm": 0.8403069972991943,
"learning_rate": 1.3184891711727764e-07,
"loss": 0.2168,
"step": 189
},
{
"epoch": 2.9571984435797667,
"grad_norm": 0.8483723998069763,
"learning_rate": 7.417928492784443e-08,
"loss": 0.1897,
"step": 190
},
{
"epoch": 2.972762645914397,
"grad_norm": 0.9391508102416992,
"learning_rate": 3.2973101533567695e-08,
"loss": 0.2266,
"step": 191
},
{
"epoch": 2.9883268482490273,
"grad_norm": 0.7663131952285767,
"learning_rate": 8.243955011333349e-09,
"loss": 0.1504,
"step": 192
},
{
"epoch": 3.0038910505836576,
"grad_norm": 1.7181938886642456,
"learning_rate": 0.0,
"loss": 0.4475,
"step": 193
}
],
"logging_steps": 1,
"max_steps": 193,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 2
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.4379746513413734e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}