taozi555's picture
Upload folder using huggingface_hub
9f8589b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.000789265982636,
"eval_steps": 500,
"global_step": 1268,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007892659826361484,
"grad_norm": 2.738257646560669,
"learning_rate": 7.5e-07,
"loss": 2.0033,
"step": 1
},
{
"epoch": 0.0015785319652722968,
"grad_norm": 1.657310128211975,
"learning_rate": 1.5e-06,
"loss": 2.0098,
"step": 2
},
{
"epoch": 0.0023677979479084454,
"grad_norm": 3.086771011352539,
"learning_rate": 2.25e-06,
"loss": 2.0645,
"step": 3
},
{
"epoch": 0.0031570639305445935,
"grad_norm": 3.7529759407043457,
"learning_rate": 3e-06,
"loss": 1.9987,
"step": 4
},
{
"epoch": 0.003946329913180742,
"grad_norm": 2.473452091217041,
"learning_rate": 3.75e-06,
"loss": 2.042,
"step": 5
},
{
"epoch": 0.004735595895816891,
"grad_norm": 2.465566873550415,
"learning_rate": 4.5e-06,
"loss": 1.9511,
"step": 6
},
{
"epoch": 0.0055248618784530384,
"grad_norm": 1.4017177820205688,
"learning_rate": 5.25e-06,
"loss": 2.038,
"step": 7
},
{
"epoch": 0.006314127861089187,
"grad_norm": 1.7734719514846802,
"learning_rate": 6e-06,
"loss": 2.1121,
"step": 8
},
{
"epoch": 0.007103393843725336,
"grad_norm": 1.63801109790802,
"learning_rate": 6.750000000000001e-06,
"loss": 2.0107,
"step": 9
},
{
"epoch": 0.007892659826361484,
"grad_norm": 1.9717129468917847,
"learning_rate": 7.5e-06,
"loss": 2.0291,
"step": 10
},
{
"epoch": 0.008681925808997633,
"grad_norm": 1.5256599187850952,
"learning_rate": 8.25e-06,
"loss": 2.0407,
"step": 11
},
{
"epoch": 0.009471191791633781,
"grad_norm": 1.4094102382659912,
"learning_rate": 9e-06,
"loss": 2.0014,
"step": 12
},
{
"epoch": 0.010260457774269928,
"grad_norm": 2.010549306869507,
"learning_rate": 9.75e-06,
"loss": 2.0341,
"step": 13
},
{
"epoch": 0.011049723756906077,
"grad_norm": 3.010610342025757,
"learning_rate": 1.05e-05,
"loss": 2.0388,
"step": 14
},
{
"epoch": 0.011838989739542225,
"grad_norm": 2.2516043186187744,
"learning_rate": 1.125e-05,
"loss": 2.0162,
"step": 15
},
{
"epoch": 0.012628255722178374,
"grad_norm": 3.590932607650757,
"learning_rate": 1.2e-05,
"loss": 2.0371,
"step": 16
},
{
"epoch": 0.013417521704814523,
"grad_norm": 2.2385761737823486,
"learning_rate": 1.275e-05,
"loss": 2.1105,
"step": 17
},
{
"epoch": 0.014206787687450671,
"grad_norm": 4.703427314758301,
"learning_rate": 1.3500000000000001e-05,
"loss": 2.1296,
"step": 18
},
{
"epoch": 0.01499605367008682,
"grad_norm": 2.485727310180664,
"learning_rate": 1.4249999999999999e-05,
"loss": 2.0493,
"step": 19
},
{
"epoch": 0.01578531965272297,
"grad_norm": 3.6647562980651855,
"learning_rate": 1.5e-05,
"loss": 2.006,
"step": 20
},
{
"epoch": 0.016574585635359115,
"grad_norm": 2.9405100345611572,
"learning_rate": 1.575e-05,
"loss": 2.045,
"step": 21
},
{
"epoch": 0.017363851617995266,
"grad_norm": 5.117101192474365,
"learning_rate": 1.65e-05,
"loss": 2.0551,
"step": 22
},
{
"epoch": 0.018153117600631413,
"grad_norm": 4.730511665344238,
"learning_rate": 1.725e-05,
"loss": 2.0584,
"step": 23
},
{
"epoch": 0.018942383583267563,
"grad_norm": 3.1207115650177,
"learning_rate": 1.8e-05,
"loss": 2.029,
"step": 24
},
{
"epoch": 0.01973164956590371,
"grad_norm": 3.1241533756256104,
"learning_rate": 1.8750000000000002e-05,
"loss": 1.989,
"step": 25
},
{
"epoch": 0.020520915548539857,
"grad_norm": 3.6228599548339844,
"learning_rate": 1.95e-05,
"loss": 1.9951,
"step": 26
},
{
"epoch": 0.021310181531176007,
"grad_norm": 2.3428969383239746,
"learning_rate": 2.025e-05,
"loss": 2.0484,
"step": 27
},
{
"epoch": 0.022099447513812154,
"grad_norm": 5.09832239151001,
"learning_rate": 2.1e-05,
"loss": 2.1136,
"step": 28
},
{
"epoch": 0.022888713496448304,
"grad_norm": 114.6532211303711,
"learning_rate": 2.175e-05,
"loss": 1.9234,
"step": 29
},
{
"epoch": 0.02367797947908445,
"grad_norm": 4.587088108062744,
"learning_rate": 2.25e-05,
"loss": 1.9931,
"step": 30
},
{
"epoch": 0.0244672454617206,
"grad_norm": 24.365489959716797,
"learning_rate": 2.3250000000000003e-05,
"loss": 2.1458,
"step": 31
},
{
"epoch": 0.025256511444356748,
"grad_norm": 4.502379417419434,
"learning_rate": 2.4e-05,
"loss": 2.0772,
"step": 32
},
{
"epoch": 0.026045777426992895,
"grad_norm": 3.6276373863220215,
"learning_rate": 2.475e-05,
"loss": 2.0606,
"step": 33
},
{
"epoch": 0.026835043409629045,
"grad_norm": 5.00884485244751,
"learning_rate": 2.55e-05,
"loss": 2.1513,
"step": 34
},
{
"epoch": 0.027624309392265192,
"grad_norm": 3.615124225616455,
"learning_rate": 2.625e-05,
"loss": 2.0823,
"step": 35
},
{
"epoch": 0.028413575374901343,
"grad_norm": 2.078237771987915,
"learning_rate": 2.7000000000000002e-05,
"loss": 2.0344,
"step": 36
},
{
"epoch": 0.02920284135753749,
"grad_norm": 4.229931354522705,
"learning_rate": 2.7750000000000004e-05,
"loss": 2.0605,
"step": 37
},
{
"epoch": 0.02999210734017364,
"grad_norm": 4.0947113037109375,
"learning_rate": 2.8499999999999998e-05,
"loss": 2.1708,
"step": 38
},
{
"epoch": 0.030781373322809787,
"grad_norm": 12.784259796142578,
"learning_rate": 2.925e-05,
"loss": 2.2762,
"step": 39
},
{
"epoch": 0.03157063930544594,
"grad_norm": 3.9042775630950928,
"learning_rate": 3e-05,
"loss": 2.0888,
"step": 40
},
{
"epoch": 0.03235990528808208,
"grad_norm": 4.025945663452148,
"learning_rate": 2.999998809942206e-05,
"loss": 2.031,
"step": 41
},
{
"epoch": 0.03314917127071823,
"grad_norm": 4.266918659210205,
"learning_rate": 2.9999952397707115e-05,
"loss": 2.0641,
"step": 42
},
{
"epoch": 0.03393843725335438,
"grad_norm": 5.787127494812012,
"learning_rate": 2.9999892894911822e-05,
"loss": 2.0462,
"step": 43
},
{
"epoch": 0.03472770323599053,
"grad_norm": 8.19904613494873,
"learning_rate": 2.999980959113059e-05,
"loss": 2.1786,
"step": 44
},
{
"epoch": 0.035516969218626675,
"grad_norm": 5.060120582580566,
"learning_rate": 2.999970248649561e-05,
"loss": 2.1286,
"step": 45
},
{
"epoch": 0.036306235201262825,
"grad_norm": 3.672504425048828,
"learning_rate": 2.9999571581176817e-05,
"loss": 2.0609,
"step": 46
},
{
"epoch": 0.037095501183898975,
"grad_norm": 10.494200706481934,
"learning_rate": 2.999941687538193e-05,
"loss": 2.125,
"step": 47
},
{
"epoch": 0.037884767166535126,
"grad_norm": 10.003849029541016,
"learning_rate": 2.9999238369356434e-05,
"loss": 2.1308,
"step": 48
},
{
"epoch": 0.03867403314917127,
"grad_norm": 33.990047454833984,
"learning_rate": 2.9999036063383564e-05,
"loss": 2.1923,
"step": 49
},
{
"epoch": 0.03946329913180742,
"grad_norm": 4.710549831390381,
"learning_rate": 2.999880995778433e-05,
"loss": 2.1082,
"step": 50
},
{
"epoch": 0.04025256511444357,
"grad_norm": 7.869533061981201,
"learning_rate": 2.9998560052917504e-05,
"loss": 2.1733,
"step": 51
},
{
"epoch": 0.04104183109707971,
"grad_norm": 3.3192622661590576,
"learning_rate": 2.999828634917962e-05,
"loss": 2.1189,
"step": 52
},
{
"epoch": 0.041831097079715863,
"grad_norm": 4.055788516998291,
"learning_rate": 2.999798884700498e-05,
"loss": 2.1299,
"step": 53
},
{
"epoch": 0.042620363062352014,
"grad_norm": 4.032358169555664,
"learning_rate": 2.999766754686564e-05,
"loss": 2.1634,
"step": 54
},
{
"epoch": 0.043409629044988164,
"grad_norm": 2.2070775032043457,
"learning_rate": 2.9997322449271417e-05,
"loss": 2.0969,
"step": 55
},
{
"epoch": 0.04419889502762431,
"grad_norm": 3.4423580169677734,
"learning_rate": 2.99969535547699e-05,
"loss": 2.0635,
"step": 56
},
{
"epoch": 0.04498816101026046,
"grad_norm": 2.4554710388183594,
"learning_rate": 2.9996560863946424e-05,
"loss": 2.0655,
"step": 57
},
{
"epoch": 0.04577742699289661,
"grad_norm": 4.392270088195801,
"learning_rate": 2.999614437742409e-05,
"loss": 2.1193,
"step": 58
},
{
"epoch": 0.04656669297553275,
"grad_norm": 2.630579710006714,
"learning_rate": 2.999570409586376e-05,
"loss": 2.0359,
"step": 59
},
{
"epoch": 0.0473559589581689,
"grad_norm": 7.560144424438477,
"learning_rate": 2.9995240019964048e-05,
"loss": 2.0961,
"step": 60
},
{
"epoch": 0.04814522494080505,
"grad_norm": 3.4002232551574707,
"learning_rate": 2.9994752150461317e-05,
"loss": 2.089,
"step": 61
},
{
"epoch": 0.0489344909234412,
"grad_norm": 2.383335590362549,
"learning_rate": 2.9994240488129696e-05,
"loss": 2.0589,
"step": 62
},
{
"epoch": 0.049723756906077346,
"grad_norm": 2.5884249210357666,
"learning_rate": 2.9993705033781057e-05,
"loss": 2.0677,
"step": 63
},
{
"epoch": 0.050513022888713496,
"grad_norm": 6.4155120849609375,
"learning_rate": 2.9993145788265035e-05,
"loss": 2.1962,
"step": 64
},
{
"epoch": 0.05130228887134965,
"grad_norm": 3.5735418796539307,
"learning_rate": 2.9992562752469007e-05,
"loss": 2.1703,
"step": 65
},
{
"epoch": 0.05209155485398579,
"grad_norm": 2.7267649173736572,
"learning_rate": 2.9991955927318098e-05,
"loss": 2.084,
"step": 66
},
{
"epoch": 0.05288082083662194,
"grad_norm": 5.19821834564209,
"learning_rate": 2.9991325313775187e-05,
"loss": 2.0788,
"step": 67
},
{
"epoch": 0.05367008681925809,
"grad_norm": 4.282698631286621,
"learning_rate": 2.9990670912840894e-05,
"loss": 2.0828,
"step": 68
},
{
"epoch": 0.05445935280189424,
"grad_norm": 3.5214319229125977,
"learning_rate": 2.998999272555359e-05,
"loss": 2.0621,
"step": 69
},
{
"epoch": 0.055248618784530384,
"grad_norm": 2.759268283843994,
"learning_rate": 2.9989290752989383e-05,
"loss": 2.0133,
"step": 70
},
{
"epoch": 0.056037884767166535,
"grad_norm": 3.1185007095336914,
"learning_rate": 2.9988564996262122e-05,
"loss": 2.055,
"step": 71
},
{
"epoch": 0.056827150749802685,
"grad_norm": 2.4150500297546387,
"learning_rate": 2.9987815456523395e-05,
"loss": 2.0358,
"step": 72
},
{
"epoch": 0.05761641673243883,
"grad_norm": 2.4296348094940186,
"learning_rate": 2.9987042134962534e-05,
"loss": 2.0821,
"step": 73
},
{
"epoch": 0.05840568271507498,
"grad_norm": 4.594061851501465,
"learning_rate": 2.99862450328066e-05,
"loss": 2.0983,
"step": 74
},
{
"epoch": 0.05919494869771113,
"grad_norm": 3.1533656120300293,
"learning_rate": 2.9985424151320388e-05,
"loss": 2.0465,
"step": 75
},
{
"epoch": 0.05998421468034728,
"grad_norm": 3.550783634185791,
"learning_rate": 2.9984579491806428e-05,
"loss": 2.2023,
"step": 76
},
{
"epoch": 0.06077348066298342,
"grad_norm": 3.503725051879883,
"learning_rate": 2.998371105560498e-05,
"loss": 2.1149,
"step": 77
},
{
"epoch": 0.06156274664561957,
"grad_norm": 2.6297385692596436,
"learning_rate": 2.998281884409403e-05,
"loss": 2.1017,
"step": 78
},
{
"epoch": 0.062352012628255724,
"grad_norm": 2.355755090713501,
"learning_rate": 2.9981902858689287e-05,
"loss": 2.0615,
"step": 79
},
{
"epoch": 0.06314127861089187,
"grad_norm": 4.401884078979492,
"learning_rate": 2.9980963100844184e-05,
"loss": 2.0501,
"step": 80
},
{
"epoch": 0.06393054459352802,
"grad_norm": 2.54091215133667,
"learning_rate": 2.9979999572049876e-05,
"loss": 2.0138,
"step": 81
},
{
"epoch": 0.06471981057616416,
"grad_norm": 6.047756671905518,
"learning_rate": 2.9979012273835237e-05,
"loss": 2.082,
"step": 82
},
{
"epoch": 0.06550907655880031,
"grad_norm": 6.175260543823242,
"learning_rate": 2.9978001207766858e-05,
"loss": 2.2342,
"step": 83
},
{
"epoch": 0.06629834254143646,
"grad_norm": 2.3510050773620605,
"learning_rate": 2.997696637544904e-05,
"loss": 2.0092,
"step": 84
},
{
"epoch": 0.06708760852407261,
"grad_norm": 5.738100051879883,
"learning_rate": 2.9975907778523802e-05,
"loss": 2.0805,
"step": 85
},
{
"epoch": 0.06787687450670876,
"grad_norm": 3.1302692890167236,
"learning_rate": 2.9974825418670854e-05,
"loss": 2.08,
"step": 86
},
{
"epoch": 0.06866614048934491,
"grad_norm": 3.3828234672546387,
"learning_rate": 2.9973719297607634e-05,
"loss": 2.0314,
"step": 87
},
{
"epoch": 0.06945540647198106,
"grad_norm": 2.4826507568359375,
"learning_rate": 2.9972589417089267e-05,
"loss": 2.0798,
"step": 88
},
{
"epoch": 0.0702446724546172,
"grad_norm": 1.8656960725784302,
"learning_rate": 2.997143577890859e-05,
"loss": 2.005,
"step": 89
},
{
"epoch": 0.07103393843725335,
"grad_norm": 2.417743444442749,
"learning_rate": 2.9970258384896127e-05,
"loss": 2.0352,
"step": 90
},
{
"epoch": 0.0718232044198895,
"grad_norm": 5.928717613220215,
"learning_rate": 2.9969057236920102e-05,
"loss": 2.1684,
"step": 91
},
{
"epoch": 0.07261247040252565,
"grad_norm": 2.976854085922241,
"learning_rate": 2.9967832336886425e-05,
"loss": 2.0208,
"step": 92
},
{
"epoch": 0.0734017363851618,
"grad_norm": 2.003722906112671,
"learning_rate": 2.99665836867387e-05,
"loss": 2.008,
"step": 93
},
{
"epoch": 0.07419100236779795,
"grad_norm": 3.6728177070617676,
"learning_rate": 2.996531128845822e-05,
"loss": 2.0774,
"step": 94
},
{
"epoch": 0.0749802683504341,
"grad_norm": 2.2868685722351074,
"learning_rate": 2.996401514406395e-05,
"loss": 2.0518,
"step": 95
},
{
"epoch": 0.07576953433307025,
"grad_norm": 16.315637588500977,
"learning_rate": 2.996269525561254e-05,
"loss": 2.265,
"step": 96
},
{
"epoch": 0.07655880031570639,
"grad_norm": 6.387162685394287,
"learning_rate": 2.9961351625198315e-05,
"loss": 2.0423,
"step": 97
},
{
"epoch": 0.07734806629834254,
"grad_norm": 3.667564868927002,
"learning_rate": 2.9959984254953276e-05,
"loss": 2.0816,
"step": 98
},
{
"epoch": 0.07813733228097869,
"grad_norm": 5.106146812438965,
"learning_rate": 2.9958593147047084e-05,
"loss": 2.1309,
"step": 99
},
{
"epoch": 0.07892659826361484,
"grad_norm": 4.6215925216674805,
"learning_rate": 2.9957178303687066e-05,
"loss": 2.0565,
"step": 100
},
{
"epoch": 0.07971586424625099,
"grad_norm": 3.13112735748291,
"learning_rate": 2.9955739727118227e-05,
"loss": 2.1061,
"step": 101
},
{
"epoch": 0.08050513022888714,
"grad_norm": 2.501467227935791,
"learning_rate": 2.995427741962321e-05,
"loss": 1.9896,
"step": 102
},
{
"epoch": 0.08129439621152329,
"grad_norm": 2.8199145793914795,
"learning_rate": 2.9952791383522333e-05,
"loss": 2.0923,
"step": 103
},
{
"epoch": 0.08208366219415943,
"grad_norm": 4.113518714904785,
"learning_rate": 2.9951281621173547e-05,
"loss": 2.1615,
"step": 104
},
{
"epoch": 0.08287292817679558,
"grad_norm": 2.821820020675659,
"learning_rate": 2.9949748134972454e-05,
"loss": 2.0332,
"step": 105
},
{
"epoch": 0.08366219415943173,
"grad_norm": 1.9956532716751099,
"learning_rate": 2.9948190927352313e-05,
"loss": 2.0673,
"step": 106
},
{
"epoch": 0.08445146014206788,
"grad_norm": 2.073834180831909,
"learning_rate": 2.9946610000784006e-05,
"loss": 2.0746,
"step": 107
},
{
"epoch": 0.08524072612470403,
"grad_norm": 2.5564310550689697,
"learning_rate": 2.9945005357776064e-05,
"loss": 1.9765,
"step": 108
},
{
"epoch": 0.08602999210734018,
"grad_norm": 1.676325798034668,
"learning_rate": 2.9943377000874635e-05,
"loss": 1.9988,
"step": 109
},
{
"epoch": 0.08681925808997633,
"grad_norm": 2.4475619792938232,
"learning_rate": 2.9941724932663517e-05,
"loss": 1.9959,
"step": 110
},
{
"epoch": 0.08760852407261246,
"grad_norm": 5.736175060272217,
"learning_rate": 2.9940049155764106e-05,
"loss": 2.1744,
"step": 111
},
{
"epoch": 0.08839779005524862,
"grad_norm": 3.406623363494873,
"learning_rate": 2.9938349672835443e-05,
"loss": 2.1555,
"step": 112
},
{
"epoch": 0.08918705603788477,
"grad_norm": 2.378791332244873,
"learning_rate": 2.9936626486574165e-05,
"loss": 2.0615,
"step": 113
},
{
"epoch": 0.08997632202052092,
"grad_norm": 2.432135581970215,
"learning_rate": 2.9934879599714525e-05,
"loss": 2.0743,
"step": 114
},
{
"epoch": 0.09076558800315707,
"grad_norm": 2.3118343353271484,
"learning_rate": 2.9933109015028388e-05,
"loss": 2.0682,
"step": 115
},
{
"epoch": 0.09155485398579322,
"grad_norm": 1.8455097675323486,
"learning_rate": 2.993131473532522e-05,
"loss": 2.022,
"step": 116
},
{
"epoch": 0.09234411996842937,
"grad_norm": 1.897011637687683,
"learning_rate": 2.9929496763452077e-05,
"loss": 2.0156,
"step": 117
},
{
"epoch": 0.0931333859510655,
"grad_norm": 1.9447441101074219,
"learning_rate": 2.992765510229362e-05,
"loss": 2.021,
"step": 118
},
{
"epoch": 0.09392265193370165,
"grad_norm": 1.878902554512024,
"learning_rate": 2.9925789754772097e-05,
"loss": 2.0258,
"step": 119
},
{
"epoch": 0.0947119179163378,
"grad_norm": 1.7902443408966064,
"learning_rate": 2.9923900723847323e-05,
"loss": 2.0541,
"step": 120
},
{
"epoch": 0.09550118389897395,
"grad_norm": 2.5857603549957275,
"learning_rate": 2.9921988012516717e-05,
"loss": 2.0374,
"step": 121
},
{
"epoch": 0.0962904498816101,
"grad_norm": 1.7749550342559814,
"learning_rate": 2.992005162381526e-05,
"loss": 2.0192,
"step": 122
},
{
"epoch": 0.09707971586424625,
"grad_norm": 1.388109564781189,
"learning_rate": 2.99180915608155e-05,
"loss": 2.025,
"step": 123
},
{
"epoch": 0.0978689818468824,
"grad_norm": 2.3288400173187256,
"learning_rate": 2.9916107826627557e-05,
"loss": 1.9993,
"step": 124
},
{
"epoch": 0.09865824782951854,
"grad_norm": 1.9639314413070679,
"learning_rate": 2.9914100424399115e-05,
"loss": 2.0216,
"step": 125
},
{
"epoch": 0.09944751381215469,
"grad_norm": 5.6544108390808105,
"learning_rate": 2.9912069357315394e-05,
"loss": 2.158,
"step": 126
},
{
"epoch": 0.10023677979479084,
"grad_norm": 1.9935773611068726,
"learning_rate": 2.9910014628599188e-05,
"loss": 2.0447,
"step": 127
},
{
"epoch": 0.10102604577742699,
"grad_norm": 1.5992683172225952,
"learning_rate": 2.9907936241510822e-05,
"loss": 2.084,
"step": 128
},
{
"epoch": 0.10181531176006314,
"grad_norm": 2.9652304649353027,
"learning_rate": 2.9905834199348165e-05,
"loss": 2.0843,
"step": 129
},
{
"epoch": 0.1026045777426993,
"grad_norm": 1.8616968393325806,
"learning_rate": 2.9903708505446618e-05,
"loss": 2.0871,
"step": 130
},
{
"epoch": 0.10339384372533544,
"grad_norm": 1.7061188220977783,
"learning_rate": 2.9901559163179105e-05,
"loss": 2.0342,
"step": 131
},
{
"epoch": 0.10418310970797158,
"grad_norm": 2.443295955657959,
"learning_rate": 2.989938617595609e-05,
"loss": 2.0476,
"step": 132
},
{
"epoch": 0.10497237569060773,
"grad_norm": 2.450977087020874,
"learning_rate": 2.989718954722555e-05,
"loss": 2.0669,
"step": 133
},
{
"epoch": 0.10576164167324388,
"grad_norm": 1.9823421239852905,
"learning_rate": 2.9894969280472964e-05,
"loss": 2.0512,
"step": 134
},
{
"epoch": 0.10655090765588003,
"grad_norm": 2.8827438354492188,
"learning_rate": 2.9892725379221327e-05,
"loss": 2.0744,
"step": 135
},
{
"epoch": 0.10734017363851618,
"grad_norm": 3.418274164199829,
"learning_rate": 2.989045784703114e-05,
"loss": 2.0862,
"step": 136
},
{
"epoch": 0.10812943962115233,
"grad_norm": 3.35532808303833,
"learning_rate": 2.9888166687500397e-05,
"loss": 2.1452,
"step": 137
},
{
"epoch": 0.10891870560378848,
"grad_norm": 12.882590293884277,
"learning_rate": 2.988585190426457e-05,
"loss": 2.045,
"step": 138
},
{
"epoch": 0.10970797158642462,
"grad_norm": 7.744283676147461,
"learning_rate": 2.9883513500996636e-05,
"loss": 2.0628,
"step": 139
},
{
"epoch": 0.11049723756906077,
"grad_norm": 3.180764675140381,
"learning_rate": 2.988115148140704e-05,
"loss": 2.1304,
"step": 140
},
{
"epoch": 0.11128650355169692,
"grad_norm": 2.4699342250823975,
"learning_rate": 2.9878765849243697e-05,
"loss": 2.0432,
"step": 141
},
{
"epoch": 0.11207576953433307,
"grad_norm": 2.3269429206848145,
"learning_rate": 2.9876356608292002e-05,
"loss": 2.1071,
"step": 142
},
{
"epoch": 0.11286503551696922,
"grad_norm": 3.243727684020996,
"learning_rate": 2.9873923762374794e-05,
"loss": 2.0571,
"step": 143
},
{
"epoch": 0.11365430149960537,
"grad_norm": 1.7754957675933838,
"learning_rate": 2.9871467315352386e-05,
"loss": 2.0491,
"step": 144
},
{
"epoch": 0.11444356748224152,
"grad_norm": 3.8351848125457764,
"learning_rate": 2.9868987271122523e-05,
"loss": 1.9965,
"step": 145
},
{
"epoch": 0.11523283346487766,
"grad_norm": 1.8786120414733887,
"learning_rate": 2.9866483633620404e-05,
"loss": 2.0471,
"step": 146
},
{
"epoch": 0.11602209944751381,
"grad_norm": 2.9970719814300537,
"learning_rate": 2.9863956406818656e-05,
"loss": 2.0535,
"step": 147
},
{
"epoch": 0.11681136543014996,
"grad_norm": 2.0963001251220703,
"learning_rate": 2.986140559472734e-05,
"loss": 2.0334,
"step": 148
},
{
"epoch": 0.11760063141278611,
"grad_norm": 4.035261154174805,
"learning_rate": 2.9858831201393943e-05,
"loss": 2.0474,
"step": 149
},
{
"epoch": 0.11838989739542226,
"grad_norm": 2.755239963531494,
"learning_rate": 2.9856233230903368e-05,
"loss": 2.0366,
"step": 150
},
{
"epoch": 0.11917916337805841,
"grad_norm": 3.6331722736358643,
"learning_rate": 2.985361168737793e-05,
"loss": 2.0431,
"step": 151
},
{
"epoch": 0.11996842936069456,
"grad_norm": 2.390591859817505,
"learning_rate": 2.985096657497734e-05,
"loss": 2.0486,
"step": 152
},
{
"epoch": 0.12075769534333071,
"grad_norm": 5.158343315124512,
"learning_rate": 2.9848297897898724e-05,
"loss": 2.09,
"step": 153
},
{
"epoch": 0.12154696132596685,
"grad_norm": 5.338906764984131,
"learning_rate": 2.9845605660376577e-05,
"loss": 2.1369,
"step": 154
},
{
"epoch": 0.122336227308603,
"grad_norm": 2.3366825580596924,
"learning_rate": 2.98428898666828e-05,
"loss": 2.0233,
"step": 155
},
{
"epoch": 0.12312549329123915,
"grad_norm": 3.8773341178894043,
"learning_rate": 2.9840150521126656e-05,
"loss": 2.0668,
"step": 156
},
{
"epoch": 0.1239147592738753,
"grad_norm": 2.9581716060638428,
"learning_rate": 2.9837387628054782e-05,
"loss": 2.0564,
"step": 157
},
{
"epoch": 0.12470402525651145,
"grad_norm": 3.0984768867492676,
"learning_rate": 2.9834601191851187e-05,
"loss": 2.0288,
"step": 158
},
{
"epoch": 0.1254932912391476,
"grad_norm": 2.5730841159820557,
"learning_rate": 2.9831791216937227e-05,
"loss": 2.0555,
"step": 159
},
{
"epoch": 0.12628255722178375,
"grad_norm": 1.9970016479492188,
"learning_rate": 2.982895770777162e-05,
"loss": 2.0139,
"step": 160
},
{
"epoch": 0.1270718232044199,
"grad_norm": 3.020761728286743,
"learning_rate": 2.9826100668850408e-05,
"loss": 2.0301,
"step": 161
},
{
"epoch": 0.12786108918705605,
"grad_norm": 2.456298828125,
"learning_rate": 2.9823220104706988e-05,
"loss": 2.0326,
"step": 162
},
{
"epoch": 0.1286503551696922,
"grad_norm": 3.4278247356414795,
"learning_rate": 2.982031601991207e-05,
"loss": 2.0017,
"step": 163
},
{
"epoch": 0.12943962115232832,
"grad_norm": 3.299609899520874,
"learning_rate": 2.9817388419073695e-05,
"loss": 2.0219,
"step": 164
},
{
"epoch": 0.13022888713496447,
"grad_norm": 3.0248196125030518,
"learning_rate": 2.981443730683722e-05,
"loss": 2.1489,
"step": 165
},
{
"epoch": 0.13101815311760062,
"grad_norm": 3.821983575820923,
"learning_rate": 2.98114626878853e-05,
"loss": 2.0333,
"step": 166
},
{
"epoch": 0.13180741910023677,
"grad_norm": 2.1882872581481934,
"learning_rate": 2.9808464566937887e-05,
"loss": 2.0813,
"step": 167
},
{
"epoch": 0.13259668508287292,
"grad_norm": 4.968083381652832,
"learning_rate": 2.9805442948752242e-05,
"loss": 2.0832,
"step": 168
},
{
"epoch": 0.13338595106550907,
"grad_norm": 4.123311519622803,
"learning_rate": 2.9802397838122895e-05,
"loss": 2.0222,
"step": 169
},
{
"epoch": 0.13417521704814522,
"grad_norm": 2.42797589302063,
"learning_rate": 2.979932923988165e-05,
"loss": 2.0298,
"step": 170
},
{
"epoch": 0.13496448303078137,
"grad_norm": 2.7780134677886963,
"learning_rate": 2.979623715889759e-05,
"loss": 1.9963,
"step": 171
},
{
"epoch": 0.13575374901341752,
"grad_norm": 1.9025119543075562,
"learning_rate": 2.9793121600077058e-05,
"loss": 1.9975,
"step": 172
},
{
"epoch": 0.13654301499605367,
"grad_norm": 1.8044270277023315,
"learning_rate": 2.9789982568363643e-05,
"loss": 2.0171,
"step": 173
},
{
"epoch": 0.13733228097868982,
"grad_norm": 2.5506300926208496,
"learning_rate": 2.9786820068738186e-05,
"loss": 2.0043,
"step": 174
},
{
"epoch": 0.13812154696132597,
"grad_norm": 2.364445447921753,
"learning_rate": 2.978363410621877e-05,
"loss": 2.0384,
"step": 175
},
{
"epoch": 0.13891081294396213,
"grad_norm": 3.070680618286133,
"learning_rate": 2.9780424685860686e-05,
"loss": 2.0334,
"step": 176
},
{
"epoch": 0.13970007892659828,
"grad_norm": 2.494616746902466,
"learning_rate": 2.9777191812756474e-05,
"loss": 2.0816,
"step": 177
},
{
"epoch": 0.1404893449092344,
"grad_norm": 2.9060347080230713,
"learning_rate": 2.9773935492035868e-05,
"loss": 2.1325,
"step": 178
},
{
"epoch": 0.14127861089187055,
"grad_norm": 3.662177801132202,
"learning_rate": 2.977065572886582e-05,
"loss": 2.0774,
"step": 179
},
{
"epoch": 0.1420678768745067,
"grad_norm": 1.8172063827514648,
"learning_rate": 2.976735252845047e-05,
"loss": 1.9909,
"step": 180
},
{
"epoch": 0.14285714285714285,
"grad_norm": 2.4916176795959473,
"learning_rate": 2.9764025896031154e-05,
"loss": 2.0537,
"step": 181
},
{
"epoch": 0.143646408839779,
"grad_norm": 1.9986275434494019,
"learning_rate": 2.9760675836886383e-05,
"loss": 2.055,
"step": 182
},
{
"epoch": 0.14443567482241515,
"grad_norm": 1.7363250255584717,
"learning_rate": 2.975730235633184e-05,
"loss": 1.9759,
"step": 183
},
{
"epoch": 0.1452249408050513,
"grad_norm": 2.0071136951446533,
"learning_rate": 2.9753905459720373e-05,
"loss": 2.074,
"step": 184
},
{
"epoch": 0.14601420678768745,
"grad_norm": 1.985868215560913,
"learning_rate": 2.975048515244199e-05,
"loss": 1.9924,
"step": 185
},
{
"epoch": 0.1468034727703236,
"grad_norm": 1.816689133644104,
"learning_rate": 2.9747041439923848e-05,
"loss": 1.9802,
"step": 186
},
{
"epoch": 0.14759273875295975,
"grad_norm": 1.7806591987609863,
"learning_rate": 2.9743574327630223e-05,
"loss": 1.9926,
"step": 187
},
{
"epoch": 0.1483820047355959,
"grad_norm": 2.3675436973571777,
"learning_rate": 2.9740083821062548e-05,
"loss": 2.0365,
"step": 188
},
{
"epoch": 0.14917127071823205,
"grad_norm": 2.0345346927642822,
"learning_rate": 2.9736569925759348e-05,
"loss": 2.0975,
"step": 189
},
{
"epoch": 0.1499605367008682,
"grad_norm": 1.685162901878357,
"learning_rate": 2.9733032647296285e-05,
"loss": 2.036,
"step": 190
},
{
"epoch": 0.15074980268350435,
"grad_norm": 2.479959487915039,
"learning_rate": 2.9729471991286112e-05,
"loss": 2.0894,
"step": 191
},
{
"epoch": 0.1515390686661405,
"grad_norm": 1.6787084341049194,
"learning_rate": 2.972588796337867e-05,
"loss": 2.0565,
"step": 192
},
{
"epoch": 0.15232833464877663,
"grad_norm": 1.9295579195022583,
"learning_rate": 2.9722280569260903e-05,
"loss": 2.0168,
"step": 193
},
{
"epoch": 0.15311760063141278,
"grad_norm": 2.2043139934539795,
"learning_rate": 2.971864981465681e-05,
"loss": 2.0041,
"step": 194
},
{
"epoch": 0.15390686661404893,
"grad_norm": 1.823622703552246,
"learning_rate": 2.971499570532748e-05,
"loss": 2.032,
"step": 195
},
{
"epoch": 0.15469613259668508,
"grad_norm": 2.536930561065674,
"learning_rate": 2.971131824707104e-05,
"loss": 2.0314,
"step": 196
},
{
"epoch": 0.15548539857932123,
"grad_norm": 1.9996862411499023,
"learning_rate": 2.9707617445722675e-05,
"loss": 2.0447,
"step": 197
},
{
"epoch": 0.15627466456195738,
"grad_norm": 3.0000147819519043,
"learning_rate": 2.970389330715461e-05,
"loss": 2.005,
"step": 198
},
{
"epoch": 0.15706393054459353,
"grad_norm": 2.3367671966552734,
"learning_rate": 2.9700145837276104e-05,
"loss": 2.0679,
"step": 199
},
{
"epoch": 0.15785319652722968,
"grad_norm": 2.8640778064727783,
"learning_rate": 2.9696375042033418e-05,
"loss": 2.0307,
"step": 200
},
{
"epoch": 0.15864246250986583,
"grad_norm": 3.0491254329681396,
"learning_rate": 2.9692580927409845e-05,
"loss": 2.0275,
"step": 201
},
{
"epoch": 0.15943172849250198,
"grad_norm": 2.3943986892700195,
"learning_rate": 2.9688763499425674e-05,
"loss": 2.0167,
"step": 202
},
{
"epoch": 0.16022099447513813,
"grad_norm": 3.376763343811035,
"learning_rate": 2.9684922764138184e-05,
"loss": 2.0733,
"step": 203
},
{
"epoch": 0.16101026045777428,
"grad_norm": 1.7241394519805908,
"learning_rate": 2.9681058727641635e-05,
"loss": 2.0159,
"step": 204
},
{
"epoch": 0.16179952644041043,
"grad_norm": 2.2213003635406494,
"learning_rate": 2.9677171396067268e-05,
"loss": 2.0833,
"step": 205
},
{
"epoch": 0.16258879242304658,
"grad_norm": 2.832461357116699,
"learning_rate": 2.967326077558328e-05,
"loss": 2.0226,
"step": 206
},
{
"epoch": 0.1633780584056827,
"grad_norm": 1.7477656602859497,
"learning_rate": 2.9669326872394816e-05,
"loss": 2.0271,
"step": 207
},
{
"epoch": 0.16416732438831885,
"grad_norm": 2.8195369243621826,
"learning_rate": 2.9665369692743982e-05,
"loss": 2.0262,
"step": 208
},
{
"epoch": 0.164956590370955,
"grad_norm": 2.8796699047088623,
"learning_rate": 2.96613892429098e-05,
"loss": 2.0593,
"step": 209
},
{
"epoch": 0.16574585635359115,
"grad_norm": 2.3450050354003906,
"learning_rate": 2.9657385529208232e-05,
"loss": 2.0637,
"step": 210
},
{
"epoch": 0.1665351223362273,
"grad_norm": 1.9920148849487305,
"learning_rate": 2.9653358557992144e-05,
"loss": 2.0372,
"step": 211
},
{
"epoch": 0.16732438831886345,
"grad_norm": 2.1870229244232178,
"learning_rate": 2.96493083356513e-05,
"loss": 2.0295,
"step": 212
},
{
"epoch": 0.1681136543014996,
"grad_norm": 1.792230248451233,
"learning_rate": 2.964523486861237e-05,
"loss": 2.0277,
"step": 213
},
{
"epoch": 0.16890292028413575,
"grad_norm": 2.0910332202911377,
"learning_rate": 2.964113816333891e-05,
"loss": 1.9782,
"step": 214
},
{
"epoch": 0.1696921862667719,
"grad_norm": 1.8934677839279175,
"learning_rate": 2.9637018226331324e-05,
"loss": 2.02,
"step": 215
},
{
"epoch": 0.17048145224940806,
"grad_norm": 2.7041609287261963,
"learning_rate": 2.9632875064126913e-05,
"loss": 1.9993,
"step": 216
},
{
"epoch": 0.1712707182320442,
"grad_norm": 2.318772077560425,
"learning_rate": 2.9628708683299803e-05,
"loss": 2.0641,
"step": 217
},
{
"epoch": 0.17205998421468036,
"grad_norm": 2.512057304382324,
"learning_rate": 2.9624519090460977e-05,
"loss": 2.092,
"step": 218
},
{
"epoch": 0.1728492501973165,
"grad_norm": 4.763547897338867,
"learning_rate": 2.9620306292258244e-05,
"loss": 1.9952,
"step": 219
},
{
"epoch": 0.17363851617995266,
"grad_norm": 1.4289495944976807,
"learning_rate": 2.9616070295376236e-05,
"loss": 1.9942,
"step": 220
},
{
"epoch": 0.17442778216258878,
"grad_norm": 1.9937539100646973,
"learning_rate": 2.9611811106536392e-05,
"loss": 1.979,
"step": 221
},
{
"epoch": 0.17521704814522493,
"grad_norm": 3.0298221111297607,
"learning_rate": 2.9607528732496956e-05,
"loss": 2.0111,
"step": 222
},
{
"epoch": 0.17600631412786108,
"grad_norm": 1.8235526084899902,
"learning_rate": 2.9603223180052958e-05,
"loss": 2.023,
"step": 223
},
{
"epoch": 0.17679558011049723,
"grad_norm": 1.3632335662841797,
"learning_rate": 2.9598894456036202e-05,
"loss": 2.0136,
"step": 224
},
{
"epoch": 0.17758484609313338,
"grad_norm": 1.8402268886566162,
"learning_rate": 2.959454256731527e-05,
"loss": 1.9844,
"step": 225
},
{
"epoch": 0.17837411207576953,
"grad_norm": 1.385807991027832,
"learning_rate": 2.9590167520795487e-05,
"loss": 2.0579,
"step": 226
},
{
"epoch": 0.17916337805840568,
"grad_norm": 1.4608477354049683,
"learning_rate": 2.9585769323418944e-05,
"loss": 1.9635,
"step": 227
},
{
"epoch": 0.17995264404104183,
"grad_norm": 4.646909236907959,
"learning_rate": 2.9581347982164436e-05,
"loss": 2.0006,
"step": 228
},
{
"epoch": 0.18074191002367798,
"grad_norm": 4.385404586791992,
"learning_rate": 2.9576903504047507e-05,
"loss": 2.0464,
"step": 229
},
{
"epoch": 0.18153117600631413,
"grad_norm": 2.0274832248687744,
"learning_rate": 2.9572435896120408e-05,
"loss": 2.0775,
"step": 230
},
{
"epoch": 0.18232044198895028,
"grad_norm": 2.401803493499756,
"learning_rate": 2.9567945165472082e-05,
"loss": 2.0186,
"step": 231
},
{
"epoch": 0.18310970797158643,
"grad_norm": 2.278960943222046,
"learning_rate": 2.9563431319228168e-05,
"loss": 2.0229,
"step": 232
},
{
"epoch": 0.18389897395422258,
"grad_norm": 2.016186237335205,
"learning_rate": 2.955889436455099e-05,
"loss": 2.0598,
"step": 233
},
{
"epoch": 0.18468823993685873,
"grad_norm": 5.457252025604248,
"learning_rate": 2.955433430863952e-05,
"loss": 2.0805,
"step": 234
},
{
"epoch": 0.18547750591949486,
"grad_norm": 3.5198957920074463,
"learning_rate": 2.9549751158729413e-05,
"loss": 2.0738,
"step": 235
},
{
"epoch": 0.186266771902131,
"grad_norm": 1.9037076234817505,
"learning_rate": 2.954514492209294e-05,
"loss": 2.0021,
"step": 236
},
{
"epoch": 0.18705603788476716,
"grad_norm": 3.1855623722076416,
"learning_rate": 2.9540515606039027e-05,
"loss": 2.0158,
"step": 237
},
{
"epoch": 0.1878453038674033,
"grad_norm": 3.4484121799468994,
"learning_rate": 2.9535863217913207e-05,
"loss": 2.0114,
"step": 238
},
{
"epoch": 0.18863456985003946,
"grad_norm": 2.1158602237701416,
"learning_rate": 2.9531187765097628e-05,
"loss": 2.0541,
"step": 239
},
{
"epoch": 0.1894238358326756,
"grad_norm": 2.0095179080963135,
"learning_rate": 2.9526489255011045e-05,
"loss": 2.0216,
"step": 240
},
{
"epoch": 0.19021310181531176,
"grad_norm": 2.5604724884033203,
"learning_rate": 2.9521767695108774e-05,
"loss": 2.1093,
"step": 241
},
{
"epoch": 0.1910023677979479,
"grad_norm": 2.263054609298706,
"learning_rate": 2.951702309288273e-05,
"loss": 2.1129,
"step": 242
},
{
"epoch": 0.19179163378058406,
"grad_norm": 2.0415761470794678,
"learning_rate": 2.9512255455861378e-05,
"loss": 2.0314,
"step": 243
},
{
"epoch": 0.1925808997632202,
"grad_norm": 2.0080976486206055,
"learning_rate": 2.950746479160974e-05,
"loss": 2.0588,
"step": 244
},
{
"epoch": 0.19337016574585636,
"grad_norm": 2.1279237270355225,
"learning_rate": 2.9502651107729368e-05,
"loss": 2.0065,
"step": 245
},
{
"epoch": 0.1941594317284925,
"grad_norm": 1.4809678792953491,
"learning_rate": 2.9497814411858345e-05,
"loss": 2.0148,
"step": 246
},
{
"epoch": 0.19494869771112866,
"grad_norm": 1.6292145252227783,
"learning_rate": 2.949295471167127e-05,
"loss": 2.0296,
"step": 247
},
{
"epoch": 0.1957379636937648,
"grad_norm": 1.6641464233398438,
"learning_rate": 2.948807201487924e-05,
"loss": 2.0343,
"step": 248
},
{
"epoch": 0.19652722967640096,
"grad_norm": 1.5582904815673828,
"learning_rate": 2.9483166329229848e-05,
"loss": 2.0254,
"step": 249
},
{
"epoch": 0.19731649565903708,
"grad_norm": 1.716148853302002,
"learning_rate": 2.947823766250715e-05,
"loss": 2.0208,
"step": 250
},
{
"epoch": 0.19810576164167323,
"grad_norm": 1.9706841707229614,
"learning_rate": 2.947328602253169e-05,
"loss": 2.0787,
"step": 251
},
{
"epoch": 0.19889502762430938,
"grad_norm": 1.5452086925506592,
"learning_rate": 2.9468311417160437e-05,
"loss": 2.0004,
"step": 252
},
{
"epoch": 0.19968429360694553,
"grad_norm": 2.418027400970459,
"learning_rate": 2.9463313854286827e-05,
"loss": 2.1183,
"step": 253
},
{
"epoch": 0.20047355958958168,
"grad_norm": 1.6865825653076172,
"learning_rate": 2.9458293341840708e-05,
"loss": 2.0038,
"step": 254
},
{
"epoch": 0.20126282557221783,
"grad_norm": 2.1472463607788086,
"learning_rate": 2.9453249887788343e-05,
"loss": 2.0005,
"step": 255
},
{
"epoch": 0.20205209155485399,
"grad_norm": 1.4692081212997437,
"learning_rate": 2.9448183500132407e-05,
"loss": 2.0358,
"step": 256
},
{
"epoch": 0.20284135753749014,
"grad_norm": 1.3552985191345215,
"learning_rate": 2.9443094186911955e-05,
"loss": 2.0313,
"step": 257
},
{
"epoch": 0.20363062352012629,
"grad_norm": 1.4547805786132812,
"learning_rate": 2.9437981956202422e-05,
"loss": 2.0203,
"step": 258
},
{
"epoch": 0.20441988950276244,
"grad_norm": 1.7660293579101562,
"learning_rate": 2.9432846816115614e-05,
"loss": 2.0385,
"step": 259
},
{
"epoch": 0.2052091554853986,
"grad_norm": 1.439816951751709,
"learning_rate": 2.942768877479967e-05,
"loss": 1.9926,
"step": 260
},
{
"epoch": 0.20599842146803474,
"grad_norm": 1.6005187034606934,
"learning_rate": 2.9422507840439095e-05,
"loss": 1.973,
"step": 261
},
{
"epoch": 0.2067876874506709,
"grad_norm": 1.9543637037277222,
"learning_rate": 2.9417304021254697e-05,
"loss": 2.0519,
"step": 262
},
{
"epoch": 0.20757695343330704,
"grad_norm": 1.4007548093795776,
"learning_rate": 2.94120773255036e-05,
"loss": 2.0292,
"step": 263
},
{
"epoch": 0.20836621941594316,
"grad_norm": 1.6729865074157715,
"learning_rate": 2.9406827761479233e-05,
"loss": 1.9558,
"step": 264
},
{
"epoch": 0.2091554853985793,
"grad_norm": 1.6150462627410889,
"learning_rate": 2.940155533751131e-05,
"loss": 1.9898,
"step": 265
},
{
"epoch": 0.20994475138121546,
"grad_norm": 1.9057928323745728,
"learning_rate": 2.9396260061965816e-05,
"loss": 2.0723,
"step": 266
},
{
"epoch": 0.2107340173638516,
"grad_norm": 1.8108017444610596,
"learning_rate": 2.9390941943244996e-05,
"loss": 2.0714,
"step": 267
},
{
"epoch": 0.21152328334648776,
"grad_norm": 1.887980341911316,
"learning_rate": 2.9385600989787346e-05,
"loss": 2.015,
"step": 268
},
{
"epoch": 0.2123125493291239,
"grad_norm": 1.840850830078125,
"learning_rate": 2.938023721006758e-05,
"loss": 2.033,
"step": 269
},
{
"epoch": 0.21310181531176006,
"grad_norm": 2.9777164459228516,
"learning_rate": 2.9374850612596652e-05,
"loss": 2.0825,
"step": 270
},
{
"epoch": 0.2138910812943962,
"grad_norm": 3.170605421066284,
"learning_rate": 2.9369441205921708e-05,
"loss": 2.0824,
"step": 271
},
{
"epoch": 0.21468034727703236,
"grad_norm": 2.142885446548462,
"learning_rate": 2.936400899862609e-05,
"loss": 2.0829,
"step": 272
},
{
"epoch": 0.2154696132596685,
"grad_norm": 1.4462889432907104,
"learning_rate": 2.9358553999329317e-05,
"loss": 2.021,
"step": 273
},
{
"epoch": 0.21625887924230466,
"grad_norm": 1.5857114791870117,
"learning_rate": 2.935307621668708e-05,
"loss": 2.057,
"step": 274
},
{
"epoch": 0.2170481452249408,
"grad_norm": 1.7854455709457397,
"learning_rate": 2.934757565939121e-05,
"loss": 2.0408,
"step": 275
},
{
"epoch": 0.21783741120757696,
"grad_norm": 1.98285710811615,
"learning_rate": 2.9342052336169688e-05,
"loss": 2.0795,
"step": 276
},
{
"epoch": 0.21862667719021311,
"grad_norm": 1.760247826576233,
"learning_rate": 2.9336506255786605e-05,
"loss": 2.0655,
"step": 277
},
{
"epoch": 0.21941594317284924,
"grad_norm": 1.7804441452026367,
"learning_rate": 2.933093742704218e-05,
"loss": 2.0511,
"step": 278
},
{
"epoch": 0.2202052091554854,
"grad_norm": 2.0356266498565674,
"learning_rate": 2.9325345858772704e-05,
"loss": 2.0701,
"step": 279
},
{
"epoch": 0.22099447513812154,
"grad_norm": 1.364184021949768,
"learning_rate": 2.9319731559850575e-05,
"loss": 2.0427,
"step": 280
},
{
"epoch": 0.2217837411207577,
"grad_norm": 2.4935107231140137,
"learning_rate": 2.9314094539184238e-05,
"loss": 2.0072,
"step": 281
},
{
"epoch": 0.22257300710339384,
"grad_norm": 1.3386174440383911,
"learning_rate": 2.9308434805718202e-05,
"loss": 2.0346,
"step": 282
},
{
"epoch": 0.22336227308603,
"grad_norm": 2.427462577819824,
"learning_rate": 2.9302752368433012e-05,
"loss": 2.0328,
"step": 283
},
{
"epoch": 0.22415153906866614,
"grad_norm": 1.5349195003509521,
"learning_rate": 2.9297047236345248e-05,
"loss": 2.0586,
"step": 284
},
{
"epoch": 0.2249408050513023,
"grad_norm": 2.037698745727539,
"learning_rate": 2.9291319418507487e-05,
"loss": 2.0782,
"step": 285
},
{
"epoch": 0.22573007103393844,
"grad_norm": 1.759464979171753,
"learning_rate": 2.928556892400831e-05,
"loss": 2.018,
"step": 286
},
{
"epoch": 0.2265193370165746,
"grad_norm": 1.4675992727279663,
"learning_rate": 2.927979576197227e-05,
"loss": 2.0049,
"step": 287
},
{
"epoch": 0.22730860299921074,
"grad_norm": 2.1224958896636963,
"learning_rate": 2.92739999415599e-05,
"loss": 2.0442,
"step": 288
},
{
"epoch": 0.2280978689818469,
"grad_norm": 1.3020617961883545,
"learning_rate": 2.926818147196769e-05,
"loss": 2.0149,
"step": 289
},
{
"epoch": 0.22888713496448304,
"grad_norm": 2.0251049995422363,
"learning_rate": 2.926234036242805e-05,
"loss": 2.015,
"step": 290
},
{
"epoch": 0.2296764009471192,
"grad_norm": 1.5819907188415527,
"learning_rate": 2.925647662220933e-05,
"loss": 2.0674,
"step": 291
},
{
"epoch": 0.23046566692975531,
"grad_norm": 2.1385371685028076,
"learning_rate": 2.9250590260615782e-05,
"loss": 2.051,
"step": 292
},
{
"epoch": 0.23125493291239146,
"grad_norm": 1.6703637838363647,
"learning_rate": 2.924468128698755e-05,
"loss": 2.0661,
"step": 293
},
{
"epoch": 0.23204419889502761,
"grad_norm": 1.527028203010559,
"learning_rate": 2.923874971070066e-05,
"loss": 2.0273,
"step": 294
},
{
"epoch": 0.23283346487766376,
"grad_norm": 1.87993323802948,
"learning_rate": 2.9232795541167007e-05,
"loss": 2.035,
"step": 295
},
{
"epoch": 0.23362273086029992,
"grad_norm": 1.5972620248794556,
"learning_rate": 2.922681878783433e-05,
"loss": 2.052,
"step": 296
},
{
"epoch": 0.23441199684293607,
"grad_norm": 2.544999122619629,
"learning_rate": 2.9220819460186212e-05,
"loss": 2.0747,
"step": 297
},
{
"epoch": 0.23520126282557222,
"grad_norm": 1.865227222442627,
"learning_rate": 2.9214797567742036e-05,
"loss": 2.0921,
"step": 298
},
{
"epoch": 0.23599052880820837,
"grad_norm": 1.6884143352508545,
"learning_rate": 2.9208753120057012e-05,
"loss": 1.9842,
"step": 299
},
{
"epoch": 0.23677979479084452,
"grad_norm": 1.422402024269104,
"learning_rate": 2.920268612672213e-05,
"loss": 2.0283,
"step": 300
},
{
"epoch": 0.23756906077348067,
"grad_norm": 2.1607301235198975,
"learning_rate": 2.919659659736414e-05,
"loss": 2.0836,
"step": 301
},
{
"epoch": 0.23835832675611682,
"grad_norm": 2.1439242362976074,
"learning_rate": 2.919048454164558e-05,
"loss": 2.0337,
"step": 302
},
{
"epoch": 0.23914759273875297,
"grad_norm": 1.4744690656661987,
"learning_rate": 2.9184349969264713e-05,
"loss": 2.1089,
"step": 303
},
{
"epoch": 0.23993685872138912,
"grad_norm": 1.688265323638916,
"learning_rate": 2.9178192889955525e-05,
"loss": 1.9577,
"step": 304
},
{
"epoch": 0.24072612470402527,
"grad_norm": 1.8507063388824463,
"learning_rate": 2.917201331348773e-05,
"loss": 2.0491,
"step": 305
},
{
"epoch": 0.24151539068666142,
"grad_norm": 1.7672779560089111,
"learning_rate": 2.916581124966673e-05,
"loss": 1.9864,
"step": 306
},
{
"epoch": 0.24230465666929754,
"grad_norm": 1.3851258754730225,
"learning_rate": 2.915958670833361e-05,
"loss": 2.0329,
"step": 307
},
{
"epoch": 0.2430939226519337,
"grad_norm": 1.7702224254608154,
"learning_rate": 2.9153339699365127e-05,
"loss": 2.0583,
"step": 308
},
{
"epoch": 0.24388318863456984,
"grad_norm": 1.283464789390564,
"learning_rate": 2.9147070232673678e-05,
"loss": 2.0354,
"step": 309
},
{
"epoch": 0.244672454617206,
"grad_norm": 1.3770912885665894,
"learning_rate": 2.9140778318207304e-05,
"loss": 2.0359,
"step": 310
},
{
"epoch": 0.24546172059984214,
"grad_norm": 1.590747594833374,
"learning_rate": 2.9134463965949657e-05,
"loss": 2.0104,
"step": 311
},
{
"epoch": 0.2462509865824783,
"grad_norm": 1.3684974908828735,
"learning_rate": 2.9128127185919995e-05,
"loss": 1.9965,
"step": 312
},
{
"epoch": 0.24704025256511444,
"grad_norm": 1.4951163530349731,
"learning_rate": 2.9121767988173166e-05,
"loss": 2.0349,
"step": 313
},
{
"epoch": 0.2478295185477506,
"grad_norm": 1.373107671737671,
"learning_rate": 2.9115386382799594e-05,
"loss": 2.0083,
"step": 314
},
{
"epoch": 0.24861878453038674,
"grad_norm": 7.275319576263428,
"learning_rate": 2.9108982379925237e-05,
"loss": 2.0614,
"step": 315
},
{
"epoch": 0.2494080505130229,
"grad_norm": 1.5124423503875732,
"learning_rate": 2.9102555989711617e-05,
"loss": 2.0778,
"step": 316
},
{
"epoch": 0.250197316495659,
"grad_norm": 1.264237403869629,
"learning_rate": 2.9096107222355764e-05,
"loss": 2.0035,
"step": 317
},
{
"epoch": 0.2509865824782952,
"grad_norm": 1.6124038696289062,
"learning_rate": 2.9089636088090222e-05,
"loss": 1.9842,
"step": 318
},
{
"epoch": 0.2517758484609313,
"grad_norm": 1.5616995096206665,
"learning_rate": 2.908314259718302e-05,
"loss": 2.0289,
"step": 319
},
{
"epoch": 0.2525651144435675,
"grad_norm": 1.362492322921753,
"learning_rate": 2.9076626759937665e-05,
"loss": 2.0416,
"step": 320
},
{
"epoch": 0.2533543804262036,
"grad_norm": 2.4406826496124268,
"learning_rate": 2.9070088586693124e-05,
"loss": 2.1443,
"step": 321
},
{
"epoch": 0.2541436464088398,
"grad_norm": 1.6452536582946777,
"learning_rate": 2.9063528087823795e-05,
"loss": 2.0788,
"step": 322
},
{
"epoch": 0.2549329123914759,
"grad_norm": 1.5950769186019897,
"learning_rate": 2.9056945273739516e-05,
"loss": 2.0651,
"step": 323
},
{
"epoch": 0.2557221783741121,
"grad_norm": 1.3139842748641968,
"learning_rate": 2.9050340154885522e-05,
"loss": 1.9862,
"step": 324
},
{
"epoch": 0.2565114443567482,
"grad_norm": 1.4966543912887573,
"learning_rate": 2.9043712741742446e-05,
"loss": 2.0707,
"step": 325
},
{
"epoch": 0.2573007103393844,
"grad_norm": 1.750475525856018,
"learning_rate": 2.9037063044826287e-05,
"loss": 2.0497,
"step": 326
},
{
"epoch": 0.2580899763220205,
"grad_norm": 1.473083257675171,
"learning_rate": 2.9030391074688422e-05,
"loss": 2.0227,
"step": 327
},
{
"epoch": 0.25887924230465664,
"grad_norm": 1.3315284252166748,
"learning_rate": 2.9023696841915547e-05,
"loss": 2.0061,
"step": 328
},
{
"epoch": 0.2596685082872928,
"grad_norm": 1.4846017360687256,
"learning_rate": 2.90169803571297e-05,
"loss": 1.9782,
"step": 329
},
{
"epoch": 0.26045777426992894,
"grad_norm": 1.4215277433395386,
"learning_rate": 2.901024163098822e-05,
"loss": 2.0208,
"step": 330
},
{
"epoch": 0.2612470402525651,
"grad_norm": 1.4329077005386353,
"learning_rate": 2.900348067418374e-05,
"loss": 2.0733,
"step": 331
},
{
"epoch": 0.26203630623520124,
"grad_norm": 1.7334977388381958,
"learning_rate": 2.899669749744416e-05,
"loss": 2.0124,
"step": 332
},
{
"epoch": 0.2628255722178374,
"grad_norm": 1.5247516632080078,
"learning_rate": 2.8989892111532646e-05,
"loss": 2.0615,
"step": 333
},
{
"epoch": 0.26361483820047354,
"grad_norm": 1.699466347694397,
"learning_rate": 2.8983064527247603e-05,
"loss": 2.0213,
"step": 334
},
{
"epoch": 0.2644041041831097,
"grad_norm": 1.425671935081482,
"learning_rate": 2.897621475542266e-05,
"loss": 2.0026,
"step": 335
},
{
"epoch": 0.26519337016574585,
"grad_norm": 2.0017428398132324,
"learning_rate": 2.8969342806926644e-05,
"loss": 2.0526,
"step": 336
},
{
"epoch": 0.265982636148382,
"grad_norm": 1.9548540115356445,
"learning_rate": 2.8962448692663578e-05,
"loss": 2.0681,
"step": 337
},
{
"epoch": 0.26677190213101815,
"grad_norm": 1.8230706453323364,
"learning_rate": 2.8955532423572654e-05,
"loss": 2.0354,
"step": 338
},
{
"epoch": 0.2675611681136543,
"grad_norm": 5.8361287117004395,
"learning_rate": 2.8948594010628223e-05,
"loss": 2.0588,
"step": 339
},
{
"epoch": 0.26835043409629045,
"grad_norm": 1.7668081521987915,
"learning_rate": 2.894163346483976e-05,
"loss": 2.0865,
"step": 340
},
{
"epoch": 0.2691397000789266,
"grad_norm": 1.423452615737915,
"learning_rate": 2.8934650797251875e-05,
"loss": 1.9987,
"step": 341
},
{
"epoch": 0.26992896606156275,
"grad_norm": 1.6870286464691162,
"learning_rate": 2.8927646018944264e-05,
"loss": 2.0765,
"step": 342
},
{
"epoch": 0.27071823204419887,
"grad_norm": 1.5475367307662964,
"learning_rate": 2.8920619141031722e-05,
"loss": 1.9911,
"step": 343
},
{
"epoch": 0.27150749802683505,
"grad_norm": 1.4201185703277588,
"learning_rate": 2.8913570174664104e-05,
"loss": 2.0093,
"step": 344
},
{
"epoch": 0.27229676400947117,
"grad_norm": 1.6851767301559448,
"learning_rate": 2.8906499131026313e-05,
"loss": 2.019,
"step": 345
},
{
"epoch": 0.27308602999210735,
"grad_norm": 2.842083215713501,
"learning_rate": 2.8899406021338277e-05,
"loss": 2.0587,
"step": 346
},
{
"epoch": 0.27387529597474347,
"grad_norm": 2.324366569519043,
"learning_rate": 2.8892290856854945e-05,
"loss": 2.0381,
"step": 347
},
{
"epoch": 0.27466456195737965,
"grad_norm": 1.7157739400863647,
"learning_rate": 2.8885153648866266e-05,
"loss": 2.1044,
"step": 348
},
{
"epoch": 0.27545382794001577,
"grad_norm": 1.7006040811538696,
"learning_rate": 2.887799440869715e-05,
"loss": 2.0283,
"step": 349
},
{
"epoch": 0.27624309392265195,
"grad_norm": 1.27659010887146,
"learning_rate": 2.8870813147707486e-05,
"loss": 1.9939,
"step": 350
},
{
"epoch": 0.2770323599052881,
"grad_norm": 1.2692986726760864,
"learning_rate": 2.8863609877292095e-05,
"loss": 2.0046,
"step": 351
},
{
"epoch": 0.27782162588792425,
"grad_norm": 1.8094099760055542,
"learning_rate": 2.885638460888071e-05,
"loss": 2.0605,
"step": 352
},
{
"epoch": 0.2786108918705604,
"grad_norm": 1.6548364162445068,
"learning_rate": 2.884913735393799e-05,
"loss": 2.0143,
"step": 353
},
{
"epoch": 0.27940015785319655,
"grad_norm": 1.4649075269699097,
"learning_rate": 2.8841868123963467e-05,
"loss": 2.0126,
"step": 354
},
{
"epoch": 0.2801894238358327,
"grad_norm": 1.3697763681411743,
"learning_rate": 2.883457693049155e-05,
"loss": 2.0532,
"step": 355
},
{
"epoch": 0.2809786898184688,
"grad_norm": 1.5171440839767456,
"learning_rate": 2.882726378509149e-05,
"loss": 1.9809,
"step": 356
},
{
"epoch": 0.281767955801105,
"grad_norm": 1.9760620594024658,
"learning_rate": 2.881992869936738e-05,
"loss": 2.0904,
"step": 357
},
{
"epoch": 0.2825572217837411,
"grad_norm": 1.4889613389968872,
"learning_rate": 2.8812571684958112e-05,
"loss": 2.0099,
"step": 358
},
{
"epoch": 0.2833464877663773,
"grad_norm": 1.700632095336914,
"learning_rate": 2.880519275353739e-05,
"loss": 2.0635,
"step": 359
},
{
"epoch": 0.2841357537490134,
"grad_norm": 1.9366352558135986,
"learning_rate": 2.8797791916813693e-05,
"loss": 1.9718,
"step": 360
},
{
"epoch": 0.2849250197316496,
"grad_norm": 1.4939085245132446,
"learning_rate": 2.8790369186530234e-05,
"loss": 2.0218,
"step": 361
},
{
"epoch": 0.2857142857142857,
"grad_norm": 2.3025503158569336,
"learning_rate": 2.8782924574465003e-05,
"loss": 2.0402,
"step": 362
},
{
"epoch": 0.2865035516969219,
"grad_norm": 1.7214744091033936,
"learning_rate": 2.877545809243068e-05,
"loss": 1.9928,
"step": 363
},
{
"epoch": 0.287292817679558,
"grad_norm": 3.420003890991211,
"learning_rate": 2.8767969752274658e-05,
"loss": 2.0177,
"step": 364
},
{
"epoch": 0.2880820836621942,
"grad_norm": 2.178987741470337,
"learning_rate": 2.8760459565879024e-05,
"loss": 1.9814,
"step": 365
},
{
"epoch": 0.2888713496448303,
"grad_norm": 2.3021793365478516,
"learning_rate": 2.8752927545160506e-05,
"loss": 2.0564,
"step": 366
},
{
"epoch": 0.2896606156274665,
"grad_norm": 1.8379982709884644,
"learning_rate": 2.8745373702070503e-05,
"loss": 1.9817,
"step": 367
},
{
"epoch": 0.2904498816101026,
"grad_norm": 2.0152549743652344,
"learning_rate": 2.8737798048595014e-05,
"loss": 1.9944,
"step": 368
},
{
"epoch": 0.2912391475927388,
"grad_norm": 2.254018783569336,
"learning_rate": 2.8730200596754676e-05,
"loss": 2.0644,
"step": 369
},
{
"epoch": 0.2920284135753749,
"grad_norm": 2.5273802280426025,
"learning_rate": 2.8722581358604686e-05,
"loss": 1.9702,
"step": 370
},
{
"epoch": 0.292817679558011,
"grad_norm": 1.9010764360427856,
"learning_rate": 2.871494034623483e-05,
"loss": 2.0317,
"step": 371
},
{
"epoch": 0.2936069455406472,
"grad_norm": 3.991340160369873,
"learning_rate": 2.8707277571769427e-05,
"loss": 2.0036,
"step": 372
},
{
"epoch": 0.2943962115232833,
"grad_norm": 3.4760918617248535,
"learning_rate": 2.8699593047367346e-05,
"loss": 2.0028,
"step": 373
},
{
"epoch": 0.2951854775059195,
"grad_norm": 2.2568345069885254,
"learning_rate": 2.8691886785221945e-05,
"loss": 2.0038,
"step": 374
},
{
"epoch": 0.2959747434885556,
"grad_norm": 3.1988019943237305,
"learning_rate": 2.8684158797561108e-05,
"loss": 2.0642,
"step": 375
},
{
"epoch": 0.2967640094711918,
"grad_norm": 1.9911775588989258,
"learning_rate": 2.867640909664715e-05,
"loss": 2.0453,
"step": 376
},
{
"epoch": 0.2975532754538279,
"grad_norm": 2.7963943481445312,
"learning_rate": 2.866863769477687e-05,
"loss": 2.1297,
"step": 377
},
{
"epoch": 0.2983425414364641,
"grad_norm": 2.355445384979248,
"learning_rate": 2.8660844604281496e-05,
"loss": 1.9831,
"step": 378
},
{
"epoch": 0.2991318074191002,
"grad_norm": 1.601205587387085,
"learning_rate": 2.8653029837526657e-05,
"loss": 1.9842,
"step": 379
},
{
"epoch": 0.2999210734017364,
"grad_norm": 1.3466482162475586,
"learning_rate": 2.8645193406912384e-05,
"loss": 2.0044,
"step": 380
},
{
"epoch": 0.3007103393843725,
"grad_norm": 2.001037836074829,
"learning_rate": 2.8637335324873094e-05,
"loss": 2.0035,
"step": 381
},
{
"epoch": 0.3014996053670087,
"grad_norm": 1.4719334840774536,
"learning_rate": 2.8629455603877538e-05,
"loss": 1.9992,
"step": 382
},
{
"epoch": 0.3022888713496448,
"grad_norm": 2.8064234256744385,
"learning_rate": 2.862155425642882e-05,
"loss": 1.9626,
"step": 383
},
{
"epoch": 0.303078137332281,
"grad_norm": 2.7371597290039062,
"learning_rate": 2.861363129506436e-05,
"loss": 2.0108,
"step": 384
},
{
"epoch": 0.30386740331491713,
"grad_norm": 10.789750099182129,
"learning_rate": 2.8605686732355848e-05,
"loss": 2.1043,
"step": 385
},
{
"epoch": 0.30465666929755325,
"grad_norm": 3.1278905868530273,
"learning_rate": 2.859772058090929e-05,
"loss": 2.0117,
"step": 386
},
{
"epoch": 0.30544593528018943,
"grad_norm": 1.378149151802063,
"learning_rate": 2.8589732853364914e-05,
"loss": 2.0337,
"step": 387
},
{
"epoch": 0.30623520126282555,
"grad_norm": 2.791260004043579,
"learning_rate": 2.8581723562397203e-05,
"loss": 2.0247,
"step": 388
},
{
"epoch": 0.30702446724546173,
"grad_norm": 2.1987431049346924,
"learning_rate": 2.857369272071484e-05,
"loss": 2.029,
"step": 389
},
{
"epoch": 0.30781373322809785,
"grad_norm": 2.8841540813446045,
"learning_rate": 2.8565640341060726e-05,
"loss": 1.9728,
"step": 390
},
{
"epoch": 0.30860299921073403,
"grad_norm": 3.048370838165283,
"learning_rate": 2.8557566436211915e-05,
"loss": 2.0747,
"step": 391
},
{
"epoch": 0.30939226519337015,
"grad_norm": 1.3164457082748413,
"learning_rate": 2.8549471018979622e-05,
"loss": 2.0121,
"step": 392
},
{
"epoch": 0.31018153117600633,
"grad_norm": 2.2098472118377686,
"learning_rate": 2.854135410220921e-05,
"loss": 2.0156,
"step": 393
},
{
"epoch": 0.31097079715864245,
"grad_norm": 1.937860131263733,
"learning_rate": 2.853321569878014e-05,
"loss": 1.9367,
"step": 394
},
{
"epoch": 0.31176006314127863,
"grad_norm": 1.6437093019485474,
"learning_rate": 2.8525055821605966e-05,
"loss": 1.9702,
"step": 395
},
{
"epoch": 0.31254932912391475,
"grad_norm": 1.7104240655899048,
"learning_rate": 2.8516874483634336e-05,
"loss": 2.0901,
"step": 396
},
{
"epoch": 0.31333859510655093,
"grad_norm": 2.3748621940612793,
"learning_rate": 2.850867169784693e-05,
"loss": 2.0178,
"step": 397
},
{
"epoch": 0.31412786108918705,
"grad_norm": 1.400006651878357,
"learning_rate": 2.8500447477259458e-05,
"loss": 1.9689,
"step": 398
},
{
"epoch": 0.3149171270718232,
"grad_norm": 1.2704992294311523,
"learning_rate": 2.849220183492167e-05,
"loss": 1.9398,
"step": 399
},
{
"epoch": 0.31570639305445936,
"grad_norm": 1.4304686784744263,
"learning_rate": 2.8483934783917266e-05,
"loss": 2.0251,
"step": 400
},
{
"epoch": 0.3164956590370955,
"grad_norm": 2.9045751094818115,
"learning_rate": 2.847564633736395e-05,
"loss": 2.0429,
"step": 401
},
{
"epoch": 0.31728492501973166,
"grad_norm": 1.477043867111206,
"learning_rate": 2.8467336508413366e-05,
"loss": 1.9737,
"step": 402
},
{
"epoch": 0.3180741910023678,
"grad_norm": 1.5823172330856323,
"learning_rate": 2.845900531025107e-05,
"loss": 1.9746,
"step": 403
},
{
"epoch": 0.31886345698500396,
"grad_norm": 2.0735702514648438,
"learning_rate": 2.8450652756096544e-05,
"loss": 2.0304,
"step": 404
},
{
"epoch": 0.3196527229676401,
"grad_norm": 2.1452434062957764,
"learning_rate": 2.8442278859203154e-05,
"loss": 2.0198,
"step": 405
},
{
"epoch": 0.32044198895027626,
"grad_norm": 1.5755338668823242,
"learning_rate": 2.8433883632858125e-05,
"loss": 1.9722,
"step": 406
},
{
"epoch": 0.3212312549329124,
"grad_norm": 1.497015118598938,
"learning_rate": 2.8425467090382533e-05,
"loss": 2.0369,
"step": 407
},
{
"epoch": 0.32202052091554856,
"grad_norm": 1.4064276218414307,
"learning_rate": 2.8417029245131272e-05,
"loss": 1.9983,
"step": 408
},
{
"epoch": 0.3228097868981847,
"grad_norm": 2.0976450443267822,
"learning_rate": 2.8408570110493038e-05,
"loss": 1.9712,
"step": 409
},
{
"epoch": 0.32359905288082086,
"grad_norm": 1.6904927492141724,
"learning_rate": 2.8400089699890313e-05,
"loss": 2.0335,
"step": 410
},
{
"epoch": 0.324388318863457,
"grad_norm": 1.9519057273864746,
"learning_rate": 2.8391588026779334e-05,
"loss": 2.0334,
"step": 411
},
{
"epoch": 0.32517758484609316,
"grad_norm": 2.1449975967407227,
"learning_rate": 2.838306510465008e-05,
"loss": 2.0053,
"step": 412
},
{
"epoch": 0.3259668508287293,
"grad_norm": 2.0840375423431396,
"learning_rate": 2.837452094702624e-05,
"loss": 1.9542,
"step": 413
},
{
"epoch": 0.3267561168113654,
"grad_norm": 2.5114152431488037,
"learning_rate": 2.8365955567465203e-05,
"loss": 1.9939,
"step": 414
},
{
"epoch": 0.3275453827940016,
"grad_norm": 2.250431537628174,
"learning_rate": 2.8357368979558035e-05,
"loss": 2.0081,
"step": 415
},
{
"epoch": 0.3283346487766377,
"grad_norm": 3.218123197555542,
"learning_rate": 2.8348761196929443e-05,
"loss": 1.9852,
"step": 416
},
{
"epoch": 0.3291239147592739,
"grad_norm": 4.118273735046387,
"learning_rate": 2.8340132233237784e-05,
"loss": 2.0008,
"step": 417
},
{
"epoch": 0.32991318074191,
"grad_norm": 2.0143914222717285,
"learning_rate": 2.8331482102175e-05,
"loss": 1.9703,
"step": 418
},
{
"epoch": 0.3307024467245462,
"grad_norm": 2.4409642219543457,
"learning_rate": 2.832281081746664e-05,
"loss": 1.9678,
"step": 419
},
{
"epoch": 0.3314917127071823,
"grad_norm": 1.8680471181869507,
"learning_rate": 2.831411839287181e-05,
"loss": 1.988,
"step": 420
},
{
"epoch": 0.3322809786898185,
"grad_norm": 2.5271923542022705,
"learning_rate": 2.8305404842183154e-05,
"loss": 1.9601,
"step": 421
},
{
"epoch": 0.3330702446724546,
"grad_norm": 1.9228181838989258,
"learning_rate": 2.829667017922685e-05,
"loss": 1.9337,
"step": 422
},
{
"epoch": 0.3338595106550908,
"grad_norm": 2.778698444366455,
"learning_rate": 2.8287914417862565e-05,
"loss": 2.0357,
"step": 423
},
{
"epoch": 0.3346487766377269,
"grad_norm": 1.8243480920791626,
"learning_rate": 2.8279137571983456e-05,
"loss": 1.9876,
"step": 424
},
{
"epoch": 0.3354380426203631,
"grad_norm": 2.0867981910705566,
"learning_rate": 2.827033965551612e-05,
"loss": 2.0039,
"step": 425
},
{
"epoch": 0.3362273086029992,
"grad_norm": 2.3519058227539062,
"learning_rate": 2.82615206824206e-05,
"loss": 1.9389,
"step": 426
},
{
"epoch": 0.3370165745856354,
"grad_norm": 2.992839813232422,
"learning_rate": 2.8252680666690346e-05,
"loss": 2.0878,
"step": 427
},
{
"epoch": 0.3378058405682715,
"grad_norm": 3.436065435409546,
"learning_rate": 2.8243819622352197e-05,
"loss": 2.0401,
"step": 428
},
{
"epoch": 0.33859510655090763,
"grad_norm": 1.8708529472351074,
"learning_rate": 2.8234937563466355e-05,
"loss": 2.0056,
"step": 429
},
{
"epoch": 0.3393843725335438,
"grad_norm": 2.743220090866089,
"learning_rate": 2.822603450412638e-05,
"loss": 2.0001,
"step": 430
},
{
"epoch": 0.34017363851617993,
"grad_norm": 3.055884838104248,
"learning_rate": 2.8217110458459136e-05,
"loss": 2.0223,
"step": 431
},
{
"epoch": 0.3409629044988161,
"grad_norm": 1.3831731081008911,
"learning_rate": 2.8208165440624804e-05,
"loss": 2.0223,
"step": 432
},
{
"epoch": 0.34175217048145223,
"grad_norm": 2.483407735824585,
"learning_rate": 2.8199199464816834e-05,
"loss": 2.0198,
"step": 433
},
{
"epoch": 0.3425414364640884,
"grad_norm": 1.9653549194335938,
"learning_rate": 2.8190212545261928e-05,
"loss": 2.0058,
"step": 434
},
{
"epoch": 0.34333070244672453,
"grad_norm": 2.2429397106170654,
"learning_rate": 2.818120469622003e-05,
"loss": 1.9628,
"step": 435
},
{
"epoch": 0.3441199684293607,
"grad_norm": 2.5655837059020996,
"learning_rate": 2.8172175931984276e-05,
"loss": 2.0032,
"step": 436
},
{
"epoch": 0.34490923441199683,
"grad_norm": 1.482216238975525,
"learning_rate": 2.8163126266881012e-05,
"loss": 1.9787,
"step": 437
},
{
"epoch": 0.345698500394633,
"grad_norm": 2.1151742935180664,
"learning_rate": 2.815405571526974e-05,
"loss": 1.94,
"step": 438
},
{
"epoch": 0.34648776637726914,
"grad_norm": 1.2237361669540405,
"learning_rate": 2.8144964291543093e-05,
"loss": 2.0106,
"step": 439
},
{
"epoch": 0.3472770323599053,
"grad_norm": 2.514650821685791,
"learning_rate": 2.8135852010126836e-05,
"loss": 2.0033,
"step": 440
},
{
"epoch": 0.34806629834254144,
"grad_norm": 1.4608144760131836,
"learning_rate": 2.812671888547982e-05,
"loss": 1.9822,
"step": 441
},
{
"epoch": 0.34885556432517756,
"grad_norm": 2.3360090255737305,
"learning_rate": 2.811756493209397e-05,
"loss": 1.9659,
"step": 442
},
{
"epoch": 0.34964483030781374,
"grad_norm": 2.3090360164642334,
"learning_rate": 2.8108390164494272e-05,
"loss": 1.9949,
"step": 443
},
{
"epoch": 0.35043409629044986,
"grad_norm": 1.496638536453247,
"learning_rate": 2.8099194597238727e-05,
"loss": 1.992,
"step": 444
},
{
"epoch": 0.35122336227308604,
"grad_norm": 2.009535551071167,
"learning_rate": 2.8089978244918344e-05,
"loss": 2.0079,
"step": 445
},
{
"epoch": 0.35201262825572216,
"grad_norm": 1.2042880058288574,
"learning_rate": 2.8080741122157116e-05,
"loss": 2.0639,
"step": 446
},
{
"epoch": 0.35280189423835834,
"grad_norm": 2.214263916015625,
"learning_rate": 2.8071483243611982e-05,
"loss": 1.9812,
"step": 447
},
{
"epoch": 0.35359116022099446,
"grad_norm": 1.3645316362380981,
"learning_rate": 2.8062204623972826e-05,
"loss": 2.0229,
"step": 448
},
{
"epoch": 0.35438042620363064,
"grad_norm": 2.3980894088745117,
"learning_rate": 2.8052905277962445e-05,
"loss": 2.0767,
"step": 449
},
{
"epoch": 0.35516969218626676,
"grad_norm": 2.262523651123047,
"learning_rate": 2.804358522033651e-05,
"loss": 1.9719,
"step": 450
},
{
"epoch": 0.35595895816890294,
"grad_norm": 1.6269384622573853,
"learning_rate": 2.803424446588357e-05,
"loss": 1.9849,
"step": 451
},
{
"epoch": 0.35674822415153906,
"grad_norm": 1.989469051361084,
"learning_rate": 2.8024883029425007e-05,
"loss": 2.0425,
"step": 452
},
{
"epoch": 0.35753749013417524,
"grad_norm": 1.4726110696792603,
"learning_rate": 2.8015500925815014e-05,
"loss": 1.9886,
"step": 453
},
{
"epoch": 0.35832675611681136,
"grad_norm": 1.4802323579788208,
"learning_rate": 2.8006098169940594e-05,
"loss": 1.9858,
"step": 454
},
{
"epoch": 0.35911602209944754,
"grad_norm": 2.984384059906006,
"learning_rate": 2.799667477672151e-05,
"loss": 2.0638,
"step": 455
},
{
"epoch": 0.35990528808208366,
"grad_norm": 2.767064094543457,
"learning_rate": 2.7987230761110268e-05,
"loss": 2.0321,
"step": 456
},
{
"epoch": 0.3606945540647198,
"grad_norm": 1.6278806924819946,
"learning_rate": 2.7977766138092105e-05,
"loss": 1.9889,
"step": 457
},
{
"epoch": 0.36148382004735596,
"grad_norm": 2.8049097061157227,
"learning_rate": 2.796828092268495e-05,
"loss": 2.0226,
"step": 458
},
{
"epoch": 0.3622730860299921,
"grad_norm": 2.840355396270752,
"learning_rate": 2.7958775129939407e-05,
"loss": 1.9445,
"step": 459
},
{
"epoch": 0.36306235201262826,
"grad_norm": 1.337878942489624,
"learning_rate": 2.7949248774938735e-05,
"loss": 1.9669,
"step": 460
},
{
"epoch": 0.3638516179952644,
"grad_norm": 2.2922017574310303,
"learning_rate": 2.793970187279882e-05,
"loss": 1.9675,
"step": 461
},
{
"epoch": 0.36464088397790057,
"grad_norm": 1.5649480819702148,
"learning_rate": 2.7930134438668147e-05,
"loss": 1.9923,
"step": 462
},
{
"epoch": 0.3654301499605367,
"grad_norm": 1.7279510498046875,
"learning_rate": 2.7920546487727782e-05,
"loss": 1.9885,
"step": 463
},
{
"epoch": 0.36621941594317287,
"grad_norm": 1.4108428955078125,
"learning_rate": 2.7910938035191344e-05,
"loss": 2.0052,
"step": 464
},
{
"epoch": 0.367008681925809,
"grad_norm": 1.3832406997680664,
"learning_rate": 2.790130909630499e-05,
"loss": 1.9843,
"step": 465
},
{
"epoch": 0.36779794790844517,
"grad_norm": 1.4402744770050049,
"learning_rate": 2.7891659686347372e-05,
"loss": 1.9644,
"step": 466
},
{
"epoch": 0.3685872138910813,
"grad_norm": 1.1699187755584717,
"learning_rate": 2.7881989820629634e-05,
"loss": 1.9496,
"step": 467
},
{
"epoch": 0.36937647987371747,
"grad_norm": 1.7933238744735718,
"learning_rate": 2.787229951449538e-05,
"loss": 1.9949,
"step": 468
},
{
"epoch": 0.3701657458563536,
"grad_norm": 1.3118610382080078,
"learning_rate": 2.7862588783320634e-05,
"loss": 1.945,
"step": 469
},
{
"epoch": 0.3709550118389897,
"grad_norm": 1.6701432466506958,
"learning_rate": 2.7852857642513838e-05,
"loss": 1.9523,
"step": 470
},
{
"epoch": 0.3717442778216259,
"grad_norm": 1.4986763000488281,
"learning_rate": 2.7843106107515824e-05,
"loss": 1.9515,
"step": 471
},
{
"epoch": 0.372533543804262,
"grad_norm": 1.5882444381713867,
"learning_rate": 2.783333419379978e-05,
"loss": 2.0187,
"step": 472
},
{
"epoch": 0.3733228097868982,
"grad_norm": 1.8366268873214722,
"learning_rate": 2.782354191687122e-05,
"loss": 1.984,
"step": 473
},
{
"epoch": 0.3741120757695343,
"grad_norm": 1.6720727682113647,
"learning_rate": 2.7813729292267987e-05,
"loss": 1.9898,
"step": 474
},
{
"epoch": 0.3749013417521705,
"grad_norm": 1.5399422645568848,
"learning_rate": 2.78038963355602e-05,
"loss": 1.9989,
"step": 475
},
{
"epoch": 0.3756906077348066,
"grad_norm": 2.1057066917419434,
"learning_rate": 2.7794043062350234e-05,
"loss": 1.996,
"step": 476
},
{
"epoch": 0.3764798737174428,
"grad_norm": 1.6760990619659424,
"learning_rate": 2.7784169488272714e-05,
"loss": 2.0258,
"step": 477
},
{
"epoch": 0.3772691397000789,
"grad_norm": 1.8123892545700073,
"learning_rate": 2.7774275628994474e-05,
"loss": 1.9563,
"step": 478
},
{
"epoch": 0.3780584056827151,
"grad_norm": 1.5573099851608276,
"learning_rate": 2.776436150021453e-05,
"loss": 1.9674,
"step": 479
},
{
"epoch": 0.3788476716653512,
"grad_norm": 1.8921067714691162,
"learning_rate": 2.7754427117664064e-05,
"loss": 1.9627,
"step": 480
},
{
"epoch": 0.3796369376479874,
"grad_norm": 1.409118890762329,
"learning_rate": 2.7744472497106396e-05,
"loss": 1.9807,
"step": 481
},
{
"epoch": 0.3804262036306235,
"grad_norm": 1.9282482862472534,
"learning_rate": 2.7734497654336955e-05,
"loss": 2.0048,
"step": 482
},
{
"epoch": 0.3812154696132597,
"grad_norm": 1.6770762205123901,
"learning_rate": 2.7724502605183263e-05,
"loss": 2.0104,
"step": 483
},
{
"epoch": 0.3820047355958958,
"grad_norm": 1.8302911520004272,
"learning_rate": 2.7714487365504903e-05,
"loss": 1.9957,
"step": 484
},
{
"epoch": 0.38279400157853194,
"grad_norm": 1.6130921840667725,
"learning_rate": 2.770445195119349e-05,
"loss": 2.0331,
"step": 485
},
{
"epoch": 0.3835832675611681,
"grad_norm": 1.7150806188583374,
"learning_rate": 2.769439637817265e-05,
"loss": 1.9678,
"step": 486
},
{
"epoch": 0.38437253354380424,
"grad_norm": 1.2525075674057007,
"learning_rate": 2.7684320662398017e-05,
"loss": 1.9609,
"step": 487
},
{
"epoch": 0.3851617995264404,
"grad_norm": 1.7981503009796143,
"learning_rate": 2.7674224819857155e-05,
"loss": 1.9904,
"step": 488
},
{
"epoch": 0.38595106550907654,
"grad_norm": 1.3988957405090332,
"learning_rate": 2.7664108866569583e-05,
"loss": 2.0331,
"step": 489
},
{
"epoch": 0.3867403314917127,
"grad_norm": 1.6103547811508179,
"learning_rate": 2.7653972818586725e-05,
"loss": 1.9774,
"step": 490
},
{
"epoch": 0.38752959747434884,
"grad_norm": 1.465564489364624,
"learning_rate": 2.7643816691991896e-05,
"loss": 2.0019,
"step": 491
},
{
"epoch": 0.388318863456985,
"grad_norm": 1.4811826944351196,
"learning_rate": 2.7633640502900258e-05,
"loss": 2.0019,
"step": 492
},
{
"epoch": 0.38910812943962114,
"grad_norm": 1.9267323017120361,
"learning_rate": 2.762344426745883e-05,
"loss": 1.9857,
"step": 493
},
{
"epoch": 0.3898973954222573,
"grad_norm": 1.2511990070343018,
"learning_rate": 2.7613228001846408e-05,
"loss": 1.9663,
"step": 494
},
{
"epoch": 0.39068666140489344,
"grad_norm": 1.2942070960998535,
"learning_rate": 2.7602991722273595e-05,
"loss": 1.9845,
"step": 495
},
{
"epoch": 0.3914759273875296,
"grad_norm": 1.2999018430709839,
"learning_rate": 2.7592735444982745e-05,
"loss": 2.0299,
"step": 496
},
{
"epoch": 0.39226519337016574,
"grad_norm": 1.3434914350509644,
"learning_rate": 2.758245918624794e-05,
"loss": 1.9909,
"step": 497
},
{
"epoch": 0.3930544593528019,
"grad_norm": 1.5425822734832764,
"learning_rate": 2.757216296237496e-05,
"loss": 1.9483,
"step": 498
},
{
"epoch": 0.39384372533543804,
"grad_norm": 1.1178399324417114,
"learning_rate": 2.7561846789701295e-05,
"loss": 1.9843,
"step": 499
},
{
"epoch": 0.39463299131807417,
"grad_norm": 1.8598966598510742,
"learning_rate": 2.755151068459605e-05,
"loss": 1.9679,
"step": 500
},
{
"epoch": 0.39542225730071034,
"grad_norm": 1.3829669952392578,
"learning_rate": 2.7541154663459973e-05,
"loss": 2.0293,
"step": 501
},
{
"epoch": 0.39621152328334647,
"grad_norm": 1.8445639610290527,
"learning_rate": 2.7530778742725428e-05,
"loss": 1.9737,
"step": 502
},
{
"epoch": 0.39700078926598265,
"grad_norm": 1.229123830795288,
"learning_rate": 2.7520382938856332e-05,
"loss": 2.0013,
"step": 503
},
{
"epoch": 0.39779005524861877,
"grad_norm": 1.8316127061843872,
"learning_rate": 2.7509967268348168e-05,
"loss": 1.9645,
"step": 504
},
{
"epoch": 0.39857932123125495,
"grad_norm": 1.3551149368286133,
"learning_rate": 2.7499531747727938e-05,
"loss": 1.9771,
"step": 505
},
{
"epoch": 0.39936858721389107,
"grad_norm": 1.8307702541351318,
"learning_rate": 2.7489076393554128e-05,
"loss": 2.0686,
"step": 506
},
{
"epoch": 0.40015785319652725,
"grad_norm": 2.3857483863830566,
"learning_rate": 2.747860122241671e-05,
"loss": 2.0652,
"step": 507
},
{
"epoch": 0.40094711917916337,
"grad_norm": 5.453534126281738,
"learning_rate": 2.7468106250937104e-05,
"loss": 2.0154,
"step": 508
},
{
"epoch": 0.40173638516179955,
"grad_norm": 4.784702301025391,
"learning_rate": 2.745759149576813e-05,
"loss": 2.0359,
"step": 509
},
{
"epoch": 0.40252565114443567,
"grad_norm": 1.8123985528945923,
"learning_rate": 2.7447056973594018e-05,
"loss": 1.9937,
"step": 510
},
{
"epoch": 0.40331491712707185,
"grad_norm": 1.9731765985488892,
"learning_rate": 2.7436502701130346e-05,
"loss": 2.0277,
"step": 511
},
{
"epoch": 0.40410418310970797,
"grad_norm": 2.117807149887085,
"learning_rate": 2.742592869512405e-05,
"loss": 2.0225,
"step": 512
},
{
"epoch": 0.4048934490923441,
"grad_norm": 1.983472466468811,
"learning_rate": 2.741533497235336e-05,
"loss": 2.0592,
"step": 513
},
{
"epoch": 0.40568271507498027,
"grad_norm": 1.5282464027404785,
"learning_rate": 2.74047215496278e-05,
"loss": 2.0422,
"step": 514
},
{
"epoch": 0.4064719810576164,
"grad_norm": 1.679261326789856,
"learning_rate": 2.7394088443788154e-05,
"loss": 1.9958,
"step": 515
},
{
"epoch": 0.40726124704025257,
"grad_norm": 1.5584269762039185,
"learning_rate": 2.7383435671706433e-05,
"loss": 2.0147,
"step": 516
},
{
"epoch": 0.4080505130228887,
"grad_norm": 1.3049349784851074,
"learning_rate": 2.7372763250285865e-05,
"loss": 2.0012,
"step": 517
},
{
"epoch": 0.4088397790055249,
"grad_norm": 1.1305190324783325,
"learning_rate": 2.7362071196460833e-05,
"loss": 2.0246,
"step": 518
},
{
"epoch": 0.409629044988161,
"grad_norm": 1.8814990520477295,
"learning_rate": 2.73513595271969e-05,
"loss": 2.0355,
"step": 519
},
{
"epoch": 0.4104183109707972,
"grad_norm": 1.9675487279891968,
"learning_rate": 2.7340628259490732e-05,
"loss": 2.0102,
"step": 520
},
{
"epoch": 0.4112075769534333,
"grad_norm": 1.3380517959594727,
"learning_rate": 2.7329877410370103e-05,
"loss": 1.9709,
"step": 521
},
{
"epoch": 0.4119968429360695,
"grad_norm": 1.272178053855896,
"learning_rate": 2.731910699689386e-05,
"loss": 2.0224,
"step": 522
},
{
"epoch": 0.4127861089187056,
"grad_norm": 1.2764815092086792,
"learning_rate": 2.730831703615188e-05,
"loss": 1.9994,
"step": 523
},
{
"epoch": 0.4135753749013418,
"grad_norm": 2.1558752059936523,
"learning_rate": 2.729750754526507e-05,
"loss": 2.034,
"step": 524
},
{
"epoch": 0.4143646408839779,
"grad_norm": 1.6258021593093872,
"learning_rate": 2.7286678541385327e-05,
"loss": 1.9565,
"step": 525
},
{
"epoch": 0.4151539068666141,
"grad_norm": 1.502463698387146,
"learning_rate": 2.727583004169549e-05,
"loss": 1.9866,
"step": 526
},
{
"epoch": 0.4159431728492502,
"grad_norm": 1.258079171180725,
"learning_rate": 2.7264962063409368e-05,
"loss": 1.9524,
"step": 527
},
{
"epoch": 0.4167324388318863,
"grad_norm": 1.5579389333724976,
"learning_rate": 2.7254074623771643e-05,
"loss": 2.034,
"step": 528
},
{
"epoch": 0.4175217048145225,
"grad_norm": 1.9363354444503784,
"learning_rate": 2.7243167740057894e-05,
"loss": 1.952,
"step": 529
},
{
"epoch": 0.4183109707971586,
"grad_norm": 2.8109583854675293,
"learning_rate": 2.723224142957455e-05,
"loss": 1.9794,
"step": 530
},
{
"epoch": 0.4191002367797948,
"grad_norm": 1.8886324167251587,
"learning_rate": 2.7221295709658873e-05,
"loss": 2.0423,
"step": 531
},
{
"epoch": 0.4198895027624309,
"grad_norm": 1.2472280263900757,
"learning_rate": 2.721033059767891e-05,
"loss": 2.0076,
"step": 532
},
{
"epoch": 0.4206787687450671,
"grad_norm": 2.1861677169799805,
"learning_rate": 2.719934611103348e-05,
"loss": 2.0189,
"step": 533
},
{
"epoch": 0.4214680347277032,
"grad_norm": 1.2035226821899414,
"learning_rate": 2.7188342267152155e-05,
"loss": 1.9594,
"step": 534
},
{
"epoch": 0.4222573007103394,
"grad_norm": 2.6411736011505127,
"learning_rate": 2.7177319083495212e-05,
"loss": 1.9578,
"step": 535
},
{
"epoch": 0.4230465666929755,
"grad_norm": 63.83803939819336,
"learning_rate": 2.7166276577553612e-05,
"loss": 2.1051,
"step": 536
},
{
"epoch": 0.4238358326756117,
"grad_norm": 1.9330120086669922,
"learning_rate": 2.7155214766848996e-05,
"loss": 1.9559,
"step": 537
},
{
"epoch": 0.4246250986582478,
"grad_norm": 1.8951784372329712,
"learning_rate": 2.714413366893361e-05,
"loss": 1.9691,
"step": 538
},
{
"epoch": 0.425414364640884,
"grad_norm": 1.6028685569763184,
"learning_rate": 2.7133033301390328e-05,
"loss": 1.9484,
"step": 539
},
{
"epoch": 0.4262036306235201,
"grad_norm": 2.4249136447906494,
"learning_rate": 2.712191368183258e-05,
"loss": 2.1158,
"step": 540
},
{
"epoch": 0.42699289660615625,
"grad_norm": 3.1744580268859863,
"learning_rate": 2.711077482790435e-05,
"loss": 1.9602,
"step": 541
},
{
"epoch": 0.4277821625887924,
"grad_norm": 1.779587745666504,
"learning_rate": 2.7099616757280158e-05,
"loss": 2.037,
"step": 542
},
{
"epoch": 0.42857142857142855,
"grad_norm": 2.5144872665405273,
"learning_rate": 2.708843948766499e-05,
"loss": 2.0066,
"step": 543
},
{
"epoch": 0.4293606945540647,
"grad_norm": 3.0390021800994873,
"learning_rate": 2.707724303679431e-05,
"loss": 2.0017,
"step": 544
},
{
"epoch": 0.43014996053670085,
"grad_norm": 2.0000808238983154,
"learning_rate": 2.706602742243402e-05,
"loss": 1.9526,
"step": 545
},
{
"epoch": 0.430939226519337,
"grad_norm": 2.5513486862182617,
"learning_rate": 2.7054792662380427e-05,
"loss": 1.9689,
"step": 546
},
{
"epoch": 0.43172849250197315,
"grad_norm": 2.5275909900665283,
"learning_rate": 2.704353877446021e-05,
"loss": 1.9719,
"step": 547
},
{
"epoch": 0.4325177584846093,
"grad_norm": 1.781296968460083,
"learning_rate": 2.7032265776530414e-05,
"loss": 1.9811,
"step": 548
},
{
"epoch": 0.43330702446724545,
"grad_norm": 1.563276767730713,
"learning_rate": 2.7020973686478388e-05,
"loss": 2.0099,
"step": 549
},
{
"epoch": 0.4340962904498816,
"grad_norm": 2.2312963008880615,
"learning_rate": 2.700966252222179e-05,
"loss": 2.0063,
"step": 550
},
{
"epoch": 0.43488555643251775,
"grad_norm": 2.1669869422912598,
"learning_rate": 2.699833230170854e-05,
"loss": 1.9857,
"step": 551
},
{
"epoch": 0.43567482241515393,
"grad_norm": 2.051197052001953,
"learning_rate": 2.6986983042916792e-05,
"loss": 2.0003,
"step": 552
},
{
"epoch": 0.43646408839779005,
"grad_norm": 1.9057836532592773,
"learning_rate": 2.697561476385491e-05,
"loss": 2.0375,
"step": 553
},
{
"epoch": 0.43725335438042623,
"grad_norm": 1.7578641176223755,
"learning_rate": 2.6964227482561442e-05,
"loss": 2.0319,
"step": 554
},
{
"epoch": 0.43804262036306235,
"grad_norm": 2.087156295776367,
"learning_rate": 2.6952821217105086e-05,
"loss": 1.9948,
"step": 555
},
{
"epoch": 0.4388318863456985,
"grad_norm": 1.399621844291687,
"learning_rate": 2.6941395985584656e-05,
"loss": 1.9759,
"step": 556
},
{
"epoch": 0.43962115232833465,
"grad_norm": 1.4898017644882202,
"learning_rate": 2.6929951806129076e-05,
"loss": 1.9606,
"step": 557
},
{
"epoch": 0.4404104183109708,
"grad_norm": 1.5503097772598267,
"learning_rate": 2.6918488696897317e-05,
"loss": 1.9867,
"step": 558
},
{
"epoch": 0.44119968429360695,
"grad_norm": 1.4455583095550537,
"learning_rate": 2.69070066760784e-05,
"loss": 1.9556,
"step": 559
},
{
"epoch": 0.4419889502762431,
"grad_norm": 1.4497873783111572,
"learning_rate": 2.689550576189135e-05,
"loss": 1.9932,
"step": 560
},
{
"epoch": 0.44277821625887925,
"grad_norm": 1.2433416843414307,
"learning_rate": 2.688398597258517e-05,
"loss": 1.9857,
"step": 561
},
{
"epoch": 0.4435674822415154,
"grad_norm": 1.5041720867156982,
"learning_rate": 2.6872447326438813e-05,
"loss": 1.9872,
"step": 562
},
{
"epoch": 0.44435674822415155,
"grad_norm": 1.2008202075958252,
"learning_rate": 2.6860889841761152e-05,
"loss": 2.0249,
"step": 563
},
{
"epoch": 0.4451460142067877,
"grad_norm": 1.420989751815796,
"learning_rate": 2.6849313536890956e-05,
"loss": 1.9807,
"step": 564
},
{
"epoch": 0.44593528018942385,
"grad_norm": 1.4576897621154785,
"learning_rate": 2.6837718430196848e-05,
"loss": 1.9909,
"step": 565
},
{
"epoch": 0.44672454617206,
"grad_norm": 1.538425087928772,
"learning_rate": 2.68261045400773e-05,
"loss": 1.9566,
"step": 566
},
{
"epoch": 0.44751381215469616,
"grad_norm": 1.6612803936004639,
"learning_rate": 2.681447188496057e-05,
"loss": 1.9703,
"step": 567
},
{
"epoch": 0.4483030781373323,
"grad_norm": 1.5408464670181274,
"learning_rate": 2.6802820483304713e-05,
"loss": 1.9929,
"step": 568
},
{
"epoch": 0.44909234411996846,
"grad_norm": 1.468129277229309,
"learning_rate": 2.6791150353597507e-05,
"loss": 1.9623,
"step": 569
},
{
"epoch": 0.4498816101026046,
"grad_norm": 1.5219614505767822,
"learning_rate": 2.6779461514356454e-05,
"loss": 2.0299,
"step": 570
},
{
"epoch": 0.4506708760852407,
"grad_norm": 1.2168009281158447,
"learning_rate": 2.6767753984128756e-05,
"loss": 1.9743,
"step": 571
},
{
"epoch": 0.4514601420678769,
"grad_norm": 1.8264933824539185,
"learning_rate": 2.6756027781491262e-05,
"loss": 2.0069,
"step": 572
},
{
"epoch": 0.452249408050513,
"grad_norm": 1.409223198890686,
"learning_rate": 2.6744282925050443e-05,
"loss": 2.0129,
"step": 573
},
{
"epoch": 0.4530386740331492,
"grad_norm": 2.189690113067627,
"learning_rate": 2.6732519433442386e-05,
"loss": 1.9749,
"step": 574
},
{
"epoch": 0.4538279400157853,
"grad_norm": 1.3944450616836548,
"learning_rate": 2.672073732533273e-05,
"loss": 2.039,
"step": 575
},
{
"epoch": 0.4546172059984215,
"grad_norm": 2.574265718460083,
"learning_rate": 2.670893661941666e-05,
"loss": 2.0016,
"step": 576
},
{
"epoch": 0.4554064719810576,
"grad_norm": 2.220921039581299,
"learning_rate": 2.669711733441888e-05,
"loss": 2.0456,
"step": 577
},
{
"epoch": 0.4561957379636938,
"grad_norm": 1.492487907409668,
"learning_rate": 2.668527948909356e-05,
"loss": 1.9773,
"step": 578
},
{
"epoch": 0.4569850039463299,
"grad_norm": 2.086151123046875,
"learning_rate": 2.667342310222433e-05,
"loss": 2.0039,
"step": 579
},
{
"epoch": 0.4577742699289661,
"grad_norm": 1.9060444831848145,
"learning_rate": 2.6661548192624234e-05,
"loss": 1.9759,
"step": 580
},
{
"epoch": 0.4585635359116022,
"grad_norm": 15.723176956176758,
"learning_rate": 2.6649654779135715e-05,
"loss": 2.0215,
"step": 581
},
{
"epoch": 0.4593528018942384,
"grad_norm": 3.334221124649048,
"learning_rate": 2.663774288063057e-05,
"loss": 2.0489,
"step": 582
},
{
"epoch": 0.4601420678768745,
"grad_norm": 2.814065933227539,
"learning_rate": 2.6625812516009925e-05,
"loss": 2.0048,
"step": 583
},
{
"epoch": 0.46093133385951063,
"grad_norm": 3.551992177963257,
"learning_rate": 2.6613863704204217e-05,
"loss": 2.0969,
"step": 584
},
{
"epoch": 0.4617205998421468,
"grad_norm": 3.252483367919922,
"learning_rate": 2.660189646417315e-05,
"loss": 2.0006,
"step": 585
},
{
"epoch": 0.46250986582478293,
"grad_norm": 3.306950092315674,
"learning_rate": 2.658991081490566e-05,
"loss": 2.0138,
"step": 586
},
{
"epoch": 0.4632991318074191,
"grad_norm": 1.512024998664856,
"learning_rate": 2.65779067754199e-05,
"loss": 1.9825,
"step": 587
},
{
"epoch": 0.46408839779005523,
"grad_norm": 1.600062608718872,
"learning_rate": 2.6565884364763214e-05,
"loss": 2.013,
"step": 588
},
{
"epoch": 0.4648776637726914,
"grad_norm": 1.3948335647583008,
"learning_rate": 2.655384360201208e-05,
"loss": 1.9622,
"step": 589
},
{
"epoch": 0.46566692975532753,
"grad_norm": 1.922743558883667,
"learning_rate": 2.6541784506272108e-05,
"loss": 1.9972,
"step": 590
},
{
"epoch": 0.4664561957379637,
"grad_norm": 1.3690232038497925,
"learning_rate": 2.652970709667798e-05,
"loss": 1.9871,
"step": 591
},
{
"epoch": 0.46724546172059983,
"grad_norm": 1.273729681968689,
"learning_rate": 2.6517611392393463e-05,
"loss": 1.9675,
"step": 592
},
{
"epoch": 0.468034727703236,
"grad_norm": 1.565245509147644,
"learning_rate": 2.6505497412611335e-05,
"loss": 1.9785,
"step": 593
},
{
"epoch": 0.46882399368587213,
"grad_norm": 1.3492815494537354,
"learning_rate": 2.6493365176553376e-05,
"loss": 1.9781,
"step": 594
},
{
"epoch": 0.4696132596685083,
"grad_norm": 1.7030278444290161,
"learning_rate": 2.648121470347034e-05,
"loss": 1.9991,
"step": 595
},
{
"epoch": 0.47040252565114443,
"grad_norm": 1.3920458555221558,
"learning_rate": 2.646904601264191e-05,
"loss": 1.9711,
"step": 596
},
{
"epoch": 0.4711917916337806,
"grad_norm": 1.3776328563690186,
"learning_rate": 2.6456859123376676e-05,
"loss": 1.9506,
"step": 597
},
{
"epoch": 0.47198105761641673,
"grad_norm": 1.3518524169921875,
"learning_rate": 2.644465405501212e-05,
"loss": 1.9457,
"step": 598
},
{
"epoch": 0.47277032359905286,
"grad_norm": 1.43320631980896,
"learning_rate": 2.643243082691455e-05,
"loss": 1.9906,
"step": 599
},
{
"epoch": 0.47355958958168903,
"grad_norm": 1.2658172845840454,
"learning_rate": 2.6420189458479095e-05,
"loss": 1.9749,
"step": 600
},
{
"epoch": 0.47434885556432516,
"grad_norm": 1.543461799621582,
"learning_rate": 2.6407929969129668e-05,
"loss": 1.968,
"step": 601
},
{
"epoch": 0.47513812154696133,
"grad_norm": 1.238538384437561,
"learning_rate": 2.6395652378318945e-05,
"loss": 1.97,
"step": 602
},
{
"epoch": 0.47592738752959746,
"grad_norm": 1.790334701538086,
"learning_rate": 2.6383356705528306e-05,
"loss": 2.0287,
"step": 603
},
{
"epoch": 0.47671665351223363,
"grad_norm": 1.5136432647705078,
"learning_rate": 2.6371042970267846e-05,
"loss": 1.9737,
"step": 604
},
{
"epoch": 0.47750591949486976,
"grad_norm": 1.5719919204711914,
"learning_rate": 2.6358711192076294e-05,
"loss": 2.0797,
"step": 605
},
{
"epoch": 0.47829518547750594,
"grad_norm": 1.8503254652023315,
"learning_rate": 2.6346361390521026e-05,
"loss": 1.9932,
"step": 606
},
{
"epoch": 0.47908445146014206,
"grad_norm": 1.1544816493988037,
"learning_rate": 2.6333993585198014e-05,
"loss": 1.9213,
"step": 607
},
{
"epoch": 0.47987371744277824,
"grad_norm": 1.2639306783676147,
"learning_rate": 2.63216077957318e-05,
"loss": 1.9033,
"step": 608
},
{
"epoch": 0.48066298342541436,
"grad_norm": 1.110762357711792,
"learning_rate": 2.6309204041775444e-05,
"loss": 1.9829,
"step": 609
},
{
"epoch": 0.48145224940805054,
"grad_norm": 1.6563924551010132,
"learning_rate": 2.629678234301054e-05,
"loss": 2.078,
"step": 610
},
{
"epoch": 0.48224151539068666,
"grad_norm": 1.3371410369873047,
"learning_rate": 2.6284342719147134e-05,
"loss": 2.024,
"step": 611
},
{
"epoch": 0.48303078137332284,
"grad_norm": 1.093579649925232,
"learning_rate": 2.6271885189923714e-05,
"loss": 1.9387,
"step": 612
},
{
"epoch": 0.48382004735595896,
"grad_norm": 1.2362289428710938,
"learning_rate": 2.6259409775107198e-05,
"loss": 2.0065,
"step": 613
},
{
"epoch": 0.4846093133385951,
"grad_norm": 1.3334242105484009,
"learning_rate": 2.6246916494492866e-05,
"loss": 2.003,
"step": 614
},
{
"epoch": 0.48539857932123126,
"grad_norm": 1.247543454170227,
"learning_rate": 2.6234405367904354e-05,
"loss": 1.9803,
"step": 615
},
{
"epoch": 0.4861878453038674,
"grad_norm": 1.462421178817749,
"learning_rate": 2.622187641519361e-05,
"loss": 1.9861,
"step": 616
},
{
"epoch": 0.48697711128650356,
"grad_norm": 1.3130522966384888,
"learning_rate": 2.6209329656240883e-05,
"loss": 1.9314,
"step": 617
},
{
"epoch": 0.4877663772691397,
"grad_norm": 1.2004978656768799,
"learning_rate": 2.619676511095465e-05,
"loss": 2.0066,
"step": 618
},
{
"epoch": 0.48855564325177586,
"grad_norm": 1.11477530002594,
"learning_rate": 2.618418279927163e-05,
"loss": 1.9476,
"step": 619
},
{
"epoch": 0.489344909234412,
"grad_norm": 1.0705735683441162,
"learning_rate": 2.617158274115673e-05,
"loss": 1.9561,
"step": 620
},
{
"epoch": 0.49013417521704816,
"grad_norm": 1.1721019744873047,
"learning_rate": 2.6158964956603008e-05,
"loss": 1.9738,
"step": 621
},
{
"epoch": 0.4909234411996843,
"grad_norm": 1.3675673007965088,
"learning_rate": 2.6146329465631657e-05,
"loss": 1.9825,
"step": 622
},
{
"epoch": 0.49171270718232046,
"grad_norm": 1.2252568006515503,
"learning_rate": 2.6133676288291964e-05,
"loss": 1.9561,
"step": 623
},
{
"epoch": 0.4925019731649566,
"grad_norm": 1.335405707359314,
"learning_rate": 2.6121005444661275e-05,
"loss": 2.0124,
"step": 624
},
{
"epoch": 0.49329123914759276,
"grad_norm": 1.20765221118927,
"learning_rate": 2.610831695484498e-05,
"loss": 2.0189,
"step": 625
},
{
"epoch": 0.4940805051302289,
"grad_norm": 1.2233660221099854,
"learning_rate": 2.6095610838976453e-05,
"loss": 1.9865,
"step": 626
},
{
"epoch": 0.494869771112865,
"grad_norm": 1.3525431156158447,
"learning_rate": 2.608288711721704e-05,
"loss": 1.9831,
"step": 627
},
{
"epoch": 0.4956590370955012,
"grad_norm": 1.565442681312561,
"learning_rate": 2.6070145809756036e-05,
"loss": 2.0341,
"step": 628
},
{
"epoch": 0.4964483030781373,
"grad_norm": 1.7925246953964233,
"learning_rate": 2.6057386936810627e-05,
"loss": 2.0274,
"step": 629
},
{
"epoch": 0.4972375690607735,
"grad_norm": 2.0463998317718506,
"learning_rate": 2.6044610518625875e-05,
"loss": 2.1091,
"step": 630
},
{
"epoch": 0.4980268350434096,
"grad_norm": 1.641654133796692,
"learning_rate": 2.603181657547468e-05,
"loss": 2.0029,
"step": 631
},
{
"epoch": 0.4988161010260458,
"grad_norm": 1.214727759361267,
"learning_rate": 2.601900512765775e-05,
"loss": 1.9857,
"step": 632
},
{
"epoch": 0.4996053670086819,
"grad_norm": 1.7539560794830322,
"learning_rate": 2.6006176195503576e-05,
"loss": 1.9751,
"step": 633
},
{
"epoch": 0.500394632991318,
"grad_norm": 1.4065206050872803,
"learning_rate": 2.5993329799368377e-05,
"loss": 1.9431,
"step": 634
},
{
"epoch": 0.5011838989739542,
"grad_norm": 1.6131680011749268,
"learning_rate": 2.598046595963609e-05,
"loss": 1.9809,
"step": 635
},
{
"epoch": 0.5019731649565904,
"grad_norm": 1.5111154317855835,
"learning_rate": 2.5967584696718346e-05,
"loss": 1.9888,
"step": 636
},
{
"epoch": 0.5027624309392266,
"grad_norm": 1.5211158990859985,
"learning_rate": 2.5954686031054402e-05,
"loss": 1.9922,
"step": 637
},
{
"epoch": 0.5035516969218626,
"grad_norm": 3.4300498962402344,
"learning_rate": 2.5941769983111126e-05,
"loss": 1.9395,
"step": 638
},
{
"epoch": 0.5043409629044988,
"grad_norm": 1.8587327003479004,
"learning_rate": 2.5928836573382982e-05,
"loss": 2.012,
"step": 639
},
{
"epoch": 0.505130228887135,
"grad_norm": 3.757934808731079,
"learning_rate": 2.591588582239198e-05,
"loss": 2.038,
"step": 640
},
{
"epoch": 0.5059194948697711,
"grad_norm": 1.6540387868881226,
"learning_rate": 2.5902917750687637e-05,
"loss": 1.948,
"step": 641
},
{
"epoch": 0.5067087608524072,
"grad_norm": 1.5322803258895874,
"learning_rate": 2.5889932378846963e-05,
"loss": 1.9642,
"step": 642
},
{
"epoch": 0.5074980268350434,
"grad_norm": 1.324696660041809,
"learning_rate": 2.5876929727474415e-05,
"loss": 1.961,
"step": 643
},
{
"epoch": 0.5082872928176796,
"grad_norm": 1.3825550079345703,
"learning_rate": 2.586390981720187e-05,
"loss": 2.0007,
"step": 644
},
{
"epoch": 0.5090765588003157,
"grad_norm": 1.4527854919433594,
"learning_rate": 2.5850872668688585e-05,
"loss": 2.0226,
"step": 645
},
{
"epoch": 0.5098658247829518,
"grad_norm": 1.7354285717010498,
"learning_rate": 2.5837818302621177e-05,
"loss": 2.0154,
"step": 646
},
{
"epoch": 0.510655090765588,
"grad_norm": 1.1257487535476685,
"learning_rate": 2.5824746739713574e-05,
"loss": 1.951,
"step": 647
},
{
"epoch": 0.5114443567482242,
"grad_norm": 1.931472897529602,
"learning_rate": 2.5811658000707002e-05,
"loss": 1.9932,
"step": 648
},
{
"epoch": 0.5122336227308603,
"grad_norm": 1.536661148071289,
"learning_rate": 2.579855210636994e-05,
"loss": 2.0213,
"step": 649
},
{
"epoch": 0.5130228887134964,
"grad_norm": 1.3281984329223633,
"learning_rate": 2.578542907749807e-05,
"loss": 2.0358,
"step": 650
},
{
"epoch": 0.5138121546961326,
"grad_norm": 1.228049874305725,
"learning_rate": 2.5772288934914287e-05,
"loss": 1.9683,
"step": 651
},
{
"epoch": 0.5146014206787688,
"grad_norm": 1.1916272640228271,
"learning_rate": 2.5759131699468624e-05,
"loss": 1.9604,
"step": 652
},
{
"epoch": 0.5153906866614049,
"grad_norm": 1.534188985824585,
"learning_rate": 2.5745957392038252e-05,
"loss": 1.9949,
"step": 653
},
{
"epoch": 0.516179952644041,
"grad_norm": 1.3599672317504883,
"learning_rate": 2.5732766033527403e-05,
"loss": 2.0305,
"step": 654
},
{
"epoch": 0.5169692186266772,
"grad_norm": 1.546063780784607,
"learning_rate": 2.5719557644867395e-05,
"loss": 1.9581,
"step": 655
},
{
"epoch": 0.5177584846093133,
"grad_norm": 1.4883053302764893,
"learning_rate": 2.570633224701655e-05,
"loss": 1.9556,
"step": 656
},
{
"epoch": 0.5185477505919495,
"grad_norm": 1.2917836904525757,
"learning_rate": 2.569308986096019e-05,
"loss": 1.973,
"step": 657
},
{
"epoch": 0.5193370165745856,
"grad_norm": 1.249537467956543,
"learning_rate": 2.5679830507710586e-05,
"loss": 2.0075,
"step": 658
},
{
"epoch": 0.5201262825572218,
"grad_norm": 1.1445908546447754,
"learning_rate": 2.5666554208306933e-05,
"loss": 1.9672,
"step": 659
},
{
"epoch": 0.5209155485398579,
"grad_norm": 1.1478815078735352,
"learning_rate": 2.565326098381532e-05,
"loss": 1.9705,
"step": 660
},
{
"epoch": 0.5217048145224941,
"grad_norm": 1.1105977296829224,
"learning_rate": 2.5639950855328678e-05,
"loss": 1.9644,
"step": 661
},
{
"epoch": 0.5224940805051302,
"grad_norm": 1.1826235055923462,
"learning_rate": 2.562662384396678e-05,
"loss": 1.925,
"step": 662
},
{
"epoch": 0.5232833464877664,
"grad_norm": 1.4168332815170288,
"learning_rate": 2.561327997087617e-05,
"loss": 1.9569,
"step": 663
},
{
"epoch": 0.5240726124704025,
"grad_norm": 1.261547327041626,
"learning_rate": 2.5599919257230158e-05,
"loss": 2.0187,
"step": 664
},
{
"epoch": 0.5248618784530387,
"grad_norm": 1.1711071729660034,
"learning_rate": 2.5586541724228774e-05,
"loss": 1.9609,
"step": 665
},
{
"epoch": 0.5256511444356748,
"grad_norm": 1.1934360265731812,
"learning_rate": 2.5573147393098734e-05,
"loss": 1.955,
"step": 666
},
{
"epoch": 0.526440410418311,
"grad_norm": 1.6360708475112915,
"learning_rate": 2.5559736285093408e-05,
"loss": 1.975,
"step": 667
},
{
"epoch": 0.5272296764009471,
"grad_norm": 1.4871546030044556,
"learning_rate": 2.5546308421492785e-05,
"loss": 2.01,
"step": 668
},
{
"epoch": 0.5280189423835833,
"grad_norm": 1.4333875179290771,
"learning_rate": 2.5532863823603446e-05,
"loss": 1.9847,
"step": 669
},
{
"epoch": 0.5288082083662194,
"grad_norm": 1.2392849922180176,
"learning_rate": 2.5519402512758524e-05,
"loss": 1.9323,
"step": 670
},
{
"epoch": 0.5295974743488555,
"grad_norm": 1.5046405792236328,
"learning_rate": 2.550592451031767e-05,
"loss": 1.9297,
"step": 671
},
{
"epoch": 0.5303867403314917,
"grad_norm": 1.231242299079895,
"learning_rate": 2.549242983766702e-05,
"loss": 1.9706,
"step": 672
},
{
"epoch": 0.5311760063141279,
"grad_norm": 1.2985905408859253,
"learning_rate": 2.547891851621915e-05,
"loss": 1.9668,
"step": 673
},
{
"epoch": 0.531965272296764,
"grad_norm": 1.5686601400375366,
"learning_rate": 2.5465390567413078e-05,
"loss": 1.9397,
"step": 674
},
{
"epoch": 0.5327545382794001,
"grad_norm": 2.6767473220825195,
"learning_rate": 2.5451846012714186e-05,
"loss": 1.9895,
"step": 675
},
{
"epoch": 0.5335438042620363,
"grad_norm": 1.4398545026779175,
"learning_rate": 2.543828487361421e-05,
"loss": 2.0005,
"step": 676
},
{
"epoch": 0.5343330702446725,
"grad_norm": 1.6226674318313599,
"learning_rate": 2.5424707171631206e-05,
"loss": 1.9832,
"step": 677
},
{
"epoch": 0.5351223362273086,
"grad_norm": 1.2014738321304321,
"learning_rate": 2.541111292830951e-05,
"loss": 2.0015,
"step": 678
},
{
"epoch": 0.5359116022099447,
"grad_norm": 1.2680522203445435,
"learning_rate": 2.5397502165219696e-05,
"loss": 1.9777,
"step": 679
},
{
"epoch": 0.5367008681925809,
"grad_norm": 1.573599100112915,
"learning_rate": 2.5383874903958557e-05,
"loss": 1.9867,
"step": 680
},
{
"epoch": 0.5374901341752171,
"grad_norm": 1.1822348833084106,
"learning_rate": 2.537023116614907e-05,
"loss": 1.9619,
"step": 681
},
{
"epoch": 0.5382794001578532,
"grad_norm": 1.0724636316299438,
"learning_rate": 2.5356570973440348e-05,
"loss": 1.9544,
"step": 682
},
{
"epoch": 0.5390686661404893,
"grad_norm": 1.2558928728103638,
"learning_rate": 2.5342894347507614e-05,
"loss": 1.9375,
"step": 683
},
{
"epoch": 0.5398579321231255,
"grad_norm": 64.77701568603516,
"learning_rate": 2.5329201310052162e-05,
"loss": 2.4849,
"step": 684
},
{
"epoch": 0.5406471981057617,
"grad_norm": 1.963006615638733,
"learning_rate": 2.531549188280135e-05,
"loss": 1.9229,
"step": 685
},
{
"epoch": 0.5414364640883977,
"grad_norm": 1.3791204690933228,
"learning_rate": 2.5301766087508515e-05,
"loss": 1.97,
"step": 686
},
{
"epoch": 0.5422257300710339,
"grad_norm": 1.607014775276184,
"learning_rate": 2.5288023945952974e-05,
"loss": 1.9664,
"step": 687
},
{
"epoch": 0.5430149960536701,
"grad_norm": 1.2975430488586426,
"learning_rate": 2.527426547993999e-05,
"loss": 1.98,
"step": 688
},
{
"epoch": 0.5438042620363063,
"grad_norm": 1.4744203090667725,
"learning_rate": 2.5260490711300724e-05,
"loss": 2.021,
"step": 689
},
{
"epoch": 0.5445935280189423,
"grad_norm": 1.405918836593628,
"learning_rate": 2.5246699661892193e-05,
"loss": 1.9456,
"step": 690
},
{
"epoch": 0.5453827940015785,
"grad_norm": 1.3272600173950195,
"learning_rate": 2.5232892353597273e-05,
"loss": 1.9648,
"step": 691
},
{
"epoch": 0.5461720599842147,
"grad_norm": 1.346197247505188,
"learning_rate": 2.5219068808324612e-05,
"loss": 1.9528,
"step": 692
},
{
"epoch": 0.5469613259668509,
"grad_norm": 1.2085739374160767,
"learning_rate": 2.5205229048008635e-05,
"loss": 1.9736,
"step": 693
},
{
"epoch": 0.5477505919494869,
"grad_norm": 1.3460921049118042,
"learning_rate": 2.5191373094609505e-05,
"loss": 1.9744,
"step": 694
},
{
"epoch": 0.5485398579321231,
"grad_norm": 1.324518084526062,
"learning_rate": 2.517750097011306e-05,
"loss": 1.9659,
"step": 695
},
{
"epoch": 0.5493291239147593,
"grad_norm": 1.448301911354065,
"learning_rate": 2.5163612696530805e-05,
"loss": 1.9793,
"step": 696
},
{
"epoch": 0.5501183898973955,
"grad_norm": 1.9733283519744873,
"learning_rate": 2.5149708295899873e-05,
"loss": 2.0423,
"step": 697
},
{
"epoch": 0.5509076558800315,
"grad_norm": 1.3468515872955322,
"learning_rate": 2.5135787790282992e-05,
"loss": 1.9799,
"step": 698
},
{
"epoch": 0.5516969218626677,
"grad_norm": 2.0482213497161865,
"learning_rate": 2.5121851201768425e-05,
"loss": 2.0022,
"step": 699
},
{
"epoch": 0.5524861878453039,
"grad_norm": 1.2711023092269897,
"learning_rate": 2.5107898552469974e-05,
"loss": 1.9457,
"step": 700
},
{
"epoch": 0.55327545382794,
"grad_norm": 1.7975988388061523,
"learning_rate": 2.5093929864526915e-05,
"loss": 2.0417,
"step": 701
},
{
"epoch": 0.5540647198105761,
"grad_norm": 1.7747515439987183,
"learning_rate": 2.507994516010398e-05,
"loss": 1.9492,
"step": 702
},
{
"epoch": 0.5548539857932123,
"grad_norm": 1.7052737474441528,
"learning_rate": 2.5065944461391305e-05,
"loss": 1.9589,
"step": 703
},
{
"epoch": 0.5556432517758485,
"grad_norm": 2.9147486686706543,
"learning_rate": 2.5051927790604412e-05,
"loss": 2.0123,
"step": 704
},
{
"epoch": 0.5564325177584846,
"grad_norm": 1.5917088985443115,
"learning_rate": 2.5037895169984174e-05,
"loss": 1.9582,
"step": 705
},
{
"epoch": 0.5572217837411207,
"grad_norm": 1.5023882389068604,
"learning_rate": 2.502384662179675e-05,
"loss": 1.9572,
"step": 706
},
{
"epoch": 0.5580110497237569,
"grad_norm": 1.551519513130188,
"learning_rate": 2.500978216833359e-05,
"loss": 1.9601,
"step": 707
},
{
"epoch": 0.5588003157063931,
"grad_norm": 1.5883748531341553,
"learning_rate": 2.4995701831911388e-05,
"loss": 2.0028,
"step": 708
},
{
"epoch": 0.5595895816890292,
"grad_norm": 2.114210605621338,
"learning_rate": 2.4981605634872013e-05,
"loss": 1.9387,
"step": 709
},
{
"epoch": 0.5603788476716653,
"grad_norm": 1.8715765476226807,
"learning_rate": 2.496749359958253e-05,
"loss": 2.0478,
"step": 710
},
{
"epoch": 0.5611681136543015,
"grad_norm": 1.5651837587356567,
"learning_rate": 2.495336574843512e-05,
"loss": 1.9964,
"step": 711
},
{
"epoch": 0.5619573796369376,
"grad_norm": 1.7899484634399414,
"learning_rate": 2.4939222103847056e-05,
"loss": 1.9748,
"step": 712
},
{
"epoch": 0.5627466456195738,
"grad_norm": 1.461633324623108,
"learning_rate": 2.4925062688260683e-05,
"loss": 1.9682,
"step": 713
},
{
"epoch": 0.56353591160221,
"grad_norm": 2.1834540367126465,
"learning_rate": 2.4910887524143364e-05,
"loss": 1.9991,
"step": 714
},
{
"epoch": 0.5643251775848461,
"grad_norm": 1.5680309534072876,
"learning_rate": 2.4896696633987448e-05,
"loss": 1.9627,
"step": 715
},
{
"epoch": 0.5651144435674822,
"grad_norm": 1.121997356414795,
"learning_rate": 2.4882490040310244e-05,
"loss": 1.9605,
"step": 716
},
{
"epoch": 0.5659037095501184,
"grad_norm": 1.4992661476135254,
"learning_rate": 2.4868267765653976e-05,
"loss": 1.9297,
"step": 717
},
{
"epoch": 0.5666929755327546,
"grad_norm": 1.1993900537490845,
"learning_rate": 2.485402983258575e-05,
"loss": 1.9219,
"step": 718
},
{
"epoch": 0.5674822415153907,
"grad_norm": 2.7265751361846924,
"learning_rate": 2.4839776263697514e-05,
"loss": 1.9954,
"step": 719
},
{
"epoch": 0.5682715074980268,
"grad_norm": 2.0000016689300537,
"learning_rate": 2.482550708160603e-05,
"loss": 1.9929,
"step": 720
},
{
"epoch": 0.569060773480663,
"grad_norm": 1.3767764568328857,
"learning_rate": 2.4811222308952836e-05,
"loss": 2.0236,
"step": 721
},
{
"epoch": 0.5698500394632992,
"grad_norm": 2.6863558292388916,
"learning_rate": 2.4796921968404204e-05,
"loss": 2.0154,
"step": 722
},
{
"epoch": 0.5706393054459353,
"grad_norm": 1.9830318689346313,
"learning_rate": 2.4782606082651102e-05,
"loss": 1.9738,
"step": 723
},
{
"epoch": 0.5714285714285714,
"grad_norm": 1.2655317783355713,
"learning_rate": 2.4768274674409178e-05,
"loss": 1.9938,
"step": 724
},
{
"epoch": 0.5722178374112076,
"grad_norm": 1.693976879119873,
"learning_rate": 2.4753927766418708e-05,
"loss": 2.0487,
"step": 725
},
{
"epoch": 0.5730071033938438,
"grad_norm": 1.6992710828781128,
"learning_rate": 2.4739565381444554e-05,
"loss": 2.0051,
"step": 726
},
{
"epoch": 0.5737963693764798,
"grad_norm": 1.0462723970413208,
"learning_rate": 2.4725187542276144e-05,
"loss": 1.9466,
"step": 727
},
{
"epoch": 0.574585635359116,
"grad_norm": 1.8844130039215088,
"learning_rate": 2.4710794271727415e-05,
"loss": 1.9626,
"step": 728
},
{
"epoch": 0.5753749013417522,
"grad_norm": 1.1489008665084839,
"learning_rate": 2.469638559263681e-05,
"loss": 1.9644,
"step": 729
},
{
"epoch": 0.5761641673243884,
"grad_norm": 1.4816726446151733,
"learning_rate": 2.46819615278672e-05,
"loss": 1.9771,
"step": 730
},
{
"epoch": 0.5769534333070244,
"grad_norm": 1.660891056060791,
"learning_rate": 2.4667522100305886e-05,
"loss": 1.9484,
"step": 731
},
{
"epoch": 0.5777426992896606,
"grad_norm": 1.0999447107315063,
"learning_rate": 2.4653067332864537e-05,
"loss": 1.9751,
"step": 732
},
{
"epoch": 0.5785319652722968,
"grad_norm": 1.3727487325668335,
"learning_rate": 2.4638597248479165e-05,
"loss": 1.8958,
"step": 733
},
{
"epoch": 0.579321231254933,
"grad_norm": 1.0751103162765503,
"learning_rate": 2.462411187011009e-05,
"loss": 1.933,
"step": 734
},
{
"epoch": 0.580110497237569,
"grad_norm": 1.3762593269348145,
"learning_rate": 2.4609611220741884e-05,
"loss": 1.9646,
"step": 735
},
{
"epoch": 0.5808997632202052,
"grad_norm": 1.5363467931747437,
"learning_rate": 2.459509532338337e-05,
"loss": 1.9666,
"step": 736
},
{
"epoch": 0.5816890292028414,
"grad_norm": 1.1099787950515747,
"learning_rate": 2.4580564201067557e-05,
"loss": 1.9915,
"step": 737
},
{
"epoch": 0.5824782951854776,
"grad_norm": 1.2934099435806274,
"learning_rate": 2.4566017876851605e-05,
"loss": 1.9321,
"step": 738
},
{
"epoch": 0.5832675611681136,
"grad_norm": 2.3133106231689453,
"learning_rate": 2.4551456373816815e-05,
"loss": 1.9448,
"step": 739
},
{
"epoch": 0.5840568271507498,
"grad_norm": 1.6514102220535278,
"learning_rate": 2.4536879715068546e-05,
"loss": 2.0637,
"step": 740
},
{
"epoch": 0.584846093133386,
"grad_norm": 1.7410539388656616,
"learning_rate": 2.452228792373623e-05,
"loss": 1.9651,
"step": 741
},
{
"epoch": 0.585635359116022,
"grad_norm": 1.1819558143615723,
"learning_rate": 2.45076810229733e-05,
"loss": 1.9527,
"step": 742
},
{
"epoch": 0.5864246250986582,
"grad_norm": 1.6729192733764648,
"learning_rate": 2.4493059035957164e-05,
"loss": 2.0146,
"step": 743
},
{
"epoch": 0.5872138910812944,
"grad_norm": 1.0809391736984253,
"learning_rate": 2.4478421985889162e-05,
"loss": 1.9934,
"step": 744
},
{
"epoch": 0.5880031570639306,
"grad_norm": 1.5663741827011108,
"learning_rate": 2.4463769895994545e-05,
"loss": 1.8878,
"step": 745
},
{
"epoch": 0.5887924230465666,
"grad_norm": 1.1649593114852905,
"learning_rate": 2.4449102789522427e-05,
"loss": 1.9876,
"step": 746
},
{
"epoch": 0.5895816890292028,
"grad_norm": 1.8039592504501343,
"learning_rate": 2.443442068974574e-05,
"loss": 1.9676,
"step": 747
},
{
"epoch": 0.590370955011839,
"grad_norm": 1.341096043586731,
"learning_rate": 2.441972361996122e-05,
"loss": 1.9812,
"step": 748
},
{
"epoch": 0.5911602209944752,
"grad_norm": 1.758004903793335,
"learning_rate": 2.4405011603489353e-05,
"loss": 1.9761,
"step": 749
},
{
"epoch": 0.5919494869771112,
"grad_norm": 1.4644911289215088,
"learning_rate": 2.439028466367433e-05,
"loss": 1.9381,
"step": 750
},
{
"epoch": 0.5927387529597474,
"grad_norm": 1.7480524778366089,
"learning_rate": 2.4375542823884044e-05,
"loss": 1.9949,
"step": 751
},
{
"epoch": 0.5935280189423836,
"grad_norm": 2.019632577896118,
"learning_rate": 2.4360786107510003e-05,
"loss": 1.9843,
"step": 752
},
{
"epoch": 0.5943172849250198,
"grad_norm": 1.6516532897949219,
"learning_rate": 2.434601453796734e-05,
"loss": 1.9856,
"step": 753
},
{
"epoch": 0.5951065509076559,
"grad_norm": 1.9035701751708984,
"learning_rate": 2.433122813869475e-05,
"loss": 1.9736,
"step": 754
},
{
"epoch": 0.595895816890292,
"grad_norm": 1.3820042610168457,
"learning_rate": 2.4316426933154457e-05,
"loss": 1.9113,
"step": 755
},
{
"epoch": 0.5966850828729282,
"grad_norm": 32.268516540527344,
"learning_rate": 2.430161094483218e-05,
"loss": 2.0811,
"step": 756
},
{
"epoch": 0.5974743488555643,
"grad_norm": 1.958722472190857,
"learning_rate": 2.4286780197237098e-05,
"loss": 1.9285,
"step": 757
},
{
"epoch": 0.5982636148382005,
"grad_norm": 1.1933417320251465,
"learning_rate": 2.427193471390181e-05,
"loss": 1.9512,
"step": 758
},
{
"epoch": 0.5990528808208366,
"grad_norm": 3.102102756500244,
"learning_rate": 2.425707451838229e-05,
"loss": 1.9804,
"step": 759
},
{
"epoch": 0.5998421468034728,
"grad_norm": 1.4609273672103882,
"learning_rate": 2.4242199634257865e-05,
"loss": 1.9406,
"step": 760
},
{
"epoch": 0.6006314127861089,
"grad_norm": 1.467287302017212,
"learning_rate": 2.4227310085131156e-05,
"loss": 1.9203,
"step": 761
},
{
"epoch": 0.601420678768745,
"grad_norm": 1.8098467588424683,
"learning_rate": 2.4212405894628063e-05,
"loss": 1.9826,
"step": 762
},
{
"epoch": 0.6022099447513812,
"grad_norm": 1.479901671409607,
"learning_rate": 2.4197487086397724e-05,
"loss": 2.0035,
"step": 763
},
{
"epoch": 0.6029992107340174,
"grad_norm": 1.6941622495651245,
"learning_rate": 2.4182553684112454e-05,
"loss": 1.9921,
"step": 764
},
{
"epoch": 0.6037884767166535,
"grad_norm": 1.103091835975647,
"learning_rate": 2.416760571146774e-05,
"loss": 1.9229,
"step": 765
},
{
"epoch": 0.6045777426992897,
"grad_norm": 1.2702820301055908,
"learning_rate": 2.4152643192182188e-05,
"loss": 1.9755,
"step": 766
},
{
"epoch": 0.6053670086819258,
"grad_norm": 1.2228517532348633,
"learning_rate": 2.4137666149997478e-05,
"loss": 1.9695,
"step": 767
},
{
"epoch": 0.606156274664562,
"grad_norm": 1.2769513130187988,
"learning_rate": 2.4122674608678334e-05,
"loss": 1.9965,
"step": 768
},
{
"epoch": 0.6069455406471981,
"grad_norm": 1.25937020778656,
"learning_rate": 2.4107668592012498e-05,
"loss": 1.9219,
"step": 769
},
{
"epoch": 0.6077348066298343,
"grad_norm": 1.0595743656158447,
"learning_rate": 2.409264812381067e-05,
"loss": 1.988,
"step": 770
},
{
"epoch": 0.6085240726124704,
"grad_norm": 1.3666781187057495,
"learning_rate": 2.407761322790648e-05,
"loss": 1.9581,
"step": 771
},
{
"epoch": 0.6093133385951065,
"grad_norm": 1.167434573173523,
"learning_rate": 2.4062563928156455e-05,
"loss": 1.9685,
"step": 772
},
{
"epoch": 0.6101026045777427,
"grad_norm": 1.627456784248352,
"learning_rate": 2.404750024843998e-05,
"loss": 1.9174,
"step": 773
},
{
"epoch": 0.6108918705603789,
"grad_norm": 1.2988723516464233,
"learning_rate": 2.4032422212659257e-05,
"loss": 1.971,
"step": 774
},
{
"epoch": 0.611681136543015,
"grad_norm": 1.6529747247695923,
"learning_rate": 2.4017329844739257e-05,
"loss": 1.9778,
"step": 775
},
{
"epoch": 0.6124704025256511,
"grad_norm": 1.8538501262664795,
"learning_rate": 2.4002223168627707e-05,
"loss": 1.9594,
"step": 776
},
{
"epoch": 0.6132596685082873,
"grad_norm": 1.302612066268921,
"learning_rate": 2.398710220829503e-05,
"loss": 1.9616,
"step": 777
},
{
"epoch": 0.6140489344909235,
"grad_norm": 1.9736380577087402,
"learning_rate": 2.3971966987734306e-05,
"loss": 1.9303,
"step": 778
},
{
"epoch": 0.6148382004735596,
"grad_norm": 1.284517765045166,
"learning_rate": 2.3956817530961262e-05,
"loss": 1.9755,
"step": 779
},
{
"epoch": 0.6156274664561957,
"grad_norm": 1.2649601697921753,
"learning_rate": 2.3941653862014195e-05,
"loss": 1.9127,
"step": 780
},
{
"epoch": 0.6164167324388319,
"grad_norm": 1.1102463006973267,
"learning_rate": 2.392647600495397e-05,
"loss": 1.9198,
"step": 781
},
{
"epoch": 0.6172059984214681,
"grad_norm": 1.0401090383529663,
"learning_rate": 2.3911283983863953e-05,
"loss": 1.9438,
"step": 782
},
{
"epoch": 0.6179952644041041,
"grad_norm": 1.1567270755767822,
"learning_rate": 2.3896077822849984e-05,
"loss": 1.9812,
"step": 783
},
{
"epoch": 0.6187845303867403,
"grad_norm": 1.1251745223999023,
"learning_rate": 2.3880857546040355e-05,
"loss": 1.9033,
"step": 784
},
{
"epoch": 0.6195737963693765,
"grad_norm": 1.1831730604171753,
"learning_rate": 2.386562317758573e-05,
"loss": 1.9404,
"step": 785
},
{
"epoch": 0.6203630623520127,
"grad_norm": 4.819545269012451,
"learning_rate": 2.3850374741659155e-05,
"loss": 1.9528,
"step": 786
},
{
"epoch": 0.6211523283346487,
"grad_norm": 1.254148244857788,
"learning_rate": 2.3835112262455992e-05,
"loss": 1.9311,
"step": 787
},
{
"epoch": 0.6219415943172849,
"grad_norm": 3.181715250015259,
"learning_rate": 2.3819835764193875e-05,
"loss": 2.0101,
"step": 788
},
{
"epoch": 0.6227308602999211,
"grad_norm": 1.117749810218811,
"learning_rate": 2.38045452711127e-05,
"loss": 1.9697,
"step": 789
},
{
"epoch": 0.6235201262825573,
"grad_norm": 1.4008678197860718,
"learning_rate": 2.3789240807474566e-05,
"loss": 1.9163,
"step": 790
},
{
"epoch": 0.6243093922651933,
"grad_norm": 1.0686087608337402,
"learning_rate": 2.3773922397563723e-05,
"loss": 1.9319,
"step": 791
},
{
"epoch": 0.6250986582478295,
"grad_norm": 1.0253478288650513,
"learning_rate": 2.3758590065686567e-05,
"loss": 1.9838,
"step": 792
},
{
"epoch": 0.6258879242304657,
"grad_norm": 1.427983045578003,
"learning_rate": 2.3743243836171577e-05,
"loss": 1.9409,
"step": 793
},
{
"epoch": 0.6266771902131019,
"grad_norm": 1.0589593648910522,
"learning_rate": 2.3727883733369295e-05,
"loss": 1.9825,
"step": 794
},
{
"epoch": 0.6274664561957379,
"grad_norm": 2.2683448791503906,
"learning_rate": 2.3712509781652258e-05,
"loss": 2.007,
"step": 795
},
{
"epoch": 0.6282557221783741,
"grad_norm": 1.4350131750106812,
"learning_rate": 2.3697122005414987e-05,
"loss": 1.969,
"step": 796
},
{
"epoch": 0.6290449881610103,
"grad_norm": 1.2323224544525146,
"learning_rate": 2.3681720429073947e-05,
"loss": 1.9373,
"step": 797
},
{
"epoch": 0.6298342541436464,
"grad_norm": 0.9967265129089355,
"learning_rate": 2.3666305077067487e-05,
"loss": 1.9147,
"step": 798
},
{
"epoch": 0.6306235201262825,
"grad_norm": 1.2962182760238647,
"learning_rate": 2.3650875973855825e-05,
"loss": 1.9588,
"step": 799
},
{
"epoch": 0.6314127861089187,
"grad_norm": 1.6102293729782104,
"learning_rate": 2.3635433143920985e-05,
"loss": 2.0478,
"step": 800
},
{
"epoch": 0.6322020520915549,
"grad_norm": 1.213399052619934,
"learning_rate": 2.3619976611766793e-05,
"loss": 1.9225,
"step": 801
},
{
"epoch": 0.632991318074191,
"grad_norm": 1.4079943895339966,
"learning_rate": 2.360450640191879e-05,
"loss": 1.9589,
"step": 802
},
{
"epoch": 0.6337805840568271,
"grad_norm": 1.3271418809890747,
"learning_rate": 2.3589022538924246e-05,
"loss": 2.0268,
"step": 803
},
{
"epoch": 0.6345698500394633,
"grad_norm": 1.1346262693405151,
"learning_rate": 2.3573525047352078e-05,
"loss": 1.9397,
"step": 804
},
{
"epoch": 0.6353591160220995,
"grad_norm": 1.3805230855941772,
"learning_rate": 2.3558013951792836e-05,
"loss": 1.9674,
"step": 805
},
{
"epoch": 0.6361483820047356,
"grad_norm": 1.2992908954620361,
"learning_rate": 2.3542489276858655e-05,
"loss": 1.9824,
"step": 806
},
{
"epoch": 0.6369376479873717,
"grad_norm": 1.2167001962661743,
"learning_rate": 2.3526951047183208e-05,
"loss": 1.9646,
"step": 807
},
{
"epoch": 0.6377269139700079,
"grad_norm": 1.2849534749984741,
"learning_rate": 2.3511399287421688e-05,
"loss": 1.9261,
"step": 808
},
{
"epoch": 0.6385161799526441,
"grad_norm": 1.2633917331695557,
"learning_rate": 2.3495834022250752e-05,
"loss": 1.9408,
"step": 809
},
{
"epoch": 0.6393054459352802,
"grad_norm": 1.0912851095199585,
"learning_rate": 2.3480255276368493e-05,
"loss": 1.9653,
"step": 810
},
{
"epoch": 0.6400947119179163,
"grad_norm": 1.1520500183105469,
"learning_rate": 2.346466307449438e-05,
"loss": 1.9434,
"step": 811
},
{
"epoch": 0.6408839779005525,
"grad_norm": 1.21257483959198,
"learning_rate": 2.3449057441369243e-05,
"loss": 2.0161,
"step": 812
},
{
"epoch": 0.6416732438831886,
"grad_norm": 1.5208064317703247,
"learning_rate": 2.3433438401755224e-05,
"loss": 1.915,
"step": 813
},
{
"epoch": 0.6424625098658248,
"grad_norm": 1.200332760810852,
"learning_rate": 2.341780598043574e-05,
"loss": 1.9753,
"step": 814
},
{
"epoch": 0.6432517758484609,
"grad_norm": 1.5574405193328857,
"learning_rate": 2.3402160202215426e-05,
"loss": 1.9294,
"step": 815
},
{
"epoch": 0.6440410418310971,
"grad_norm": 1.2709983587265015,
"learning_rate": 2.3386501091920134e-05,
"loss": 1.9696,
"step": 816
},
{
"epoch": 0.6448303078137332,
"grad_norm": 1.9312561750411987,
"learning_rate": 2.3370828674396855e-05,
"loss": 2.034,
"step": 817
},
{
"epoch": 0.6456195737963694,
"grad_norm": 1.6993128061294556,
"learning_rate": 2.3355142974513694e-05,
"loss": 1.9364,
"step": 818
},
{
"epoch": 0.6464088397790055,
"grad_norm": 1.7830700874328613,
"learning_rate": 2.3339444017159847e-05,
"loss": 1.9691,
"step": 819
},
{
"epoch": 0.6471981057616417,
"grad_norm": 1.1403906345367432,
"learning_rate": 2.3323731827245526e-05,
"loss": 1.952,
"step": 820
},
{
"epoch": 0.6479873717442778,
"grad_norm": 1.5226829051971436,
"learning_rate": 2.3308006429701956e-05,
"loss": 1.9802,
"step": 821
},
{
"epoch": 0.648776637726914,
"grad_norm": 1.1285881996154785,
"learning_rate": 2.3292267849481313e-05,
"loss": 1.9184,
"step": 822
},
{
"epoch": 0.6495659037095501,
"grad_norm": 1.2501252889633179,
"learning_rate": 2.327651611155669e-05,
"loss": 1.8888,
"step": 823
},
{
"epoch": 0.6503551696921863,
"grad_norm": 1.576758623123169,
"learning_rate": 2.3260751240922054e-05,
"loss": 1.9909,
"step": 824
},
{
"epoch": 0.6511444356748224,
"grad_norm": 1.2548635005950928,
"learning_rate": 2.324497326259222e-05,
"loss": 1.9063,
"step": 825
},
{
"epoch": 0.6519337016574586,
"grad_norm": 1.1014522314071655,
"learning_rate": 2.322918220160279e-05,
"loss": 1.9507,
"step": 826
},
{
"epoch": 0.6527229676400947,
"grad_norm": 1.3672009706497192,
"learning_rate": 2.321337808301014e-05,
"loss": 1.9708,
"step": 827
},
{
"epoch": 0.6535122336227308,
"grad_norm": 1.2893126010894775,
"learning_rate": 2.3197560931891347e-05,
"loss": 1.951,
"step": 828
},
{
"epoch": 0.654301499605367,
"grad_norm": 1.176769733428955,
"learning_rate": 2.3181730773344182e-05,
"loss": 1.9732,
"step": 829
},
{
"epoch": 0.6550907655880032,
"grad_norm": 1.6181094646453857,
"learning_rate": 2.3165887632487046e-05,
"loss": 1.8668,
"step": 830
},
{
"epoch": 0.6558800315706393,
"grad_norm": 1.608091950416565,
"learning_rate": 2.3150031534458947e-05,
"loss": 1.9172,
"step": 831
},
{
"epoch": 0.6566692975532754,
"grad_norm": 1.127637267112732,
"learning_rate": 2.313416250441945e-05,
"loss": 1.9291,
"step": 832
},
{
"epoch": 0.6574585635359116,
"grad_norm": 1.0388165712356567,
"learning_rate": 2.3118280567548633e-05,
"loss": 1.9392,
"step": 833
},
{
"epoch": 0.6582478295185478,
"grad_norm": 1.3837047815322876,
"learning_rate": 2.3102385749047058e-05,
"loss": 1.9586,
"step": 834
},
{
"epoch": 0.659037095501184,
"grad_norm": 1.0598629713058472,
"learning_rate": 2.3086478074135742e-05,
"loss": 1.9177,
"step": 835
},
{
"epoch": 0.65982636148382,
"grad_norm": 1.442895770072937,
"learning_rate": 2.307055756805607e-05,
"loss": 1.9223,
"step": 836
},
{
"epoch": 0.6606156274664562,
"grad_norm": 1.247636079788208,
"learning_rate": 2.3054624256069824e-05,
"loss": 1.9878,
"step": 837
},
{
"epoch": 0.6614048934490924,
"grad_norm": 1.8692328929901123,
"learning_rate": 2.303867816345907e-05,
"loss": 1.9418,
"step": 838
},
{
"epoch": 0.6621941594317285,
"grad_norm": 1.4947094917297363,
"learning_rate": 2.3022719315526184e-05,
"loss": 1.9157,
"step": 839
},
{
"epoch": 0.6629834254143646,
"grad_norm": 1.2936433553695679,
"learning_rate": 2.3006747737593756e-05,
"loss": 1.9789,
"step": 840
},
{
"epoch": 0.6637726913970008,
"grad_norm": 1.081805944442749,
"learning_rate": 2.2990763455004597e-05,
"loss": 1.8891,
"step": 841
},
{
"epoch": 0.664561957379637,
"grad_norm": 10.924898147583008,
"learning_rate": 2.2974766493121666e-05,
"loss": 2.0674,
"step": 842
},
{
"epoch": 0.665351223362273,
"grad_norm": 1.4154882431030273,
"learning_rate": 2.2958756877328037e-05,
"loss": 1.966,
"step": 843
},
{
"epoch": 0.6661404893449092,
"grad_norm": 1.1723241806030273,
"learning_rate": 2.2942734633026864e-05,
"loss": 1.966,
"step": 844
},
{
"epoch": 0.6669297553275454,
"grad_norm": 1.3635941743850708,
"learning_rate": 2.292669978564135e-05,
"loss": 1.9216,
"step": 845
},
{
"epoch": 0.6677190213101816,
"grad_norm": 2.8145909309387207,
"learning_rate": 2.2910652360614688e-05,
"loss": 1.9837,
"step": 846
},
{
"epoch": 0.6685082872928176,
"grad_norm": 1.623931646347046,
"learning_rate": 2.2894592383410027e-05,
"loss": 1.9557,
"step": 847
},
{
"epoch": 0.6692975532754538,
"grad_norm": 1.248794436454773,
"learning_rate": 2.2878519879510437e-05,
"loss": 1.9235,
"step": 848
},
{
"epoch": 0.67008681925809,
"grad_norm": 2.0067548751831055,
"learning_rate": 2.2862434874418857e-05,
"loss": 1.9316,
"step": 849
},
{
"epoch": 0.6708760852407262,
"grad_norm": 1.3988443613052368,
"learning_rate": 2.2846337393658074e-05,
"loss": 1.944,
"step": 850
},
{
"epoch": 0.6716653512233622,
"grad_norm": 1.6866941452026367,
"learning_rate": 2.2830227462770665e-05,
"loss": 2.0047,
"step": 851
},
{
"epoch": 0.6724546172059984,
"grad_norm": 1.9285333156585693,
"learning_rate": 2.2814105107318955e-05,
"loss": 2.0428,
"step": 852
},
{
"epoch": 0.6732438831886346,
"grad_norm": 1.3415141105651855,
"learning_rate": 2.2797970352884997e-05,
"loss": 1.9088,
"step": 853
},
{
"epoch": 0.6740331491712708,
"grad_norm": 1.4724236726760864,
"learning_rate": 2.2781823225070507e-05,
"loss": 1.9685,
"step": 854
},
{
"epoch": 0.6748224151539068,
"grad_norm": 1.3407270908355713,
"learning_rate": 2.2765663749496846e-05,
"loss": 1.9887,
"step": 855
},
{
"epoch": 0.675611681136543,
"grad_norm": 1.1044732332229614,
"learning_rate": 2.274949195180495e-05,
"loss": 1.8941,
"step": 856
},
{
"epoch": 0.6764009471191792,
"grad_norm": 1.3121356964111328,
"learning_rate": 2.2733307857655327e-05,
"loss": 1.9777,
"step": 857
},
{
"epoch": 0.6771902131018153,
"grad_norm": 1.1786476373672485,
"learning_rate": 2.271711149272798e-05,
"loss": 1.923,
"step": 858
},
{
"epoch": 0.6779794790844514,
"grad_norm": 1.3217613697052002,
"learning_rate": 2.2700902882722396e-05,
"loss": 1.8686,
"step": 859
},
{
"epoch": 0.6787687450670876,
"grad_norm": 1.3290356397628784,
"learning_rate": 2.2684682053357472e-05,
"loss": 1.9165,
"step": 860
},
{
"epoch": 0.6795580110497238,
"grad_norm": 1.4889332056045532,
"learning_rate": 2.2668449030371527e-05,
"loss": 1.9085,
"step": 861
},
{
"epoch": 0.6803472770323599,
"grad_norm": 1.156880259513855,
"learning_rate": 2.2652203839522196e-05,
"loss": 1.9228,
"step": 862
},
{
"epoch": 0.681136543014996,
"grad_norm": 1.52714204788208,
"learning_rate": 2.2635946506586435e-05,
"loss": 1.9908,
"step": 863
},
{
"epoch": 0.6819258089976322,
"grad_norm": 1.362430453300476,
"learning_rate": 2.261967705736046e-05,
"loss": 1.9262,
"step": 864
},
{
"epoch": 0.6827150749802684,
"grad_norm": 1.495282769203186,
"learning_rate": 2.2603395517659728e-05,
"loss": 1.9529,
"step": 865
},
{
"epoch": 0.6835043409629045,
"grad_norm": 1.3992247581481934,
"learning_rate": 2.2587101913318864e-05,
"loss": 1.9274,
"step": 866
},
{
"epoch": 0.6842936069455406,
"grad_norm": 1.3318471908569336,
"learning_rate": 2.257079627019164e-05,
"loss": 1.9572,
"step": 867
},
{
"epoch": 0.6850828729281768,
"grad_norm": 1.420928716659546,
"learning_rate": 2.255447861415094e-05,
"loss": 1.9143,
"step": 868
},
{
"epoch": 0.6858721389108129,
"grad_norm": 1.0273611545562744,
"learning_rate": 2.25381489710887e-05,
"loss": 1.9147,
"step": 869
},
{
"epoch": 0.6866614048934491,
"grad_norm": 1.3935787677764893,
"learning_rate": 2.2521807366915876e-05,
"loss": 1.9353,
"step": 870
},
{
"epoch": 0.6874506708760852,
"grad_norm": 1.21060049533844,
"learning_rate": 2.250545382756241e-05,
"loss": 2.0201,
"step": 871
},
{
"epoch": 0.6882399368587214,
"grad_norm": 10.238750457763672,
"learning_rate": 2.2489088378977176e-05,
"loss": 2.0379,
"step": 872
},
{
"epoch": 0.6890292028413575,
"grad_norm": 4.480075359344482,
"learning_rate": 2.247271104712794e-05,
"loss": 2.0273,
"step": 873
},
{
"epoch": 0.6898184688239937,
"grad_norm": 2.047750234603882,
"learning_rate": 2.245632185800134e-05,
"loss": 1.9601,
"step": 874
},
{
"epoch": 0.6906077348066298,
"grad_norm": 1.5088602304458618,
"learning_rate": 2.2439920837602817e-05,
"loss": 1.9402,
"step": 875
},
{
"epoch": 0.691397000789266,
"grad_norm": 1.2437899112701416,
"learning_rate": 2.2423508011956583e-05,
"loss": 1.9443,
"step": 876
},
{
"epoch": 0.6921862667719021,
"grad_norm": 7.469998836517334,
"learning_rate": 2.240708340710559e-05,
"loss": 1.971,
"step": 877
},
{
"epoch": 0.6929755327545383,
"grad_norm": 2.0380475521087646,
"learning_rate": 2.2390647049111472e-05,
"loss": 1.9815,
"step": 878
},
{
"epoch": 0.6937647987371744,
"grad_norm": 2.9355788230895996,
"learning_rate": 2.237419896405453e-05,
"loss": 1.8744,
"step": 879
},
{
"epoch": 0.6945540647198106,
"grad_norm": 1.66250479221344,
"learning_rate": 2.2357739178033645e-05,
"loss": 1.9304,
"step": 880
},
{
"epoch": 0.6953433307024467,
"grad_norm": 1.2899961471557617,
"learning_rate": 2.2341267717166285e-05,
"loss": 1.9682,
"step": 881
},
{
"epoch": 0.6961325966850829,
"grad_norm": 1.6746995449066162,
"learning_rate": 2.2324784607588432e-05,
"loss": 1.9546,
"step": 882
},
{
"epoch": 0.696921862667719,
"grad_norm": 2.286348342895508,
"learning_rate": 2.2308289875454573e-05,
"loss": 1.9209,
"step": 883
},
{
"epoch": 0.6977111286503551,
"grad_norm": 1.3030370473861694,
"learning_rate": 2.2291783546937596e-05,
"loss": 1.9305,
"step": 884
},
{
"epoch": 0.6985003946329913,
"grad_norm": 1.2656025886535645,
"learning_rate": 2.2275265648228833e-05,
"loss": 1.9644,
"step": 885
},
{
"epoch": 0.6992896606156275,
"grad_norm": 1.3782883882522583,
"learning_rate": 2.2258736205537955e-05,
"loss": 1.9385,
"step": 886
},
{
"epoch": 0.7000789265982637,
"grad_norm": 1.1452478170394897,
"learning_rate": 2.2242195245092942e-05,
"loss": 1.9486,
"step": 887
},
{
"epoch": 0.7008681925808997,
"grad_norm": 1.3117746114730835,
"learning_rate": 2.2225642793140067e-05,
"loss": 1.9841,
"step": 888
},
{
"epoch": 0.7016574585635359,
"grad_norm": 1.22527015209198,
"learning_rate": 2.2209078875943822e-05,
"loss": 1.9265,
"step": 889
},
{
"epoch": 0.7024467245461721,
"grad_norm": 1.0896904468536377,
"learning_rate": 2.219250351978691e-05,
"loss": 1.9565,
"step": 890
},
{
"epoch": 0.7032359905288083,
"grad_norm": 1.1967089176177979,
"learning_rate": 2.2175916750970164e-05,
"loss": 1.9371,
"step": 891
},
{
"epoch": 0.7040252565114443,
"grad_norm": 1.3110548257827759,
"learning_rate": 2.2159318595812532e-05,
"loss": 1.9225,
"step": 892
},
{
"epoch": 0.7048145224940805,
"grad_norm": 1.1737967729568481,
"learning_rate": 2.2142709080651047e-05,
"loss": 1.9572,
"step": 893
},
{
"epoch": 0.7056037884767167,
"grad_norm": 1.9293705224990845,
"learning_rate": 2.212608823184074e-05,
"loss": 1.9495,
"step": 894
},
{
"epoch": 0.7063930544593529,
"grad_norm": 1.6178711652755737,
"learning_rate": 2.2109456075754644e-05,
"loss": 1.9028,
"step": 895
},
{
"epoch": 0.7071823204419889,
"grad_norm": 1.4091240167617798,
"learning_rate": 2.2092812638783723e-05,
"loss": 1.8941,
"step": 896
},
{
"epoch": 0.7079715864246251,
"grad_norm": 1.9126784801483154,
"learning_rate": 2.207615794733686e-05,
"loss": 2.013,
"step": 897
},
{
"epoch": 0.7087608524072613,
"grad_norm": 1.1694210767745972,
"learning_rate": 2.2059492027840766e-05,
"loss": 1.9768,
"step": 898
},
{
"epoch": 0.7095501183898973,
"grad_norm": 3.8965656757354736,
"learning_rate": 2.204281490673999e-05,
"loss": 2.0802,
"step": 899
},
{
"epoch": 0.7103393843725335,
"grad_norm": 1.5217310190200806,
"learning_rate": 2.2026126610496852e-05,
"loss": 1.9425,
"step": 900
},
{
"epoch": 0.7111286503551697,
"grad_norm": 1.3671518564224243,
"learning_rate": 2.20094271655914e-05,
"loss": 1.9806,
"step": 901
},
{
"epoch": 0.7119179163378059,
"grad_norm": 1.2386888265609741,
"learning_rate": 2.1992716598521372e-05,
"loss": 1.9273,
"step": 902
},
{
"epoch": 0.712707182320442,
"grad_norm": 1.5315154790878296,
"learning_rate": 2.197599493580216e-05,
"loss": 1.8953,
"step": 903
},
{
"epoch": 0.7134964483030781,
"grad_norm": 1.662078857421875,
"learning_rate": 2.1959262203966748e-05,
"loss": 1.9719,
"step": 904
},
{
"epoch": 0.7142857142857143,
"grad_norm": 1.7234045267105103,
"learning_rate": 2.1942518429565703e-05,
"loss": 1.9798,
"step": 905
},
{
"epoch": 0.7150749802683505,
"grad_norm": 1.38892662525177,
"learning_rate": 2.19257636391671e-05,
"loss": 1.9861,
"step": 906
},
{
"epoch": 0.7158642462509865,
"grad_norm": 1.0747907161712646,
"learning_rate": 2.1908997859356496e-05,
"loss": 1.9242,
"step": 907
},
{
"epoch": 0.7166535122336227,
"grad_norm": 1.7632391452789307,
"learning_rate": 2.189222111673689e-05,
"loss": 1.9238,
"step": 908
},
{
"epoch": 0.7174427782162589,
"grad_norm": 1.0350416898727417,
"learning_rate": 2.1875433437928666e-05,
"loss": 1.8935,
"step": 909
},
{
"epoch": 0.7182320441988951,
"grad_norm": 1.5194156169891357,
"learning_rate": 2.1858634849569578e-05,
"loss": 1.9061,
"step": 910
},
{
"epoch": 0.7190213101815311,
"grad_norm": 1.0854952335357666,
"learning_rate": 2.184182537831468e-05,
"loss": 1.8827,
"step": 911
},
{
"epoch": 0.7198105761641673,
"grad_norm": 1.9698389768600464,
"learning_rate": 2.1825005050836284e-05,
"loss": 1.9542,
"step": 912
},
{
"epoch": 0.7205998421468035,
"grad_norm": 1.6458585262298584,
"learning_rate": 2.180817389382395e-05,
"loss": 1.9505,
"step": 913
},
{
"epoch": 0.7213891081294396,
"grad_norm": 1.779136061668396,
"learning_rate": 2.1791331933984407e-05,
"loss": 1.9921,
"step": 914
},
{
"epoch": 0.7221783741120757,
"grad_norm": 2.018765687942505,
"learning_rate": 2.1774479198041526e-05,
"loss": 1.9579,
"step": 915
},
{
"epoch": 0.7229676400947119,
"grad_norm": 1.0813268423080444,
"learning_rate": 2.1757615712736284e-05,
"loss": 1.9442,
"step": 916
},
{
"epoch": 0.7237569060773481,
"grad_norm": 2.2704761028289795,
"learning_rate": 2.174074150482672e-05,
"loss": 1.9334,
"step": 917
},
{
"epoch": 0.7245461720599842,
"grad_norm": 4.316535949707031,
"learning_rate": 2.1723856601087854e-05,
"loss": 1.9386,
"step": 918
},
{
"epoch": 0.7253354380426204,
"grad_norm": 2.0900955200195312,
"learning_rate": 2.170696102831172e-05,
"loss": 1.9106,
"step": 919
},
{
"epoch": 0.7261247040252565,
"grad_norm": 2.0656304359436035,
"learning_rate": 2.1690054813307255e-05,
"loss": 1.9515,
"step": 920
},
{
"epoch": 0.7269139700078927,
"grad_norm": 2.1848411560058594,
"learning_rate": 2.1673137982900297e-05,
"loss": 1.9959,
"step": 921
},
{
"epoch": 0.7277032359905288,
"grad_norm": 1.9023263454437256,
"learning_rate": 2.1656210563933508e-05,
"loss": 1.9947,
"step": 922
},
{
"epoch": 0.728492501973165,
"grad_norm": 1.5430853366851807,
"learning_rate": 2.163927258326637e-05,
"loss": 1.9767,
"step": 923
},
{
"epoch": 0.7292817679558011,
"grad_norm": 1.293797254562378,
"learning_rate": 2.1622324067775118e-05,
"loss": 1.903,
"step": 924
},
{
"epoch": 0.7300710339384373,
"grad_norm": 1.5131195783615112,
"learning_rate": 2.1605365044352704e-05,
"loss": 1.9292,
"step": 925
},
{
"epoch": 0.7308602999210734,
"grad_norm": 1.0771164894104004,
"learning_rate": 2.1588395539908753e-05,
"loss": 1.9476,
"step": 926
},
{
"epoch": 0.7316495659037096,
"grad_norm": 1.6526238918304443,
"learning_rate": 2.1571415581369516e-05,
"loss": 1.94,
"step": 927
},
{
"epoch": 0.7324388318863457,
"grad_norm": 1.312951683998108,
"learning_rate": 2.1554425195677838e-05,
"loss": 1.9687,
"step": 928
},
{
"epoch": 0.7332280978689818,
"grad_norm": 1.599605679512024,
"learning_rate": 2.1537424409793104e-05,
"loss": 1.9032,
"step": 929
},
{
"epoch": 0.734017363851618,
"grad_norm": 1.2115885019302368,
"learning_rate": 2.1520413250691202e-05,
"loss": 1.94,
"step": 930
},
{
"epoch": 0.7348066298342542,
"grad_norm": 1.6560498476028442,
"learning_rate": 2.1503391745364494e-05,
"loss": 1.9307,
"step": 931
},
{
"epoch": 0.7355958958168903,
"grad_norm": 2.2198646068573,
"learning_rate": 2.148635992082173e-05,
"loss": 1.9424,
"step": 932
},
{
"epoch": 0.7363851617995264,
"grad_norm": 1.0756328105926514,
"learning_rate": 2.1469317804088066e-05,
"loss": 1.9332,
"step": 933
},
{
"epoch": 0.7371744277821626,
"grad_norm": 1.542237401008606,
"learning_rate": 2.1452265422204967e-05,
"loss": 1.9586,
"step": 934
},
{
"epoch": 0.7379636937647988,
"grad_norm": 1.2590047121047974,
"learning_rate": 2.1435202802230193e-05,
"loss": 1.937,
"step": 935
},
{
"epoch": 0.7387529597474349,
"grad_norm": 1.3695781230926514,
"learning_rate": 2.141812997123775e-05,
"loss": 1.9052,
"step": 936
},
{
"epoch": 0.739542225730071,
"grad_norm": 1.2480250597000122,
"learning_rate": 2.1401046956317843e-05,
"loss": 1.996,
"step": 937
},
{
"epoch": 0.7403314917127072,
"grad_norm": 1.5121519565582275,
"learning_rate": 2.1383953784576843e-05,
"loss": 1.9314,
"step": 938
},
{
"epoch": 0.7411207576953434,
"grad_norm": 1.3273252248764038,
"learning_rate": 2.136685048313723e-05,
"loss": 1.9594,
"step": 939
},
{
"epoch": 0.7419100236779794,
"grad_norm": 1.2657073736190796,
"learning_rate": 2.1349737079137554e-05,
"loss": 1.9398,
"step": 940
},
{
"epoch": 0.7426992896606156,
"grad_norm": 1.3078733682632446,
"learning_rate": 2.133261359973242e-05,
"loss": 1.9175,
"step": 941
},
{
"epoch": 0.7434885556432518,
"grad_norm": 1.0810556411743164,
"learning_rate": 2.1315480072092385e-05,
"loss": 1.9058,
"step": 942
},
{
"epoch": 0.744277821625888,
"grad_norm": 1.3791708946228027,
"learning_rate": 2.129833652340397e-05,
"loss": 1.9271,
"step": 943
},
{
"epoch": 0.745067087608524,
"grad_norm": 1.2780756950378418,
"learning_rate": 2.1281182980869594e-05,
"loss": 1.9071,
"step": 944
},
{
"epoch": 0.7458563535911602,
"grad_norm": 1.1226168870925903,
"learning_rate": 2.126401947170754e-05,
"loss": 1.9827,
"step": 945
},
{
"epoch": 0.7466456195737964,
"grad_norm": 1.133967638015747,
"learning_rate": 2.1246846023151888e-05,
"loss": 1.8706,
"step": 946
},
{
"epoch": 0.7474348855564326,
"grad_norm": 1.0483019351959229,
"learning_rate": 2.12296626624525e-05,
"loss": 1.8822,
"step": 947
},
{
"epoch": 0.7482241515390686,
"grad_norm": 2.100027322769165,
"learning_rate": 2.1212469416874972e-05,
"loss": 1.9556,
"step": 948
},
{
"epoch": 0.7490134175217048,
"grad_norm": 1.4822384119033813,
"learning_rate": 2.119526631370058e-05,
"loss": 1.9661,
"step": 949
},
{
"epoch": 0.749802683504341,
"grad_norm": 1.2308375835418701,
"learning_rate": 2.1178053380226234e-05,
"loss": 1.9075,
"step": 950
},
{
"epoch": 0.7505919494869772,
"grad_norm": 1.7181750535964966,
"learning_rate": 2.1160830643764448e-05,
"loss": 1.9816,
"step": 951
},
{
"epoch": 0.7513812154696132,
"grad_norm": 1.7420463562011719,
"learning_rate": 2.11435981316433e-05,
"loss": 1.9134,
"step": 952
},
{
"epoch": 0.7521704814522494,
"grad_norm": 1.654775619506836,
"learning_rate": 2.1126355871206358e-05,
"loss": 1.9577,
"step": 953
},
{
"epoch": 0.7529597474348856,
"grad_norm": 1.5406017303466797,
"learning_rate": 2.110910388981268e-05,
"loss": 1.9192,
"step": 954
},
{
"epoch": 0.7537490134175217,
"grad_norm": 1.4977563619613647,
"learning_rate": 2.1091842214836736e-05,
"loss": 1.9376,
"step": 955
},
{
"epoch": 0.7545382794001578,
"grad_norm": 1.0576728582382202,
"learning_rate": 2.1074570873668374e-05,
"loss": 1.8936,
"step": 956
},
{
"epoch": 0.755327545382794,
"grad_norm": 1.1977719068527222,
"learning_rate": 2.1057289893712796e-05,
"loss": 1.8901,
"step": 957
},
{
"epoch": 0.7561168113654302,
"grad_norm": 1.399145483970642,
"learning_rate": 2.103999930239049e-05,
"loss": 1.8795,
"step": 958
},
{
"epoch": 0.7569060773480663,
"grad_norm": 1.3279935121536255,
"learning_rate": 2.1022699127137184e-05,
"loss": 1.9639,
"step": 959
},
{
"epoch": 0.7576953433307024,
"grad_norm": 1.220153570175171,
"learning_rate": 2.1005389395403827e-05,
"loss": 1.9882,
"step": 960
},
{
"epoch": 0.7584846093133386,
"grad_norm": 1.3342796564102173,
"learning_rate": 2.0988070134656525e-05,
"loss": 1.9771,
"step": 961
},
{
"epoch": 0.7592738752959748,
"grad_norm": 1.1898157596588135,
"learning_rate": 2.097074137237651e-05,
"loss": 1.9551,
"step": 962
},
{
"epoch": 0.7600631412786109,
"grad_norm": 1.3458317518234253,
"learning_rate": 2.0953403136060088e-05,
"loss": 2.0118,
"step": 963
},
{
"epoch": 0.760852407261247,
"grad_norm": 1.055283546447754,
"learning_rate": 2.093605545321859e-05,
"loss": 1.9307,
"step": 964
},
{
"epoch": 0.7616416732438832,
"grad_norm": 1.1726508140563965,
"learning_rate": 2.091869835137835e-05,
"loss": 1.9662,
"step": 965
},
{
"epoch": 0.7624309392265194,
"grad_norm": 1.4072753190994263,
"learning_rate": 2.0901331858080633e-05,
"loss": 1.9281,
"step": 966
},
{
"epoch": 0.7632202052091555,
"grad_norm": 1.1238325834274292,
"learning_rate": 2.088395600088162e-05,
"loss": 1.8911,
"step": 967
},
{
"epoch": 0.7640094711917916,
"grad_norm": 1.2030211687088013,
"learning_rate": 2.086657080735234e-05,
"loss": 1.8815,
"step": 968
},
{
"epoch": 0.7647987371744278,
"grad_norm": 1.2055410146713257,
"learning_rate": 2.0849176305078646e-05,
"loss": 1.9201,
"step": 969
},
{
"epoch": 0.7655880031570639,
"grad_norm": 1.2860301733016968,
"learning_rate": 2.083177252166114e-05,
"loss": 1.9055,
"step": 970
},
{
"epoch": 0.7663772691397001,
"grad_norm": 1.2509900331497192,
"learning_rate": 2.0814359484715183e-05,
"loss": 1.957,
"step": 971
},
{
"epoch": 0.7671665351223362,
"grad_norm": 1.172758936882019,
"learning_rate": 2.0796937221870792e-05,
"loss": 1.9532,
"step": 972
},
{
"epoch": 0.7679558011049724,
"grad_norm": 1.1523761749267578,
"learning_rate": 2.077950576077264e-05,
"loss": 1.92,
"step": 973
},
{
"epoch": 0.7687450670876085,
"grad_norm": 1.1627498865127563,
"learning_rate": 2.076206512907998e-05,
"loss": 1.927,
"step": 974
},
{
"epoch": 0.7695343330702447,
"grad_norm": 1.4067708253860474,
"learning_rate": 2.074461535446663e-05,
"loss": 1.934,
"step": 975
},
{
"epoch": 0.7703235990528808,
"grad_norm": 1.0437569618225098,
"learning_rate": 2.072715646462092e-05,
"loss": 1.9641,
"step": 976
},
{
"epoch": 0.771112865035517,
"grad_norm": 1.206112265586853,
"learning_rate": 2.0709688487245616e-05,
"loss": 1.9312,
"step": 977
},
{
"epoch": 0.7719021310181531,
"grad_norm": 1.1898462772369385,
"learning_rate": 2.069221145005793e-05,
"loss": 1.9562,
"step": 978
},
{
"epoch": 0.7726913970007893,
"grad_norm": 1.2786809206008911,
"learning_rate": 2.0674725380789444e-05,
"loss": 1.9559,
"step": 979
},
{
"epoch": 0.7734806629834254,
"grad_norm": 1.164592981338501,
"learning_rate": 2.065723030718606e-05,
"loss": 1.9825,
"step": 980
},
{
"epoch": 0.7742699289660616,
"grad_norm": 1.3167165517807007,
"learning_rate": 2.0639726257007986e-05,
"loss": 1.9216,
"step": 981
},
{
"epoch": 0.7750591949486977,
"grad_norm": 1.2257906198501587,
"learning_rate": 2.0622213258029657e-05,
"loss": 1.9491,
"step": 982
},
{
"epoch": 0.7758484609313339,
"grad_norm": 1.0711567401885986,
"learning_rate": 2.060469133803972e-05,
"loss": 1.9191,
"step": 983
},
{
"epoch": 0.77663772691397,
"grad_norm": 1.8117008209228516,
"learning_rate": 2.058716052484097e-05,
"loss": 1.9311,
"step": 984
},
{
"epoch": 0.7774269928966061,
"grad_norm": 1.137128233909607,
"learning_rate": 2.056962084625031e-05,
"loss": 1.8654,
"step": 985
},
{
"epoch": 0.7782162588792423,
"grad_norm": 1.1593658924102783,
"learning_rate": 2.055207233009872e-05,
"loss": 1.9392,
"step": 986
},
{
"epoch": 0.7790055248618785,
"grad_norm": 1.0210576057434082,
"learning_rate": 2.0534515004231193e-05,
"loss": 1.9445,
"step": 987
},
{
"epoch": 0.7797947908445146,
"grad_norm": 1.2282181978225708,
"learning_rate": 2.0516948896506706e-05,
"loss": 1.8879,
"step": 988
},
{
"epoch": 0.7805840568271507,
"grad_norm": 1.1221119165420532,
"learning_rate": 2.049937403479818e-05,
"loss": 1.9226,
"step": 989
},
{
"epoch": 0.7813733228097869,
"grad_norm": 1.6811386346817017,
"learning_rate": 2.0481790446992405e-05,
"loss": 1.9722,
"step": 990
},
{
"epoch": 0.7821625887924231,
"grad_norm": 1.2378782033920288,
"learning_rate": 2.0464198160990034e-05,
"loss": 1.9014,
"step": 991
},
{
"epoch": 0.7829518547750592,
"grad_norm": 1.1285789012908936,
"learning_rate": 2.044659720470552e-05,
"loss": 1.9549,
"step": 992
},
{
"epoch": 0.7837411207576953,
"grad_norm": 1.1845968961715698,
"learning_rate": 2.042898760606706e-05,
"loss": 1.9145,
"step": 993
},
{
"epoch": 0.7845303867403315,
"grad_norm": 98.27835083007812,
"learning_rate": 2.0411369393016583e-05,
"loss": 2.1377,
"step": 994
},
{
"epoch": 0.7853196527229677,
"grad_norm": 1.2735936641693115,
"learning_rate": 2.0393742593509673e-05,
"loss": 1.9253,
"step": 995
},
{
"epoch": 0.7861089187056038,
"grad_norm": 1.335058569908142,
"learning_rate": 2.0376107235515545e-05,
"loss": 1.9287,
"step": 996
},
{
"epoch": 0.7868981846882399,
"grad_norm": 1.3353217840194702,
"learning_rate": 2.035846334701699e-05,
"loss": 1.9171,
"step": 997
},
{
"epoch": 0.7876874506708761,
"grad_norm": 1.3583861589431763,
"learning_rate": 2.0340810956010347e-05,
"loss": 1.8695,
"step": 998
},
{
"epoch": 0.7884767166535123,
"grad_norm": 1.0094035863876343,
"learning_rate": 2.0323150090505425e-05,
"loss": 1.959,
"step": 999
},
{
"epoch": 0.7892659826361483,
"grad_norm": 1.0646247863769531,
"learning_rate": 2.03054807785255e-05,
"loss": 1.9456,
"step": 1000
},
{
"epoch": 0.7900552486187845,
"grad_norm": 1.4316010475158691,
"learning_rate": 2.0287803048107237e-05,
"loss": 1.9598,
"step": 1001
},
{
"epoch": 0.7908445146014207,
"grad_norm": 1.2731982469558716,
"learning_rate": 2.027011692730066e-05,
"loss": 1.8874,
"step": 1002
},
{
"epoch": 0.7916337805840569,
"grad_norm": 0.9664216041564941,
"learning_rate": 2.0252422444169122e-05,
"loss": 1.9212,
"step": 1003
},
{
"epoch": 0.7924230465666929,
"grad_norm": 1.229513168334961,
"learning_rate": 2.0234719626789222e-05,
"loss": 1.9094,
"step": 1004
},
{
"epoch": 0.7932123125493291,
"grad_norm": 1.2635388374328613,
"learning_rate": 2.0217008503250804e-05,
"loss": 1.9462,
"step": 1005
},
{
"epoch": 0.7940015785319653,
"grad_norm": 1.0137258768081665,
"learning_rate": 2.019928910165687e-05,
"loss": 1.9129,
"step": 1006
},
{
"epoch": 0.7947908445146015,
"grad_norm": 4.261168003082275,
"learning_rate": 2.0181561450123584e-05,
"loss": 1.9242,
"step": 1007
},
{
"epoch": 0.7955801104972375,
"grad_norm": 1.5557130575180054,
"learning_rate": 2.0163825576780177e-05,
"loss": 1.9141,
"step": 1008
},
{
"epoch": 0.7963693764798737,
"grad_norm": 1.5184749364852905,
"learning_rate": 2.0146081509768932e-05,
"loss": 1.9849,
"step": 1009
},
{
"epoch": 0.7971586424625099,
"grad_norm": 1.0676449537277222,
"learning_rate": 2.0128329277245147e-05,
"loss": 1.9777,
"step": 1010
},
{
"epoch": 0.797947908445146,
"grad_norm": 1.1058017015457153,
"learning_rate": 2.011056890737705e-05,
"loss": 1.9017,
"step": 1011
},
{
"epoch": 0.7987371744277821,
"grad_norm": 3.1086857318878174,
"learning_rate": 2.0092800428345803e-05,
"loss": 1.9944,
"step": 1012
},
{
"epoch": 0.7995264404104183,
"grad_norm": 2.2823331356048584,
"learning_rate": 2.007502386834544e-05,
"loss": 1.9451,
"step": 1013
},
{
"epoch": 0.8003157063930545,
"grad_norm": 1.4383108615875244,
"learning_rate": 2.005723925558279e-05,
"loss": 1.8993,
"step": 1014
},
{
"epoch": 0.8011049723756906,
"grad_norm": 1.7305094003677368,
"learning_rate": 2.0039446618277486e-05,
"loss": 1.9778,
"step": 1015
},
{
"epoch": 0.8018942383583267,
"grad_norm": 1.3104729652404785,
"learning_rate": 2.0021645984661877e-05,
"loss": 1.9218,
"step": 1016
},
{
"epoch": 0.8026835043409629,
"grad_norm": 1.3720124959945679,
"learning_rate": 2.000383738298101e-05,
"loss": 1.9605,
"step": 1017
},
{
"epoch": 0.8034727703235991,
"grad_norm": 1.67930006980896,
"learning_rate": 1.9986020841492575e-05,
"loss": 1.9081,
"step": 1018
},
{
"epoch": 0.8042620363062352,
"grad_norm": 1.313557505607605,
"learning_rate": 1.9968196388466852e-05,
"loss": 2.0005,
"step": 1019
},
{
"epoch": 0.8050513022888713,
"grad_norm": 1.0877718925476074,
"learning_rate": 1.9950364052186682e-05,
"loss": 1.9576,
"step": 1020
},
{
"epoch": 0.8058405682715075,
"grad_norm": 1.4061955213546753,
"learning_rate": 1.993252386094741e-05,
"loss": 1.9535,
"step": 1021
},
{
"epoch": 0.8066298342541437,
"grad_norm": 1.2338870763778687,
"learning_rate": 1.9914675843056855e-05,
"loss": 1.9022,
"step": 1022
},
{
"epoch": 0.8074191002367798,
"grad_norm": 1.0897296667099,
"learning_rate": 1.9896820026835237e-05,
"loss": 1.9264,
"step": 1023
},
{
"epoch": 0.8082083662194159,
"grad_norm": 1.21879243850708,
"learning_rate": 1.9878956440615172e-05,
"loss": 1.9245,
"step": 1024
},
{
"epoch": 0.8089976322020521,
"grad_norm": 1.0368714332580566,
"learning_rate": 1.9861085112741587e-05,
"loss": 1.905,
"step": 1025
},
{
"epoch": 0.8097868981846882,
"grad_norm": 1.2078007459640503,
"learning_rate": 1.9843206071571692e-05,
"loss": 1.9068,
"step": 1026
},
{
"epoch": 0.8105761641673244,
"grad_norm": 1.3901704549789429,
"learning_rate": 1.982531934547496e-05,
"loss": 1.9295,
"step": 1027
},
{
"epoch": 0.8113654301499605,
"grad_norm": 1.098552942276001,
"learning_rate": 1.980742496283303e-05,
"loss": 1.9386,
"step": 1028
},
{
"epoch": 0.8121546961325967,
"grad_norm": 1.073913812637329,
"learning_rate": 1.9789522952039697e-05,
"loss": 1.9274,
"step": 1029
},
{
"epoch": 0.8129439621152328,
"grad_norm": 1.0683954954147339,
"learning_rate": 1.977161334150088e-05,
"loss": 1.9,
"step": 1030
},
{
"epoch": 0.813733228097869,
"grad_norm": 0.9743616580963135,
"learning_rate": 1.9753696159634532e-05,
"loss": 1.9312,
"step": 1031
},
{
"epoch": 0.8145224940805051,
"grad_norm": 2.691880226135254,
"learning_rate": 1.9735771434870624e-05,
"loss": 1.8925,
"step": 1032
},
{
"epoch": 0.8153117600631413,
"grad_norm": 1.1001895666122437,
"learning_rate": 1.9717839195651112e-05,
"loss": 1.9242,
"step": 1033
},
{
"epoch": 0.8161010260457774,
"grad_norm": 1.2172091007232666,
"learning_rate": 1.9699899470429852e-05,
"loss": 1.9218,
"step": 1034
},
{
"epoch": 0.8168902920284136,
"grad_norm": 1.0749343633651733,
"learning_rate": 1.9681952287672603e-05,
"loss": 1.9229,
"step": 1035
},
{
"epoch": 0.8176795580110497,
"grad_norm": 1.0054171085357666,
"learning_rate": 1.9663997675856928e-05,
"loss": 1.9917,
"step": 1036
},
{
"epoch": 0.8184688239936859,
"grad_norm": 1.0160495042800903,
"learning_rate": 1.964603566347221e-05,
"loss": 1.9582,
"step": 1037
},
{
"epoch": 0.819258089976322,
"grad_norm": 1.068723201751709,
"learning_rate": 1.9628066279019557e-05,
"loss": 1.9772,
"step": 1038
},
{
"epoch": 0.8200473559589582,
"grad_norm": 4.0177507400512695,
"learning_rate": 1.961008955101177e-05,
"loss": 1.9647,
"step": 1039
},
{
"epoch": 0.8208366219415943,
"grad_norm": 1.0878604650497437,
"learning_rate": 1.959210550797331e-05,
"loss": 1.9027,
"step": 1040
},
{
"epoch": 0.8216258879242304,
"grad_norm": 1.042733073234558,
"learning_rate": 1.9574114178440258e-05,
"loss": 1.8878,
"step": 1041
},
{
"epoch": 0.8224151539068666,
"grad_norm": 0.9671216011047363,
"learning_rate": 1.955611559096023e-05,
"loss": 1.9175,
"step": 1042
},
{
"epoch": 0.8232044198895028,
"grad_norm": 1.0640370845794678,
"learning_rate": 1.953810977409237e-05,
"loss": 1.9226,
"step": 1043
},
{
"epoch": 0.823993685872139,
"grad_norm": 0.9828730821609497,
"learning_rate": 1.9520096756407302e-05,
"loss": 1.9383,
"step": 1044
},
{
"epoch": 0.824782951854775,
"grad_norm": 0.9962606430053711,
"learning_rate": 1.950207656648707e-05,
"loss": 1.8699,
"step": 1045
},
{
"epoch": 0.8255722178374112,
"grad_norm": 1.1136631965637207,
"learning_rate": 1.948404923292509e-05,
"loss": 1.9337,
"step": 1046
},
{
"epoch": 0.8263614838200474,
"grad_norm": 0.9540908932685852,
"learning_rate": 1.9466014784326124e-05,
"loss": 1.8999,
"step": 1047
},
{
"epoch": 0.8271507498026835,
"grad_norm": 1.3623814582824707,
"learning_rate": 1.9447973249306225e-05,
"loss": 1.9309,
"step": 1048
},
{
"epoch": 0.8279400157853196,
"grad_norm": 1.2193188667297363,
"learning_rate": 1.942992465649268e-05,
"loss": 1.9221,
"step": 1049
},
{
"epoch": 0.8287292817679558,
"grad_norm": 1.0682237148284912,
"learning_rate": 1.9411869034523977e-05,
"loss": 1.9563,
"step": 1050
},
{
"epoch": 0.829518547750592,
"grad_norm": 1.0715112686157227,
"learning_rate": 1.9393806412049765e-05,
"loss": 1.889,
"step": 1051
},
{
"epoch": 0.8303078137332282,
"grad_norm": 1.0543572902679443,
"learning_rate": 1.93757368177308e-05,
"loss": 1.8968,
"step": 1052
},
{
"epoch": 0.8310970797158642,
"grad_norm": 1.1826039552688599,
"learning_rate": 1.9357660280238896e-05,
"loss": 1.9357,
"step": 1053
},
{
"epoch": 0.8318863456985004,
"grad_norm": 1.1282274723052979,
"learning_rate": 1.933957682825688e-05,
"loss": 1.9127,
"step": 1054
},
{
"epoch": 0.8326756116811366,
"grad_norm": 0.9678381085395813,
"learning_rate": 1.9321486490478565e-05,
"loss": 1.9063,
"step": 1055
},
{
"epoch": 0.8334648776637726,
"grad_norm": 1.0463242530822754,
"learning_rate": 1.9303389295608677e-05,
"loss": 1.9163,
"step": 1056
},
{
"epoch": 0.8342541436464088,
"grad_norm": 1.168262243270874,
"learning_rate": 1.9285285272362816e-05,
"loss": 1.8283,
"step": 1057
},
{
"epoch": 0.835043409629045,
"grad_norm": 1.1833221912384033,
"learning_rate": 1.9267174449467442e-05,
"loss": 1.9377,
"step": 1058
},
{
"epoch": 0.8358326756116812,
"grad_norm": 1.2744213342666626,
"learning_rate": 1.924905685565979e-05,
"loss": 1.9625,
"step": 1059
},
{
"epoch": 0.8366219415943172,
"grad_norm": 1.243308663368225,
"learning_rate": 1.9230932519687822e-05,
"loss": 1.8508,
"step": 1060
},
{
"epoch": 0.8374112075769534,
"grad_norm": 1.043876051902771,
"learning_rate": 1.921280147031023e-05,
"loss": 1.911,
"step": 1061
},
{
"epoch": 0.8382004735595896,
"grad_norm": 1.0375399589538574,
"learning_rate": 1.919466373629634e-05,
"loss": 1.9219,
"step": 1062
},
{
"epoch": 0.8389897395422258,
"grad_norm": 0.9551234245300293,
"learning_rate": 1.9176519346426084e-05,
"loss": 1.8883,
"step": 1063
},
{
"epoch": 0.8397790055248618,
"grad_norm": 1.1786881685256958,
"learning_rate": 1.9158368329489957e-05,
"loss": 1.9367,
"step": 1064
},
{
"epoch": 0.840568271507498,
"grad_norm": 1.134186029434204,
"learning_rate": 1.914021071428898e-05,
"loss": 1.9078,
"step": 1065
},
{
"epoch": 0.8413575374901342,
"grad_norm": 0.9656745791435242,
"learning_rate": 1.9122046529634625e-05,
"loss": 1.9052,
"step": 1066
},
{
"epoch": 0.8421468034727704,
"grad_norm": 0.9540627002716064,
"learning_rate": 1.9103875804348806e-05,
"loss": 1.8895,
"step": 1067
},
{
"epoch": 0.8429360694554064,
"grad_norm": 1.0714752674102783,
"learning_rate": 1.90856985672638e-05,
"loss": 1.9444,
"step": 1068
},
{
"epoch": 0.8437253354380426,
"grad_norm": 1.54597806930542,
"learning_rate": 1.9067514847222227e-05,
"loss": 1.965,
"step": 1069
},
{
"epoch": 0.8445146014206788,
"grad_norm": 1.068199634552002,
"learning_rate": 1.9049324673076994e-05,
"loss": 1.9616,
"step": 1070
},
{
"epoch": 0.8453038674033149,
"grad_norm": 1.2419066429138184,
"learning_rate": 1.903112807369124e-05,
"loss": 1.9696,
"step": 1071
},
{
"epoch": 0.846093133385951,
"grad_norm": 0.986475944519043,
"learning_rate": 1.9012925077938318e-05,
"loss": 1.9167,
"step": 1072
},
{
"epoch": 0.8468823993685872,
"grad_norm": 0.9953616261482239,
"learning_rate": 1.89947157147017e-05,
"loss": 1.9293,
"step": 1073
},
{
"epoch": 0.8476716653512234,
"grad_norm": 1.0164732933044434,
"learning_rate": 1.897650001287498e-05,
"loss": 1.9196,
"step": 1074
},
{
"epoch": 0.8484609313338595,
"grad_norm": 2.1091668605804443,
"learning_rate": 1.8958278001361823e-05,
"loss": 1.9055,
"step": 1075
},
{
"epoch": 0.8492501973164956,
"grad_norm": 2.535614252090454,
"learning_rate": 1.8940049709075877e-05,
"loss": 1.9882,
"step": 1076
},
{
"epoch": 0.8500394632991318,
"grad_norm": 1.2041168212890625,
"learning_rate": 1.8921815164940784e-05,
"loss": 1.9206,
"step": 1077
},
{
"epoch": 0.850828729281768,
"grad_norm": 1.0966860055923462,
"learning_rate": 1.890357439789008e-05,
"loss": 1.9412,
"step": 1078
},
{
"epoch": 0.8516179952644041,
"grad_norm": 1.3406330347061157,
"learning_rate": 1.8885327436867194e-05,
"loss": 1.9086,
"step": 1079
},
{
"epoch": 0.8524072612470402,
"grad_norm": 2.0100197792053223,
"learning_rate": 1.8867074310825377e-05,
"loss": 1.9412,
"step": 1080
},
{
"epoch": 0.8531965272296764,
"grad_norm": 1.0935620069503784,
"learning_rate": 1.884881504872766e-05,
"loss": 1.9451,
"step": 1081
},
{
"epoch": 0.8539857932123125,
"grad_norm": 1.2751333713531494,
"learning_rate": 1.883054967954681e-05,
"loss": 1.9423,
"step": 1082
},
{
"epoch": 0.8547750591949487,
"grad_norm": 1.2702161073684692,
"learning_rate": 1.8812278232265297e-05,
"loss": 1.9101,
"step": 1083
},
{
"epoch": 0.8555643251775849,
"grad_norm": 1.0920089483261108,
"learning_rate": 1.879400073587521e-05,
"loss": 1.9152,
"step": 1084
},
{
"epoch": 0.856353591160221,
"grad_norm": 1.0169867277145386,
"learning_rate": 1.8775717219378264e-05,
"loss": 1.892,
"step": 1085
},
{
"epoch": 0.8571428571428571,
"grad_norm": 1.1083533763885498,
"learning_rate": 1.8757427711785714e-05,
"loss": 1.943,
"step": 1086
},
{
"epoch": 0.8579321231254933,
"grad_norm": 0.9779300093650818,
"learning_rate": 1.873913224211832e-05,
"loss": 1.93,
"step": 1087
},
{
"epoch": 0.8587213891081295,
"grad_norm": 1.2874062061309814,
"learning_rate": 1.8720830839406294e-05,
"loss": 1.9395,
"step": 1088
},
{
"epoch": 0.8595106550907656,
"grad_norm": 1.7089831829071045,
"learning_rate": 1.8702523532689282e-05,
"loss": 1.938,
"step": 1089
},
{
"epoch": 0.8602999210734017,
"grad_norm": 1.2932785749435425,
"learning_rate": 1.8684210351016288e-05,
"loss": 1.9323,
"step": 1090
},
{
"epoch": 0.8610891870560379,
"grad_norm": 1.0572917461395264,
"learning_rate": 1.8665891323445635e-05,
"loss": 1.934,
"step": 1091
},
{
"epoch": 0.861878453038674,
"grad_norm": 0.9761425256729126,
"learning_rate": 1.864756647904492e-05,
"loss": 1.8805,
"step": 1092
},
{
"epoch": 0.8626677190213102,
"grad_norm": 0.97571861743927,
"learning_rate": 1.862923584689099e-05,
"loss": 1.891,
"step": 1093
},
{
"epoch": 0.8634569850039463,
"grad_norm": 1.1124482154846191,
"learning_rate": 1.8610899456069846e-05,
"loss": 1.9309,
"step": 1094
},
{
"epoch": 0.8642462509865825,
"grad_norm": 1.0374202728271484,
"learning_rate": 1.8592557335676648e-05,
"loss": 1.9369,
"step": 1095
},
{
"epoch": 0.8650355169692187,
"grad_norm": 1.0762258768081665,
"learning_rate": 1.857420951481564e-05,
"loss": 1.916,
"step": 1096
},
{
"epoch": 0.8658247829518547,
"grad_norm": 1.2466862201690674,
"learning_rate": 1.8555856022600105e-05,
"loss": 1.9623,
"step": 1097
},
{
"epoch": 0.8666140489344909,
"grad_norm": 1.0643996000289917,
"learning_rate": 1.853749688815234e-05,
"loss": 1.9485,
"step": 1098
},
{
"epoch": 0.8674033149171271,
"grad_norm": 1.2198410034179688,
"learning_rate": 1.8519132140603584e-05,
"loss": 1.9576,
"step": 1099
},
{
"epoch": 0.8681925808997633,
"grad_norm": 1.1695075035095215,
"learning_rate": 1.8500761809093983e-05,
"loss": 1.8975,
"step": 1100
},
{
"epoch": 0.8689818468823993,
"grad_norm": 1.2702025175094604,
"learning_rate": 1.848238592277255e-05,
"loss": 1.9415,
"step": 1101
},
{
"epoch": 0.8697711128650355,
"grad_norm": 1.2971569299697876,
"learning_rate": 1.84640045107971e-05,
"loss": 1.9521,
"step": 1102
},
{
"epoch": 0.8705603788476717,
"grad_norm": 1.3112415075302124,
"learning_rate": 1.8445617602334228e-05,
"loss": 1.9009,
"step": 1103
},
{
"epoch": 0.8713496448303079,
"grad_norm": 1.192368507385254,
"learning_rate": 1.8427225226559247e-05,
"loss": 1.9095,
"step": 1104
},
{
"epoch": 0.8721389108129439,
"grad_norm": 1.4217482805252075,
"learning_rate": 1.840882741265614e-05,
"loss": 1.9417,
"step": 1105
},
{
"epoch": 0.8729281767955801,
"grad_norm": 1.1348249912261963,
"learning_rate": 1.839042418981752e-05,
"loss": 1.8749,
"step": 1106
},
{
"epoch": 0.8737174427782163,
"grad_norm": 1.2523006200790405,
"learning_rate": 1.8372015587244596e-05,
"loss": 1.9132,
"step": 1107
},
{
"epoch": 0.8745067087608525,
"grad_norm": 1.1433618068695068,
"learning_rate": 1.8353601634147092e-05,
"loss": 1.9525,
"step": 1108
},
{
"epoch": 0.8752959747434885,
"grad_norm": 1.5293651819229126,
"learning_rate": 1.833518235974324e-05,
"loss": 1.8907,
"step": 1109
},
{
"epoch": 0.8760852407261247,
"grad_norm": 1.7137699127197266,
"learning_rate": 1.8316757793259704e-05,
"loss": 1.9531,
"step": 1110
},
{
"epoch": 0.8768745067087609,
"grad_norm": 1.1961606740951538,
"learning_rate": 1.829832796393155e-05,
"loss": 1.9241,
"step": 1111
},
{
"epoch": 0.877663772691397,
"grad_norm": 1.5045583248138428,
"learning_rate": 1.8279892901002193e-05,
"loss": 1.9374,
"step": 1112
},
{
"epoch": 0.8784530386740331,
"grad_norm": 1.1014037132263184,
"learning_rate": 1.8261452633723356e-05,
"loss": 1.8816,
"step": 1113
},
{
"epoch": 0.8792423046566693,
"grad_norm": 1.487324595451355,
"learning_rate": 1.824300719135502e-05,
"loss": 1.8671,
"step": 1114
},
{
"epoch": 0.8800315706393055,
"grad_norm": 1.1038920879364014,
"learning_rate": 1.8224556603165363e-05,
"loss": 1.9133,
"step": 1115
},
{
"epoch": 0.8808208366219415,
"grad_norm": 1.3335046768188477,
"learning_rate": 1.820610089843075e-05,
"loss": 1.9142,
"step": 1116
},
{
"epoch": 0.8816101026045777,
"grad_norm": 1.2794814109802246,
"learning_rate": 1.8187640106435654e-05,
"loss": 1.8914,
"step": 1117
},
{
"epoch": 0.8823993685872139,
"grad_norm": 1.3214670419692993,
"learning_rate": 1.8169174256472623e-05,
"loss": 1.9615,
"step": 1118
},
{
"epoch": 0.8831886345698501,
"grad_norm": 1.5310771465301514,
"learning_rate": 1.815070337784222e-05,
"loss": 1.8994,
"step": 1119
},
{
"epoch": 0.8839779005524862,
"grad_norm": 1.1699209213256836,
"learning_rate": 1.8132227499853003e-05,
"loss": 1.9175,
"step": 1120
},
{
"epoch": 0.8847671665351223,
"grad_norm": 1.2050460577011108,
"learning_rate": 1.8113746651821457e-05,
"loss": 1.8937,
"step": 1121
},
{
"epoch": 0.8855564325177585,
"grad_norm": 1.040279746055603,
"learning_rate": 1.8095260863071943e-05,
"loss": 1.9171,
"step": 1122
},
{
"epoch": 0.8863456985003947,
"grad_norm": 1.1379011869430542,
"learning_rate": 1.8076770162936678e-05,
"loss": 1.9004,
"step": 1123
},
{
"epoch": 0.8871349644830308,
"grad_norm": 1.071140170097351,
"learning_rate": 1.805827458075566e-05,
"loss": 1.8959,
"step": 1124
},
{
"epoch": 0.8879242304656669,
"grad_norm": 0.9981568455696106,
"learning_rate": 1.8039774145876643e-05,
"loss": 1.8838,
"step": 1125
},
{
"epoch": 0.8887134964483031,
"grad_norm": 1.1366488933563232,
"learning_rate": 1.802126888765507e-05,
"loss": 1.8907,
"step": 1126
},
{
"epoch": 0.8895027624309392,
"grad_norm": 1.379737377166748,
"learning_rate": 1.8002758835454046e-05,
"loss": 1.9165,
"step": 1127
},
{
"epoch": 0.8902920284135754,
"grad_norm": 0.9969932436943054,
"learning_rate": 1.7984244018644283e-05,
"loss": 1.9267,
"step": 1128
},
{
"epoch": 0.8910812943962115,
"grad_norm": 1.143640398979187,
"learning_rate": 1.7965724466604046e-05,
"loss": 1.8801,
"step": 1129
},
{
"epoch": 0.8918705603788477,
"grad_norm": 1.122267723083496,
"learning_rate": 1.794720020871912e-05,
"loss": 1.9291,
"step": 1130
},
{
"epoch": 0.8926598263614838,
"grad_norm": 1.3195860385894775,
"learning_rate": 1.7928671274382754e-05,
"loss": 1.922,
"step": 1131
},
{
"epoch": 0.89344909234412,
"grad_norm": 1.3526853322982788,
"learning_rate": 1.7910137692995616e-05,
"loss": 1.947,
"step": 1132
},
{
"epoch": 0.8942383583267561,
"grad_norm": 1.3031686544418335,
"learning_rate": 1.7891599493965756e-05,
"loss": 1.9207,
"step": 1133
},
{
"epoch": 0.8950276243093923,
"grad_norm": 0.967303991317749,
"learning_rate": 1.7873056706708546e-05,
"loss": 1.9416,
"step": 1134
},
{
"epoch": 0.8958168902920284,
"grad_norm": 1.5189329385757446,
"learning_rate": 1.7854509360646627e-05,
"loss": 1.9273,
"step": 1135
},
{
"epoch": 0.8966061562746646,
"grad_norm": 1.0588527917861938,
"learning_rate": 1.7835957485209894e-05,
"loss": 1.8705,
"step": 1136
},
{
"epoch": 0.8973954222573007,
"grad_norm": 1.061260461807251,
"learning_rate": 1.7817401109835412e-05,
"loss": 1.9345,
"step": 1137
},
{
"epoch": 0.8981846882399369,
"grad_norm": 1.0034102201461792,
"learning_rate": 1.7798840263967405e-05,
"loss": 1.9364,
"step": 1138
},
{
"epoch": 0.898973954222573,
"grad_norm": 1.059441089630127,
"learning_rate": 1.7780274977057162e-05,
"loss": 1.8858,
"step": 1139
},
{
"epoch": 0.8997632202052092,
"grad_norm": 1.0713084936141968,
"learning_rate": 1.776170527856304e-05,
"loss": 1.9348,
"step": 1140
},
{
"epoch": 0.9005524861878453,
"grad_norm": 1.074656367301941,
"learning_rate": 1.7743131197950405e-05,
"loss": 1.878,
"step": 1141
},
{
"epoch": 0.9013417521704814,
"grad_norm": 1.2053810358047485,
"learning_rate": 1.7724552764691545e-05,
"loss": 1.9581,
"step": 1142
},
{
"epoch": 0.9021310181531176,
"grad_norm": 1.0136345624923706,
"learning_rate": 1.7705970008265687e-05,
"loss": 1.8901,
"step": 1143
},
{
"epoch": 0.9029202841357538,
"grad_norm": 1.1384106874465942,
"learning_rate": 1.7687382958158893e-05,
"loss": 1.8547,
"step": 1144
},
{
"epoch": 0.9037095501183899,
"grad_norm": 1.0804816484451294,
"learning_rate": 1.7668791643864056e-05,
"loss": 1.9461,
"step": 1145
},
{
"epoch": 0.904498816101026,
"grad_norm": 1.1892673969268799,
"learning_rate": 1.7650196094880817e-05,
"loss": 1.9242,
"step": 1146
},
{
"epoch": 0.9052880820836622,
"grad_norm": 1.0078619718551636,
"learning_rate": 1.763159634071556e-05,
"loss": 1.9495,
"step": 1147
},
{
"epoch": 0.9060773480662984,
"grad_norm": 0.978377103805542,
"learning_rate": 1.7612992410881314e-05,
"loss": 1.9139,
"step": 1148
},
{
"epoch": 0.9068666140489345,
"grad_norm": 1.0232102870941162,
"learning_rate": 1.7594384334897757e-05,
"loss": 1.8801,
"step": 1149
},
{
"epoch": 0.9076558800315706,
"grad_norm": 1.1658730506896973,
"learning_rate": 1.7575772142291136e-05,
"loss": 1.9321,
"step": 1150
},
{
"epoch": 0.9084451460142068,
"grad_norm": 0.961025059223175,
"learning_rate": 1.7557155862594232e-05,
"loss": 1.9078,
"step": 1151
},
{
"epoch": 0.909234411996843,
"grad_norm": 1.1790615320205688,
"learning_rate": 1.7538535525346304e-05,
"loss": 1.9617,
"step": 1152
},
{
"epoch": 0.9100236779794791,
"grad_norm": 0.976462721824646,
"learning_rate": 1.751991116009306e-05,
"loss": 1.8995,
"step": 1153
},
{
"epoch": 0.9108129439621152,
"grad_norm": 0.9908099174499512,
"learning_rate": 1.7501282796386593e-05,
"loss": 1.9021,
"step": 1154
},
{
"epoch": 0.9116022099447514,
"grad_norm": 1.0563790798187256,
"learning_rate": 1.748265046378535e-05,
"loss": 1.9061,
"step": 1155
},
{
"epoch": 0.9123914759273876,
"grad_norm": 1.0215762853622437,
"learning_rate": 1.7464014191854046e-05,
"loss": 1.8833,
"step": 1156
},
{
"epoch": 0.9131807419100236,
"grad_norm": 1.1506677865982056,
"learning_rate": 1.744537401016369e-05,
"loss": 1.8939,
"step": 1157
},
{
"epoch": 0.9139700078926598,
"grad_norm": 1.1882095336914062,
"learning_rate": 1.7426729948291474e-05,
"loss": 1.9077,
"step": 1158
},
{
"epoch": 0.914759273875296,
"grad_norm": 0.929111897945404,
"learning_rate": 1.7408082035820733e-05,
"loss": 1.854,
"step": 1159
},
{
"epoch": 0.9155485398579322,
"grad_norm": 1.0913232564926147,
"learning_rate": 1.7389430302340928e-05,
"loss": 1.8809,
"step": 1160
},
{
"epoch": 0.9163378058405682,
"grad_norm": 0.9732852578163147,
"learning_rate": 1.7370774777447583e-05,
"loss": 1.8555,
"step": 1161
},
{
"epoch": 0.9171270718232044,
"grad_norm": 1.0545657873153687,
"learning_rate": 1.7352115490742243e-05,
"loss": 1.9186,
"step": 1162
},
{
"epoch": 0.9179163378058406,
"grad_norm": 1.105906367301941,
"learning_rate": 1.7333452471832403e-05,
"loss": 1.9243,
"step": 1163
},
{
"epoch": 0.9187056037884768,
"grad_norm": 1.3180484771728516,
"learning_rate": 1.7314785750331486e-05,
"loss": 1.9367,
"step": 1164
},
{
"epoch": 0.9194948697711128,
"grad_norm": 0.9373801350593567,
"learning_rate": 1.7296115355858812e-05,
"loss": 1.8631,
"step": 1165
},
{
"epoch": 0.920284135753749,
"grad_norm": 1.2233669757843018,
"learning_rate": 1.7277441318039503e-05,
"loss": 1.9653,
"step": 1166
},
{
"epoch": 0.9210734017363852,
"grad_norm": 1.0992927551269531,
"learning_rate": 1.725876366650447e-05,
"loss": 1.8945,
"step": 1167
},
{
"epoch": 0.9218626677190213,
"grad_norm": 1.4309226274490356,
"learning_rate": 1.724008243089036e-05,
"loss": 1.892,
"step": 1168
},
{
"epoch": 0.9226519337016574,
"grad_norm": 1.1195532083511353,
"learning_rate": 1.7221397640839516e-05,
"loss": 1.9198,
"step": 1169
},
{
"epoch": 0.9234411996842936,
"grad_norm": 1.1537940502166748,
"learning_rate": 1.7202709325999893e-05,
"loss": 1.9336,
"step": 1170
},
{
"epoch": 0.9242304656669298,
"grad_norm": 1.222401738166809,
"learning_rate": 1.7184017516025075e-05,
"loss": 1.878,
"step": 1171
},
{
"epoch": 0.9250197316495659,
"grad_norm": 1.4775243997573853,
"learning_rate": 1.7165322240574162e-05,
"loss": 1.9012,
"step": 1172
},
{
"epoch": 0.925808997632202,
"grad_norm": 1.2868552207946777,
"learning_rate": 1.7146623529311772e-05,
"loss": 1.9462,
"step": 1173
},
{
"epoch": 0.9265982636148382,
"grad_norm": 1.0581059455871582,
"learning_rate": 1.7127921411907965e-05,
"loss": 1.9898,
"step": 1174
},
{
"epoch": 0.9273875295974744,
"grad_norm": 1.020804762840271,
"learning_rate": 1.710921591803821e-05,
"loss": 1.8327,
"step": 1175
},
{
"epoch": 0.9281767955801105,
"grad_norm": 1.3169434070587158,
"learning_rate": 1.7090507077383332e-05,
"loss": 1.9644,
"step": 1176
},
{
"epoch": 0.9289660615627466,
"grad_norm": 0.9919707179069519,
"learning_rate": 1.7071794919629466e-05,
"loss": 1.8822,
"step": 1177
},
{
"epoch": 0.9297553275453828,
"grad_norm": 1.044602870941162,
"learning_rate": 1.7053079474468006e-05,
"loss": 1.941,
"step": 1178
},
{
"epoch": 0.930544593528019,
"grad_norm": 0.9796439409255981,
"learning_rate": 1.703436077159558e-05,
"loss": 1.8714,
"step": 1179
},
{
"epoch": 0.9313338595106551,
"grad_norm": 1.1889114379882812,
"learning_rate": 1.7015638840713954e-05,
"loss": 1.9272,
"step": 1180
},
{
"epoch": 0.9321231254932912,
"grad_norm": 0.9672757983207703,
"learning_rate": 1.699691371153005e-05,
"loss": 1.9113,
"step": 1181
},
{
"epoch": 0.9329123914759274,
"grad_norm": 1.0609270334243774,
"learning_rate": 1.6978185413755844e-05,
"loss": 1.9508,
"step": 1182
},
{
"epoch": 0.9337016574585635,
"grad_norm": 1.0816676616668701,
"learning_rate": 1.6959453977108345e-05,
"loss": 1.8926,
"step": 1183
},
{
"epoch": 0.9344909234411997,
"grad_norm": 1.1348642110824585,
"learning_rate": 1.694071943130954e-05,
"loss": 1.9565,
"step": 1184
},
{
"epoch": 0.9352801894238358,
"grad_norm": 0.9974650740623474,
"learning_rate": 1.6921981806086354e-05,
"loss": 1.969,
"step": 1185
},
{
"epoch": 0.936069455406472,
"grad_norm": 1.0185261964797974,
"learning_rate": 1.6903241131170597e-05,
"loss": 1.8792,
"step": 1186
},
{
"epoch": 0.9368587213891081,
"grad_norm": 1.1831624507904053,
"learning_rate": 1.6884497436298918e-05,
"loss": 1.9001,
"step": 1187
},
{
"epoch": 0.9376479873717443,
"grad_norm": 1.4525943994522095,
"learning_rate": 1.6865750751212752e-05,
"loss": 1.9121,
"step": 1188
},
{
"epoch": 0.9384372533543804,
"grad_norm": 1.0227642059326172,
"learning_rate": 1.6847001105658296e-05,
"loss": 1.9067,
"step": 1189
},
{
"epoch": 0.9392265193370166,
"grad_norm": 1.0105981826782227,
"learning_rate": 1.6828248529386418e-05,
"loss": 1.9006,
"step": 1190
},
{
"epoch": 0.9400157853196527,
"grad_norm": 1.0613151788711548,
"learning_rate": 1.6809493052152655e-05,
"loss": 1.942,
"step": 1191
},
{
"epoch": 0.9408050513022889,
"grad_norm": 1.254150390625,
"learning_rate": 1.6790734703717153e-05,
"loss": 1.8991,
"step": 1192
},
{
"epoch": 0.941594317284925,
"grad_norm": 1.0217347145080566,
"learning_rate": 1.677197351384459e-05,
"loss": 1.9152,
"step": 1193
},
{
"epoch": 0.9423835832675612,
"grad_norm": 1.1719932556152344,
"learning_rate": 1.6753209512304174e-05,
"loss": 1.9262,
"step": 1194
},
{
"epoch": 0.9431728492501973,
"grad_norm": 1.062974452972412,
"learning_rate": 1.6734442728869566e-05,
"loss": 1.8823,
"step": 1195
},
{
"epoch": 0.9439621152328335,
"grad_norm": 1.2222181558609009,
"learning_rate": 1.6715673193318834e-05,
"loss": 2.0189,
"step": 1196
},
{
"epoch": 0.9447513812154696,
"grad_norm": 1.1024019718170166,
"learning_rate": 1.669690093543443e-05,
"loss": 1.909,
"step": 1197
},
{
"epoch": 0.9455406471981057,
"grad_norm": 1.2238271236419678,
"learning_rate": 1.667812598500312e-05,
"loss": 1.9175,
"step": 1198
},
{
"epoch": 0.9463299131807419,
"grad_norm": 1.004940152168274,
"learning_rate": 1.6659348371815927e-05,
"loss": 1.9056,
"step": 1199
},
{
"epoch": 0.9471191791633781,
"grad_norm": 0.9510829448699951,
"learning_rate": 1.664056812566812e-05,
"loss": 1.9269,
"step": 1200
},
{
"epoch": 0.9479084451460142,
"grad_norm": 0.9175992608070374,
"learning_rate": 1.662178527635913e-05,
"loss": 1.9337,
"step": 1201
},
{
"epoch": 0.9486977111286503,
"grad_norm": 0.9384323358535767,
"learning_rate": 1.6602999853692528e-05,
"loss": 1.8387,
"step": 1202
},
{
"epoch": 0.9494869771112865,
"grad_norm": 1.3229904174804688,
"learning_rate": 1.6584211887475968e-05,
"loss": 1.9395,
"step": 1203
},
{
"epoch": 0.9502762430939227,
"grad_norm": 0.9942592978477478,
"learning_rate": 1.6565421407521134e-05,
"loss": 1.9253,
"step": 1204
},
{
"epoch": 0.9510655090765588,
"grad_norm": 1.01997709274292,
"learning_rate": 1.65466284436437e-05,
"loss": 1.9165,
"step": 1205
},
{
"epoch": 0.9518547750591949,
"grad_norm": 0.9606868028640747,
"learning_rate": 1.6527833025663294e-05,
"loss": 1.895,
"step": 1206
},
{
"epoch": 0.9526440410418311,
"grad_norm": 0.9341292381286621,
"learning_rate": 1.650903518340342e-05,
"loss": 1.9,
"step": 1207
},
{
"epoch": 0.9534333070244673,
"grad_norm": 1.058000922203064,
"learning_rate": 1.6490234946691435e-05,
"loss": 1.8823,
"step": 1208
},
{
"epoch": 0.9542225730071034,
"grad_norm": 0.9709818959236145,
"learning_rate": 1.6471432345358498e-05,
"loss": 1.9495,
"step": 1209
},
{
"epoch": 0.9550118389897395,
"grad_norm": 0.9561473727226257,
"learning_rate": 1.6452627409239523e-05,
"loss": 1.8852,
"step": 1210
},
{
"epoch": 0.9558011049723757,
"grad_norm": 0.9571905732154846,
"learning_rate": 1.6433820168173116e-05,
"loss": 1.93,
"step": 1211
},
{
"epoch": 0.9565903709550119,
"grad_norm": 1.0493736267089844,
"learning_rate": 1.6415010652001553e-05,
"loss": 1.9129,
"step": 1212
},
{
"epoch": 0.9573796369376479,
"grad_norm": 0.975974977016449,
"learning_rate": 1.6396198890570724e-05,
"loss": 1.8683,
"step": 1213
},
{
"epoch": 0.9581689029202841,
"grad_norm": 1.1953823566436768,
"learning_rate": 1.637738491373006e-05,
"loss": 1.929,
"step": 1214
},
{
"epoch": 0.9589581689029203,
"grad_norm": 0.9741194248199463,
"learning_rate": 1.6358568751332524e-05,
"loss": 1.9201,
"step": 1215
},
{
"epoch": 0.9597474348855565,
"grad_norm": 1.1271497011184692,
"learning_rate": 1.633975043323455e-05,
"loss": 1.8898,
"step": 1216
},
{
"epoch": 0.9605367008681925,
"grad_norm": 1.1122384071350098,
"learning_rate": 1.632092998929598e-05,
"loss": 1.9652,
"step": 1217
},
{
"epoch": 0.9613259668508287,
"grad_norm": 0.9283862113952637,
"learning_rate": 1.6302107449380042e-05,
"loss": 1.917,
"step": 1218
},
{
"epoch": 0.9621152328334649,
"grad_norm": 1.0600271224975586,
"learning_rate": 1.628328284335327e-05,
"loss": 1.9492,
"step": 1219
},
{
"epoch": 0.9629044988161011,
"grad_norm": 1.1159858703613281,
"learning_rate": 1.6264456201085506e-05,
"loss": 1.9413,
"step": 1220
},
{
"epoch": 0.9636937647987371,
"grad_norm": 1.1544772386550903,
"learning_rate": 1.6245627552449796e-05,
"loss": 1.939,
"step": 1221
},
{
"epoch": 0.9644830307813733,
"grad_norm": 0.9870274662971497,
"learning_rate": 1.622679692732238e-05,
"loss": 1.926,
"step": 1222
},
{
"epoch": 0.9652722967640095,
"grad_norm": 0.9417735934257507,
"learning_rate": 1.620796435558264e-05,
"loss": 1.8375,
"step": 1223
},
{
"epoch": 0.9660615627466457,
"grad_norm": 1.059479832649231,
"learning_rate": 1.618912986711304e-05,
"loss": 1.913,
"step": 1224
},
{
"epoch": 0.9668508287292817,
"grad_norm": 0.9993748068809509,
"learning_rate": 1.6170293491799083e-05,
"loss": 1.836,
"step": 1225
},
{
"epoch": 0.9676400947119179,
"grad_norm": 1.0149083137512207,
"learning_rate": 1.615145525952927e-05,
"loss": 1.9319,
"step": 1226
},
{
"epoch": 0.9684293606945541,
"grad_norm": 1.4480969905853271,
"learning_rate": 1.6132615200195044e-05,
"loss": 1.9768,
"step": 1227
},
{
"epoch": 0.9692186266771902,
"grad_norm": 1.1640135049819946,
"learning_rate": 1.611377334369076e-05,
"loss": 1.8763,
"step": 1228
},
{
"epoch": 0.9700078926598263,
"grad_norm": 1.0086321830749512,
"learning_rate": 1.6094929719913614e-05,
"loss": 1.8939,
"step": 1229
},
{
"epoch": 0.9707971586424625,
"grad_norm": 1.4108259677886963,
"learning_rate": 1.60760843587636e-05,
"loss": 1.8972,
"step": 1230
},
{
"epoch": 0.9715864246250987,
"grad_norm": 1.1569288969039917,
"learning_rate": 1.605723729014349e-05,
"loss": 1.899,
"step": 1231
},
{
"epoch": 0.9723756906077348,
"grad_norm": 1.085387110710144,
"learning_rate": 1.6038388543958734e-05,
"loss": 1.91,
"step": 1232
},
{
"epoch": 0.9731649565903709,
"grad_norm": 6.412627696990967,
"learning_rate": 1.6019538150117473e-05,
"loss": 1.8456,
"step": 1233
},
{
"epoch": 0.9739542225730071,
"grad_norm": 1.1839845180511475,
"learning_rate": 1.6000686138530452e-05,
"loss": 1.8995,
"step": 1234
},
{
"epoch": 0.9747434885556433,
"grad_norm": 1.0736435651779175,
"learning_rate": 1.598183253911098e-05,
"loss": 1.8954,
"step": 1235
},
{
"epoch": 0.9755327545382794,
"grad_norm": 1.1705999374389648,
"learning_rate": 1.5962977381774883e-05,
"loss": 1.8938,
"step": 1236
},
{
"epoch": 0.9763220205209155,
"grad_norm": 0.9498980045318604,
"learning_rate": 1.5944120696440467e-05,
"loss": 1.8688,
"step": 1237
},
{
"epoch": 0.9771112865035517,
"grad_norm": 1.0730433464050293,
"learning_rate": 1.5925262513028463e-05,
"loss": 1.9343,
"step": 1238
},
{
"epoch": 0.9779005524861878,
"grad_norm": 1.2047873735427856,
"learning_rate": 1.590640286146197e-05,
"loss": 1.8976,
"step": 1239
},
{
"epoch": 0.978689818468824,
"grad_norm": 1.1192723512649536,
"learning_rate": 1.5887541771666424e-05,
"loss": 1.8691,
"step": 1240
},
{
"epoch": 0.9794790844514601,
"grad_norm": 1.0943949222564697,
"learning_rate": 1.5868679273569543e-05,
"loss": 1.8912,
"step": 1241
},
{
"epoch": 0.9802683504340963,
"grad_norm": 1.2639052867889404,
"learning_rate": 1.5849815397101276e-05,
"loss": 1.9344,
"step": 1242
},
{
"epoch": 0.9810576164167324,
"grad_norm": 0.9903395175933838,
"learning_rate": 1.5830950172193756e-05,
"loss": 1.8876,
"step": 1243
},
{
"epoch": 0.9818468823993686,
"grad_norm": 1.1683999300003052,
"learning_rate": 1.5812083628781265e-05,
"loss": 1.8467,
"step": 1244
},
{
"epoch": 0.9826361483820047,
"grad_norm": 0.9717577695846558,
"learning_rate": 1.5793215796800167e-05,
"loss": 1.9227,
"step": 1245
},
{
"epoch": 0.9834254143646409,
"grad_norm": 1.2117033004760742,
"learning_rate": 1.5774346706188886e-05,
"loss": 1.9106,
"step": 1246
},
{
"epoch": 0.984214680347277,
"grad_norm": 1.0700875520706177,
"learning_rate": 1.5755476386887828e-05,
"loss": 1.9613,
"step": 1247
},
{
"epoch": 0.9850039463299132,
"grad_norm": 0.9804224967956543,
"learning_rate": 1.5736604868839355e-05,
"loss": 1.8936,
"step": 1248
},
{
"epoch": 0.9857932123125493,
"grad_norm": 0.9192711710929871,
"learning_rate": 1.5717732181987723e-05,
"loss": 1.9023,
"step": 1249
},
{
"epoch": 0.9865824782951855,
"grad_norm": 1.2049267292022705,
"learning_rate": 1.5698858356279057e-05,
"loss": 1.9268,
"step": 1250
},
{
"epoch": 0.9873717442778216,
"grad_norm": 1.061391830444336,
"learning_rate": 1.5679983421661277e-05,
"loss": 1.8722,
"step": 1251
},
{
"epoch": 0.9881610102604578,
"grad_norm": 1.2003884315490723,
"learning_rate": 1.5661107408084073e-05,
"loss": 1.8623,
"step": 1252
},
{
"epoch": 0.988950276243094,
"grad_norm": 1.0538649559020996,
"learning_rate": 1.564223034549883e-05,
"loss": 1.8866,
"step": 1253
},
{
"epoch": 0.98973954222573,
"grad_norm": 0.9783664345741272,
"learning_rate": 1.5623352263858622e-05,
"loss": 1.8931,
"step": 1254
},
{
"epoch": 0.9905288082083662,
"grad_norm": 1.0133979320526123,
"learning_rate": 1.5604473193118124e-05,
"loss": 1.8642,
"step": 1255
},
{
"epoch": 0.9913180741910024,
"grad_norm": 0.913181722164154,
"learning_rate": 1.5585593163233572e-05,
"loss": 1.8951,
"step": 1256
},
{
"epoch": 0.9921073401736386,
"grad_norm": 0.8568845987319946,
"learning_rate": 1.5566712204162744e-05,
"loss": 1.8658,
"step": 1257
},
{
"epoch": 0.9928966061562746,
"grad_norm": 0.9925752282142639,
"learning_rate": 1.5547830345864887e-05,
"loss": 1.9071,
"step": 1258
},
{
"epoch": 0.9936858721389108,
"grad_norm": 1.0075455904006958,
"learning_rate": 1.552894761830066e-05,
"loss": 1.8626,
"step": 1259
},
{
"epoch": 0.994475138121547,
"grad_norm": 0.9527983665466309,
"learning_rate": 1.551006405143212e-05,
"loss": 1.8863,
"step": 1260
},
{
"epoch": 0.9952644041041832,
"grad_norm": 0.9452120065689087,
"learning_rate": 1.5491179675222645e-05,
"loss": 1.8764,
"step": 1261
},
{
"epoch": 0.9960536700868192,
"grad_norm": 1.0155888795852661,
"learning_rate": 1.5472294519636906e-05,
"loss": 1.9361,
"step": 1262
},
{
"epoch": 0.9968429360694554,
"grad_norm": 0.955237627029419,
"learning_rate": 1.54534086146408e-05,
"loss": 1.8533,
"step": 1263
},
{
"epoch": 0.9976322020520916,
"grad_norm": 1.4082919359207153,
"learning_rate": 1.5434521990201417e-05,
"loss": 1.8902,
"step": 1264
},
{
"epoch": 0.9984214680347278,
"grad_norm": 0.9780258536338806,
"learning_rate": 1.5415634676287e-05,
"loss": 1.8814,
"step": 1265
},
{
"epoch": 0.9992107340173638,
"grad_norm": 1.0558334589004517,
"learning_rate": 1.5396746702866863e-05,
"loss": 1.9069,
"step": 1266
},
{
"epoch": 1.0,
"grad_norm": 1.1238291263580322,
"learning_rate": 1.5377858099911384e-05,
"loss": 1.9228,
"step": 1267
},
{
"epoch": 1.000789265982636,
"grad_norm": 1.2115720510482788,
"learning_rate": 1.5358968897391935e-05,
"loss": 2.6076,
"step": 1268
}
],
"logging_steps": 1,
"max_steps": 2534,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 634,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.619513039696586e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}