sage-8b / trainer_state.json
windsornguyen's picture
finetuned weights
abb5621 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9121164846593866,
"eval_steps": 500,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020800832033281333,
"grad_norm": 2.962590217590332,
"learning_rate": 2.5000000000000004e-07,
"loss": 3.7411,
"step": 5
},
{
"epoch": 0.004160166406656267,
"grad_norm": 2.7364747524261475,
"learning_rate": 5.000000000000001e-07,
"loss": 3.8199,
"step": 10
},
{
"epoch": 0.0062402496099844,
"grad_norm": 2.7037243843078613,
"learning_rate": 7.5e-07,
"loss": 3.6922,
"step": 15
},
{
"epoch": 0.008320332813312533,
"grad_norm": 2.5150601863861084,
"learning_rate": 1.0000000000000002e-06,
"loss": 3.6965,
"step": 20
},
{
"epoch": 0.010400416016640665,
"grad_norm": 2.7693450450897217,
"learning_rate": 1.25e-06,
"loss": 3.7636,
"step": 25
},
{
"epoch": 0.0124804992199688,
"grad_norm": 2.946901321411133,
"learning_rate": 1.5e-06,
"loss": 3.5892,
"step": 30
},
{
"epoch": 0.014560582423296931,
"grad_norm": 2.705421209335327,
"learning_rate": 1.7500000000000002e-06,
"loss": 3.6004,
"step": 35
},
{
"epoch": 0.016640665626625067,
"grad_norm": 2.718595504760742,
"learning_rate": 2.0000000000000003e-06,
"loss": 3.4766,
"step": 40
},
{
"epoch": 0.0187207488299532,
"grad_norm": 3.268495798110962,
"learning_rate": 2.25e-06,
"loss": 3.4046,
"step": 45
},
{
"epoch": 0.02080083203328133,
"grad_norm": 3.0189549922943115,
"learning_rate": 2.5e-06,
"loss": 3.4256,
"step": 50
},
{
"epoch": 0.022880915236609463,
"grad_norm": 3.067143201828003,
"learning_rate": 2.7500000000000004e-06,
"loss": 3.4206,
"step": 55
},
{
"epoch": 0.0249609984399376,
"grad_norm": 2.973191261291504,
"learning_rate": 3e-06,
"loss": 3.2975,
"step": 60
},
{
"epoch": 0.02704108164326573,
"grad_norm": 3.4896562099456787,
"learning_rate": 3.2500000000000002e-06,
"loss": 3.1883,
"step": 65
},
{
"epoch": 0.029121164846593862,
"grad_norm": 2.4497926235198975,
"learning_rate": 3.5000000000000004e-06,
"loss": 3.0682,
"step": 70
},
{
"epoch": 0.031201248049921998,
"grad_norm": 3.044771909713745,
"learning_rate": 3.75e-06,
"loss": 2.9224,
"step": 75
},
{
"epoch": 0.033281331253250133,
"grad_norm": 2.6263840198516846,
"learning_rate": 4.000000000000001e-06,
"loss": 2.9386,
"step": 80
},
{
"epoch": 0.03536141445657826,
"grad_norm": 1.6827579736709595,
"learning_rate": 4.250000000000001e-06,
"loss": 2.8187,
"step": 85
},
{
"epoch": 0.0374414976599064,
"grad_norm": 1.493446707725525,
"learning_rate": 4.5e-06,
"loss": 2.7529,
"step": 90
},
{
"epoch": 0.039521580863234526,
"grad_norm": 1.1336227655410767,
"learning_rate": 4.75e-06,
"loss": 2.7076,
"step": 95
},
{
"epoch": 0.04160166406656266,
"grad_norm": 0.8525938391685486,
"learning_rate": 5e-06,
"loss": 2.678,
"step": 100
},
{
"epoch": 0.0436817472698908,
"grad_norm": 0.9276126027107239,
"learning_rate": 5.25e-06,
"loss": 2.6097,
"step": 105
},
{
"epoch": 0.045761830473218926,
"grad_norm": 0.7816782593727112,
"learning_rate": 5.500000000000001e-06,
"loss": 2.614,
"step": 110
},
{
"epoch": 0.04784191367654706,
"grad_norm": 0.8164133429527283,
"learning_rate": 5.750000000000001e-06,
"loss": 2.6019,
"step": 115
},
{
"epoch": 0.0499219968798752,
"grad_norm": 0.555113673210144,
"learning_rate": 6e-06,
"loss": 2.5088,
"step": 120
},
{
"epoch": 0.052002080083203325,
"grad_norm": 0.45174235105514526,
"learning_rate": 6.25e-06,
"loss": 2.5117,
"step": 125
},
{
"epoch": 0.05408216328653146,
"grad_norm": 0.7130635380744934,
"learning_rate": 6.5000000000000004e-06,
"loss": 2.5214,
"step": 130
},
{
"epoch": 0.056162246489859596,
"grad_norm": 0.5437763333320618,
"learning_rate": 6.750000000000001e-06,
"loss": 2.5454,
"step": 135
},
{
"epoch": 0.058242329693187725,
"grad_norm": 0.48792022466659546,
"learning_rate": 7.000000000000001e-06,
"loss": 2.5159,
"step": 140
},
{
"epoch": 0.06032241289651586,
"grad_norm": 0.7019992470741272,
"learning_rate": 7.25e-06,
"loss": 2.5082,
"step": 145
},
{
"epoch": 0.062402496099843996,
"grad_norm": 0.5933384895324707,
"learning_rate": 7.5e-06,
"loss": 2.5135,
"step": 150
},
{
"epoch": 0.06448257930317212,
"grad_norm": 0.4854763150215149,
"learning_rate": 7.75e-06,
"loss": 2.4585,
"step": 155
},
{
"epoch": 0.06656266250650027,
"grad_norm": 0.4506765902042389,
"learning_rate": 8.000000000000001e-06,
"loss": 2.5187,
"step": 160
},
{
"epoch": 0.0686427457098284,
"grad_norm": 0.6778927445411682,
"learning_rate": 8.25e-06,
"loss": 2.4645,
"step": 165
},
{
"epoch": 0.07072282891315652,
"grad_norm": 0.6200412511825562,
"learning_rate": 8.500000000000002e-06,
"loss": 2.5131,
"step": 170
},
{
"epoch": 0.07280291211648465,
"grad_norm": 0.6752357482910156,
"learning_rate": 8.75e-06,
"loss": 2.5049,
"step": 175
},
{
"epoch": 0.0748829953198128,
"grad_norm": 0.5805301070213318,
"learning_rate": 9e-06,
"loss": 2.5014,
"step": 180
},
{
"epoch": 0.07696307852314092,
"grad_norm": 1.1979331970214844,
"learning_rate": 9.25e-06,
"loss": 2.4334,
"step": 185
},
{
"epoch": 0.07904316172646905,
"grad_norm": 0.8396961688995361,
"learning_rate": 9.5e-06,
"loss": 2.4531,
"step": 190
},
{
"epoch": 0.0811232449297972,
"grad_norm": 0.6947128772735596,
"learning_rate": 9.750000000000002e-06,
"loss": 2.4758,
"step": 195
},
{
"epoch": 0.08320332813312532,
"grad_norm": 0.46556374430656433,
"learning_rate": 1e-05,
"loss": 2.4426,
"step": 200
},
{
"epoch": 0.08528341133645345,
"grad_norm": 0.6042707562446594,
"learning_rate": 1.025e-05,
"loss": 2.3977,
"step": 205
},
{
"epoch": 0.0873634945397816,
"grad_norm": 0.5161399245262146,
"learning_rate": 1.05e-05,
"loss": 2.4229,
"step": 210
},
{
"epoch": 0.08944357774310972,
"grad_norm": 0.9716496467590332,
"learning_rate": 1.075e-05,
"loss": 2.4166,
"step": 215
},
{
"epoch": 0.09152366094643785,
"grad_norm": 0.754511296749115,
"learning_rate": 1.1000000000000001e-05,
"loss": 2.4372,
"step": 220
},
{
"epoch": 0.093603744149766,
"grad_norm": 0.6887955665588379,
"learning_rate": 1.125e-05,
"loss": 2.4147,
"step": 225
},
{
"epoch": 0.09568382735309412,
"grad_norm": 0.7467107772827148,
"learning_rate": 1.1500000000000002e-05,
"loss": 2.4222,
"step": 230
},
{
"epoch": 0.09776391055642225,
"grad_norm": 0.945798397064209,
"learning_rate": 1.175e-05,
"loss": 2.4285,
"step": 235
},
{
"epoch": 0.0998439937597504,
"grad_norm": 0.6666924953460693,
"learning_rate": 1.2e-05,
"loss": 2.3843,
"step": 240
},
{
"epoch": 0.10192407696307852,
"grad_norm": 0.6966888308525085,
"learning_rate": 1.225e-05,
"loss": 2.4212,
"step": 245
},
{
"epoch": 0.10400416016640665,
"grad_norm": 0.6751601099967957,
"learning_rate": 1.25e-05,
"loss": 2.3562,
"step": 250
},
{
"epoch": 0.1060842433697348,
"grad_norm": 0.7203898429870605,
"learning_rate": 1.2750000000000002e-05,
"loss": 2.3661,
"step": 255
},
{
"epoch": 0.10816432657306292,
"grad_norm": 0.8724287748336792,
"learning_rate": 1.3000000000000001e-05,
"loss": 2.4047,
"step": 260
},
{
"epoch": 0.11024440977639105,
"grad_norm": 0.8295998573303223,
"learning_rate": 1.3250000000000002e-05,
"loss": 2.387,
"step": 265
},
{
"epoch": 0.11232449297971919,
"grad_norm": 0.7787670493125916,
"learning_rate": 1.3500000000000001e-05,
"loss": 2.3892,
"step": 270
},
{
"epoch": 0.11440457618304732,
"grad_norm": 0.5952211022377014,
"learning_rate": 1.3750000000000002e-05,
"loss": 2.3992,
"step": 275
},
{
"epoch": 0.11648465938637545,
"grad_norm": 0.8523284792900085,
"learning_rate": 1.4000000000000001e-05,
"loss": 2.3621,
"step": 280
},
{
"epoch": 0.11856474258970359,
"grad_norm": 0.7084488868713379,
"learning_rate": 1.4249999999999999e-05,
"loss": 2.3823,
"step": 285
},
{
"epoch": 0.12064482579303172,
"grad_norm": 0.7749157547950745,
"learning_rate": 1.45e-05,
"loss": 2.3772,
"step": 290
},
{
"epoch": 0.12272490899635985,
"grad_norm": 0.6760996580123901,
"learning_rate": 1.475e-05,
"loss": 2.3897,
"step": 295
},
{
"epoch": 0.12480499219968799,
"grad_norm": 0.7566614151000977,
"learning_rate": 1.5e-05,
"loss": 2.3939,
"step": 300
},
{
"epoch": 0.12688507540301613,
"grad_norm": 0.7718506455421448,
"learning_rate": 1.525e-05,
"loss": 2.4124,
"step": 305
},
{
"epoch": 0.12896515860634425,
"grad_norm": 0.6160978078842163,
"learning_rate": 1.55e-05,
"loss": 2.4009,
"step": 310
},
{
"epoch": 0.1310452418096724,
"grad_norm": 1.0341984033584595,
"learning_rate": 1.575e-05,
"loss": 2.3489,
"step": 315
},
{
"epoch": 0.13312532501300053,
"grad_norm": 0.7184290289878845,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.3588,
"step": 320
},
{
"epoch": 0.13520540821632865,
"grad_norm": 0.6868571639060974,
"learning_rate": 1.6250000000000002e-05,
"loss": 2.427,
"step": 325
},
{
"epoch": 0.1372854914196568,
"grad_norm": 0.837578535079956,
"learning_rate": 1.65e-05,
"loss": 2.3661,
"step": 330
},
{
"epoch": 0.1393655746229849,
"grad_norm": 0.6868174076080322,
"learning_rate": 1.675e-05,
"loss": 2.3912,
"step": 335
},
{
"epoch": 0.14144565782631305,
"grad_norm": 0.625311017036438,
"learning_rate": 1.7000000000000003e-05,
"loss": 2.3164,
"step": 340
},
{
"epoch": 0.1435257410296412,
"grad_norm": 0.6201218962669373,
"learning_rate": 1.725e-05,
"loss": 2.3786,
"step": 345
},
{
"epoch": 0.1456058242329693,
"grad_norm": 0.7219041585922241,
"learning_rate": 1.75e-05,
"loss": 2.3528,
"step": 350
},
{
"epoch": 0.14768590743629745,
"grad_norm": 0.6239330172538757,
"learning_rate": 1.775e-05,
"loss": 2.3686,
"step": 355
},
{
"epoch": 0.1497659906396256,
"grad_norm": 0.7319772839546204,
"learning_rate": 1.8e-05,
"loss": 2.3694,
"step": 360
},
{
"epoch": 0.1518460738429537,
"grad_norm": 0.6200202703475952,
"learning_rate": 1.825e-05,
"loss": 2.3627,
"step": 365
},
{
"epoch": 0.15392615704628185,
"grad_norm": 0.8407759666442871,
"learning_rate": 1.85e-05,
"loss": 2.3931,
"step": 370
},
{
"epoch": 0.15600624024961,
"grad_norm": 1.3796571493148804,
"learning_rate": 1.8750000000000002e-05,
"loss": 2.3787,
"step": 375
},
{
"epoch": 0.1580863234529381,
"grad_norm": 0.6906828880310059,
"learning_rate": 1.9e-05,
"loss": 2.3771,
"step": 380
},
{
"epoch": 0.16016640665626625,
"grad_norm": 0.6497045159339905,
"learning_rate": 1.925e-05,
"loss": 2.3581,
"step": 385
},
{
"epoch": 0.1622464898595944,
"grad_norm": 0.5141230821609497,
"learning_rate": 1.9500000000000003e-05,
"loss": 2.3419,
"step": 390
},
{
"epoch": 0.1643265730629225,
"grad_norm": 0.9040182828903198,
"learning_rate": 1.9750000000000002e-05,
"loss": 2.3513,
"step": 395
},
{
"epoch": 0.16640665626625065,
"grad_norm": 0.7217531204223633,
"learning_rate": 2e-05,
"loss": 2.3448,
"step": 400
},
{
"epoch": 0.1684867394695788,
"grad_norm": 0.8354098200798035,
"learning_rate": 2.025e-05,
"loss": 2.3951,
"step": 405
},
{
"epoch": 0.1705668226729069,
"grad_norm": 0.5832729935646057,
"learning_rate": 2.05e-05,
"loss": 2.323,
"step": 410
},
{
"epoch": 0.17264690587623505,
"grad_norm": 0.6963520050048828,
"learning_rate": 2.075e-05,
"loss": 2.3438,
"step": 415
},
{
"epoch": 0.1747269890795632,
"grad_norm": 1.1836349964141846,
"learning_rate": 2.1e-05,
"loss": 2.3125,
"step": 420
},
{
"epoch": 0.1768070722828913,
"grad_norm": 0.7848880887031555,
"learning_rate": 2.125e-05,
"loss": 2.401,
"step": 425
},
{
"epoch": 0.17888715548621945,
"grad_norm": 0.757739782333374,
"learning_rate": 2.15e-05,
"loss": 2.3487,
"step": 430
},
{
"epoch": 0.1809672386895476,
"grad_norm": 0.5723095536231995,
"learning_rate": 2.175e-05,
"loss": 2.346,
"step": 435
},
{
"epoch": 0.1830473218928757,
"grad_norm": 0.7125130891799927,
"learning_rate": 2.2000000000000003e-05,
"loss": 2.3236,
"step": 440
},
{
"epoch": 0.18512740509620385,
"grad_norm": 0.6598831415176392,
"learning_rate": 2.2250000000000002e-05,
"loss": 2.3828,
"step": 445
},
{
"epoch": 0.187207488299532,
"grad_norm": 0.6140534281730652,
"learning_rate": 2.25e-05,
"loss": 2.3335,
"step": 450
},
{
"epoch": 0.1892875715028601,
"grad_norm": 0.6711081266403198,
"learning_rate": 2.275e-05,
"loss": 2.3336,
"step": 455
},
{
"epoch": 0.19136765470618824,
"grad_norm": 0.6882185339927673,
"learning_rate": 2.3000000000000003e-05,
"loss": 2.3423,
"step": 460
},
{
"epoch": 0.1934477379095164,
"grad_norm": 0.7247514724731445,
"learning_rate": 2.3250000000000003e-05,
"loss": 2.3326,
"step": 465
},
{
"epoch": 0.1955278211128445,
"grad_norm": 0.6698940992355347,
"learning_rate": 2.35e-05,
"loss": 2.2953,
"step": 470
},
{
"epoch": 0.19760790431617264,
"grad_norm": 0.7808359265327454,
"learning_rate": 2.375e-05,
"loss": 2.3261,
"step": 475
},
{
"epoch": 0.1996879875195008,
"grad_norm": 0.7009552717208862,
"learning_rate": 2.4e-05,
"loss": 2.3248,
"step": 480
},
{
"epoch": 0.2017680707228289,
"grad_norm": 0.6713016033172607,
"learning_rate": 2.425e-05,
"loss": 2.3566,
"step": 485
},
{
"epoch": 0.20384815392615704,
"grad_norm": 0.784695565700531,
"learning_rate": 2.45e-05,
"loss": 2.3474,
"step": 490
},
{
"epoch": 0.2059282371294852,
"grad_norm": 0.7487632632255554,
"learning_rate": 2.4750000000000002e-05,
"loss": 2.3279,
"step": 495
},
{
"epoch": 0.2080083203328133,
"grad_norm": 0.5700154304504395,
"learning_rate": 2.5e-05,
"loss": 2.3325,
"step": 500
},
{
"epoch": 0.21008840353614144,
"grad_norm": 0.7952355146408081,
"learning_rate": 2.525e-05,
"loss": 2.3383,
"step": 505
},
{
"epoch": 0.2121684867394696,
"grad_norm": 0.6064152717590332,
"learning_rate": 2.5500000000000003e-05,
"loss": 2.2919,
"step": 510
},
{
"epoch": 0.2142485699427977,
"grad_norm": 0.6273530721664429,
"learning_rate": 2.5750000000000002e-05,
"loss": 2.3059,
"step": 515
},
{
"epoch": 0.21632865314612584,
"grad_norm": 0.683093786239624,
"learning_rate": 2.6000000000000002e-05,
"loss": 2.2865,
"step": 520
},
{
"epoch": 0.21840873634945399,
"grad_norm": 0.8195337653160095,
"learning_rate": 2.625e-05,
"loss": 2.3474,
"step": 525
},
{
"epoch": 0.2204888195527821,
"grad_norm": 0.7622310519218445,
"learning_rate": 2.6500000000000004e-05,
"loss": 2.3112,
"step": 530
},
{
"epoch": 0.22256890275611024,
"grad_norm": 0.7957525253295898,
"learning_rate": 2.6750000000000003e-05,
"loss": 2.3254,
"step": 535
},
{
"epoch": 0.22464898595943839,
"grad_norm": 0.5769463181495667,
"learning_rate": 2.7000000000000002e-05,
"loss": 2.3433,
"step": 540
},
{
"epoch": 0.2267290691627665,
"grad_norm": 0.6178082823753357,
"learning_rate": 2.725e-05,
"loss": 2.3502,
"step": 545
},
{
"epoch": 0.22880915236609464,
"grad_norm": 0.633885383605957,
"learning_rate": 2.7500000000000004e-05,
"loss": 2.3422,
"step": 550
},
{
"epoch": 0.23088923556942278,
"grad_norm": 0.48240986466407776,
"learning_rate": 2.7750000000000004e-05,
"loss": 2.2952,
"step": 555
},
{
"epoch": 0.2329693187727509,
"grad_norm": 0.7513511180877686,
"learning_rate": 2.8000000000000003e-05,
"loss": 2.2972,
"step": 560
},
{
"epoch": 0.23504940197607904,
"grad_norm": 0.5911456942558289,
"learning_rate": 2.825e-05,
"loss": 2.3214,
"step": 565
},
{
"epoch": 0.23712948517940718,
"grad_norm": 0.6111375689506531,
"learning_rate": 2.8499999999999998e-05,
"loss": 2.3611,
"step": 570
},
{
"epoch": 0.2392095683827353,
"grad_norm": 0.6953846216201782,
"learning_rate": 2.8749999999999997e-05,
"loss": 2.2539,
"step": 575
},
{
"epoch": 0.24128965158606344,
"grad_norm": 0.5785839557647705,
"learning_rate": 2.9e-05,
"loss": 2.3212,
"step": 580
},
{
"epoch": 0.24336973478939158,
"grad_norm": 0.6091140508651733,
"learning_rate": 2.925e-05,
"loss": 2.2971,
"step": 585
},
{
"epoch": 0.2454498179927197,
"grad_norm": 0.5193526744842529,
"learning_rate": 2.95e-05,
"loss": 2.293,
"step": 590
},
{
"epoch": 0.24752990119604784,
"grad_norm": 0.7062333226203918,
"learning_rate": 2.975e-05,
"loss": 2.3222,
"step": 595
},
{
"epoch": 0.24960998439937598,
"grad_norm": 1.2192779779434204,
"learning_rate": 3e-05,
"loss": 2.3591,
"step": 600
},
{
"epoch": 0.2516900676027041,
"grad_norm": 0.6763813495635986,
"learning_rate": 3.025e-05,
"loss": 2.2771,
"step": 605
},
{
"epoch": 0.25377015080603227,
"grad_norm": 0.6813860535621643,
"learning_rate": 3.05e-05,
"loss": 2.3141,
"step": 610
},
{
"epoch": 0.25585023400936036,
"grad_norm": 0.8562334179878235,
"learning_rate": 3.075e-05,
"loss": 2.3094,
"step": 615
},
{
"epoch": 0.2579303172126885,
"grad_norm": 0.7040572762489319,
"learning_rate": 3.1e-05,
"loss": 2.3033,
"step": 620
},
{
"epoch": 0.26001040041601664,
"grad_norm": 0.6882712841033936,
"learning_rate": 3.125e-05,
"loss": 2.2834,
"step": 625
},
{
"epoch": 0.2620904836193448,
"grad_norm": 0.8077874779701233,
"learning_rate": 3.15e-05,
"loss": 2.3071,
"step": 630
},
{
"epoch": 0.2641705668226729,
"grad_norm": 0.6731362342834473,
"learning_rate": 3.175e-05,
"loss": 2.3127,
"step": 635
},
{
"epoch": 0.26625065002600107,
"grad_norm": 0.5459744334220886,
"learning_rate": 3.2000000000000005e-05,
"loss": 2.3146,
"step": 640
},
{
"epoch": 0.26833073322932915,
"grad_norm": 0.6492711901664734,
"learning_rate": 3.2250000000000005e-05,
"loss": 2.3144,
"step": 645
},
{
"epoch": 0.2704108164326573,
"grad_norm": 1.2923798561096191,
"learning_rate": 3.2500000000000004e-05,
"loss": 2.3319,
"step": 650
},
{
"epoch": 0.27249089963598544,
"grad_norm": 0.6995902061462402,
"learning_rate": 3.275e-05,
"loss": 2.3251,
"step": 655
},
{
"epoch": 0.2745709828393136,
"grad_norm": 0.598090410232544,
"learning_rate": 3.3e-05,
"loss": 2.3024,
"step": 660
},
{
"epoch": 0.2766510660426417,
"grad_norm": 0.5931279063224792,
"learning_rate": 3.325e-05,
"loss": 2.289,
"step": 665
},
{
"epoch": 0.2787311492459698,
"grad_norm": 0.5426341891288757,
"learning_rate": 3.35e-05,
"loss": 2.3031,
"step": 670
},
{
"epoch": 0.28081123244929795,
"grad_norm": 0.6066926717758179,
"learning_rate": 3.375000000000001e-05,
"loss": 2.3116,
"step": 675
},
{
"epoch": 0.2828913156526261,
"grad_norm": 0.7575869560241699,
"learning_rate": 3.4000000000000007e-05,
"loss": 2.256,
"step": 680
},
{
"epoch": 0.28497139885595424,
"grad_norm": 0.6038545370101929,
"learning_rate": 3.4250000000000006e-05,
"loss": 2.3257,
"step": 685
},
{
"epoch": 0.2870514820592824,
"grad_norm": 0.5261275768280029,
"learning_rate": 3.45e-05,
"loss": 2.287,
"step": 690
},
{
"epoch": 0.2891315652626105,
"grad_norm": 0.6376118659973145,
"learning_rate": 3.475e-05,
"loss": 2.2707,
"step": 695
},
{
"epoch": 0.2912116484659386,
"grad_norm": 0.6691327095031738,
"learning_rate": 3.5e-05,
"loss": 2.3172,
"step": 700
},
{
"epoch": 0.29329173166926675,
"grad_norm": 0.8731220364570618,
"learning_rate": 3.525e-05,
"loss": 2.2722,
"step": 705
},
{
"epoch": 0.2953718148725949,
"grad_norm": 0.6707152724266052,
"learning_rate": 3.55e-05,
"loss": 2.3535,
"step": 710
},
{
"epoch": 0.29745189807592304,
"grad_norm": 0.6515153646469116,
"learning_rate": 3.575e-05,
"loss": 2.297,
"step": 715
},
{
"epoch": 0.2995319812792512,
"grad_norm": 0.5436397790908813,
"learning_rate": 3.6e-05,
"loss": 2.3256,
"step": 720
},
{
"epoch": 0.3016120644825793,
"grad_norm": 0.4850907027721405,
"learning_rate": 3.625e-05,
"loss": 2.2774,
"step": 725
},
{
"epoch": 0.3036921476859074,
"grad_norm": 0.559877872467041,
"learning_rate": 3.65e-05,
"loss": 2.2895,
"step": 730
},
{
"epoch": 0.30577223088923555,
"grad_norm": 0.6224697232246399,
"learning_rate": 3.675e-05,
"loss": 2.2715,
"step": 735
},
{
"epoch": 0.3078523140925637,
"grad_norm": 0.5158293843269348,
"learning_rate": 3.7e-05,
"loss": 2.2788,
"step": 740
},
{
"epoch": 0.30993239729589184,
"grad_norm": 0.6136394143104553,
"learning_rate": 3.7250000000000004e-05,
"loss": 2.3017,
"step": 745
},
{
"epoch": 0.31201248049922,
"grad_norm": 0.6287189722061157,
"learning_rate": 3.7500000000000003e-05,
"loss": 2.2602,
"step": 750
},
{
"epoch": 0.3140925637025481,
"grad_norm": 0.6049214601516724,
"learning_rate": 3.775e-05,
"loss": 2.2554,
"step": 755
},
{
"epoch": 0.3161726469058762,
"grad_norm": 0.7491621375083923,
"learning_rate": 3.8e-05,
"loss": 2.2689,
"step": 760
},
{
"epoch": 0.31825273010920435,
"grad_norm": 0.6048611402511597,
"learning_rate": 3.825e-05,
"loss": 2.2535,
"step": 765
},
{
"epoch": 0.3203328133125325,
"grad_norm": 0.9862955808639526,
"learning_rate": 3.85e-05,
"loss": 2.2708,
"step": 770
},
{
"epoch": 0.32241289651586064,
"grad_norm": 0.7605366706848145,
"learning_rate": 3.875e-05,
"loss": 2.2885,
"step": 775
},
{
"epoch": 0.3244929797191888,
"grad_norm": 1.3617628812789917,
"learning_rate": 3.9000000000000006e-05,
"loss": 2.2877,
"step": 780
},
{
"epoch": 0.3265730629225169,
"grad_norm": 0.7338688373565674,
"learning_rate": 3.9250000000000005e-05,
"loss": 2.3489,
"step": 785
},
{
"epoch": 0.328653146125845,
"grad_norm": 0.8271191716194153,
"learning_rate": 3.9500000000000005e-05,
"loss": 2.2867,
"step": 790
},
{
"epoch": 0.33073322932917315,
"grad_norm": 0.6508281230926514,
"learning_rate": 3.9750000000000004e-05,
"loss": 2.2863,
"step": 795
},
{
"epoch": 0.3328133125325013,
"grad_norm": 0.635067880153656,
"learning_rate": 4e-05,
"loss": 2.2428,
"step": 800
},
{
"epoch": 0.33489339573582944,
"grad_norm": 0.6569282412528992,
"learning_rate": 4.025e-05,
"loss": 2.2202,
"step": 805
},
{
"epoch": 0.3369734789391576,
"grad_norm": 0.529431164264679,
"learning_rate": 4.05e-05,
"loss": 2.2532,
"step": 810
},
{
"epoch": 0.3390535621424857,
"grad_norm": 0.5580635070800781,
"learning_rate": 4.075e-05,
"loss": 2.2774,
"step": 815
},
{
"epoch": 0.3411336453458138,
"grad_norm": 0.794660210609436,
"learning_rate": 4.1e-05,
"loss": 2.3053,
"step": 820
},
{
"epoch": 0.34321372854914195,
"grad_norm": 0.5378262996673584,
"learning_rate": 4.125e-05,
"loss": 2.3062,
"step": 825
},
{
"epoch": 0.3452938117524701,
"grad_norm": 0.660877525806427,
"learning_rate": 4.15e-05,
"loss": 2.2977,
"step": 830
},
{
"epoch": 0.34737389495579823,
"grad_norm": 0.6711246371269226,
"learning_rate": 4.175e-05,
"loss": 2.2724,
"step": 835
},
{
"epoch": 0.3494539781591264,
"grad_norm": 0.5555285215377808,
"learning_rate": 4.2e-05,
"loss": 2.2536,
"step": 840
},
{
"epoch": 0.3515340613624545,
"grad_norm": 0.5838858485221863,
"learning_rate": 4.2250000000000004e-05,
"loss": 2.2838,
"step": 845
},
{
"epoch": 0.3536141445657826,
"grad_norm": 0.8371697068214417,
"learning_rate": 4.25e-05,
"loss": 2.2882,
"step": 850
},
{
"epoch": 0.35569422776911075,
"grad_norm": 0.6019457578659058,
"learning_rate": 4.275e-05,
"loss": 2.2534,
"step": 855
},
{
"epoch": 0.3577743109724389,
"grad_norm": 0.5931807160377502,
"learning_rate": 4.3e-05,
"loss": 2.2513,
"step": 860
},
{
"epoch": 0.35985439417576703,
"grad_norm": 0.6282745003700256,
"learning_rate": 4.325e-05,
"loss": 2.2739,
"step": 865
},
{
"epoch": 0.3619344773790952,
"grad_norm": 0.605859100818634,
"learning_rate": 4.35e-05,
"loss": 2.2923,
"step": 870
},
{
"epoch": 0.3640145605824233,
"grad_norm": 0.5225040912628174,
"learning_rate": 4.375e-05,
"loss": 2.2917,
"step": 875
},
{
"epoch": 0.3660946437857514,
"grad_norm": 0.638031005859375,
"learning_rate": 4.4000000000000006e-05,
"loss": 2.3202,
"step": 880
},
{
"epoch": 0.36817472698907955,
"grad_norm": 0.5370813608169556,
"learning_rate": 4.4250000000000005e-05,
"loss": 2.2645,
"step": 885
},
{
"epoch": 0.3702548101924077,
"grad_norm": 0.5657123327255249,
"learning_rate": 4.4500000000000004e-05,
"loss": 2.2923,
"step": 890
},
{
"epoch": 0.37233489339573583,
"grad_norm": 0.7133671045303345,
"learning_rate": 4.4750000000000004e-05,
"loss": 2.2233,
"step": 895
},
{
"epoch": 0.374414976599064,
"grad_norm": 0.7067397236824036,
"learning_rate": 4.5e-05,
"loss": 2.2615,
"step": 900
},
{
"epoch": 0.3764950598023921,
"grad_norm": 0.5847836136817932,
"learning_rate": 4.525e-05,
"loss": 2.2535,
"step": 905
},
{
"epoch": 0.3785751430057202,
"grad_norm": 0.616258442401886,
"learning_rate": 4.55e-05,
"loss": 2.2308,
"step": 910
},
{
"epoch": 0.38065522620904835,
"grad_norm": 0.5688422918319702,
"learning_rate": 4.575e-05,
"loss": 2.2858,
"step": 915
},
{
"epoch": 0.3827353094123765,
"grad_norm": 0.7827875018119812,
"learning_rate": 4.600000000000001e-05,
"loss": 2.2429,
"step": 920
},
{
"epoch": 0.38481539261570463,
"grad_norm": 0.6340644359588623,
"learning_rate": 4.6250000000000006e-05,
"loss": 2.2596,
"step": 925
},
{
"epoch": 0.3868954758190328,
"grad_norm": 0.6311989426612854,
"learning_rate": 4.6500000000000005e-05,
"loss": 2.2857,
"step": 930
},
{
"epoch": 0.3889755590223609,
"grad_norm": 0.6470209956169128,
"learning_rate": 4.6750000000000005e-05,
"loss": 2.252,
"step": 935
},
{
"epoch": 0.391055642225689,
"grad_norm": 0.7288528084754944,
"learning_rate": 4.7e-05,
"loss": 2.2766,
"step": 940
},
{
"epoch": 0.39313572542901715,
"grad_norm": 0.5812355875968933,
"learning_rate": 4.7249999999999997e-05,
"loss": 2.3149,
"step": 945
},
{
"epoch": 0.3952158086323453,
"grad_norm": 0.6074076294898987,
"learning_rate": 4.75e-05,
"loss": 2.2789,
"step": 950
},
{
"epoch": 0.39729589183567343,
"grad_norm": 0.6139565110206604,
"learning_rate": 4.775e-05,
"loss": 2.3189,
"step": 955
},
{
"epoch": 0.3993759750390016,
"grad_norm": 0.7194942235946655,
"learning_rate": 4.8e-05,
"loss": 2.2049,
"step": 960
},
{
"epoch": 0.4014560582423297,
"grad_norm": 0.5972425937652588,
"learning_rate": 4.825e-05,
"loss": 2.2476,
"step": 965
},
{
"epoch": 0.4035361414456578,
"grad_norm": 0.557567298412323,
"learning_rate": 4.85e-05,
"loss": 2.2489,
"step": 970
},
{
"epoch": 0.40561622464898595,
"grad_norm": 0.5463237762451172,
"learning_rate": 4.875e-05,
"loss": 2.2416,
"step": 975
},
{
"epoch": 0.4076963078523141,
"grad_norm": 0.5343475341796875,
"learning_rate": 4.9e-05,
"loss": 2.2684,
"step": 980
},
{
"epoch": 0.40977639105564223,
"grad_norm": 0.5676887035369873,
"learning_rate": 4.9250000000000004e-05,
"loss": 2.2401,
"step": 985
},
{
"epoch": 0.4118564742589704,
"grad_norm": 0.6001689434051514,
"learning_rate": 4.9500000000000004e-05,
"loss": 2.2077,
"step": 990
},
{
"epoch": 0.4139365574622985,
"grad_norm": 0.6760561466217041,
"learning_rate": 4.975e-05,
"loss": 2.3,
"step": 995
},
{
"epoch": 0.4160166406656266,
"grad_norm": 0.7530073523521423,
"learning_rate": 5e-05,
"loss": 2.2847,
"step": 1000
},
{
"epoch": 0.41809672386895474,
"grad_norm": 0.6120262742042542,
"learning_rate": 4.9999919997010506e-05,
"loss": 2.2774,
"step": 1005
},
{
"epoch": 0.4201768070722829,
"grad_norm": 0.6917073130607605,
"learning_rate": 4.9999679988554024e-05,
"loss": 2.2252,
"step": 1010
},
{
"epoch": 0.42225689027561103,
"grad_norm": 0.6386430859565735,
"learning_rate": 4.999927997616671e-05,
"loss": 2.2213,
"step": 1015
},
{
"epoch": 0.4243369734789392,
"grad_norm": 0.7094517946243286,
"learning_rate": 4.99987199624087e-05,
"loss": 2.245,
"step": 1020
},
{
"epoch": 0.4264170566822673,
"grad_norm": 0.7361143231391907,
"learning_rate": 4.999799995086424e-05,
"loss": 2.2545,
"step": 1025
},
{
"epoch": 0.4284971398855954,
"grad_norm": 0.6875327825546265,
"learning_rate": 4.999711994614157e-05,
"loss": 2.2491,
"step": 1030
},
{
"epoch": 0.43057722308892354,
"grad_norm": 0.6336541175842285,
"learning_rate": 4.999607995387292e-05,
"loss": 2.2433,
"step": 1035
},
{
"epoch": 0.4326573062922517,
"grad_norm": 0.6160376667976379,
"learning_rate": 4.9994879980714507e-05,
"loss": 2.2845,
"step": 1040
},
{
"epoch": 0.43473738949557983,
"grad_norm": 0.5946584939956665,
"learning_rate": 4.999352003434643e-05,
"loss": 2.2479,
"step": 1045
},
{
"epoch": 0.43681747269890797,
"grad_norm": 0.6029415130615234,
"learning_rate": 4.9992000123472676e-05,
"loss": 2.2685,
"step": 1050
},
{
"epoch": 0.4388975559022361,
"grad_norm": 0.7253440618515015,
"learning_rate": 4.999032025782104e-05,
"loss": 2.2398,
"step": 1055
},
{
"epoch": 0.4409776391055642,
"grad_norm": 0.6144683957099915,
"learning_rate": 4.998848044814307e-05,
"loss": 2.2668,
"step": 1060
},
{
"epoch": 0.44305772230889234,
"grad_norm": 0.5762149095535278,
"learning_rate": 4.998648070621398e-05,
"loss": 2.2581,
"step": 1065
},
{
"epoch": 0.4451378055122205,
"grad_norm": 0.5672757625579834,
"learning_rate": 4.9984321044832606e-05,
"loss": 2.2518,
"step": 1070
},
{
"epoch": 0.44721788871554863,
"grad_norm": 0.7117498517036438,
"learning_rate": 4.998200147782128e-05,
"loss": 2.2842,
"step": 1075
},
{
"epoch": 0.44929797191887677,
"grad_norm": 0.6527178883552551,
"learning_rate": 4.9979522020025795e-05,
"loss": 2.2687,
"step": 1080
},
{
"epoch": 0.4513780551222049,
"grad_norm": 0.7282842397689819,
"learning_rate": 4.997688268731528e-05,
"loss": 2.2443,
"step": 1085
},
{
"epoch": 0.453458138325533,
"grad_norm": 0.7003041505813599,
"learning_rate": 4.997408349658209e-05,
"loss": 2.2616,
"step": 1090
},
{
"epoch": 0.45553822152886114,
"grad_norm": 0.58209228515625,
"learning_rate": 4.9971124465741716e-05,
"loss": 2.2741,
"step": 1095
},
{
"epoch": 0.4576183047321893,
"grad_norm": 0.5157697200775146,
"learning_rate": 4.996800561373266e-05,
"loss": 2.2557,
"step": 1100
},
{
"epoch": 0.4596983879355174,
"grad_norm": 0.7559351325035095,
"learning_rate": 4.996472696051632e-05,
"loss": 2.2787,
"step": 1105
},
{
"epoch": 0.46177847113884557,
"grad_norm": 0.5898042917251587,
"learning_rate": 4.996128852707687e-05,
"loss": 2.2248,
"step": 1110
},
{
"epoch": 0.4638585543421737,
"grad_norm": 0.6440080404281616,
"learning_rate": 4.9957690335421094e-05,
"loss": 2.2747,
"step": 1115
},
{
"epoch": 0.4659386375455018,
"grad_norm": 0.5132443308830261,
"learning_rate": 4.9953932408578286e-05,
"loss": 2.2839,
"step": 1120
},
{
"epoch": 0.46801872074882994,
"grad_norm": 0.688789963722229,
"learning_rate": 4.9950014770600075e-05,
"loss": 2.2402,
"step": 1125
},
{
"epoch": 0.4700988039521581,
"grad_norm": 0.6494508385658264,
"learning_rate": 4.994593744656029e-05,
"loss": 2.2473,
"step": 1130
},
{
"epoch": 0.4721788871554862,
"grad_norm": 0.5445535778999329,
"learning_rate": 4.994170046255476e-05,
"loss": 2.2316,
"step": 1135
},
{
"epoch": 0.47425897035881437,
"grad_norm": 0.6383847594261169,
"learning_rate": 4.993730384570121e-05,
"loss": 2.2671,
"step": 1140
},
{
"epoch": 0.4763390535621425,
"grad_norm": 0.6905817985534668,
"learning_rate": 4.9932747624139045e-05,
"loss": 2.2444,
"step": 1145
},
{
"epoch": 0.4784191367654706,
"grad_norm": 0.6420992612838745,
"learning_rate": 4.992803182702916e-05,
"loss": 2.2843,
"step": 1150
},
{
"epoch": 0.48049921996879874,
"grad_norm": 0.619663655757904,
"learning_rate": 4.992315648455379e-05,
"loss": 2.2571,
"step": 1155
},
{
"epoch": 0.4825793031721269,
"grad_norm": 0.6715114116668701,
"learning_rate": 4.9918121627916294e-05,
"loss": 2.2266,
"step": 1160
},
{
"epoch": 0.484659386375455,
"grad_norm": 0.611705482006073,
"learning_rate": 4.991292728934095e-05,
"loss": 2.2844,
"step": 1165
},
{
"epoch": 0.48673946957878317,
"grad_norm": 0.5942493677139282,
"learning_rate": 4.990757350207278e-05,
"loss": 2.2116,
"step": 1170
},
{
"epoch": 0.4888195527821113,
"grad_norm": 0.7864916920661926,
"learning_rate": 4.990206030037729e-05,
"loss": 2.262,
"step": 1175
},
{
"epoch": 0.4908996359854394,
"grad_norm": 0.7583130598068237,
"learning_rate": 4.98963877195403e-05,
"loss": 2.2963,
"step": 1180
},
{
"epoch": 0.49297971918876754,
"grad_norm": 0.5617021322250366,
"learning_rate": 4.9890555795867675e-05,
"loss": 2.2711,
"step": 1185
},
{
"epoch": 0.4950598023920957,
"grad_norm": 0.7314319610595703,
"learning_rate": 4.9884564566685135e-05,
"loss": 2.2533,
"step": 1190
},
{
"epoch": 0.4971398855954238,
"grad_norm": 0.5248362421989441,
"learning_rate": 4.9878414070337967e-05,
"loss": 2.2582,
"step": 1195
},
{
"epoch": 0.49921996879875197,
"grad_norm": 0.5933838486671448,
"learning_rate": 4.9872104346190826e-05,
"loss": 2.2481,
"step": 1200
},
{
"epoch": 0.50130005200208,
"grad_norm": 0.6278464794158936,
"learning_rate": 4.986563543462745e-05,
"loss": 2.288,
"step": 1205
},
{
"epoch": 0.5033801352054083,
"grad_norm": 0.6378714442253113,
"learning_rate": 4.985900737705041e-05,
"loss": 2.2828,
"step": 1210
},
{
"epoch": 0.5054602184087363,
"grad_norm": 0.575777530670166,
"learning_rate": 4.9852220215880893e-05,
"loss": 2.2452,
"step": 1215
},
{
"epoch": 0.5075403016120645,
"grad_norm": 0.6294274926185608,
"learning_rate": 4.984527399455832e-05,
"loss": 2.254,
"step": 1220
},
{
"epoch": 0.5096203848153926,
"grad_norm": 0.8891189098358154,
"learning_rate": 4.983816875754018e-05,
"loss": 2.2727,
"step": 1225
},
{
"epoch": 0.5117004680187207,
"grad_norm": 0.6461197137832642,
"learning_rate": 4.9830904550301695e-05,
"loss": 2.2339,
"step": 1230
},
{
"epoch": 0.5137805512220489,
"grad_norm": 0.6093956232070923,
"learning_rate": 4.982348141933553e-05,
"loss": 2.2483,
"step": 1235
},
{
"epoch": 0.515860634425377,
"grad_norm": 0.6348010897636414,
"learning_rate": 4.9815899412151476e-05,
"loss": 2.2093,
"step": 1240
},
{
"epoch": 0.5179407176287052,
"grad_norm": 0.7893829941749573,
"learning_rate": 4.9808158577276224e-05,
"loss": 2.2995,
"step": 1245
},
{
"epoch": 0.5200208008320333,
"grad_norm": 0.528293788433075,
"learning_rate": 4.9800258964252946e-05,
"loss": 2.2726,
"step": 1250
},
{
"epoch": 0.5221008840353614,
"grad_norm": 0.6920540928840637,
"learning_rate": 4.9792200623641066e-05,
"loss": 2.2444,
"step": 1255
},
{
"epoch": 0.5241809672386896,
"grad_norm": 0.7353582978248596,
"learning_rate": 4.9783983607015885e-05,
"loss": 2.2409,
"step": 1260
},
{
"epoch": 0.5262610504420177,
"grad_norm": 0.7552638649940491,
"learning_rate": 4.977560796696828e-05,
"loss": 2.2735,
"step": 1265
},
{
"epoch": 0.5283411336453459,
"grad_norm": 0.6445308327674866,
"learning_rate": 4.9767073757104346e-05,
"loss": 2.2176,
"step": 1270
},
{
"epoch": 0.5304212168486739,
"grad_norm": 0.5857995748519897,
"learning_rate": 4.975838103204506e-05,
"loss": 2.2663,
"step": 1275
},
{
"epoch": 0.5325013000520021,
"grad_norm": 0.6020021438598633,
"learning_rate": 4.974952984742596e-05,
"loss": 2.2685,
"step": 1280
},
{
"epoch": 0.5345813832553302,
"grad_norm": 0.9829360246658325,
"learning_rate": 4.974052025989673e-05,
"loss": 2.1992,
"step": 1285
},
{
"epoch": 0.5366614664586583,
"grad_norm": 0.6377243995666504,
"learning_rate": 4.9731352327120883e-05,
"loss": 2.2451,
"step": 1290
},
{
"epoch": 0.5387415496619865,
"grad_norm": 0.5337876081466675,
"learning_rate": 4.97220261077754e-05,
"loss": 2.2365,
"step": 1295
},
{
"epoch": 0.5408216328653146,
"grad_norm": 0.6845943927764893,
"learning_rate": 4.97125416615503e-05,
"loss": 2.2317,
"step": 1300
},
{
"epoch": 0.5429017160686428,
"grad_norm": 0.6941018104553223,
"learning_rate": 4.97028990491483e-05,
"loss": 2.2123,
"step": 1305
},
{
"epoch": 0.5449817992719709,
"grad_norm": 0.626502275466919,
"learning_rate": 4.969309833228444e-05,
"loss": 2.2336,
"step": 1310
},
{
"epoch": 0.547061882475299,
"grad_norm": 0.5843268036842346,
"learning_rate": 4.968313957368564e-05,
"loss": 2.2062,
"step": 1315
},
{
"epoch": 0.5491419656786272,
"grad_norm": 0.687279999256134,
"learning_rate": 4.967302283709036e-05,
"loss": 2.2564,
"step": 1320
},
{
"epoch": 0.5512220488819553,
"grad_norm": 0.5501798987388611,
"learning_rate": 4.966274818724811e-05,
"loss": 2.2667,
"step": 1325
},
{
"epoch": 0.5533021320852834,
"grad_norm": 0.6118011474609375,
"learning_rate": 4.9652315689919117e-05,
"loss": 2.2369,
"step": 1330
},
{
"epoch": 0.5553822152886115,
"grad_norm": 0.6296612024307251,
"learning_rate": 4.9641725411873854e-05,
"loss": 2.2542,
"step": 1335
},
{
"epoch": 0.5574622984919396,
"grad_norm": 0.7094495296478271,
"learning_rate": 4.963097742089263e-05,
"loss": 2.2562,
"step": 1340
},
{
"epoch": 0.5595423816952678,
"grad_norm": 0.6220189929008484,
"learning_rate": 4.962007178576517e-05,
"loss": 2.2373,
"step": 1345
},
{
"epoch": 0.5616224648985959,
"grad_norm": 0.6659913659095764,
"learning_rate": 4.9609008576290135e-05,
"loss": 2.2629,
"step": 1350
},
{
"epoch": 0.5637025481019241,
"grad_norm": 0.5892766714096069,
"learning_rate": 4.9597787863274715e-05,
"loss": 2.2143,
"step": 1355
},
{
"epoch": 0.5657826313052522,
"grad_norm": 0.5834632515907288,
"learning_rate": 4.958640971853417e-05,
"loss": 2.219,
"step": 1360
},
{
"epoch": 0.5678627145085804,
"grad_norm": 0.7557886242866516,
"learning_rate": 4.957487421489132e-05,
"loss": 2.2199,
"step": 1365
},
{
"epoch": 0.5699427977119085,
"grad_norm": 0.7835619449615479,
"learning_rate": 4.956318142617617e-05,
"loss": 2.2092,
"step": 1370
},
{
"epoch": 0.5720228809152366,
"grad_norm": 0.6275124549865723,
"learning_rate": 4.955133142722536e-05,
"loss": 2.2569,
"step": 1375
},
{
"epoch": 0.5741029641185648,
"grad_norm": 0.6475622057914734,
"learning_rate": 4.953932429388171e-05,
"loss": 2.2476,
"step": 1380
},
{
"epoch": 0.5761830473218928,
"grad_norm": 0.6012747287750244,
"learning_rate": 4.952716010299375e-05,
"loss": 2.1796,
"step": 1385
},
{
"epoch": 0.578263130525221,
"grad_norm": 0.5755728483200073,
"learning_rate": 4.9514838932415216e-05,
"loss": 2.2479,
"step": 1390
},
{
"epoch": 0.5803432137285491,
"grad_norm": 0.5810622572898865,
"learning_rate": 4.950236086100454e-05,
"loss": 2.2458,
"step": 1395
},
{
"epoch": 0.5824232969318772,
"grad_norm": 0.5510530471801758,
"learning_rate": 4.9489725968624354e-05,
"loss": 2.2085,
"step": 1400
},
{
"epoch": 0.5845033801352054,
"grad_norm": 0.5717112421989441,
"learning_rate": 4.9476934336141014e-05,
"loss": 2.2497,
"step": 1405
},
{
"epoch": 0.5865834633385335,
"grad_norm": 0.5643584132194519,
"learning_rate": 4.9463986045424006e-05,
"loss": 2.2368,
"step": 1410
},
{
"epoch": 0.5886635465418617,
"grad_norm": 0.7527519464492798,
"learning_rate": 4.94508811793455e-05,
"loss": 2.2424,
"step": 1415
},
{
"epoch": 0.5907436297451898,
"grad_norm": 0.7746401429176331,
"learning_rate": 4.9437619821779766e-05,
"loss": 2.2678,
"step": 1420
},
{
"epoch": 0.592823712948518,
"grad_norm": 0.6275284886360168,
"learning_rate": 4.9424202057602664e-05,
"loss": 2.2083,
"step": 1425
},
{
"epoch": 0.5949037961518461,
"grad_norm": 0.5806483626365662,
"learning_rate": 4.94106279726911e-05,
"loss": 2.2478,
"step": 1430
},
{
"epoch": 0.5969838793551742,
"grad_norm": 0.6382482647895813,
"learning_rate": 4.939689765392246e-05,
"loss": 2.225,
"step": 1435
},
{
"epoch": 0.5990639625585024,
"grad_norm": 0.712761402130127,
"learning_rate": 4.938301118917407e-05,
"loss": 2.2336,
"step": 1440
},
{
"epoch": 0.6011440457618304,
"grad_norm": 0.6290826201438904,
"learning_rate": 4.936896866732262e-05,
"loss": 2.2524,
"step": 1445
},
{
"epoch": 0.6032241289651586,
"grad_norm": 0.676964521408081,
"learning_rate": 4.935477017824361e-05,
"loss": 2.203,
"step": 1450
},
{
"epoch": 0.6053042121684867,
"grad_norm": 0.648561418056488,
"learning_rate": 4.934041581281078e-05,
"loss": 2.2604,
"step": 1455
},
{
"epoch": 0.6073842953718148,
"grad_norm": 0.6181101202964783,
"learning_rate": 4.9325905662895474e-05,
"loss": 2.2018,
"step": 1460
},
{
"epoch": 0.609464378575143,
"grad_norm": 0.654924750328064,
"learning_rate": 4.931123982136615e-05,
"loss": 2.243,
"step": 1465
},
{
"epoch": 0.6115444617784711,
"grad_norm": 0.6459677219390869,
"learning_rate": 4.929641838208768e-05,
"loss": 2.1945,
"step": 1470
},
{
"epoch": 0.6136245449817993,
"grad_norm": 0.7053624391555786,
"learning_rate": 4.928144143992083e-05,
"loss": 2.2052,
"step": 1475
},
{
"epoch": 0.6157046281851274,
"grad_norm": 0.7262877225875854,
"learning_rate": 4.926630909072161e-05,
"loss": 2.2281,
"step": 1480
},
{
"epoch": 0.6177847113884556,
"grad_norm": 0.6927151083946228,
"learning_rate": 4.925102143134068e-05,
"loss": 2.2062,
"step": 1485
},
{
"epoch": 0.6198647945917837,
"grad_norm": 0.623932957649231,
"learning_rate": 4.92355785596227e-05,
"loss": 2.2255,
"step": 1490
},
{
"epoch": 0.6219448777951118,
"grad_norm": 0.706231415271759,
"learning_rate": 4.921998057440576e-05,
"loss": 2.2341,
"step": 1495
},
{
"epoch": 0.62402496099844,
"grad_norm": 0.737194299697876,
"learning_rate": 4.920422757552069e-05,
"loss": 2.2169,
"step": 1500
},
{
"epoch": 0.626105044201768,
"grad_norm": 0.9163033962249756,
"learning_rate": 4.918831966379044e-05,
"loss": 2.2055,
"step": 1505
},
{
"epoch": 0.6281851274050962,
"grad_norm": 0.5982114672660828,
"learning_rate": 4.917225694102947e-05,
"loss": 2.2223,
"step": 1510
},
{
"epoch": 0.6302652106084243,
"grad_norm": 0.6721086502075195,
"learning_rate": 4.9156039510043025e-05,
"loss": 2.2271,
"step": 1515
},
{
"epoch": 0.6323452938117524,
"grad_norm": 0.6458872556686401,
"learning_rate": 4.913966747462656e-05,
"loss": 2.2341,
"step": 1520
},
{
"epoch": 0.6344253770150806,
"grad_norm": 0.8973822593688965,
"learning_rate": 4.9123140939565e-05,
"loss": 2.2489,
"step": 1525
},
{
"epoch": 0.6365054602184087,
"grad_norm": 0.6235558390617371,
"learning_rate": 4.9106460010632146e-05,
"loss": 2.2126,
"step": 1530
},
{
"epoch": 0.6385855434217369,
"grad_norm": 0.6284027695655823,
"learning_rate": 4.908962479458991e-05,
"loss": 2.2565,
"step": 1535
},
{
"epoch": 0.640665626625065,
"grad_norm": 0.663979172706604,
"learning_rate": 4.907263539918771e-05,
"loss": 2.2357,
"step": 1540
},
{
"epoch": 0.6427457098283932,
"grad_norm": 0.7080653309822083,
"learning_rate": 4.905549193316174e-05,
"loss": 2.2386,
"step": 1545
},
{
"epoch": 0.6448257930317213,
"grad_norm": 0.5746604800224304,
"learning_rate": 4.903819450623428e-05,
"loss": 2.218,
"step": 1550
},
{
"epoch": 0.6469058762350494,
"grad_norm": 0.5668492913246155,
"learning_rate": 4.9020743229113e-05,
"loss": 2.1871,
"step": 1555
},
{
"epoch": 0.6489859594383776,
"grad_norm": 0.8596884608268738,
"learning_rate": 4.900313821349025e-05,
"loss": 2.2023,
"step": 1560
},
{
"epoch": 0.6510660426417056,
"grad_norm": 0.6699584722518921,
"learning_rate": 4.898537957204234e-05,
"loss": 2.2124,
"step": 1565
},
{
"epoch": 0.6531461258450338,
"grad_norm": 0.7021228075027466,
"learning_rate": 4.8967467418428826e-05,
"loss": 2.218,
"step": 1570
},
{
"epoch": 0.6552262090483619,
"grad_norm": 0.6994513869285583,
"learning_rate": 4.894940186729176e-05,
"loss": 2.2321,
"step": 1575
},
{
"epoch": 0.65730629225169,
"grad_norm": 0.6525283455848694,
"learning_rate": 4.8931183034255e-05,
"loss": 2.2323,
"step": 1580
},
{
"epoch": 0.6593863754550182,
"grad_norm": 0.6755690574645996,
"learning_rate": 4.891281103592344e-05,
"loss": 2.2491,
"step": 1585
},
{
"epoch": 0.6614664586583463,
"grad_norm": 0.7468230724334717,
"learning_rate": 4.889428598988226e-05,
"loss": 2.2817,
"step": 1590
},
{
"epoch": 0.6635465418616745,
"grad_norm": 0.6302746534347534,
"learning_rate": 4.887560801469617e-05,
"loss": 2.2178,
"step": 1595
},
{
"epoch": 0.6656266250650026,
"grad_norm": 0.5519952774047852,
"learning_rate": 4.88567772299087e-05,
"loss": 2.2314,
"step": 1600
},
{
"epoch": 0.6677067082683308,
"grad_norm": 0.706723153591156,
"learning_rate": 4.8837793756041364e-05,
"loss": 2.2246,
"step": 1605
},
{
"epoch": 0.6697867914716589,
"grad_norm": 0.6226012110710144,
"learning_rate": 4.881865771459294e-05,
"loss": 2.2303,
"step": 1610
},
{
"epoch": 0.671866874674987,
"grad_norm": 0.6458874344825745,
"learning_rate": 4.879936922803867e-05,
"loss": 2.2443,
"step": 1615
},
{
"epoch": 0.6739469578783152,
"grad_norm": 0.6402535438537598,
"learning_rate": 4.8779928419829475e-05,
"loss": 2.2227,
"step": 1620
},
{
"epoch": 0.6760270410816432,
"grad_norm": 0.6376984715461731,
"learning_rate": 4.876033541439118e-05,
"loss": 2.2644,
"step": 1625
},
{
"epoch": 0.6781071242849714,
"grad_norm": 0.7197051048278809,
"learning_rate": 4.874059033712371e-05,
"loss": 2.225,
"step": 1630
},
{
"epoch": 0.6801872074882995,
"grad_norm": 0.6593906283378601,
"learning_rate": 4.872069331440028e-05,
"loss": 2.224,
"step": 1635
},
{
"epoch": 0.6822672906916276,
"grad_norm": 0.6552969217300415,
"learning_rate": 4.870064447356658e-05,
"loss": 2.2376,
"step": 1640
},
{
"epoch": 0.6843473738949558,
"grad_norm": 0.665170431137085,
"learning_rate": 4.8680443942940014e-05,
"loss": 2.2294,
"step": 1645
},
{
"epoch": 0.6864274570982839,
"grad_norm": 0.6637657880783081,
"learning_rate": 4.8660091851808784e-05,
"loss": 2.2462,
"step": 1650
},
{
"epoch": 0.6885075403016121,
"grad_norm": 0.5620794892311096,
"learning_rate": 4.863958833043115e-05,
"loss": 2.2032,
"step": 1655
},
{
"epoch": 0.6905876235049402,
"grad_norm": 0.7143023014068604,
"learning_rate": 4.861893351003456e-05,
"loss": 2.2158,
"step": 1660
},
{
"epoch": 0.6926677067082684,
"grad_norm": 0.6956245303153992,
"learning_rate": 4.859812752281479e-05,
"loss": 2.185,
"step": 1665
},
{
"epoch": 0.6947477899115965,
"grad_norm": 0.6536686420440674,
"learning_rate": 4.857717050193514e-05,
"loss": 2.2095,
"step": 1670
},
{
"epoch": 0.6968278731149246,
"grad_norm": 0.728190004825592,
"learning_rate": 4.855606258152556e-05,
"loss": 2.2691,
"step": 1675
},
{
"epoch": 0.6989079563182528,
"grad_norm": 0.7523576617240906,
"learning_rate": 4.853480389668179e-05,
"loss": 2.2348,
"step": 1680
},
{
"epoch": 0.7009880395215808,
"grad_norm": 0.6006982922554016,
"learning_rate": 4.851339458346449e-05,
"loss": 2.2242,
"step": 1685
},
{
"epoch": 0.703068122724909,
"grad_norm": 0.6585814356803894,
"learning_rate": 4.8491834778898385e-05,
"loss": 2.2263,
"step": 1690
},
{
"epoch": 0.7051482059282371,
"grad_norm": 0.7236796617507935,
"learning_rate": 4.847012462097139e-05,
"loss": 2.2498,
"step": 1695
},
{
"epoch": 0.7072282891315652,
"grad_norm": 0.8331758379936218,
"learning_rate": 4.84482642486337e-05,
"loss": 2.2006,
"step": 1700
},
{
"epoch": 0.7093083723348934,
"grad_norm": 0.7638906836509705,
"learning_rate": 4.8426253801796914e-05,
"loss": 2.2025,
"step": 1705
},
{
"epoch": 0.7113884555382215,
"grad_norm": 0.6874154210090637,
"learning_rate": 4.840409342133318e-05,
"loss": 2.2119,
"step": 1710
},
{
"epoch": 0.7134685387415497,
"grad_norm": 0.7299637198448181,
"learning_rate": 4.8381783249074224e-05,
"loss": 2.2233,
"step": 1715
},
{
"epoch": 0.7155486219448778,
"grad_norm": 0.8418067097663879,
"learning_rate": 4.8359323427810476e-05,
"loss": 2.2407,
"step": 1720
},
{
"epoch": 0.717628705148206,
"grad_norm": 0.6470432281494141,
"learning_rate": 4.833671410129018e-05,
"loss": 2.2548,
"step": 1725
},
{
"epoch": 0.7197087883515341,
"grad_norm": 0.6883252263069153,
"learning_rate": 4.831395541421841e-05,
"loss": 2.2434,
"step": 1730
},
{
"epoch": 0.7217888715548622,
"grad_norm": 0.7531642317771912,
"learning_rate": 4.8291047512256223e-05,
"loss": 2.2151,
"step": 1735
},
{
"epoch": 0.7238689547581904,
"grad_norm": 0.6417682766914368,
"learning_rate": 4.826799054201967e-05,
"loss": 2.2081,
"step": 1740
},
{
"epoch": 0.7259490379615184,
"grad_norm": 0.8133646845817566,
"learning_rate": 4.824478465107887e-05,
"loss": 2.2138,
"step": 1745
},
{
"epoch": 0.7280291211648466,
"grad_norm": 0.7256624698638916,
"learning_rate": 4.8221429987957076e-05,
"loss": 2.2553,
"step": 1750
},
{
"epoch": 0.7301092043681747,
"grad_norm": 0.7030817866325378,
"learning_rate": 4.819792670212971e-05,
"loss": 2.2055,
"step": 1755
},
{
"epoch": 0.7321892875715028,
"grad_norm": 0.6028294563293457,
"learning_rate": 4.817427494402344e-05,
"loss": 2.2026,
"step": 1760
},
{
"epoch": 0.734269370774831,
"grad_norm": 0.628900945186615,
"learning_rate": 4.815047486501515e-05,
"loss": 2.2114,
"step": 1765
},
{
"epoch": 0.7363494539781591,
"grad_norm": 0.6979473233222961,
"learning_rate": 4.8126526617431065e-05,
"loss": 2.2334,
"step": 1770
},
{
"epoch": 0.7384295371814873,
"grad_norm": 0.6751022934913635,
"learning_rate": 4.810243035454568e-05,
"loss": 2.2157,
"step": 1775
},
{
"epoch": 0.7405096203848154,
"grad_norm": 0.6779219508171082,
"learning_rate": 4.8078186230580845e-05,
"loss": 2.2429,
"step": 1780
},
{
"epoch": 0.7425897035881436,
"grad_norm": 0.7016996741294861,
"learning_rate": 4.805379440070475e-05,
"loss": 2.2412,
"step": 1785
},
{
"epoch": 0.7446697867914717,
"grad_norm": 0.6258619427680969,
"learning_rate": 4.802925502103094e-05,
"loss": 2.2488,
"step": 1790
},
{
"epoch": 0.7467498699947998,
"grad_norm": 0.622600793838501,
"learning_rate": 4.800456824861731e-05,
"loss": 2.2435,
"step": 1795
},
{
"epoch": 0.748829953198128,
"grad_norm": 0.5709562301635742,
"learning_rate": 4.797973424146512e-05,
"loss": 2.1844,
"step": 1800
},
{
"epoch": 0.750910036401456,
"grad_norm": 0.7474341988563538,
"learning_rate": 4.795475315851795e-05,
"loss": 2.2236,
"step": 1805
},
{
"epoch": 0.7529901196047842,
"grad_norm": 0.6307352781295776,
"learning_rate": 4.7929625159660694e-05,
"loss": 2.2427,
"step": 1810
},
{
"epoch": 0.7550702028081123,
"grad_norm": 0.792239248752594,
"learning_rate": 4.7904350405718555e-05,
"loss": 2.23,
"step": 1815
},
{
"epoch": 0.7571502860114404,
"grad_norm": 0.5931370258331299,
"learning_rate": 4.7878929058456027e-05,
"loss": 2.2158,
"step": 1820
},
{
"epoch": 0.7592303692147686,
"grad_norm": 0.6610358357429504,
"learning_rate": 4.7853361280575786e-05,
"loss": 2.1798,
"step": 1825
},
{
"epoch": 0.7613104524180967,
"grad_norm": 0.7434505224227905,
"learning_rate": 4.782764723571774e-05,
"loss": 2.2291,
"step": 1830
},
{
"epoch": 0.7633905356214249,
"grad_norm": 0.6113109588623047,
"learning_rate": 4.780178708845792e-05,
"loss": 2.2246,
"step": 1835
},
{
"epoch": 0.765470618824753,
"grad_norm": 0.6270626783370972,
"learning_rate": 4.7775781004307446e-05,
"loss": 2.2527,
"step": 1840
},
{
"epoch": 0.7675507020280812,
"grad_norm": 0.8917273879051208,
"learning_rate": 4.7749629149711495e-05,
"loss": 2.2343,
"step": 1845
},
{
"epoch": 0.7696307852314093,
"grad_norm": 0.6628624200820923,
"learning_rate": 4.7723331692048174e-05,
"loss": 2.2066,
"step": 1850
},
{
"epoch": 0.7717108684347374,
"grad_norm": 0.7599259614944458,
"learning_rate": 4.76968887996275e-05,
"loss": 2.2003,
"step": 1855
},
{
"epoch": 0.7737909516380655,
"grad_norm": 0.7547385692596436,
"learning_rate": 4.767030064169034e-05,
"loss": 2.209,
"step": 1860
},
{
"epoch": 0.7758710348413936,
"grad_norm": 0.5945778489112854,
"learning_rate": 4.764356738840722e-05,
"loss": 2.1807,
"step": 1865
},
{
"epoch": 0.7779511180447218,
"grad_norm": 0.6278342008590698,
"learning_rate": 4.7616689210877374e-05,
"loss": 2.2512,
"step": 1870
},
{
"epoch": 0.7800312012480499,
"grad_norm": 0.6003873944282532,
"learning_rate": 4.7589666281127575e-05,
"loss": 2.2352,
"step": 1875
},
{
"epoch": 0.782111284451378,
"grad_norm": 0.6956392526626587,
"learning_rate": 4.756249877211102e-05,
"loss": 2.2109,
"step": 1880
},
{
"epoch": 0.7841913676547062,
"grad_norm": 0.724892795085907,
"learning_rate": 4.7535186857706274e-05,
"loss": 2.2142,
"step": 1885
},
{
"epoch": 0.7862714508580343,
"grad_norm": 0.6417645215988159,
"learning_rate": 4.750773071271612e-05,
"loss": 2.2437,
"step": 1890
},
{
"epoch": 0.7883515340613625,
"grad_norm": 0.7107153534889221,
"learning_rate": 4.748013051286646e-05,
"loss": 2.2092,
"step": 1895
},
{
"epoch": 0.7904316172646906,
"grad_norm": 0.7158809304237366,
"learning_rate": 4.7452386434805154e-05,
"loss": 2.1911,
"step": 1900
},
{
"epoch": 0.7925117004680188,
"grad_norm": 0.5407947301864624,
"learning_rate": 4.7424498656100954e-05,
"loss": 2.2399,
"step": 1905
},
{
"epoch": 0.7945917836713469,
"grad_norm": 0.638559103012085,
"learning_rate": 4.73964673552423e-05,
"loss": 2.2175,
"step": 1910
},
{
"epoch": 0.796671866874675,
"grad_norm": 0.6588721871376038,
"learning_rate": 4.736829271163624e-05,
"loss": 2.1737,
"step": 1915
},
{
"epoch": 0.7987519500780031,
"grad_norm": 1.2813347578048706,
"learning_rate": 4.7339974905607206e-05,
"loss": 2.214,
"step": 1920
},
{
"epoch": 0.8008320332813312,
"grad_norm": 0.6472169756889343,
"learning_rate": 4.731151411839596e-05,
"loss": 2.2123,
"step": 1925
},
{
"epoch": 0.8029121164846594,
"grad_norm": 0.6989557147026062,
"learning_rate": 4.728291053215832e-05,
"loss": 2.266,
"step": 1930
},
{
"epoch": 0.8049921996879875,
"grad_norm": 0.5853009223937988,
"learning_rate": 4.725416432996409e-05,
"loss": 2.2408,
"step": 1935
},
{
"epoch": 0.8070722828913156,
"grad_norm": 0.6867830157279968,
"learning_rate": 4.722527569579584e-05,
"loss": 2.2089,
"step": 1940
},
{
"epoch": 0.8091523660946438,
"grad_norm": 0.5697247385978699,
"learning_rate": 4.719624481454773e-05,
"loss": 2.2268,
"step": 1945
},
{
"epoch": 0.8112324492979719,
"grad_norm": 0.7628327012062073,
"learning_rate": 4.716707187202436e-05,
"loss": 2.2137,
"step": 1950
},
{
"epoch": 0.8133125325013001,
"grad_norm": 0.6538941264152527,
"learning_rate": 4.7137757054939516e-05,
"loss": 2.2173,
"step": 1955
},
{
"epoch": 0.8153926157046282,
"grad_norm": 0.66108638048172,
"learning_rate": 4.710830055091506e-05,
"loss": 2.2273,
"step": 1960
},
{
"epoch": 0.8174726989079563,
"grad_norm": 0.8627687692642212,
"learning_rate": 4.707870254847965e-05,
"loss": 2.2459,
"step": 1965
},
{
"epoch": 0.8195527821112845,
"grad_norm": 1.0859020948410034,
"learning_rate": 4.7048963237067576e-05,
"loss": 2.2171,
"step": 1970
},
{
"epoch": 0.8216328653146125,
"grad_norm": 0.5735368132591248,
"learning_rate": 4.7019082807017555e-05,
"loss": 2.201,
"step": 1975
},
{
"epoch": 0.8237129485179407,
"grad_norm": 0.7044029831886292,
"learning_rate": 4.698906144957148e-05,
"loss": 2.1652,
"step": 1980
},
{
"epoch": 0.8257930317212688,
"grad_norm": 0.6482775211334229,
"learning_rate": 4.695889935687322e-05,
"loss": 2.227,
"step": 1985
},
{
"epoch": 0.827873114924597,
"grad_norm": 0.6682475209236145,
"learning_rate": 4.692859672196738e-05,
"loss": 2.1841,
"step": 1990
},
{
"epoch": 0.8299531981279251,
"grad_norm": 0.925491988658905,
"learning_rate": 4.689815373879808e-05,
"loss": 2.2046,
"step": 1995
},
{
"epoch": 0.8320332813312532,
"grad_norm": 0.6961370706558228,
"learning_rate": 4.686757060220768e-05,
"loss": 2.1795,
"step": 2000
},
{
"epoch": 0.8341133645345814,
"grad_norm": 0.669620931148529,
"learning_rate": 4.6836847507935566e-05,
"loss": 2.2046,
"step": 2005
},
{
"epoch": 0.8361934477379095,
"grad_norm": 1.242880940437317,
"learning_rate": 4.6805984652616905e-05,
"loss": 2.2444,
"step": 2010
},
{
"epoch": 0.8382735309412377,
"grad_norm": 0.6134272217750549,
"learning_rate": 4.677498223378134e-05,
"loss": 2.2067,
"step": 2015
},
{
"epoch": 0.8403536141445658,
"grad_norm": 0.6093041896820068,
"learning_rate": 4.674384044985177e-05,
"loss": 2.2191,
"step": 2020
},
{
"epoch": 0.8424336973478939,
"grad_norm": 0.7351178526878357,
"learning_rate": 4.6712559500143064e-05,
"loss": 2.197,
"step": 2025
},
{
"epoch": 0.8445137805512221,
"grad_norm": 0.7967379093170166,
"learning_rate": 4.668113958486077e-05,
"loss": 2.1565,
"step": 2030
},
{
"epoch": 0.8465938637545501,
"grad_norm": 0.7329113483428955,
"learning_rate": 4.6649580905099875e-05,
"loss": 2.2587,
"step": 2035
},
{
"epoch": 0.8486739469578783,
"grad_norm": 0.6913650631904602,
"learning_rate": 4.6617883662843464e-05,
"loss": 2.1888,
"step": 2040
},
{
"epoch": 0.8507540301612064,
"grad_norm": 0.6449567675590515,
"learning_rate": 4.658604806096147e-05,
"loss": 2.1799,
"step": 2045
},
{
"epoch": 0.8528341133645346,
"grad_norm": 0.6242998838424683,
"learning_rate": 4.655407430320937e-05,
"loss": 2.2185,
"step": 2050
},
{
"epoch": 0.8549141965678627,
"grad_norm": 0.7295005917549133,
"learning_rate": 4.652196259422685e-05,
"loss": 2.1757,
"step": 2055
},
{
"epoch": 0.8569942797711908,
"grad_norm": 0.6830917000770569,
"learning_rate": 4.648971313953654e-05,
"loss": 2.216,
"step": 2060
},
{
"epoch": 0.859074362974519,
"grad_norm": 0.6306335926055908,
"learning_rate": 4.645732614554264e-05,
"loss": 2.1863,
"step": 2065
},
{
"epoch": 0.8611544461778471,
"grad_norm": 0.7190198302268982,
"learning_rate": 4.642480181952967e-05,
"loss": 2.1922,
"step": 2070
},
{
"epoch": 0.8632345293811753,
"grad_norm": 0.6659998893737793,
"learning_rate": 4.6392140369661104e-05,
"loss": 2.2265,
"step": 2075
},
{
"epoch": 0.8653146125845034,
"grad_norm": 0.8210431337356567,
"learning_rate": 4.6359342004978016e-05,
"loss": 2.2259,
"step": 2080
},
{
"epoch": 0.8673946957878315,
"grad_norm": 0.7920681834220886,
"learning_rate": 4.6326406935397797e-05,
"loss": 2.222,
"step": 2085
},
{
"epoch": 0.8694747789911597,
"grad_norm": 0.7032017707824707,
"learning_rate": 4.629333537171277e-05,
"loss": 2.2289,
"step": 2090
},
{
"epoch": 0.8715548621944877,
"grad_norm": 0.648613691329956,
"learning_rate": 4.6260127525588824e-05,
"loss": 2.2021,
"step": 2095
},
{
"epoch": 0.8736349453978159,
"grad_norm": 0.6248600482940674,
"learning_rate": 4.622678360956415e-05,
"loss": 2.2162,
"step": 2100
},
{
"epoch": 0.875715028601144,
"grad_norm": 0.6035733222961426,
"learning_rate": 4.619330383704778e-05,
"loss": 2.2385,
"step": 2105
},
{
"epoch": 0.8777951118044722,
"grad_norm": 0.6163062453269958,
"learning_rate": 4.615968842231825e-05,
"loss": 2.222,
"step": 2110
},
{
"epoch": 0.8798751950078003,
"grad_norm": 0.6982393860816956,
"learning_rate": 4.612593758052227e-05,
"loss": 2.191,
"step": 2115
},
{
"epoch": 0.8819552782111284,
"grad_norm": 0.9503993391990662,
"learning_rate": 4.609205152767329e-05,
"loss": 2.1852,
"step": 2120
},
{
"epoch": 0.8840353614144566,
"grad_norm": 0.6817579865455627,
"learning_rate": 4.605803048065014e-05,
"loss": 2.2085,
"step": 2125
},
{
"epoch": 0.8861154446177847,
"grad_norm": 0.621665358543396,
"learning_rate": 4.6023874657195686e-05,
"loss": 2.2261,
"step": 2130
},
{
"epoch": 0.8881955278211129,
"grad_norm": 0.7342547178268433,
"learning_rate": 4.5989584275915345e-05,
"loss": 2.2056,
"step": 2135
},
{
"epoch": 0.890275611024441,
"grad_norm": 0.7103343605995178,
"learning_rate": 4.595515955627576e-05,
"loss": 2.1456,
"step": 2140
},
{
"epoch": 0.8923556942277691,
"grad_norm": 0.7423024773597717,
"learning_rate": 4.592060071860339e-05,
"loss": 2.2315,
"step": 2145
},
{
"epoch": 0.8944357774310973,
"grad_norm": 0.6223635077476501,
"learning_rate": 4.5885907984083034e-05,
"loss": 2.2524,
"step": 2150
},
{
"epoch": 0.8965158606344253,
"grad_norm": 0.6376893520355225,
"learning_rate": 4.5851081574756504e-05,
"loss": 2.2102,
"step": 2155
},
{
"epoch": 0.8985959438377535,
"grad_norm": 0.6602868437767029,
"learning_rate": 4.5816121713521155e-05,
"loss": 2.2622,
"step": 2160
},
{
"epoch": 0.9006760270410816,
"grad_norm": 0.5940150022506714,
"learning_rate": 4.578102862412844e-05,
"loss": 2.241,
"step": 2165
},
{
"epoch": 0.9027561102444098,
"grad_norm": 0.7278428673744202,
"learning_rate": 4.5745802531182544e-05,
"loss": 2.1557,
"step": 2170
},
{
"epoch": 0.9048361934477379,
"grad_norm": 0.6823807954788208,
"learning_rate": 4.5710443660138874e-05,
"loss": 2.1973,
"step": 2175
},
{
"epoch": 0.906916276651066,
"grad_norm": 0.6403205990791321,
"learning_rate": 4.5674952237302664e-05,
"loss": 2.2092,
"step": 2180
},
{
"epoch": 0.9089963598543942,
"grad_norm": 0.8217492699623108,
"learning_rate": 4.563932848982752e-05,
"loss": 2.1835,
"step": 2185
},
{
"epoch": 0.9110764430577223,
"grad_norm": 0.6273100972175598,
"learning_rate": 4.560357264571392e-05,
"loss": 2.1914,
"step": 2190
},
{
"epoch": 0.9131565262610505,
"grad_norm": 0.7473320364952087,
"learning_rate": 4.5567684933807844e-05,
"loss": 2.2302,
"step": 2195
},
{
"epoch": 0.9152366094643786,
"grad_norm": 0.6911905407905579,
"learning_rate": 4.553166558379922e-05,
"loss": 2.1685,
"step": 2200
},
{
"epoch": 0.9173166926677067,
"grad_norm": 0.817228376865387,
"learning_rate": 4.54955148262205e-05,
"loss": 2.1914,
"step": 2205
},
{
"epoch": 0.9193967758710349,
"grad_norm": 0.934036910533905,
"learning_rate": 4.545923289244517e-05,
"loss": 2.2018,
"step": 2210
},
{
"epoch": 0.9214768590743629,
"grad_norm": 0.6456049680709839,
"learning_rate": 4.542282001468631e-05,
"loss": 2.2071,
"step": 2215
},
{
"epoch": 0.9235569422776911,
"grad_norm": 0.9103360772132874,
"learning_rate": 4.5386276425995025e-05,
"loss": 2.2058,
"step": 2220
},
{
"epoch": 0.9256370254810192,
"grad_norm": 0.7046015858650208,
"learning_rate": 4.5349602360259026e-05,
"loss": 2.231,
"step": 2225
},
{
"epoch": 0.9277171086843474,
"grad_norm": 0.7220612168312073,
"learning_rate": 4.531279805220111e-05,
"loss": 2.2151,
"step": 2230
},
{
"epoch": 0.9297971918876755,
"grad_norm": 0.759607195854187,
"learning_rate": 4.5275863737377644e-05,
"loss": 2.2082,
"step": 2235
},
{
"epoch": 0.9318772750910036,
"grad_norm": 0.6310170888900757,
"learning_rate": 4.523879965217708e-05,
"loss": 2.2005,
"step": 2240
},
{
"epoch": 0.9339573582943318,
"grad_norm": 0.8374559879302979,
"learning_rate": 4.520160603381842e-05,
"loss": 2.2175,
"step": 2245
},
{
"epoch": 0.9360374414976599,
"grad_norm": 0.6279659271240234,
"learning_rate": 4.516428312034974e-05,
"loss": 2.1931,
"step": 2250
},
{
"epoch": 0.9381175247009881,
"grad_norm": 0.6658344864845276,
"learning_rate": 4.512683115064658e-05,
"loss": 2.2423,
"step": 2255
},
{
"epoch": 0.9401976079043162,
"grad_norm": 0.6160857081413269,
"learning_rate": 4.508925036441053e-05,
"loss": 2.1999,
"step": 2260
},
{
"epoch": 0.9422776911076443,
"grad_norm": 0.685691773891449,
"learning_rate": 4.505154100216759e-05,
"loss": 2.2019,
"step": 2265
},
{
"epoch": 0.9443577743109725,
"grad_norm": 0.5475696325302124,
"learning_rate": 4.501370330526671e-05,
"loss": 2.1944,
"step": 2270
},
{
"epoch": 0.9464378575143005,
"grad_norm": 0.6216719746589661,
"learning_rate": 4.497573751587819e-05,
"loss": 2.2182,
"step": 2275
},
{
"epoch": 0.9485179407176287,
"grad_norm": 0.708109438419342,
"learning_rate": 4.4937643876992176e-05,
"loss": 2.1936,
"step": 2280
},
{
"epoch": 0.9505980239209568,
"grad_norm": 0.8892818093299866,
"learning_rate": 4.489942263241705e-05,
"loss": 2.194,
"step": 2285
},
{
"epoch": 0.952678107124285,
"grad_norm": 0.6160807013511658,
"learning_rate": 4.4861074026777936e-05,
"loss": 2.2116,
"step": 2290
},
{
"epoch": 0.9547581903276131,
"grad_norm": 0.7309338450431824,
"learning_rate": 4.482259830551507e-05,
"loss": 2.2298,
"step": 2295
},
{
"epoch": 0.9568382735309412,
"grad_norm": 0.6691438555717468,
"learning_rate": 4.4783995714882265e-05,
"loss": 2.2271,
"step": 2300
},
{
"epoch": 0.9589183567342694,
"grad_norm": 0.7534273266792297,
"learning_rate": 4.474526650194535e-05,
"loss": 2.195,
"step": 2305
},
{
"epoch": 0.9609984399375975,
"grad_norm": 0.7900533676147461,
"learning_rate": 4.4706410914580535e-05,
"loss": 2.1975,
"step": 2310
},
{
"epoch": 0.9630785231409257,
"grad_norm": 0.8033674359321594,
"learning_rate": 4.4667429201472876e-05,
"loss": 2.2036,
"step": 2315
},
{
"epoch": 0.9651586063442538,
"grad_norm": 0.6255626678466797,
"learning_rate": 4.4628321612114666e-05,
"loss": 2.2145,
"step": 2320
},
{
"epoch": 0.9672386895475819,
"grad_norm": 0.7248416543006897,
"learning_rate": 4.458908839680382e-05,
"loss": 2.2367,
"step": 2325
},
{
"epoch": 0.96931877275091,
"grad_norm": 0.5884549021720886,
"learning_rate": 4.454972980664231e-05,
"loss": 2.2555,
"step": 2330
},
{
"epoch": 0.9713988559542381,
"grad_norm": 0.7502307891845703,
"learning_rate": 4.451024609353451e-05,
"loss": 2.2157,
"step": 2335
},
{
"epoch": 0.9734789391575663,
"grad_norm": 0.6624306440353394,
"learning_rate": 4.447063751018565e-05,
"loss": 2.2068,
"step": 2340
},
{
"epoch": 0.9755590223608944,
"grad_norm": 0.8079151511192322,
"learning_rate": 4.4430904310100117e-05,
"loss": 2.1948,
"step": 2345
},
{
"epoch": 0.9776391055642226,
"grad_norm": 0.6674748659133911,
"learning_rate": 4.4391046747579903e-05,
"loss": 2.2253,
"step": 2350
},
{
"epoch": 0.9797191887675507,
"grad_norm": 0.644355297088623,
"learning_rate": 4.435106507772294e-05,
"loss": 2.2232,
"step": 2355
},
{
"epoch": 0.9817992719708788,
"grad_norm": 0.7964586615562439,
"learning_rate": 4.431095955642147e-05,
"loss": 2.1854,
"step": 2360
},
{
"epoch": 0.983879355174207,
"grad_norm": 0.6387479305267334,
"learning_rate": 4.4270730440360434e-05,
"loss": 2.1848,
"step": 2365
},
{
"epoch": 0.9859594383775351,
"grad_norm": 0.7305927872657776,
"learning_rate": 4.4230377987015773e-05,
"loss": 2.1617,
"step": 2370
},
{
"epoch": 0.9880395215808633,
"grad_norm": 0.7234925031661987,
"learning_rate": 4.418990245465286e-05,
"loss": 2.2035,
"step": 2375
},
{
"epoch": 0.9901196047841914,
"grad_norm": 0.9234485030174255,
"learning_rate": 4.4149304102324784e-05,
"loss": 2.1871,
"step": 2380
},
{
"epoch": 0.9921996879875195,
"grad_norm": 0.6665201783180237,
"learning_rate": 4.41085831898707e-05,
"loss": 2.2183,
"step": 2385
},
{
"epoch": 0.9942797711908476,
"grad_norm": 0.6990284323692322,
"learning_rate": 4.406773997791418e-05,
"loss": 2.1597,
"step": 2390
},
{
"epoch": 0.9963598543941757,
"grad_norm": 0.7203196883201599,
"learning_rate": 4.402677472786156e-05,
"loss": 2.203,
"step": 2395
},
{
"epoch": 0.9984399375975039,
"grad_norm": 0.9164974093437195,
"learning_rate": 4.398568770190025e-05,
"loss": 2.2222,
"step": 2400
},
{
"epoch": 1.000520020800832,
"grad_norm": 0.655780553817749,
"learning_rate": 4.394447916299701e-05,
"loss": 2.188,
"step": 2405
},
{
"epoch": 1.00260010400416,
"grad_norm": 0.7037012577056885,
"learning_rate": 4.3903149374896366e-05,
"loss": 2.1934,
"step": 2410
},
{
"epoch": 1.0046801872074882,
"grad_norm": 0.6656583547592163,
"learning_rate": 4.386169860211884e-05,
"loss": 2.2366,
"step": 2415
},
{
"epoch": 1.0067602704108165,
"grad_norm": 0.6743558049201965,
"learning_rate": 4.3820127109959294e-05,
"loss": 2.1893,
"step": 2420
},
{
"epoch": 1.0088403536141446,
"grad_norm": 0.6413071751594543,
"learning_rate": 4.3778435164485216e-05,
"loss": 2.1776,
"step": 2425
},
{
"epoch": 1.0109204368174727,
"grad_norm": 0.7319051623344421,
"learning_rate": 4.373662303253504e-05,
"loss": 2.2236,
"step": 2430
},
{
"epoch": 1.0130005200208008,
"grad_norm": 0.7512479424476624,
"learning_rate": 4.369469098171639e-05,
"loss": 2.1662,
"step": 2435
},
{
"epoch": 1.015080603224129,
"grad_norm": 0.961911678314209,
"learning_rate": 4.365263928040444e-05,
"loss": 2.1793,
"step": 2440
},
{
"epoch": 1.0171606864274572,
"grad_norm": 0.7037209272384644,
"learning_rate": 4.361046819774012e-05,
"loss": 2.2268,
"step": 2445
},
{
"epoch": 1.0192407696307852,
"grad_norm": 0.6789669394493103,
"learning_rate": 4.356817800362846e-05,
"loss": 2.1593,
"step": 2450
},
{
"epoch": 1.0213208528341133,
"grad_norm": 0.7595046758651733,
"learning_rate": 4.35257689687368e-05,
"loss": 2.1979,
"step": 2455
},
{
"epoch": 1.0234009360374414,
"grad_norm": 0.6408656239509583,
"learning_rate": 4.348324136449311e-05,
"loss": 2.1928,
"step": 2460
},
{
"epoch": 1.0254810192407697,
"grad_norm": 0.7805280685424805,
"learning_rate": 4.344059546308424e-05,
"loss": 2.2062,
"step": 2465
},
{
"epoch": 1.0275611024440978,
"grad_norm": 0.6017510294914246,
"learning_rate": 4.3397831537454146e-05,
"loss": 2.1883,
"step": 2470
},
{
"epoch": 1.029641185647426,
"grad_norm": 0.6209552884101868,
"learning_rate": 4.335494986130219e-05,
"loss": 2.2042,
"step": 2475
},
{
"epoch": 1.031721268850754,
"grad_norm": 0.641849935054779,
"learning_rate": 4.331195070908134e-05,
"loss": 2.2132,
"step": 2480
},
{
"epoch": 1.033801352054082,
"grad_norm": 0.6367346048355103,
"learning_rate": 4.326883435599646e-05,
"loss": 2.1853,
"step": 2485
},
{
"epoch": 1.0358814352574104,
"grad_norm": 0.745715856552124,
"learning_rate": 4.322560107800253e-05,
"loss": 2.1998,
"step": 2490
},
{
"epoch": 1.0379615184607385,
"grad_norm": 0.9358944296836853,
"learning_rate": 4.318225115180287e-05,
"loss": 2.2148,
"step": 2495
},
{
"epoch": 1.0400416016640666,
"grad_norm": 0.771369457244873,
"learning_rate": 4.313878485484735e-05,
"loss": 2.2095,
"step": 2500
},
{
"epoch": 1.0421216848673946,
"grad_norm": 0.8903271555900574,
"learning_rate": 4.3095202465330695e-05,
"loss": 2.1885,
"step": 2505
},
{
"epoch": 1.0442017680707227,
"grad_norm": 0.83650141954422,
"learning_rate": 4.305150426219061e-05,
"loss": 2.1791,
"step": 2510
},
{
"epoch": 1.046281851274051,
"grad_norm": 0.8761057257652283,
"learning_rate": 4.300769052510604e-05,
"loss": 2.2408,
"step": 2515
},
{
"epoch": 1.0483619344773791,
"grad_norm": 0.8988538384437561,
"learning_rate": 4.296376153449539e-05,
"loss": 2.2054,
"step": 2520
},
{
"epoch": 1.0504420176807072,
"grad_norm": 0.7735057473182678,
"learning_rate": 4.29197175715147e-05,
"loss": 2.2135,
"step": 2525
},
{
"epoch": 1.0525221008840353,
"grad_norm": 0.7575799226760864,
"learning_rate": 4.287555891805587e-05,
"loss": 2.1696,
"step": 2530
},
{
"epoch": 1.0546021840873634,
"grad_norm": 0.8359115719795227,
"learning_rate": 4.283128585674485e-05,
"loss": 2.2099,
"step": 2535
},
{
"epoch": 1.0566822672906917,
"grad_norm": 0.8799722194671631,
"learning_rate": 4.27868986709398e-05,
"loss": 2.173,
"step": 2540
},
{
"epoch": 1.0587623504940198,
"grad_norm": 0.6700918078422546,
"learning_rate": 4.274239764472935e-05,
"loss": 2.2171,
"step": 2545
},
{
"epoch": 1.0608424336973479,
"grad_norm": 0.6470414400100708,
"learning_rate": 4.269778306293068e-05,
"loss": 2.2247,
"step": 2550
},
{
"epoch": 1.062922516900676,
"grad_norm": 0.7337380051612854,
"learning_rate": 4.2653055211087824e-05,
"loss": 2.19,
"step": 2555
},
{
"epoch": 1.065002600104004,
"grad_norm": 0.6618953347206116,
"learning_rate": 4.26082143754697e-05,
"loss": 2.2051,
"step": 2560
},
{
"epoch": 1.0670826833073324,
"grad_norm": 0.6239338517189026,
"learning_rate": 4.256326084306839e-05,
"loss": 2.2373,
"step": 2565
},
{
"epoch": 1.0691627665106604,
"grad_norm": 0.6498297452926636,
"learning_rate": 4.2518194901597244e-05,
"loss": 2.155,
"step": 2570
},
{
"epoch": 1.0712428497139885,
"grad_norm": 0.7128412127494812,
"learning_rate": 4.2473016839489084e-05,
"loss": 2.2039,
"step": 2575
},
{
"epoch": 1.0733229329173166,
"grad_norm": 0.63873291015625,
"learning_rate": 4.2427726945894294e-05,
"loss": 2.2305,
"step": 2580
},
{
"epoch": 1.075403016120645,
"grad_norm": 0.6624719500541687,
"learning_rate": 4.2382325510679034e-05,
"loss": 2.1742,
"step": 2585
},
{
"epoch": 1.077483099323973,
"grad_norm": 0.7572752833366394,
"learning_rate": 4.2336812824423345e-05,
"loss": 2.2121,
"step": 2590
},
{
"epoch": 1.079563182527301,
"grad_norm": 0.7270757555961609,
"learning_rate": 4.229118917841931e-05,
"loss": 2.201,
"step": 2595
},
{
"epoch": 1.0816432657306292,
"grad_norm": 0.7370373606681824,
"learning_rate": 4.224545486466916e-05,
"loss": 2.1859,
"step": 2600
},
{
"epoch": 1.0837233489339573,
"grad_norm": 0.7897046208381653,
"learning_rate": 4.219961017588345e-05,
"loss": 2.1436,
"step": 2605
},
{
"epoch": 1.0858034321372856,
"grad_norm": 0.7892904281616211,
"learning_rate": 4.215365540547916e-05,
"loss": 2.2309,
"step": 2610
},
{
"epoch": 1.0878835153406137,
"grad_norm": 0.67086261510849,
"learning_rate": 4.2107590847577795e-05,
"loss": 2.2064,
"step": 2615
},
{
"epoch": 1.0899635985439418,
"grad_norm": 0.7036137580871582,
"learning_rate": 4.2061416797003563e-05,
"loss": 2.1563,
"step": 2620
},
{
"epoch": 1.0920436817472698,
"grad_norm": 0.678588330745697,
"learning_rate": 4.2015133549281405e-05,
"loss": 2.2293,
"step": 2625
},
{
"epoch": 1.094123764950598,
"grad_norm": 0.6367912888526917,
"learning_rate": 4.196874140063519e-05,
"loss": 2.1538,
"step": 2630
},
{
"epoch": 1.0962038481539262,
"grad_norm": 0.8265711665153503,
"learning_rate": 4.192224064798577e-05,
"loss": 2.2269,
"step": 2635
},
{
"epoch": 1.0982839313572543,
"grad_norm": 0.7148616909980774,
"learning_rate": 4.187563158894907e-05,
"loss": 2.1895,
"step": 2640
},
{
"epoch": 1.1003640145605824,
"grad_norm": 0.7453672289848328,
"learning_rate": 4.182891452183423e-05,
"loss": 2.2019,
"step": 2645
},
{
"epoch": 1.1024440977639105,
"grad_norm": 0.7494534254074097,
"learning_rate": 4.178208974564164e-05,
"loss": 2.1909,
"step": 2650
},
{
"epoch": 1.1045241809672386,
"grad_norm": 0.7747659683227539,
"learning_rate": 4.173515756006107e-05,
"loss": 2.1631,
"step": 2655
},
{
"epoch": 1.106604264170567,
"grad_norm": 0.7446553707122803,
"learning_rate": 4.168811826546972e-05,
"loss": 2.2373,
"step": 2660
},
{
"epoch": 1.108684347373895,
"grad_norm": 1.0015227794647217,
"learning_rate": 4.164097216293035e-05,
"loss": 2.1514,
"step": 2665
},
{
"epoch": 1.110764430577223,
"grad_norm": 0.7084265351295471,
"learning_rate": 4.159371955418928e-05,
"loss": 2.1887,
"step": 2670
},
{
"epoch": 1.1128445137805512,
"grad_norm": 0.8212253451347351,
"learning_rate": 4.15463607416745e-05,
"loss": 2.1795,
"step": 2675
},
{
"epoch": 1.1149245969838795,
"grad_norm": 0.6273921728134155,
"learning_rate": 4.149889602849375e-05,
"loss": 2.2028,
"step": 2680
},
{
"epoch": 1.1170046801872076,
"grad_norm": 0.7842991948127747,
"learning_rate": 4.145132571843253e-05,
"loss": 2.2196,
"step": 2685
},
{
"epoch": 1.1190847633905356,
"grad_norm": 1.2877519130706787,
"learning_rate": 4.140365011595222e-05,
"loss": 2.1534,
"step": 2690
},
{
"epoch": 1.1211648465938637,
"grad_norm": 0.8273409008979797,
"learning_rate": 4.1355869526188065e-05,
"loss": 2.2098,
"step": 2695
},
{
"epoch": 1.1232449297971918,
"grad_norm": 0.7299179434776306,
"learning_rate": 4.130798425494726e-05,
"loss": 2.1747,
"step": 2700
},
{
"epoch": 1.12532501300052,
"grad_norm": 0.6302483677864075,
"learning_rate": 4.125999460870701e-05,
"loss": 2.1866,
"step": 2705
},
{
"epoch": 1.1274050962038482,
"grad_norm": 0.6543394327163696,
"learning_rate": 4.121190089461252e-05,
"loss": 2.2179,
"step": 2710
},
{
"epoch": 1.1294851794071763,
"grad_norm": 0.6725994944572449,
"learning_rate": 4.1163703420475065e-05,
"loss": 2.1915,
"step": 2715
},
{
"epoch": 1.1315652626105044,
"grad_norm": 0.6935145854949951,
"learning_rate": 4.111540249476999e-05,
"loss": 2.133,
"step": 2720
},
{
"epoch": 1.1336453458138325,
"grad_norm": 0.6305291652679443,
"learning_rate": 4.106699842663481e-05,
"loss": 2.2123,
"step": 2725
},
{
"epoch": 1.1357254290171608,
"grad_norm": 0.6931216716766357,
"learning_rate": 4.10184915258671e-05,
"loss": 2.2118,
"step": 2730
},
{
"epoch": 1.1378055122204889,
"grad_norm": 0.6725122928619385,
"learning_rate": 4.096988210292264e-05,
"loss": 2.1921,
"step": 2735
},
{
"epoch": 1.139885595423817,
"grad_norm": 0.6598119139671326,
"learning_rate": 4.092117046891336e-05,
"loss": 2.2118,
"step": 2740
},
{
"epoch": 1.141965678627145,
"grad_norm": 0.8654290437698364,
"learning_rate": 4.0872356935605365e-05,
"loss": 2.1693,
"step": 2745
},
{
"epoch": 1.1440457618304731,
"grad_norm": 0.7804104089736938,
"learning_rate": 4.082344181541695e-05,
"loss": 2.1692,
"step": 2750
},
{
"epoch": 1.1461258450338014,
"grad_norm": 0.725069522857666,
"learning_rate": 4.0774425421416586e-05,
"loss": 2.2162,
"step": 2755
},
{
"epoch": 1.1482059282371295,
"grad_norm": 0.8145197629928589,
"learning_rate": 4.07253080673209e-05,
"loss": 2.1679,
"step": 2760
},
{
"epoch": 1.1502860114404576,
"grad_norm": 0.785518229007721,
"learning_rate": 4.0676090067492725e-05,
"loss": 2.2055,
"step": 2765
},
{
"epoch": 1.1523660946437857,
"grad_norm": 0.8365340232849121,
"learning_rate": 4.062677173693901e-05,
"loss": 2.1862,
"step": 2770
},
{
"epoch": 1.154446177847114,
"grad_norm": 0.79314786195755,
"learning_rate": 4.057735339130888e-05,
"loss": 2.2076,
"step": 2775
},
{
"epoch": 1.156526261050442,
"grad_norm": 0.8674628138542175,
"learning_rate": 4.052783534689157e-05,
"loss": 2.1995,
"step": 2780
},
{
"epoch": 1.1586063442537702,
"grad_norm": 0.8610255122184753,
"learning_rate": 4.047821792061439e-05,
"loss": 2.1898,
"step": 2785
},
{
"epoch": 1.1606864274570983,
"grad_norm": 0.6575633883476257,
"learning_rate": 4.042850143004075e-05,
"loss": 2.1737,
"step": 2790
},
{
"epoch": 1.1627665106604264,
"grad_norm": 0.6737598776817322,
"learning_rate": 4.0378686193368076e-05,
"loss": 2.2065,
"step": 2795
},
{
"epoch": 1.1648465938637544,
"grad_norm": 0.6962065100669861,
"learning_rate": 4.0328772529425796e-05,
"loss": 2.1664,
"step": 2800
},
{
"epoch": 1.1669266770670828,
"grad_norm": 0.6436595320701599,
"learning_rate": 4.027876075767329e-05,
"loss": 2.171,
"step": 2805
},
{
"epoch": 1.1690067602704108,
"grad_norm": 0.6421999335289001,
"learning_rate": 4.0228651198197865e-05,
"loss": 2.1698,
"step": 2810
},
{
"epoch": 1.171086843473739,
"grad_norm": 0.8119395971298218,
"learning_rate": 4.017844417171269e-05,
"loss": 2.2039,
"step": 2815
},
{
"epoch": 1.173166926677067,
"grad_norm": 0.7124528884887695,
"learning_rate": 4.012813999955473e-05,
"loss": 2.1838,
"step": 2820
},
{
"epoch": 1.1752470098803953,
"grad_norm": 0.7611419558525085,
"learning_rate": 4.007773900368272e-05,
"loss": 2.2004,
"step": 2825
},
{
"epoch": 1.1773270930837234,
"grad_norm": 0.7459845542907715,
"learning_rate": 4.002724150667509e-05,
"loss": 2.248,
"step": 2830
},
{
"epoch": 1.1794071762870515,
"grad_norm": 0.6019890904426575,
"learning_rate": 3.997664783172792e-05,
"loss": 2.1439,
"step": 2835
},
{
"epoch": 1.1814872594903796,
"grad_norm": 0.7191320061683655,
"learning_rate": 3.992595830265279e-05,
"loss": 2.1846,
"step": 2840
},
{
"epoch": 1.1835673426937077,
"grad_norm": 0.7781062722206116,
"learning_rate": 3.987517324387483e-05,
"loss": 2.1967,
"step": 2845
},
{
"epoch": 1.185647425897036,
"grad_norm": 0.7874257564544678,
"learning_rate": 3.982429298043057e-05,
"loss": 2.206,
"step": 2850
},
{
"epoch": 1.187727509100364,
"grad_norm": 0.6833536624908447,
"learning_rate": 3.977331783796584e-05,
"loss": 2.1726,
"step": 2855
},
{
"epoch": 1.1898075923036922,
"grad_norm": 0.7719831466674805,
"learning_rate": 3.972224814273377e-05,
"loss": 2.1507,
"step": 2860
},
{
"epoch": 1.1918876755070202,
"grad_norm": 0.784296989440918,
"learning_rate": 3.9671084221592604e-05,
"loss": 2.1744,
"step": 2865
},
{
"epoch": 1.1939677587103483,
"grad_norm": 0.8013231754302979,
"learning_rate": 3.961982640200368e-05,
"loss": 2.2045,
"step": 2870
},
{
"epoch": 1.1960478419136766,
"grad_norm": 0.648163914680481,
"learning_rate": 3.95684750120293e-05,
"loss": 2.1638,
"step": 2875
},
{
"epoch": 1.1981279251170047,
"grad_norm": 0.7477614879608154,
"learning_rate": 3.951703038033066e-05,
"loss": 2.1993,
"step": 2880
},
{
"epoch": 1.2002080083203328,
"grad_norm": 0.6471365094184875,
"learning_rate": 3.9465492836165665e-05,
"loss": 2.1837,
"step": 2885
},
{
"epoch": 1.202288091523661,
"grad_norm": 0.708446204662323,
"learning_rate": 3.9413862709386964e-05,
"loss": 2.2097,
"step": 2890
},
{
"epoch": 1.204368174726989,
"grad_norm": 0.5960673689842224,
"learning_rate": 3.93621403304397e-05,
"loss": 2.2038,
"step": 2895
},
{
"epoch": 1.2064482579303173,
"grad_norm": 0.7760262489318848,
"learning_rate": 3.931032603035947e-05,
"loss": 2.2158,
"step": 2900
},
{
"epoch": 1.2085283411336454,
"grad_norm": 0.833264172077179,
"learning_rate": 3.925842014077018e-05,
"loss": 2.1335,
"step": 2905
},
{
"epoch": 1.2106084243369735,
"grad_norm": 0.6717672944068909,
"learning_rate": 3.920642299388194e-05,
"loss": 2.1679,
"step": 2910
},
{
"epoch": 1.2126885075403016,
"grad_norm": 0.7024863362312317,
"learning_rate": 3.915433492248894e-05,
"loss": 2.2027,
"step": 2915
},
{
"epoch": 1.2147685907436299,
"grad_norm": 0.6785237789154053,
"learning_rate": 3.910215625996727e-05,
"loss": 2.1512,
"step": 2920
},
{
"epoch": 1.216848673946958,
"grad_norm": 0.6357817649841309,
"learning_rate": 3.904988734027287e-05,
"loss": 2.1536,
"step": 2925
},
{
"epoch": 1.218928757150286,
"grad_norm": 0.7310034036636353,
"learning_rate": 3.899752849793932e-05,
"loss": 2.156,
"step": 2930
},
{
"epoch": 1.2210088403536141,
"grad_norm": 0.8692020773887634,
"learning_rate": 3.8945080068075726e-05,
"loss": 2.2005,
"step": 2935
},
{
"epoch": 1.2230889235569422,
"grad_norm": 0.7720586657524109,
"learning_rate": 3.8892542386364594e-05,
"loss": 2.2019,
"step": 2940
},
{
"epoch": 1.2251690067602703,
"grad_norm": 0.8210708498954773,
"learning_rate": 3.8839915789059636e-05,
"loss": 2.1664,
"step": 2945
},
{
"epoch": 1.2272490899635986,
"grad_norm": 0.813205361366272,
"learning_rate": 3.8787200612983683e-05,
"loss": 2.2265,
"step": 2950
},
{
"epoch": 1.2293291731669267,
"grad_norm": 0.7682729363441467,
"learning_rate": 3.873439719552645e-05,
"loss": 2.1928,
"step": 2955
},
{
"epoch": 1.2314092563702548,
"grad_norm": 0.8033297061920166,
"learning_rate": 3.8681505874642446e-05,
"loss": 2.1884,
"step": 2960
},
{
"epoch": 1.2334893395735829,
"grad_norm": 0.7240457534790039,
"learning_rate": 3.8628526988848776e-05,
"loss": 2.1819,
"step": 2965
},
{
"epoch": 1.2355694227769112,
"grad_norm": 0.7186746597290039,
"learning_rate": 3.857546087722297e-05,
"loss": 2.184,
"step": 2970
},
{
"epoch": 1.2376495059802393,
"grad_norm": 0.8010230660438538,
"learning_rate": 3.8522307879400835e-05,
"loss": 2.1917,
"step": 2975
},
{
"epoch": 1.2397295891835673,
"grad_norm": 0.8483126759529114,
"learning_rate": 3.846906833557429e-05,
"loss": 2.1907,
"step": 2980
},
{
"epoch": 1.2418096723868954,
"grad_norm": 0.8604680895805359,
"learning_rate": 3.841574258648912e-05,
"loss": 2.1904,
"step": 2985
},
{
"epoch": 1.2438897555902235,
"grad_norm": 0.7877135276794434,
"learning_rate": 3.836233097344288e-05,
"loss": 2.1713,
"step": 2990
},
{
"epoch": 1.2459698387935518,
"grad_norm": 0.7193028926849365,
"learning_rate": 3.8308833838282696e-05,
"loss": 2.1867,
"step": 2995
},
{
"epoch": 1.24804992199688,
"grad_norm": 0.7264684438705444,
"learning_rate": 3.825525152340298e-05,
"loss": 2.1526,
"step": 3000
},
{
"epoch": 1.250130005200208,
"grad_norm": 0.6914860606193542,
"learning_rate": 3.82015843717434e-05,
"loss": 2.1867,
"step": 3005
},
{
"epoch": 1.252210088403536,
"grad_norm": 0.6980414986610413,
"learning_rate": 3.814783272678654e-05,
"loss": 2.1812,
"step": 3010
},
{
"epoch": 1.2542901716068644,
"grad_norm": 0.6947855949401855,
"learning_rate": 3.809399693255579e-05,
"loss": 2.1888,
"step": 3015
},
{
"epoch": 1.2563702548101925,
"grad_norm": 0.8331231474876404,
"learning_rate": 3.8040077333613114e-05,
"loss": 2.1927,
"step": 3020
},
{
"epoch": 1.2584503380135206,
"grad_norm": 0.6281008720397949,
"learning_rate": 3.798607427505684e-05,
"loss": 2.1284,
"step": 3025
},
{
"epoch": 1.2605304212168487,
"grad_norm": 0.6286195516586304,
"learning_rate": 3.7931988102519436e-05,
"loss": 2.1813,
"step": 3030
},
{
"epoch": 1.2626105044201767,
"grad_norm": 0.6835711002349854,
"learning_rate": 3.7877819162165376e-05,
"loss": 2.1774,
"step": 3035
},
{
"epoch": 1.2646905876235048,
"grad_norm": 0.7094857692718506,
"learning_rate": 3.7823567800688805e-05,
"loss": 2.1944,
"step": 3040
},
{
"epoch": 1.2667706708268331,
"grad_norm": 0.7638453841209412,
"learning_rate": 3.776923436531142e-05,
"loss": 2.1965,
"step": 3045
},
{
"epoch": 1.2688507540301612,
"grad_norm": 0.8362658023834229,
"learning_rate": 3.7714819203780215e-05,
"loss": 2.2046,
"step": 3050
},
{
"epoch": 1.2709308372334893,
"grad_norm": 0.7839365601539612,
"learning_rate": 3.7660322664365226e-05,
"loss": 2.1612,
"step": 3055
},
{
"epoch": 1.2730109204368174,
"grad_norm": 0.7160779237747192,
"learning_rate": 3.760574509585734e-05,
"loss": 2.1517,
"step": 3060
},
{
"epoch": 1.2750910036401457,
"grad_norm": 0.6370715498924255,
"learning_rate": 3.7551086847566045e-05,
"loss": 2.1783,
"step": 3065
},
{
"epoch": 1.2771710868434738,
"grad_norm": 0.7071606516838074,
"learning_rate": 3.74963482693172e-05,
"loss": 2.2303,
"step": 3070
},
{
"epoch": 1.2792511700468019,
"grad_norm": 0.6695096492767334,
"learning_rate": 3.744152971145081e-05,
"loss": 2.2031,
"step": 3075
},
{
"epoch": 1.28133125325013,
"grad_norm": 0.8058128356933594,
"learning_rate": 3.738663152481875e-05,
"loss": 2.1971,
"step": 3080
},
{
"epoch": 1.283411336453458,
"grad_norm": 0.7660072445869446,
"learning_rate": 3.733165406078254e-05,
"loss": 2.1417,
"step": 3085
},
{
"epoch": 1.2854914196567861,
"grad_norm": 0.639193058013916,
"learning_rate": 3.727659767121109e-05,
"loss": 2.2019,
"step": 3090
},
{
"epoch": 1.2875715028601145,
"grad_norm": 0.7742004990577698,
"learning_rate": 3.722146270847848e-05,
"loss": 2.1982,
"step": 3095
},
{
"epoch": 1.2896515860634425,
"grad_norm": 0.7002267241477966,
"learning_rate": 3.716624952546166e-05,
"loss": 2.1751,
"step": 3100
},
{
"epoch": 1.2917316692667706,
"grad_norm": 0.7309245467185974,
"learning_rate": 3.711095847553817e-05,
"loss": 2.1215,
"step": 3105
},
{
"epoch": 1.293811752470099,
"grad_norm": 0.7643809914588928,
"learning_rate": 3.7055589912583995e-05,
"loss": 2.1208,
"step": 3110
},
{
"epoch": 1.295891835673427,
"grad_norm": 0.6457206606864929,
"learning_rate": 3.700014419097115e-05,
"loss": 2.112,
"step": 3115
},
{
"epoch": 1.2979719188767551,
"grad_norm": 0.7436161041259766,
"learning_rate": 3.694462166556554e-05,
"loss": 2.1684,
"step": 3120
},
{
"epoch": 1.3000520020800832,
"grad_norm": 0.8125039339065552,
"learning_rate": 3.688902269172458e-05,
"loss": 2.1971,
"step": 3125
},
{
"epoch": 1.3021320852834113,
"grad_norm": 0.7858961224555969,
"learning_rate": 3.6833347625295016e-05,
"loss": 2.1598,
"step": 3130
},
{
"epoch": 1.3042121684867394,
"grad_norm": 0.7834795117378235,
"learning_rate": 3.677759682261058e-05,
"loss": 2.1854,
"step": 3135
},
{
"epoch": 1.3062922516900677,
"grad_norm": 0.6971118450164795,
"learning_rate": 3.672177064048976e-05,
"loss": 2.1651,
"step": 3140
},
{
"epoch": 1.3083723348933958,
"grad_norm": 0.7360151410102844,
"learning_rate": 3.6665869436233446e-05,
"loss": 2.2039,
"step": 3145
},
{
"epoch": 1.3104524180967239,
"grad_norm": 0.7403413653373718,
"learning_rate": 3.6609893567622735e-05,
"loss": 2.1726,
"step": 3150
},
{
"epoch": 1.312532501300052,
"grad_norm": 0.6509247422218323,
"learning_rate": 3.655384339291657e-05,
"loss": 2.1739,
"step": 3155
},
{
"epoch": 1.3146125845033803,
"grad_norm": 0.7621864080429077,
"learning_rate": 3.6497719270849464e-05,
"loss": 2.2427,
"step": 3160
},
{
"epoch": 1.3166926677067083,
"grad_norm": 0.851795494556427,
"learning_rate": 3.6441521560629225e-05,
"loss": 2.1947,
"step": 3165
},
{
"epoch": 1.3187727509100364,
"grad_norm": 0.7552398443222046,
"learning_rate": 3.6385250621934655e-05,
"loss": 2.1688,
"step": 3170
},
{
"epoch": 1.3208528341133645,
"grad_norm": 0.824918270111084,
"learning_rate": 3.6328906814913194e-05,
"loss": 2.149,
"step": 3175
},
{
"epoch": 1.3229329173166926,
"grad_norm": 1.407629370689392,
"learning_rate": 3.62724905001787e-05,
"loss": 2.1911,
"step": 3180
},
{
"epoch": 1.3250130005200207,
"grad_norm": 0.7478744983673096,
"learning_rate": 3.621600203880907e-05,
"loss": 2.1778,
"step": 3185
},
{
"epoch": 1.327093083723349,
"grad_norm": 0.7519647479057312,
"learning_rate": 3.615944179234397e-05,
"loss": 2.1899,
"step": 3190
},
{
"epoch": 1.329173166926677,
"grad_norm": 0.8431859612464905,
"learning_rate": 3.610281012278252e-05,
"loss": 2.1521,
"step": 3195
},
{
"epoch": 1.3312532501300052,
"grad_norm": 0.6993555426597595,
"learning_rate": 3.604610739258091e-05,
"loss": 2.1763,
"step": 3200
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.7812565565109253,
"learning_rate": 3.5989333964650216e-05,
"loss": 2.1523,
"step": 3205
},
{
"epoch": 1.3354134165366616,
"grad_norm": 0.6882584691047668,
"learning_rate": 3.593249020235393e-05,
"loss": 2.1866,
"step": 3210
},
{
"epoch": 1.3374934997399897,
"grad_norm": 0.7424536943435669,
"learning_rate": 3.5875576469505735e-05,
"loss": 2.2083,
"step": 3215
},
{
"epoch": 1.3395735829433177,
"grad_norm": 0.7519948482513428,
"learning_rate": 3.581859313036712e-05,
"loss": 2.1977,
"step": 3220
},
{
"epoch": 1.3416536661466458,
"grad_norm": 0.759537398815155,
"learning_rate": 3.576154054964511e-05,
"loss": 2.1824,
"step": 3225
},
{
"epoch": 1.343733749349974,
"grad_norm": 0.8406732678413391,
"learning_rate": 3.570441909248984e-05,
"loss": 2.1667,
"step": 3230
},
{
"epoch": 1.345813832553302,
"grad_norm": 0.7489810585975647,
"learning_rate": 3.564722912449231e-05,
"loss": 2.1611,
"step": 3235
},
{
"epoch": 1.3478939157566303,
"grad_norm": 0.6325241327285767,
"learning_rate": 3.558997101168199e-05,
"loss": 2.1867,
"step": 3240
},
{
"epoch": 1.3499739989599584,
"grad_norm": 1.2815579175949097,
"learning_rate": 3.553264512052449e-05,
"loss": 2.2166,
"step": 3245
},
{
"epoch": 1.3520540821632865,
"grad_norm": 0.7271225452423096,
"learning_rate": 3.5475251817919234e-05,
"loss": 2.1725,
"step": 3250
},
{
"epoch": 1.3541341653666148,
"grad_norm": 0.7447043657302856,
"learning_rate": 3.5417791471197083e-05,
"loss": 2.1804,
"step": 3255
},
{
"epoch": 1.3562142485699429,
"grad_norm": 0.8641957640647888,
"learning_rate": 3.5360264448117986e-05,
"loss": 2.2064,
"step": 3260
},
{
"epoch": 1.358294331773271,
"grad_norm": 0.8609848618507385,
"learning_rate": 3.530267111686867e-05,
"loss": 2.2272,
"step": 3265
},
{
"epoch": 1.360374414976599,
"grad_norm": 0.7370724678039551,
"learning_rate": 3.52450118460602e-05,
"loss": 2.1857,
"step": 3270
},
{
"epoch": 1.3624544981799271,
"grad_norm": 0.7609270215034485,
"learning_rate": 3.518728700472573e-05,
"loss": 2.189,
"step": 3275
},
{
"epoch": 1.3645345813832552,
"grad_norm": 0.6914607286453247,
"learning_rate": 3.512949696231804e-05,
"loss": 2.18,
"step": 3280
},
{
"epoch": 1.3666146645865835,
"grad_norm": 0.8150199055671692,
"learning_rate": 3.507164208870721e-05,
"loss": 2.1867,
"step": 3285
},
{
"epoch": 1.3686947477899116,
"grad_norm": 0.6963294148445129,
"learning_rate": 3.501372275417828e-05,
"loss": 2.1847,
"step": 3290
},
{
"epoch": 1.3707748309932397,
"grad_norm": 0.8265756368637085,
"learning_rate": 3.495573932942884e-05,
"loss": 2.1786,
"step": 3295
},
{
"epoch": 1.3728549141965678,
"grad_norm": 0.7163086533546448,
"learning_rate": 3.489769218556667e-05,
"loss": 2.2068,
"step": 3300
},
{
"epoch": 1.374934997399896,
"grad_norm": 0.7726351618766785,
"learning_rate": 3.483958169410738e-05,
"loss": 2.1578,
"step": 3305
},
{
"epoch": 1.3770150806032242,
"grad_norm": 0.9150016903877258,
"learning_rate": 3.478140822697202e-05,
"loss": 2.1928,
"step": 3310
},
{
"epoch": 1.3790951638065523,
"grad_norm": 0.7856032848358154,
"learning_rate": 3.472317215648467e-05,
"loss": 2.1963,
"step": 3315
},
{
"epoch": 1.3811752470098804,
"grad_norm": 0.8869948387145996,
"learning_rate": 3.466487385537013e-05,
"loss": 2.1658,
"step": 3320
},
{
"epoch": 1.3832553302132085,
"grad_norm": 0.7349131107330322,
"learning_rate": 3.460651369675147e-05,
"loss": 2.1698,
"step": 3325
},
{
"epoch": 1.3853354134165365,
"grad_norm": 0.6802077889442444,
"learning_rate": 3.4548092054147645e-05,
"loss": 2.1982,
"step": 3330
},
{
"epoch": 1.3874154966198649,
"grad_norm": 0.6826719045639038,
"learning_rate": 3.448960930147115e-05,
"loss": 2.1806,
"step": 3335
},
{
"epoch": 1.389495579823193,
"grad_norm": 0.7698726654052734,
"learning_rate": 3.44310658130256e-05,
"loss": 2.1971,
"step": 3340
},
{
"epoch": 1.391575663026521,
"grad_norm": 0.7256355285644531,
"learning_rate": 3.4372461963503294e-05,
"loss": 2.1719,
"step": 3345
},
{
"epoch": 1.3936557462298491,
"grad_norm": 0.8313025832176208,
"learning_rate": 3.431379812798291e-05,
"loss": 2.1915,
"step": 3350
},
{
"epoch": 1.3957358294331774,
"grad_norm": 0.8327659368515015,
"learning_rate": 3.425507468192702e-05,
"loss": 2.1955,
"step": 3355
},
{
"epoch": 1.3978159126365055,
"grad_norm": 0.6724772453308105,
"learning_rate": 3.419629200117972e-05,
"loss": 2.1378,
"step": 3360
},
{
"epoch": 1.3998959958398336,
"grad_norm": 0.6691730618476868,
"learning_rate": 3.4137450461964213e-05,
"loss": 2.2271,
"step": 3365
},
{
"epoch": 1.4019760790431617,
"grad_norm": 0.8396449685096741,
"learning_rate": 3.407855044088045e-05,
"loss": 2.2151,
"step": 3370
},
{
"epoch": 1.4040561622464898,
"grad_norm": 0.9861184358596802,
"learning_rate": 3.401959231490263e-05,
"loss": 2.1944,
"step": 3375
},
{
"epoch": 1.4061362454498179,
"grad_norm": 0.6781579256057739,
"learning_rate": 3.396057646137687e-05,
"loss": 2.1622,
"step": 3380
},
{
"epoch": 1.4082163286531462,
"grad_norm": 0.7667004466056824,
"learning_rate": 3.390150325801874e-05,
"loss": 2.2198,
"step": 3385
},
{
"epoch": 1.4102964118564743,
"grad_norm": 0.7576628923416138,
"learning_rate": 3.3842373082910884e-05,
"loss": 2.1466,
"step": 3390
},
{
"epoch": 1.4123764950598023,
"grad_norm": 0.7718074917793274,
"learning_rate": 3.378318631450055e-05,
"loss": 2.2033,
"step": 3395
},
{
"epoch": 1.4144565782631306,
"grad_norm": 0.5833465456962585,
"learning_rate": 3.3723943331597205e-05,
"loss": 2.2106,
"step": 3400
},
{
"epoch": 1.4165366614664587,
"grad_norm": 0.7592945098876953,
"learning_rate": 3.366464451337012e-05,
"loss": 2.1829,
"step": 3405
},
{
"epoch": 1.4186167446697868,
"grad_norm": 0.8660997748374939,
"learning_rate": 3.360529023934592e-05,
"loss": 2.1769,
"step": 3410
},
{
"epoch": 1.420696827873115,
"grad_norm": 0.6722576022148132,
"learning_rate": 3.354588088940614e-05,
"loss": 2.1786,
"step": 3415
},
{
"epoch": 1.422776911076443,
"grad_norm": 0.689155638217926,
"learning_rate": 3.348641684378483e-05,
"loss": 2.1651,
"step": 3420
},
{
"epoch": 1.424856994279771,
"grad_norm": 0.8105888366699219,
"learning_rate": 3.342689848306611e-05,
"loss": 2.1733,
"step": 3425
},
{
"epoch": 1.4269370774830994,
"grad_norm": 0.8982015252113342,
"learning_rate": 3.3367326188181725e-05,
"loss": 2.1921,
"step": 3430
},
{
"epoch": 1.4290171606864275,
"grad_norm": 0.7339545488357544,
"learning_rate": 3.3307700340408596e-05,
"loss": 2.1979,
"step": 3435
},
{
"epoch": 1.4310972438897556,
"grad_norm": 0.8030118942260742,
"learning_rate": 3.324802132136642e-05,
"loss": 2.2051,
"step": 3440
},
{
"epoch": 1.4331773270930837,
"grad_norm": 0.7789422869682312,
"learning_rate": 3.318828951301519e-05,
"loss": 2.1387,
"step": 3445
},
{
"epoch": 1.435257410296412,
"grad_norm": 0.7332764863967896,
"learning_rate": 3.3128505297652765e-05,
"loss": 2.1851,
"step": 3450
},
{
"epoch": 1.43733749349974,
"grad_norm": 0.7092201709747314,
"learning_rate": 3.306866905791242e-05,
"loss": 2.201,
"step": 3455
},
{
"epoch": 1.4394175767030681,
"grad_norm": 0.7219560146331787,
"learning_rate": 3.30087811767604e-05,
"loss": 2.1875,
"step": 3460
},
{
"epoch": 1.4414976599063962,
"grad_norm": 0.7875995635986328,
"learning_rate": 3.2948842037493466e-05,
"loss": 2.1303,
"step": 3465
},
{
"epoch": 1.4435777431097243,
"grad_norm": 1.0195878744125366,
"learning_rate": 3.288885202373644e-05,
"loss": 2.189,
"step": 3470
},
{
"epoch": 1.4456578263130524,
"grad_norm": 0.7329500913619995,
"learning_rate": 3.282881151943977e-05,
"loss": 2.2193,
"step": 3475
},
{
"epoch": 1.4477379095163807,
"grad_norm": 0.6737660765647888,
"learning_rate": 3.276872090887702e-05,
"loss": 2.2346,
"step": 3480
},
{
"epoch": 1.4498179927197088,
"grad_norm": 0.8194296360015869,
"learning_rate": 3.270858057664251e-05,
"loss": 2.177,
"step": 3485
},
{
"epoch": 1.4518980759230369,
"grad_norm": 0.696855366230011,
"learning_rate": 3.264839090764871e-05,
"loss": 2.1669,
"step": 3490
},
{
"epoch": 1.4539781591263652,
"grad_norm": 0.7700757384300232,
"learning_rate": 3.2588152287123904e-05,
"loss": 2.1862,
"step": 3495
},
{
"epoch": 1.4560582423296933,
"grad_norm": 0.6540327668190002,
"learning_rate": 3.252786510060969e-05,
"loss": 2.2274,
"step": 3500
},
{
"epoch": 1.4581383255330214,
"grad_norm": 0.5958786606788635,
"learning_rate": 3.246752973395846e-05,
"loss": 2.1997,
"step": 3505
},
{
"epoch": 1.4602184087363494,
"grad_norm": 0.7021071314811707,
"learning_rate": 3.2407146573331e-05,
"loss": 2.1759,
"step": 3510
},
{
"epoch": 1.4622984919396775,
"grad_norm": 0.6807276606559753,
"learning_rate": 3.234671600519397e-05,
"loss": 2.2311,
"step": 3515
},
{
"epoch": 1.4643785751430056,
"grad_norm": 0.7907546162605286,
"learning_rate": 3.228623841631747e-05,
"loss": 2.2092,
"step": 3520
},
{
"epoch": 1.466458658346334,
"grad_norm": 0.8120606541633606,
"learning_rate": 3.2225714193772526e-05,
"loss": 2.1309,
"step": 3525
},
{
"epoch": 1.468538741549662,
"grad_norm": 0.7305984497070312,
"learning_rate": 3.216514372492864e-05,
"loss": 2.1823,
"step": 3530
},
{
"epoch": 1.47061882475299,
"grad_norm": 0.737861156463623,
"learning_rate": 3.210452739745129e-05,
"loss": 2.1817,
"step": 3535
},
{
"epoch": 1.4726989079563182,
"grad_norm": 0.6479620933532715,
"learning_rate": 3.2043865599299484e-05,
"loss": 2.1553,
"step": 3540
},
{
"epoch": 1.4747789911596465,
"grad_norm": 0.8242900371551514,
"learning_rate": 3.1983158718723225e-05,
"loss": 2.1756,
"step": 3545
},
{
"epoch": 1.4768590743629746,
"grad_norm": 1.2284351587295532,
"learning_rate": 3.192240714426108e-05,
"loss": 2.1861,
"step": 3550
},
{
"epoch": 1.4789391575663027,
"grad_norm": 0.8496856093406677,
"learning_rate": 3.1861611264737644e-05,
"loss": 2.1947,
"step": 3555
},
{
"epoch": 1.4810192407696308,
"grad_norm": 0.6714478135108948,
"learning_rate": 3.180077146926109e-05,
"loss": 2.1857,
"step": 3560
},
{
"epoch": 1.4830993239729588,
"grad_norm": 0.7734602689743042,
"learning_rate": 3.173988814722065e-05,
"loss": 2.1711,
"step": 3565
},
{
"epoch": 1.485179407176287,
"grad_norm": 0.6525964736938477,
"learning_rate": 3.167896168828417e-05,
"loss": 2.1995,
"step": 3570
},
{
"epoch": 1.4872594903796152,
"grad_norm": 0.9553260803222656,
"learning_rate": 3.161799248239553e-05,
"loss": 2.1814,
"step": 3575
},
{
"epoch": 1.4893395735829433,
"grad_norm": 0.8433247804641724,
"learning_rate": 3.155698091977224e-05,
"loss": 2.1783,
"step": 3580
},
{
"epoch": 1.4914196567862714,
"grad_norm": 0.706148087978363,
"learning_rate": 3.1495927390902905e-05,
"loss": 2.1987,
"step": 3585
},
{
"epoch": 1.4934997399895995,
"grad_norm": 0.7170814275741577,
"learning_rate": 3.14348322865447e-05,
"loss": 2.152,
"step": 3590
},
{
"epoch": 1.4955798231929278,
"grad_norm": 0.726800799369812,
"learning_rate": 3.1373695997720895e-05,
"loss": 2.1854,
"step": 3595
},
{
"epoch": 1.497659906396256,
"grad_norm": 0.7688819169998169,
"learning_rate": 3.131251891571839e-05,
"loss": 2.1488,
"step": 3600
},
{
"epoch": 1.499739989599584,
"grad_norm": 0.7639352679252625,
"learning_rate": 3.1251301432085106e-05,
"loss": 2.1583,
"step": 3605
},
{
"epoch": 1.501820072802912,
"grad_norm": 0.6791670322418213,
"learning_rate": 3.11900439386276e-05,
"loss": 2.1783,
"step": 3610
},
{
"epoch": 1.5039001560062402,
"grad_norm": 0.6605120897293091,
"learning_rate": 3.112874682740847e-05,
"loss": 2.1583,
"step": 3615
},
{
"epoch": 1.5059802392095682,
"grad_norm": 0.7585316896438599,
"learning_rate": 3.10674104907439e-05,
"loss": 2.1527,
"step": 3620
},
{
"epoch": 1.5080603224128966,
"grad_norm": 0.7344695329666138,
"learning_rate": 3.10060353212011e-05,
"loss": 2.1664,
"step": 3625
},
{
"epoch": 1.5101404056162246,
"grad_norm": 0.7378405332565308,
"learning_rate": 3.094462171159584e-05,
"loss": 2.1505,
"step": 3630
},
{
"epoch": 1.5122204888195527,
"grad_norm": 0.7696133255958557,
"learning_rate": 3.088317005498991e-05,
"loss": 2.1686,
"step": 3635
},
{
"epoch": 1.514300572022881,
"grad_norm": 0.6149158477783203,
"learning_rate": 3.082168074468861e-05,
"loss": 2.17,
"step": 3640
},
{
"epoch": 1.5163806552262091,
"grad_norm": 0.7041980028152466,
"learning_rate": 3.0760154174238226e-05,
"loss": 2.1967,
"step": 3645
},
{
"epoch": 1.5184607384295372,
"grad_norm": 0.8049356937408447,
"learning_rate": 3.069859073742352e-05,
"loss": 2.1727,
"step": 3650
},
{
"epoch": 1.5205408216328653,
"grad_norm": 0.75400710105896,
"learning_rate": 3.0636990828265236e-05,
"loss": 2.1826,
"step": 3655
},
{
"epoch": 1.5226209048361934,
"grad_norm": 0.8219313621520996,
"learning_rate": 3.0575354841017495e-05,
"loss": 2.1961,
"step": 3660
},
{
"epoch": 1.5247009880395215,
"grad_norm": 0.8475666046142578,
"learning_rate": 3.051368317016537e-05,
"loss": 2.1641,
"step": 3665
},
{
"epoch": 1.5267810712428496,
"grad_norm": 0.7787706851959229,
"learning_rate": 3.0451976210422307e-05,
"loss": 2.17,
"step": 3670
},
{
"epoch": 1.5288611544461779,
"grad_norm": 0.8453835844993591,
"learning_rate": 3.03902343567276e-05,
"loss": 2.2105,
"step": 3675
},
{
"epoch": 1.530941237649506,
"grad_norm": 0.7797660231590271,
"learning_rate": 3.0328458004243877e-05,
"loss": 2.1681,
"step": 3680
},
{
"epoch": 1.5330213208528343,
"grad_norm": 0.7047582864761353,
"learning_rate": 3.0266647548354576e-05,
"loss": 2.1242,
"step": 3685
},
{
"epoch": 1.5351014040561624,
"grad_norm": 0.6521434187889099,
"learning_rate": 3.0204803384661386e-05,
"loss": 2.2045,
"step": 3690
},
{
"epoch": 1.5371814872594904,
"grad_norm": 0.7803774476051331,
"learning_rate": 3.0142925908981756e-05,
"loss": 2.1639,
"step": 3695
},
{
"epoch": 1.5392615704628185,
"grad_norm": 0.7463592886924744,
"learning_rate": 3.0081015517346328e-05,
"loss": 2.1969,
"step": 3700
},
{
"epoch": 1.5413416536661466,
"grad_norm": 0.689757764339447,
"learning_rate": 3.0019072605996412e-05,
"loss": 2.2282,
"step": 3705
},
{
"epoch": 1.5434217368694747,
"grad_norm": 0.9387602806091309,
"learning_rate": 2.9957097571381453e-05,
"loss": 2.1989,
"step": 3710
},
{
"epoch": 1.5455018200728028,
"grad_norm": 0.749153196811676,
"learning_rate": 2.98950908101565e-05,
"loss": 2.1492,
"step": 3715
},
{
"epoch": 1.547581903276131,
"grad_norm": 0.7032657265663147,
"learning_rate": 2.983305271917965e-05,
"loss": 2.1923,
"step": 3720
},
{
"epoch": 1.5496619864794592,
"grad_norm": 0.640876054763794,
"learning_rate": 2.9770983695509517e-05,
"loss": 2.1863,
"step": 3725
},
{
"epoch": 1.5517420696827873,
"grad_norm": 0.6808728575706482,
"learning_rate": 2.9708884136402715e-05,
"loss": 2.1861,
"step": 3730
},
{
"epoch": 1.5538221528861156,
"grad_norm": 0.6913876533508301,
"learning_rate": 2.9646754439311252e-05,
"loss": 2.1534,
"step": 3735
},
{
"epoch": 1.5559022360894437,
"grad_norm": 0.8387054800987244,
"learning_rate": 2.9584595001880065e-05,
"loss": 2.1821,
"step": 3740
},
{
"epoch": 1.5579823192927718,
"grad_norm": 0.7485213279724121,
"learning_rate": 2.9522406221944415e-05,
"loss": 2.1615,
"step": 3745
},
{
"epoch": 1.5600624024960998,
"grad_norm": 0.6283704042434692,
"learning_rate": 2.9460188497527363e-05,
"loss": 2.11,
"step": 3750
},
{
"epoch": 1.562142485699428,
"grad_norm": 0.8199918270111084,
"learning_rate": 2.9397942226837222e-05,
"loss": 2.182,
"step": 3755
},
{
"epoch": 1.564222568902756,
"grad_norm": 0.7533789873123169,
"learning_rate": 2.9335667808265023e-05,
"loss": 2.1962,
"step": 3760
},
{
"epoch": 1.566302652106084,
"grad_norm": 1.0285567045211792,
"learning_rate": 2.9273365640381924e-05,
"loss": 2.1573,
"step": 3765
},
{
"epoch": 1.5683827353094124,
"grad_norm": 0.7705976366996765,
"learning_rate": 2.921103612193672e-05,
"loss": 2.1847,
"step": 3770
},
{
"epoch": 1.5704628185127405,
"grad_norm": 0.7277804613113403,
"learning_rate": 2.9148679651853212e-05,
"loss": 2.1516,
"step": 3775
},
{
"epoch": 1.5725429017160688,
"grad_norm": 0.7203589677810669,
"learning_rate": 2.9086296629227738e-05,
"loss": 2.1295,
"step": 3780
},
{
"epoch": 1.574622984919397,
"grad_norm": 0.7318481802940369,
"learning_rate": 2.9023887453326554e-05,
"loss": 2.1474,
"step": 3785
},
{
"epoch": 1.576703068122725,
"grad_norm": 0.7423680424690247,
"learning_rate": 2.8961452523583322e-05,
"loss": 2.2318,
"step": 3790
},
{
"epoch": 1.578783151326053,
"grad_norm": 0.6626203060150146,
"learning_rate": 2.8898992239596507e-05,
"loss": 2.1331,
"step": 3795
},
{
"epoch": 1.5808632345293812,
"grad_norm": 0.6937516331672668,
"learning_rate": 2.883650700112689e-05,
"loss": 2.1473,
"step": 3800
},
{
"epoch": 1.5829433177327092,
"grad_norm": 0.7485554814338684,
"learning_rate": 2.8773997208094912e-05,
"loss": 2.1692,
"step": 3805
},
{
"epoch": 1.5850234009360373,
"grad_norm": 0.7731395363807678,
"learning_rate": 2.8711463260578214e-05,
"loss": 2.2108,
"step": 3810
},
{
"epoch": 1.5871034841393654,
"grad_norm": 0.6718974709510803,
"learning_rate": 2.864890555880902e-05,
"loss": 2.1694,
"step": 3815
},
{
"epoch": 1.5891835673426937,
"grad_norm": 0.7455317974090576,
"learning_rate": 2.8586324503171574e-05,
"loss": 2.1801,
"step": 3820
},
{
"epoch": 1.5912636505460218,
"grad_norm": 0.6923863887786865,
"learning_rate": 2.8523720494199595e-05,
"loss": 2.2099,
"step": 3825
},
{
"epoch": 1.5933437337493501,
"grad_norm": 0.7777950167655945,
"learning_rate": 2.8461093932573736e-05,
"loss": 2.1788,
"step": 3830
},
{
"epoch": 1.5954238169526782,
"grad_norm": 0.7518746256828308,
"learning_rate": 2.8398445219118935e-05,
"loss": 2.1768,
"step": 3835
},
{
"epoch": 1.5975039001560063,
"grad_norm": 0.6466406583786011,
"learning_rate": 2.8335774754801965e-05,
"loss": 2.1786,
"step": 3840
},
{
"epoch": 1.5995839833593344,
"grad_norm": 0.9300071001052856,
"learning_rate": 2.8273082940728784e-05,
"loss": 2.13,
"step": 3845
},
{
"epoch": 1.6016640665626625,
"grad_norm": 0.668032169342041,
"learning_rate": 2.8210370178141987e-05,
"loss": 2.1955,
"step": 3850
},
{
"epoch": 1.6037441497659906,
"grad_norm": 0.713634192943573,
"learning_rate": 2.814763686841826e-05,
"loss": 2.2051,
"step": 3855
},
{
"epoch": 1.6058242329693186,
"grad_norm": 0.7603687644004822,
"learning_rate": 2.808488341306578e-05,
"loss": 2.1569,
"step": 3860
},
{
"epoch": 1.607904316172647,
"grad_norm": 0.7058757543563843,
"learning_rate": 2.8022110213721688e-05,
"loss": 2.1072,
"step": 3865
},
{
"epoch": 1.609984399375975,
"grad_norm": 0.649702787399292,
"learning_rate": 2.7959317672149444e-05,
"loss": 2.1538,
"step": 3870
},
{
"epoch": 1.6120644825793031,
"grad_norm": 0.7990389466285706,
"learning_rate": 2.789650619023636e-05,
"loss": 2.1701,
"step": 3875
},
{
"epoch": 1.6141445657826314,
"grad_norm": 0.814145028591156,
"learning_rate": 2.783367616999092e-05,
"loss": 2.1796,
"step": 3880
},
{
"epoch": 1.6162246489859595,
"grad_norm": 0.8015204071998596,
"learning_rate": 2.7770828013540294e-05,
"loss": 2.1958,
"step": 3885
},
{
"epoch": 1.6183047321892876,
"grad_norm": 0.7484777569770813,
"learning_rate": 2.7707962123127707e-05,
"loss": 2.156,
"step": 3890
},
{
"epoch": 1.6203848153926157,
"grad_norm": 1.0027931928634644,
"learning_rate": 2.7645078901109893e-05,
"loss": 2.1935,
"step": 3895
},
{
"epoch": 1.6224648985959438,
"grad_norm": 0.9040629863739014,
"learning_rate": 2.7582178749954523e-05,
"loss": 2.1512,
"step": 3900
},
{
"epoch": 1.6245449817992719,
"grad_norm": 0.6937136054039001,
"learning_rate": 2.7519262072237594e-05,
"loss": 2.1613,
"step": 3905
},
{
"epoch": 1.6266250650026,
"grad_norm": 0.7338537573814392,
"learning_rate": 2.745632927064089e-05,
"loss": 2.1566,
"step": 3910
},
{
"epoch": 1.6287051482059283,
"grad_norm": 0.7064099907875061,
"learning_rate": 2.739338074794941e-05,
"loss": 2.1996,
"step": 3915
},
{
"epoch": 1.6307852314092564,
"grad_norm": 0.7720285654067993,
"learning_rate": 2.7330416907048727e-05,
"loss": 2.1751,
"step": 3920
},
{
"epoch": 1.6328653146125847,
"grad_norm": 1.0123454332351685,
"learning_rate": 2.7267438150922508e-05,
"loss": 2.1707,
"step": 3925
},
{
"epoch": 1.6349453978159127,
"grad_norm": 0.7546709179878235,
"learning_rate": 2.720444488264984e-05,
"loss": 2.1563,
"step": 3930
},
{
"epoch": 1.6370254810192408,
"grad_norm": 0.6726992726325989,
"learning_rate": 2.7141437505402705e-05,
"loss": 2.2035,
"step": 3935
},
{
"epoch": 1.639105564222569,
"grad_norm": 0.8486371040344238,
"learning_rate": 2.7078416422443386e-05,
"loss": 2.1483,
"step": 3940
},
{
"epoch": 1.641185647425897,
"grad_norm": 0.8228679299354553,
"learning_rate": 2.7015382037121896e-05,
"loss": 2.1901,
"step": 3945
},
{
"epoch": 1.643265730629225,
"grad_norm": 0.7839644551277161,
"learning_rate": 2.695233475287336e-05,
"loss": 2.1339,
"step": 3950
},
{
"epoch": 1.6453458138325532,
"grad_norm": 0.8633984923362732,
"learning_rate": 2.6889274973215495e-05,
"loss": 2.2077,
"step": 3955
},
{
"epoch": 1.6474258970358813,
"grad_norm": 0.8162491321563721,
"learning_rate": 2.6826203101745956e-05,
"loss": 2.1532,
"step": 3960
},
{
"epoch": 1.6495059802392096,
"grad_norm": 0.7576460242271423,
"learning_rate": 2.6763119542139813e-05,
"loss": 2.1764,
"step": 3965
},
{
"epoch": 1.6515860634425377,
"grad_norm": 0.7479214668273926,
"learning_rate": 2.670002469814693e-05,
"loss": 2.1551,
"step": 3970
},
{
"epoch": 1.653666146645866,
"grad_norm": 0.6720142960548401,
"learning_rate": 2.6636918973589402e-05,
"loss": 2.1667,
"step": 3975
},
{
"epoch": 1.655746229849194,
"grad_norm": 0.8261317014694214,
"learning_rate": 2.6573802772358965e-05,
"loss": 2.1691,
"step": 3980
},
{
"epoch": 1.6578263130525221,
"grad_norm": 0.7510902881622314,
"learning_rate": 2.6510676498414377e-05,
"loss": 2.2111,
"step": 3985
},
{
"epoch": 1.6599063962558502,
"grad_norm": 0.7204596996307373,
"learning_rate": 2.644754055577892e-05,
"loss": 2.168,
"step": 3990
},
{
"epoch": 1.6619864794591783,
"grad_norm": 0.6878061890602112,
"learning_rate": 2.6384395348537704e-05,
"loss": 2.1643,
"step": 3995
},
{
"epoch": 1.6640665626625064,
"grad_norm": 0.6784379482269287,
"learning_rate": 2.6321241280835173e-05,
"loss": 2.1648,
"step": 4000
},
{
"epoch": 1.6661466458658345,
"grad_norm": 0.6045475006103516,
"learning_rate": 2.6258078756872445e-05,
"loss": 2.1654,
"step": 4005
},
{
"epoch": 1.6682267290691628,
"grad_norm": 0.7655680775642395,
"learning_rate": 2.6194908180904798e-05,
"loss": 2.1774,
"step": 4010
},
{
"epoch": 1.670306812272491,
"grad_norm": 0.9217324256896973,
"learning_rate": 2.613172995723902e-05,
"loss": 2.1726,
"step": 4015
},
{
"epoch": 1.672386895475819,
"grad_norm": 0.8260428309440613,
"learning_rate": 2.6068544490230852e-05,
"loss": 2.2147,
"step": 4020
},
{
"epoch": 1.6744669786791473,
"grad_norm": 0.7907747030258179,
"learning_rate": 2.6005352184282384e-05,
"loss": 2.2013,
"step": 4025
},
{
"epoch": 1.6765470618824754,
"grad_norm": 0.8157804012298584,
"learning_rate": 2.5942153443839506e-05,
"loss": 2.1584,
"step": 4030
},
{
"epoch": 1.6786271450858035,
"grad_norm": 0.9395479559898376,
"learning_rate": 2.5878948673389254e-05,
"loss": 2.1244,
"step": 4035
},
{
"epoch": 1.6807072282891315,
"grad_norm": 0.7687943577766418,
"learning_rate": 2.5815738277457285e-05,
"loss": 2.1902,
"step": 4040
},
{
"epoch": 1.6827873114924596,
"grad_norm": 0.6989742517471313,
"learning_rate": 2.575252266060525e-05,
"loss": 2.208,
"step": 4045
},
{
"epoch": 1.6848673946957877,
"grad_norm": 1.0736007690429688,
"learning_rate": 2.5689302227428215e-05,
"loss": 2.1795,
"step": 4050
},
{
"epoch": 1.6869474778991158,
"grad_norm": 0.7165335416793823,
"learning_rate": 2.5626077382552072e-05,
"loss": 2.1777,
"step": 4055
},
{
"epoch": 1.6890275611024441,
"grad_norm": 0.6504753828048706,
"learning_rate": 2.5562848530630945e-05,
"loss": 2.1933,
"step": 4060
},
{
"epoch": 1.6911076443057722,
"grad_norm": 1.0563914775848389,
"learning_rate": 2.5499616076344607e-05,
"loss": 2.15,
"step": 4065
},
{
"epoch": 1.6931877275091005,
"grad_norm": 0.8301395177841187,
"learning_rate": 2.5436380424395895e-05,
"loss": 2.1229,
"step": 4070
},
{
"epoch": 1.6952678107124286,
"grad_norm": 0.697247326374054,
"learning_rate": 2.5373141979508102e-05,
"loss": 2.1382,
"step": 4075
},
{
"epoch": 1.6973478939157567,
"grad_norm": 0.6969221830368042,
"learning_rate": 2.5309901146422404e-05,
"loss": 2.1861,
"step": 4080
},
{
"epoch": 1.6994279771190848,
"grad_norm": 0.7223391532897949,
"learning_rate": 2.5246658329895252e-05,
"loss": 2.1625,
"step": 4085
},
{
"epoch": 1.7015080603224129,
"grad_norm": 0.7388386726379395,
"learning_rate": 2.5183413934695794e-05,
"loss": 2.1558,
"step": 4090
},
{
"epoch": 1.703588143525741,
"grad_norm": 0.8030023574829102,
"learning_rate": 2.5120168365603292e-05,
"loss": 2.1696,
"step": 4095
},
{
"epoch": 1.705668226729069,
"grad_norm": 0.7399486899375916,
"learning_rate": 2.50569220274045e-05,
"loss": 2.192,
"step": 4100
},
{
"epoch": 1.7077483099323973,
"grad_norm": 0.7578115463256836,
"learning_rate": 2.4993675324891135e-05,
"loss": 2.1734,
"step": 4105
},
{
"epoch": 1.7098283931357254,
"grad_norm": 0.8409146070480347,
"learning_rate": 2.493042866285719e-05,
"loss": 2.2099,
"step": 4110
},
{
"epoch": 1.7119084763390535,
"grad_norm": 0.756033718585968,
"learning_rate": 2.486718244609645e-05,
"loss": 2.1975,
"step": 4115
},
{
"epoch": 1.7139885595423818,
"grad_norm": 0.7186440229415894,
"learning_rate": 2.480393707939981e-05,
"loss": 2.1507,
"step": 4120
},
{
"epoch": 1.71606864274571,
"grad_norm": 0.7971068620681763,
"learning_rate": 2.4740692967552773e-05,
"loss": 2.1812,
"step": 4125
},
{
"epoch": 1.718148725949038,
"grad_norm": 0.7994965314865112,
"learning_rate": 2.467745051533274e-05,
"loss": 2.1485,
"step": 4130
},
{
"epoch": 1.720228809152366,
"grad_norm": 0.868439257144928,
"learning_rate": 2.4614210127506556e-05,
"loss": 2.1645,
"step": 4135
},
{
"epoch": 1.7223088923556942,
"grad_norm": 0.7531114220619202,
"learning_rate": 2.4550972208827817e-05,
"loss": 2.1822,
"step": 4140
},
{
"epoch": 1.7243889755590223,
"grad_norm": 0.7823972702026367,
"learning_rate": 2.4487737164034338e-05,
"loss": 2.1444,
"step": 4145
},
{
"epoch": 1.7264690587623504,
"grad_norm": 0.6993070840835571,
"learning_rate": 2.4424505397845517e-05,
"loss": 2.1499,
"step": 4150
},
{
"epoch": 1.7285491419656787,
"grad_norm": 0.9332249164581299,
"learning_rate": 2.4361277314959796e-05,
"loss": 2.169,
"step": 4155
},
{
"epoch": 1.7306292251690067,
"grad_norm": 0.8146462440490723,
"learning_rate": 2.4298053320052004e-05,
"loss": 2.1949,
"step": 4160
},
{
"epoch": 1.732709308372335,
"grad_norm": 0.7273747324943542,
"learning_rate": 2.4234833817770846e-05,
"loss": 2.1759,
"step": 4165
},
{
"epoch": 1.7347893915756631,
"grad_norm": 0.7499719858169556,
"learning_rate": 2.417161921273625e-05,
"loss": 2.2312,
"step": 4170
},
{
"epoch": 1.7368694747789912,
"grad_norm": 0.6487704515457153,
"learning_rate": 2.4108409909536805e-05,
"loss": 2.167,
"step": 4175
},
{
"epoch": 1.7389495579823193,
"grad_norm": 0.6822001338005066,
"learning_rate": 2.4045206312727184e-05,
"loss": 2.1986,
"step": 4180
},
{
"epoch": 1.7410296411856474,
"grad_norm": 0.8364559412002563,
"learning_rate": 2.3982008826825503e-05,
"loss": 2.2144,
"step": 4185
},
{
"epoch": 1.7431097243889755,
"grad_norm": 0.6755366921424866,
"learning_rate": 2.3918817856310786e-05,
"loss": 2.1672,
"step": 4190
},
{
"epoch": 1.7451898075923036,
"grad_norm": 0.6534518003463745,
"learning_rate": 2.3855633805620374e-05,
"loss": 2.1611,
"step": 4195
},
{
"epoch": 1.7472698907956317,
"grad_norm": 0.8979050517082214,
"learning_rate": 2.3792457079147286e-05,
"loss": 2.2152,
"step": 4200
},
{
"epoch": 1.74934997399896,
"grad_norm": 0.8764121532440186,
"learning_rate": 2.3729288081237687e-05,
"loss": 2.1696,
"step": 4205
},
{
"epoch": 1.751430057202288,
"grad_norm": 0.7482860088348389,
"learning_rate": 2.3666127216188284e-05,
"loss": 2.1707,
"step": 4210
},
{
"epoch": 1.7535101404056164,
"grad_norm": 0.6165511608123779,
"learning_rate": 2.36029748882437e-05,
"loss": 2.1893,
"step": 4215
},
{
"epoch": 1.7555902236089445,
"grad_norm": 0.7991209626197815,
"learning_rate": 2.3539831501593944e-05,
"loss": 2.1379,
"step": 4220
},
{
"epoch": 1.7576703068122725,
"grad_norm": 0.7029572129249573,
"learning_rate": 2.3476697460371785e-05,
"loss": 2.1528,
"step": 4225
},
{
"epoch": 1.7597503900156006,
"grad_norm": 0.7116166353225708,
"learning_rate": 2.3413573168650198e-05,
"loss": 2.1759,
"step": 4230
},
{
"epoch": 1.7618304732189287,
"grad_norm": 0.8789666295051575,
"learning_rate": 2.335045903043974e-05,
"loss": 2.1882,
"step": 4235
},
{
"epoch": 1.7639105564222568,
"grad_norm": 1.1269465684890747,
"learning_rate": 2.3287355449686004e-05,
"loss": 2.1826,
"step": 4240
},
{
"epoch": 1.765990639625585,
"grad_norm": 0.744796097278595,
"learning_rate": 2.322426283026697e-05,
"loss": 2.1916,
"step": 4245
},
{
"epoch": 1.7680707228289132,
"grad_norm": 0.7507933378219604,
"learning_rate": 2.3161181575990518e-05,
"loss": 2.1924,
"step": 4250
},
{
"epoch": 1.7701508060322413,
"grad_norm": 0.9487372636795044,
"learning_rate": 2.3098112090591744e-05,
"loss": 2.163,
"step": 4255
},
{
"epoch": 1.7722308892355694,
"grad_norm": 0.6878231763839722,
"learning_rate": 2.303505477773045e-05,
"loss": 2.1803,
"step": 4260
},
{
"epoch": 1.7743109724388977,
"grad_norm": 0.686570405960083,
"learning_rate": 2.2972010040988518e-05,
"loss": 2.1865,
"step": 4265
},
{
"epoch": 1.7763910556422258,
"grad_norm": 0.8939157724380493,
"learning_rate": 2.290897828386734e-05,
"loss": 2.1612,
"step": 4270
},
{
"epoch": 1.7784711388455539,
"grad_norm": 0.7810360193252563,
"learning_rate": 2.2845959909785226e-05,
"loss": 2.1517,
"step": 4275
},
{
"epoch": 1.780551222048882,
"grad_norm": 0.8274974822998047,
"learning_rate": 2.2782955322074855e-05,
"loss": 2.1616,
"step": 4280
},
{
"epoch": 1.78263130525221,
"grad_norm": 0.7559753060340881,
"learning_rate": 2.2719964923980653e-05,
"loss": 2.1183,
"step": 4285
},
{
"epoch": 1.7847113884555381,
"grad_norm": 0.8210632801055908,
"learning_rate": 2.2656989118656224e-05,
"loss": 2.1911,
"step": 4290
},
{
"epoch": 1.7867914716588662,
"grad_norm": 0.7994166016578674,
"learning_rate": 2.2594028309161802e-05,
"loss": 2.1927,
"step": 4295
},
{
"epoch": 1.7888715548621945,
"grad_norm": 0.805383026599884,
"learning_rate": 2.253108289846161e-05,
"loss": 2.186,
"step": 4300
},
{
"epoch": 1.7909516380655226,
"grad_norm": 0.872366189956665,
"learning_rate": 2.246815328942133e-05,
"loss": 2.1307,
"step": 4305
},
{
"epoch": 1.793031721268851,
"grad_norm": 0.7008072137832642,
"learning_rate": 2.240523988480551e-05,
"loss": 2.2187,
"step": 4310
},
{
"epoch": 1.795111804472179,
"grad_norm": 0.799379289150238,
"learning_rate": 2.2342343087275e-05,
"loss": 2.1588,
"step": 4315
},
{
"epoch": 1.797191887675507,
"grad_norm": 1.337504267692566,
"learning_rate": 2.227946329938433e-05,
"loss": 2.1518,
"step": 4320
},
{
"epoch": 1.7992719708788352,
"grad_norm": 0.7516017556190491,
"learning_rate": 2.2216600923579196e-05,
"loss": 2.149,
"step": 4325
},
{
"epoch": 1.8013520540821633,
"grad_norm": 0.6863098740577698,
"learning_rate": 2.2153756362193827e-05,
"loss": 2.1886,
"step": 4330
},
{
"epoch": 1.8034321372854913,
"grad_norm": 0.6174860596656799,
"learning_rate": 2.209093001744845e-05,
"loss": 2.181,
"step": 4335
},
{
"epoch": 1.8055122204888194,
"grad_norm": 0.7695885300636292,
"learning_rate": 2.2028122291446687e-05,
"loss": 2.1137,
"step": 4340
},
{
"epoch": 1.8075923036921477,
"grad_norm": 0.8168510794639587,
"learning_rate": 2.1965333586173022e-05,
"loss": 2.1785,
"step": 4345
},
{
"epoch": 1.8096723868954758,
"grad_norm": 0.7163233757019043,
"learning_rate": 2.1902564303490168e-05,
"loss": 2.1422,
"step": 4350
},
{
"epoch": 1.811752470098804,
"grad_norm": 0.8038668036460876,
"learning_rate": 2.183981484513657e-05,
"loss": 2.2291,
"step": 4355
},
{
"epoch": 1.8138325533021322,
"grad_norm": 0.8842986226081848,
"learning_rate": 2.177708561272374e-05,
"loss": 2.1491,
"step": 4360
},
{
"epoch": 1.8159126365054603,
"grad_norm": 0.7526798844337463,
"learning_rate": 2.1714377007733787e-05,
"loss": 2.1554,
"step": 4365
},
{
"epoch": 1.8179927197087884,
"grad_norm": 0.8559210896492004,
"learning_rate": 2.165168943151677e-05,
"loss": 2.1444,
"step": 4370
},
{
"epoch": 1.8200728029121165,
"grad_norm": 0.6576410531997681,
"learning_rate": 2.1589023285288177e-05,
"loss": 2.1322,
"step": 4375
},
{
"epoch": 1.8221528861154446,
"grad_norm": 0.9053552746772766,
"learning_rate": 2.152637897012633e-05,
"loss": 2.2095,
"step": 4380
},
{
"epoch": 1.8242329693187727,
"grad_norm": 0.7630800008773804,
"learning_rate": 2.1463756886969828e-05,
"loss": 2.1478,
"step": 4385
},
{
"epoch": 1.8263130525221007,
"grad_norm": 0.6651878952980042,
"learning_rate": 2.140115743661497e-05,
"loss": 2.1831,
"step": 4390
},
{
"epoch": 1.828393135725429,
"grad_norm": 0.7167788147926331,
"learning_rate": 2.1338581019713225e-05,
"loss": 2.1871,
"step": 4395
},
{
"epoch": 1.8304732189287571,
"grad_norm": 0.7219182252883911,
"learning_rate": 2.1276028036768617e-05,
"loss": 2.1765,
"step": 4400
},
{
"epoch": 1.8325533021320854,
"grad_norm": 0.911848783493042,
"learning_rate": 2.121349888813519e-05,
"loss": 2.1678,
"step": 4405
},
{
"epoch": 1.8346333853354135,
"grad_norm": 0.7422452569007874,
"learning_rate": 2.1150993974014477e-05,
"loss": 2.1797,
"step": 4410
},
{
"epoch": 1.8367134685387416,
"grad_norm": 0.7973139882087708,
"learning_rate": 2.1088513694452852e-05,
"loss": 2.1423,
"step": 4415
},
{
"epoch": 1.8387935517420697,
"grad_norm": 0.6837089657783508,
"learning_rate": 2.1026058449339053e-05,
"loss": 2.1601,
"step": 4420
},
{
"epoch": 1.8408736349453978,
"grad_norm": 0.7220534086227417,
"learning_rate": 2.0963628638401584e-05,
"loss": 2.1673,
"step": 4425
},
{
"epoch": 1.8429537181487259,
"grad_norm": 0.7278773188591003,
"learning_rate": 2.090122466120617e-05,
"loss": 2.1831,
"step": 4430
},
{
"epoch": 1.845033801352054,
"grad_norm": 0.7825272679328918,
"learning_rate": 2.0838846917153184e-05,
"loss": 2.1568,
"step": 4435
},
{
"epoch": 1.847113884555382,
"grad_norm": 0.9204423427581787,
"learning_rate": 2.0776495805475125e-05,
"loss": 2.187,
"step": 4440
},
{
"epoch": 1.8491939677587104,
"grad_norm": 0.7809643149375916,
"learning_rate": 2.0714171725233993e-05,
"loss": 2.1516,
"step": 4445
},
{
"epoch": 1.8512740509620385,
"grad_norm": 0.7957209944725037,
"learning_rate": 2.0651875075318823e-05,
"loss": 2.1706,
"step": 4450
},
{
"epoch": 1.8533541341653668,
"grad_norm": 0.751357913017273,
"learning_rate": 2.0589606254443066e-05,
"loss": 2.1797,
"step": 4455
},
{
"epoch": 1.8554342173686948,
"grad_norm": 0.7734564542770386,
"learning_rate": 2.0527365661142074e-05,
"loss": 2.1572,
"step": 4460
},
{
"epoch": 1.857514300572023,
"grad_norm": 0.8376662135124207,
"learning_rate": 2.046515369377052e-05,
"loss": 2.1688,
"step": 4465
},
{
"epoch": 1.859594383775351,
"grad_norm": 0.7141360640525818,
"learning_rate": 2.04029707504999e-05,
"loss": 2.1694,
"step": 4470
},
{
"epoch": 1.861674466978679,
"grad_norm": 0.8017005920410156,
"learning_rate": 2.0340817229315888e-05,
"loss": 2.1641,
"step": 4475
},
{
"epoch": 1.8637545501820072,
"grad_norm": 0.8669304847717285,
"learning_rate": 2.02786935280159e-05,
"loss": 2.1787,
"step": 4480
},
{
"epoch": 1.8658346333853353,
"grad_norm": 0.8036349415779114,
"learning_rate": 2.021660004420648e-05,
"loss": 2.198,
"step": 4485
},
{
"epoch": 1.8679147165886636,
"grad_norm": 0.7105252146720886,
"learning_rate": 2.015453717530078e-05,
"loss": 2.1836,
"step": 4490
},
{
"epoch": 1.8699947997919917,
"grad_norm": 1.4521474838256836,
"learning_rate": 2.0092505318515998e-05,
"loss": 2.1859,
"step": 4495
},
{
"epoch": 1.8720748829953198,
"grad_norm": 0.8009501099586487,
"learning_rate": 2.003050487087086e-05,
"loss": 2.1799,
"step": 4500
},
{
"epoch": 1.874154966198648,
"grad_norm": 0.7517442107200623,
"learning_rate": 1.9968536229183045e-05,
"loss": 2.1736,
"step": 4505
},
{
"epoch": 1.8762350494019762,
"grad_norm": 0.7923987507820129,
"learning_rate": 1.9906599790066696e-05,
"loss": 2.1947,
"step": 4510
},
{
"epoch": 1.8783151326053042,
"grad_norm": 0.8453426957130432,
"learning_rate": 1.9844695949929825e-05,
"loss": 2.1611,
"step": 4515
},
{
"epoch": 1.8803952158086323,
"grad_norm": 1.034262776374817,
"learning_rate": 1.9782825104971815e-05,
"loss": 2.1851,
"step": 4520
},
{
"epoch": 1.8824752990119604,
"grad_norm": 0.7277210354804993,
"learning_rate": 1.9720987651180886e-05,
"loss": 2.142,
"step": 4525
},
{
"epoch": 1.8845553822152885,
"grad_norm": 0.7940502166748047,
"learning_rate": 1.9659183984331513e-05,
"loss": 2.1891,
"step": 4530
},
{
"epoch": 1.8866354654186166,
"grad_norm": 0.7815389633178711,
"learning_rate": 1.959741449998195e-05,
"loss": 2.1806,
"step": 4535
},
{
"epoch": 1.888715548621945,
"grad_norm": 0.759210467338562,
"learning_rate": 1.9535679593471665e-05,
"loss": 2.1939,
"step": 4540
},
{
"epoch": 1.890795631825273,
"grad_norm": 0.6869644522666931,
"learning_rate": 1.9473979659918835e-05,
"loss": 2.1908,
"step": 4545
},
{
"epoch": 1.8928757150286013,
"grad_norm": 0.782366156578064,
"learning_rate": 1.941231509421778e-05,
"loss": 2.1411,
"step": 4550
},
{
"epoch": 1.8949557982319294,
"grad_norm": 0.7898021340370178,
"learning_rate": 1.935068629103649e-05,
"loss": 2.1908,
"step": 4555
},
{
"epoch": 1.8970358814352575,
"grad_norm": 0.6880964636802673,
"learning_rate": 1.9289093644814015e-05,
"loss": 2.1845,
"step": 4560
},
{
"epoch": 1.8991159646385856,
"grad_norm": 0.6600199937820435,
"learning_rate": 1.9227537549758037e-05,
"loss": 2.1671,
"step": 4565
},
{
"epoch": 1.9011960478419136,
"grad_norm": 0.7955625057220459,
"learning_rate": 1.9166018399842277e-05,
"loss": 2.1635,
"step": 4570
},
{
"epoch": 1.9032761310452417,
"grad_norm": 0.9278563261032104,
"learning_rate": 1.910453658880402e-05,
"loss": 2.2131,
"step": 4575
},
{
"epoch": 1.9053562142485698,
"grad_norm": 0.6783947348594666,
"learning_rate": 1.904309251014156e-05,
"loss": 2.1891,
"step": 4580
},
{
"epoch": 1.907436297451898,
"grad_norm": 0.7580657005310059,
"learning_rate": 1.8981686557111696e-05,
"loss": 2.1862,
"step": 4585
},
{
"epoch": 1.9095163806552262,
"grad_norm": 0.7643154263496399,
"learning_rate": 1.892031912272719e-05,
"loss": 2.1516,
"step": 4590
},
{
"epoch": 1.9115964638585543,
"grad_norm": 0.7217808365821838,
"learning_rate": 1.8858990599754326e-05,
"loss": 2.1513,
"step": 4595
},
{
"epoch": 1.9136765470618826,
"grad_norm": 0.8804484605789185,
"learning_rate": 1.87977013807103e-05,
"loss": 2.1691,
"step": 4600
},
{
"epoch": 1.9157566302652107,
"grad_norm": 0.9107383489608765,
"learning_rate": 1.8736451857860788e-05,
"loss": 2.1448,
"step": 4605
},
{
"epoch": 1.9178367134685388,
"grad_norm": 0.7312045097351074,
"learning_rate": 1.8675242423217375e-05,
"loss": 2.1895,
"step": 4610
},
{
"epoch": 1.9199167966718669,
"grad_norm": 0.71600341796875,
"learning_rate": 1.8614073468535094e-05,
"loss": 2.1567,
"step": 4615
},
{
"epoch": 1.921996879875195,
"grad_norm": 0.8909045457839966,
"learning_rate": 1.855294538530986e-05,
"loss": 2.178,
"step": 4620
},
{
"epoch": 1.924076963078523,
"grad_norm": 0.7625308632850647,
"learning_rate": 1.8491858564776043e-05,
"loss": 2.1696,
"step": 4625
},
{
"epoch": 1.9261570462818511,
"grad_norm": 0.7876774668693542,
"learning_rate": 1.84308133979039e-05,
"loss": 2.1827,
"step": 4630
},
{
"epoch": 1.9282371294851794,
"grad_norm": 0.768500804901123,
"learning_rate": 1.836981027539709e-05,
"loss": 2.1463,
"step": 4635
},
{
"epoch": 1.9303172126885075,
"grad_norm": 0.7600011229515076,
"learning_rate": 1.8308849587690213e-05,
"loss": 2.18,
"step": 4640
},
{
"epoch": 1.9323972958918356,
"grad_norm": 0.8618823885917664,
"learning_rate": 1.8247931724946223e-05,
"loss": 2.1846,
"step": 4645
},
{
"epoch": 1.934477379095164,
"grad_norm": 0.8330581784248352,
"learning_rate": 1.818705707705402e-05,
"loss": 2.1882,
"step": 4650
},
{
"epoch": 1.936557462298492,
"grad_norm": 0.7751420140266418,
"learning_rate": 1.81262260336259e-05,
"loss": 2.1909,
"step": 4655
},
{
"epoch": 1.93863754550182,
"grad_norm": 1.0599454641342163,
"learning_rate": 1.8065438983995107e-05,
"loss": 2.1608,
"step": 4660
},
{
"epoch": 1.9407176287051482,
"grad_norm": 0.7828746438026428,
"learning_rate": 1.8004696317213283e-05,
"loss": 2.1639,
"step": 4665
},
{
"epoch": 1.9427977119084763,
"grad_norm": 0.7516394257545471,
"learning_rate": 1.7943998422048038e-05,
"loss": 2.1625,
"step": 4670
},
{
"epoch": 1.9448777951118044,
"grad_norm": 0.6707669496536255,
"learning_rate": 1.7883345686980392e-05,
"loss": 2.1772,
"step": 4675
},
{
"epoch": 1.9469578783151325,
"grad_norm": 0.7721826434135437,
"learning_rate": 1.782273850020238e-05,
"loss": 2.1463,
"step": 4680
},
{
"epoch": 1.9490379615184608,
"grad_norm": 0.6671859622001648,
"learning_rate": 1.776217724961447e-05,
"loss": 2.1573,
"step": 4685
},
{
"epoch": 1.9511180447217888,
"grad_norm": 0.8338425159454346,
"learning_rate": 1.7701662322823172e-05,
"loss": 2.1525,
"step": 4690
},
{
"epoch": 1.9531981279251172,
"grad_norm": 0.7207419276237488,
"learning_rate": 1.7641194107138477e-05,
"loss": 2.1281,
"step": 4695
},
{
"epoch": 1.9552782111284452,
"grad_norm": 0.8658615350723267,
"learning_rate": 1.7580772989571434e-05,
"loss": 2.1775,
"step": 4700
},
{
"epoch": 1.9573582943317733,
"grad_norm": 1.124961495399475,
"learning_rate": 1.7520399356831636e-05,
"loss": 2.1508,
"step": 4705
},
{
"epoch": 1.9594383775351014,
"grad_norm": 0.7705496549606323,
"learning_rate": 1.7460073595324776e-05,
"loss": 2.1657,
"step": 4710
},
{
"epoch": 1.9615184607384295,
"grad_norm": 0.80336594581604,
"learning_rate": 1.7399796091150155e-05,
"loss": 2.1883,
"step": 4715
},
{
"epoch": 1.9635985439417576,
"grad_norm": 0.904358446598053,
"learning_rate": 1.733956723009822e-05,
"loss": 2.2069,
"step": 4720
},
{
"epoch": 1.9656786271450857,
"grad_norm": 0.8221344947814941,
"learning_rate": 1.7279387397648084e-05,
"loss": 2.1593,
"step": 4725
},
{
"epoch": 1.967758710348414,
"grad_norm": 0.6962605714797974,
"learning_rate": 1.721925697896507e-05,
"loss": 2.1805,
"step": 4730
},
{
"epoch": 1.969838793551742,
"grad_norm": 0.7350068092346191,
"learning_rate": 1.715917635889823e-05,
"loss": 2.1726,
"step": 4735
},
{
"epoch": 1.9719188767550702,
"grad_norm": 0.8693460822105408,
"learning_rate": 1.7099145921977904e-05,
"loss": 2.121,
"step": 4740
},
{
"epoch": 1.9739989599583985,
"grad_norm": 0.9144797325134277,
"learning_rate": 1.703916605241325e-05,
"loss": 2.168,
"step": 4745
},
{
"epoch": 1.9760790431617266,
"grad_norm": 0.7593632340431213,
"learning_rate": 1.697923713408977e-05,
"loss": 2.1659,
"step": 4750
},
{
"epoch": 1.9781591263650546,
"grad_norm": 0.7203252911567688,
"learning_rate": 1.6919359550566886e-05,
"loss": 2.1566,
"step": 4755
},
{
"epoch": 1.9802392095683827,
"grad_norm": 0.6899543404579163,
"learning_rate": 1.6859533685075447e-05,
"loss": 2.1703,
"step": 4760
},
{
"epoch": 1.9823192927717108,
"grad_norm": 0.7112452387809753,
"learning_rate": 1.6799759920515294e-05,
"loss": 2.1541,
"step": 4765
},
{
"epoch": 1.984399375975039,
"grad_norm": 0.7511719465255737,
"learning_rate": 1.6740038639452822e-05,
"loss": 2.1698,
"step": 4770
},
{
"epoch": 1.986479459178367,
"grad_norm": 0.9203821420669556,
"learning_rate": 1.668037022411851e-05,
"loss": 2.1095,
"step": 4775
},
{
"epoch": 1.9885595423816953,
"grad_norm": 0.7733403444290161,
"learning_rate": 1.6620755056404485e-05,
"loss": 2.191,
"step": 4780
},
{
"epoch": 1.9906396255850234,
"grad_norm": 1.0877577066421509,
"learning_rate": 1.6561193517862097e-05,
"loss": 2.167,
"step": 4785
},
{
"epoch": 1.9927197087883517,
"grad_norm": 0.7318416237831116,
"learning_rate": 1.6501685989699405e-05,
"loss": 2.1868,
"step": 4790
},
{
"epoch": 1.9947997919916798,
"grad_norm": 0.7766571640968323,
"learning_rate": 1.6442232852778843e-05,
"loss": 2.1417,
"step": 4795
},
{
"epoch": 1.9968798751950079,
"grad_norm": 0.8204246163368225,
"learning_rate": 1.6382834487614694e-05,
"loss": 2.0969,
"step": 4800
},
{
"epoch": 1.998959958398336,
"grad_norm": 0.7147572040557861,
"learning_rate": 1.632349127437072e-05,
"loss": 2.1594,
"step": 4805
},
{
"epoch": 2.001040041601664,
"grad_norm": 0.8010472655296326,
"learning_rate": 1.6264203592857656e-05,
"loss": 2.1423,
"step": 4810
},
{
"epoch": 2.003120124804992,
"grad_norm": 0.7142998576164246,
"learning_rate": 1.6204971822530858e-05,
"loss": 2.1746,
"step": 4815
},
{
"epoch": 2.00520020800832,
"grad_norm": 0.8411352634429932,
"learning_rate": 1.614579634248781e-05,
"loss": 2.1706,
"step": 4820
},
{
"epoch": 2.0072802912116483,
"grad_norm": 0.8868602514266968,
"learning_rate": 1.6086677531465747e-05,
"loss": 2.1658,
"step": 4825
},
{
"epoch": 2.0093603744149764,
"grad_norm": 0.6709080934524536,
"learning_rate": 1.6027615767839195e-05,
"loss": 2.1805,
"step": 4830
},
{
"epoch": 2.011440457618305,
"grad_norm": 0.8166576027870178,
"learning_rate": 1.596861142961756e-05,
"loss": 2.1314,
"step": 4835
},
{
"epoch": 2.013520540821633,
"grad_norm": 0.7754188179969788,
"learning_rate": 1.590966489444273e-05,
"loss": 2.154,
"step": 4840
},
{
"epoch": 2.015600624024961,
"grad_norm": 0.7570802569389343,
"learning_rate": 1.5850776539586627e-05,
"loss": 2.1362,
"step": 4845
},
{
"epoch": 2.017680707228289,
"grad_norm": 0.7763701677322388,
"learning_rate": 1.579194674194879e-05,
"loss": 2.141,
"step": 4850
},
{
"epoch": 2.0197607904316173,
"grad_norm": 0.8779370784759521,
"learning_rate": 1.573317587805401e-05,
"loss": 2.1578,
"step": 4855
},
{
"epoch": 2.0218408736349454,
"grad_norm": 0.6604492664337158,
"learning_rate": 1.5674464324049864e-05,
"loss": 2.1501,
"step": 4860
},
{
"epoch": 2.0239209568382734,
"grad_norm": 0.8826847672462463,
"learning_rate": 1.561581245570434e-05,
"loss": 2.2047,
"step": 4865
},
{
"epoch": 2.0260010400416015,
"grad_norm": 0.6858493685722351,
"learning_rate": 1.5557220648403432e-05,
"loss": 2.1161,
"step": 4870
},
{
"epoch": 2.0280811232449296,
"grad_norm": 0.8504419326782227,
"learning_rate": 1.5498689277148704e-05,
"loss": 2.1241,
"step": 4875
},
{
"epoch": 2.030161206448258,
"grad_norm": 0.857164740562439,
"learning_rate": 1.544021871655494e-05,
"loss": 2.141,
"step": 4880
},
{
"epoch": 2.0322412896515862,
"grad_norm": 0.8137455582618713,
"learning_rate": 1.53818093408477e-05,
"loss": 2.1258,
"step": 4885
},
{
"epoch": 2.0343213728549143,
"grad_norm": 0.7521177530288696,
"learning_rate": 1.5323461523860977e-05,
"loss": 2.1515,
"step": 4890
},
{
"epoch": 2.0364014560582424,
"grad_norm": 0.8833959698677063,
"learning_rate": 1.5265175639034736e-05,
"loss": 2.1538,
"step": 4895
},
{
"epoch": 2.0384815392615705,
"grad_norm": 0.8551697134971619,
"learning_rate": 1.5206952059412604e-05,
"loss": 2.1609,
"step": 4900
},
{
"epoch": 2.0405616224648986,
"grad_norm": 0.9351072311401367,
"learning_rate": 1.5148791157639386e-05,
"loss": 2.1616,
"step": 4905
},
{
"epoch": 2.0426417056682267,
"grad_norm": 0.7209817171096802,
"learning_rate": 1.5090693305958779e-05,
"loss": 2.1216,
"step": 4910
},
{
"epoch": 2.0447217888715548,
"grad_norm": 0.7822704911231995,
"learning_rate": 1.503265887621092e-05,
"loss": 2.1041,
"step": 4915
},
{
"epoch": 2.046801872074883,
"grad_norm": 0.7391334772109985,
"learning_rate": 1.497468823983005e-05,
"loss": 2.1683,
"step": 4920
},
{
"epoch": 2.048881955278211,
"grad_norm": 0.8245681524276733,
"learning_rate": 1.4916781767842103e-05,
"loss": 2.1609,
"step": 4925
},
{
"epoch": 2.0509620384815395,
"grad_norm": 0.7062039375305176,
"learning_rate": 1.4858939830862347e-05,
"loss": 2.1479,
"step": 4930
},
{
"epoch": 2.0530421216848675,
"grad_norm": 0.9019640684127808,
"learning_rate": 1.4801162799093004e-05,
"loss": 2.141,
"step": 4935
},
{
"epoch": 2.0551222048881956,
"grad_norm": 0.7071494460105896,
"learning_rate": 1.4743451042320905e-05,
"loss": 2.1558,
"step": 4940
},
{
"epoch": 2.0572022880915237,
"grad_norm": 0.7502083778381348,
"learning_rate": 1.4685804929915098e-05,
"loss": 2.1755,
"step": 4945
},
{
"epoch": 2.059282371294852,
"grad_norm": 0.8539254665374756,
"learning_rate": 1.4628224830824478e-05,
"loss": 2.1479,
"step": 4950
},
{
"epoch": 2.06136245449818,
"grad_norm": 0.8018076419830322,
"learning_rate": 1.4570711113575457e-05,
"loss": 2.1345,
"step": 4955
},
{
"epoch": 2.063442537701508,
"grad_norm": 0.7207697629928589,
"learning_rate": 1.451326414626959e-05,
"loss": 2.1527,
"step": 4960
},
{
"epoch": 2.065522620904836,
"grad_norm": 0.8118232488632202,
"learning_rate": 1.4455884296581185e-05,
"loss": 2.1186,
"step": 4965
},
{
"epoch": 2.067602704108164,
"grad_norm": 0.8361445665359497,
"learning_rate": 1.4398571931755023e-05,
"loss": 2.148,
"step": 4970
},
{
"epoch": 2.0696827873114922,
"grad_norm": 0.9550055861473083,
"learning_rate": 1.4341327418603931e-05,
"loss": 2.1479,
"step": 4975
},
{
"epoch": 2.0717628705148208,
"grad_norm": 0.8788464069366455,
"learning_rate": 1.428415112350649e-05,
"loss": 2.1226,
"step": 4980
},
{
"epoch": 2.073842953718149,
"grad_norm": 0.7504823803901672,
"learning_rate": 1.4227043412404669e-05,
"loss": 2.1977,
"step": 4985
},
{
"epoch": 2.075923036921477,
"grad_norm": 0.9366262555122375,
"learning_rate": 1.4170004650801472e-05,
"loss": 2.147,
"step": 4990
},
{
"epoch": 2.078003120124805,
"grad_norm": 0.9149728417396545,
"learning_rate": 1.4113035203758606e-05,
"loss": 2.1187,
"step": 4995
},
{
"epoch": 2.080083203328133,
"grad_norm": 0.7827330827713013,
"learning_rate": 1.4056135435894163e-05,
"loss": 2.1989,
"step": 5000
},
{
"epoch": 2.082163286531461,
"grad_norm": 0.7674335837364197,
"learning_rate": 1.3999305711380267e-05,
"loss": 2.1156,
"step": 5005
},
{
"epoch": 2.0842433697347893,
"grad_norm": 0.7534267902374268,
"learning_rate": 1.3942546393940758e-05,
"loss": 2.1608,
"step": 5010
},
{
"epoch": 2.0863234529381174,
"grad_norm": 1.0169003009796143,
"learning_rate": 1.3885857846848829e-05,
"loss": 2.1333,
"step": 5015
},
{
"epoch": 2.0884035361414455,
"grad_norm": 0.7536617517471313,
"learning_rate": 1.3829240432924734e-05,
"loss": 2.2094,
"step": 5020
},
{
"epoch": 2.090483619344774,
"grad_norm": 0.8216843605041504,
"learning_rate": 1.3772694514533464e-05,
"loss": 2.1178,
"step": 5025
},
{
"epoch": 2.092563702548102,
"grad_norm": 0.8296759724617004,
"learning_rate": 1.371622045358244e-05,
"loss": 2.1405,
"step": 5030
},
{
"epoch": 2.09464378575143,
"grad_norm": 0.7530525326728821,
"learning_rate": 1.3659818611519131e-05,
"loss": 2.1602,
"step": 5035
},
{
"epoch": 2.0967238689547583,
"grad_norm": 0.7887389659881592,
"learning_rate": 1.360348934932883e-05,
"loss": 2.1249,
"step": 5040
},
{
"epoch": 2.0988039521580864,
"grad_norm": 0.8775638341903687,
"learning_rate": 1.3547233027532291e-05,
"loss": 2.1087,
"step": 5045
},
{
"epoch": 2.1008840353614144,
"grad_norm": 0.7218084335327148,
"learning_rate": 1.3491050006183425e-05,
"loss": 2.1855,
"step": 5050
},
{
"epoch": 2.1029641185647425,
"grad_norm": 0.7829450368881226,
"learning_rate": 1.3434940644866994e-05,
"loss": 2.1544,
"step": 5055
},
{
"epoch": 2.1050442017680706,
"grad_norm": 0.9921479225158691,
"learning_rate": 1.3378905302696338e-05,
"loss": 2.1401,
"step": 5060
},
{
"epoch": 2.1071242849713987,
"grad_norm": 0.7223407030105591,
"learning_rate": 1.3322944338311056e-05,
"loss": 2.1623,
"step": 5065
},
{
"epoch": 2.109204368174727,
"grad_norm": 0.786841869354248,
"learning_rate": 1.3267058109874683e-05,
"loss": 2.127,
"step": 5070
},
{
"epoch": 2.1112844513780553,
"grad_norm": 0.8238504528999329,
"learning_rate": 1.3211246975072473e-05,
"loss": 2.1636,
"step": 5075
},
{
"epoch": 2.1133645345813834,
"grad_norm": 0.8730514645576477,
"learning_rate": 1.3155511291109013e-05,
"loss": 2.1809,
"step": 5080
},
{
"epoch": 2.1154446177847115,
"grad_norm": 0.7120848894119263,
"learning_rate": 1.3099851414706027e-05,
"loss": 2.1496,
"step": 5085
},
{
"epoch": 2.1175247009880396,
"grad_norm": 0.8902744650840759,
"learning_rate": 1.304426770210002e-05,
"loss": 2.1899,
"step": 5090
},
{
"epoch": 2.1196047841913677,
"grad_norm": 0.7903064489364624,
"learning_rate": 1.2988760509040058e-05,
"loss": 2.1446,
"step": 5095
},
{
"epoch": 2.1216848673946958,
"grad_norm": 0.8039788603782654,
"learning_rate": 1.2933330190785444e-05,
"loss": 2.1314,
"step": 5100
},
{
"epoch": 2.123764950598024,
"grad_norm": 0.8553371429443359,
"learning_rate": 1.28779771021035e-05,
"loss": 2.1604,
"step": 5105
},
{
"epoch": 2.125845033801352,
"grad_norm": 0.7643943428993225,
"learning_rate": 1.2822701597267185e-05,
"loss": 2.1628,
"step": 5110
},
{
"epoch": 2.12792511700468,
"grad_norm": 0.8471932411193848,
"learning_rate": 1.2767504030052973e-05,
"loss": 2.1831,
"step": 5115
},
{
"epoch": 2.130005200208008,
"grad_norm": 0.9007987380027771,
"learning_rate": 1.2712384753738499e-05,
"loss": 2.1761,
"step": 5120
},
{
"epoch": 2.1320852834113366,
"grad_norm": 0.7271192073822021,
"learning_rate": 1.2657344121100314e-05,
"loss": 2.173,
"step": 5125
},
{
"epoch": 2.1341653666146647,
"grad_norm": 0.761782705783844,
"learning_rate": 1.260238248441163e-05,
"loss": 2.1227,
"step": 5130
},
{
"epoch": 2.136245449817993,
"grad_norm": 0.8150344491004944,
"learning_rate": 1.2547500195440049e-05,
"loss": 2.1529,
"step": 5135
},
{
"epoch": 2.138325533021321,
"grad_norm": 0.7628713250160217,
"learning_rate": 1.2492697605445361e-05,
"loss": 2.1491,
"step": 5140
},
{
"epoch": 2.140405616224649,
"grad_norm": 0.7385218739509583,
"learning_rate": 1.2437975065177258e-05,
"loss": 2.1381,
"step": 5145
},
{
"epoch": 2.142485699427977,
"grad_norm": 0.8682646155357361,
"learning_rate": 1.2383332924873062e-05,
"loss": 2.151,
"step": 5150
},
{
"epoch": 2.144565782631305,
"grad_norm": 0.9769818782806396,
"learning_rate": 1.232877153425555e-05,
"loss": 2.2263,
"step": 5155
},
{
"epoch": 2.1466458658346332,
"grad_norm": 0.6793572902679443,
"learning_rate": 1.2274291242530685e-05,
"loss": 2.1157,
"step": 5160
},
{
"epoch": 2.1487259490379613,
"grad_norm": 0.7732495069503784,
"learning_rate": 1.2219892398385351e-05,
"loss": 2.1697,
"step": 5165
},
{
"epoch": 2.15080603224129,
"grad_norm": 0.8827196359634399,
"learning_rate": 1.2165575349985151e-05,
"loss": 2.1536,
"step": 5170
},
{
"epoch": 2.152886115444618,
"grad_norm": 0.7778597474098206,
"learning_rate": 1.2111340444972194e-05,
"loss": 2.1682,
"step": 5175
},
{
"epoch": 2.154966198647946,
"grad_norm": 0.7099940776824951,
"learning_rate": 1.2057188030462851e-05,
"loss": 2.1749,
"step": 5180
},
{
"epoch": 2.157046281851274,
"grad_norm": 0.864341139793396,
"learning_rate": 1.2003118453045512e-05,
"loss": 2.1742,
"step": 5185
},
{
"epoch": 2.159126365054602,
"grad_norm": 0.7807241082191467,
"learning_rate": 1.194913205877842e-05,
"loss": 2.1381,
"step": 5190
},
{
"epoch": 2.1612064482579303,
"grad_norm": 0.8154935240745544,
"learning_rate": 1.1895229193187387e-05,
"loss": 2.1717,
"step": 5195
},
{
"epoch": 2.1632865314612584,
"grad_norm": 0.745764970779419,
"learning_rate": 1.184141020126367e-05,
"loss": 2.1242,
"step": 5200
},
{
"epoch": 2.1653666146645865,
"grad_norm": 0.8805513978004456,
"learning_rate": 1.1787675427461664e-05,
"loss": 2.1491,
"step": 5205
},
{
"epoch": 2.1674466978679146,
"grad_norm": 0.7791693806648254,
"learning_rate": 1.1734025215696784e-05,
"loss": 2.1188,
"step": 5210
},
{
"epoch": 2.1695267810712426,
"grad_norm": 0.902431309223175,
"learning_rate": 1.1680459909343219e-05,
"loss": 2.1735,
"step": 5215
},
{
"epoch": 2.171606864274571,
"grad_norm": 0.728276789188385,
"learning_rate": 1.1626979851231756e-05,
"loss": 2.1625,
"step": 5220
},
{
"epoch": 2.1736869474778993,
"grad_norm": 0.9273778796195984,
"learning_rate": 1.157358538364752e-05,
"loss": 2.126,
"step": 5225
},
{
"epoch": 2.1757670306812273,
"grad_norm": 0.7193277478218079,
"learning_rate": 1.1520276848327893e-05,
"loss": 2.1437,
"step": 5230
},
{
"epoch": 2.1778471138845554,
"grad_norm": 0.8120283484458923,
"learning_rate": 1.1467054586460249e-05,
"loss": 2.1432,
"step": 5235
},
{
"epoch": 2.1799271970878835,
"grad_norm": 0.8220019340515137,
"learning_rate": 1.1413918938679805e-05,
"loss": 2.1332,
"step": 5240
},
{
"epoch": 2.1820072802912116,
"grad_norm": 0.8082548379898071,
"learning_rate": 1.13608702450674e-05,
"loss": 2.1843,
"step": 5245
},
{
"epoch": 2.1840873634945397,
"grad_norm": 0.8674708604812622,
"learning_rate": 1.1307908845147358e-05,
"loss": 2.1804,
"step": 5250
},
{
"epoch": 2.1861674466978678,
"grad_norm": 0.7203066349029541,
"learning_rate": 1.1255035077885307e-05,
"loss": 2.1674,
"step": 5255
},
{
"epoch": 2.188247529901196,
"grad_norm": 0.9036044478416443,
"learning_rate": 1.1202249281686018e-05,
"loss": 2.1792,
"step": 5260
},
{
"epoch": 2.1903276131045244,
"grad_norm": 0.9539422988891602,
"learning_rate": 1.1149551794391186e-05,
"loss": 2.1372,
"step": 5265
},
{
"epoch": 2.1924076963078525,
"grad_norm": 0.8710148930549622,
"learning_rate": 1.1096942953277347e-05,
"loss": 2.1819,
"step": 5270
},
{
"epoch": 2.1944877795111806,
"grad_norm": 0.6914278864860535,
"learning_rate": 1.1044423095053677e-05,
"loss": 2.1202,
"step": 5275
},
{
"epoch": 2.1965678627145087,
"grad_norm": 0.6613463163375854,
"learning_rate": 1.0991992555859814e-05,
"loss": 2.1623,
"step": 5280
},
{
"epoch": 2.1986479459178367,
"grad_norm": 0.6368273496627808,
"learning_rate": 1.0939651671263745e-05,
"loss": 2.1809,
"step": 5285
},
{
"epoch": 2.200728029121165,
"grad_norm": 0.7360798716545105,
"learning_rate": 1.0887400776259655e-05,
"loss": 2.1576,
"step": 5290
},
{
"epoch": 2.202808112324493,
"grad_norm": 0.8459822535514832,
"learning_rate": 1.0835240205265775e-05,
"loss": 2.1608,
"step": 5295
},
{
"epoch": 2.204888195527821,
"grad_norm": 0.686772882938385,
"learning_rate": 1.0783170292122222e-05,
"loss": 2.1849,
"step": 5300
},
{
"epoch": 2.206968278731149,
"grad_norm": 0.797569751739502,
"learning_rate": 1.0731191370088905e-05,
"loss": 2.1912,
"step": 5305
},
{
"epoch": 2.209048361934477,
"grad_norm": 0.7473933100700378,
"learning_rate": 1.0679303771843343e-05,
"loss": 2.1224,
"step": 5310
},
{
"epoch": 2.2111284451378057,
"grad_norm": 0.7206450700759888,
"learning_rate": 1.0627507829478595e-05,
"loss": 2.1904,
"step": 5315
},
{
"epoch": 2.213208528341134,
"grad_norm": 0.774996817111969,
"learning_rate": 1.0575803874501053e-05,
"loss": 2.157,
"step": 5320
},
{
"epoch": 2.215288611544462,
"grad_norm": 0.880527138710022,
"learning_rate": 1.0524192237828406e-05,
"loss": 2.1832,
"step": 5325
},
{
"epoch": 2.21736869474779,
"grad_norm": 0.8387317061424255,
"learning_rate": 1.0472673249787477e-05,
"loss": 2.1494,
"step": 5330
},
{
"epoch": 2.219448777951118,
"grad_norm": 0.7557438611984253,
"learning_rate": 1.0421247240112126e-05,
"loss": 2.1711,
"step": 5335
},
{
"epoch": 2.221528861154446,
"grad_norm": 0.8698292970657349,
"learning_rate": 1.0369914537941076e-05,
"loss": 2.1497,
"step": 5340
},
{
"epoch": 2.2236089443577742,
"grad_norm": 0.7102160453796387,
"learning_rate": 1.031867547181592e-05,
"loss": 2.1101,
"step": 5345
},
{
"epoch": 2.2256890275611023,
"grad_norm": 0.852909505367279,
"learning_rate": 1.0267530369678929e-05,
"loss": 2.1418,
"step": 5350
},
{
"epoch": 2.2277691107644304,
"grad_norm": 0.871283233165741,
"learning_rate": 1.0216479558871004e-05,
"loss": 2.196,
"step": 5355
},
{
"epoch": 2.229849193967759,
"grad_norm": 0.7697460055351257,
"learning_rate": 1.0165523366129531e-05,
"loss": 2.1592,
"step": 5360
},
{
"epoch": 2.231929277171087,
"grad_norm": 0.8391153216362,
"learning_rate": 1.0114662117586321e-05,
"loss": 2.1135,
"step": 5365
},
{
"epoch": 2.234009360374415,
"grad_norm": 0.739166259765625,
"learning_rate": 1.0063896138765541e-05,
"loss": 2.161,
"step": 5370
},
{
"epoch": 2.236089443577743,
"grad_norm": 0.9063655138015747,
"learning_rate": 1.0013225754581601e-05,
"loss": 2.155,
"step": 5375
},
{
"epoch": 2.2381695267810713,
"grad_norm": 0.9477112293243408,
"learning_rate": 9.962651289337063e-06,
"loss": 2.1512,
"step": 5380
},
{
"epoch": 2.2402496099843994,
"grad_norm": 0.7602328062057495,
"learning_rate": 9.91217306672061e-06,
"loss": 2.1378,
"step": 5385
},
{
"epoch": 2.2423296931877275,
"grad_norm": 0.9798824191093445,
"learning_rate": 9.861791409804946e-06,
"loss": 2.1466,
"step": 5390
},
{
"epoch": 2.2444097763910555,
"grad_norm": 0.9144386053085327,
"learning_rate": 9.811506641044715e-06,
"loss": 2.1672,
"step": 5395
},
{
"epoch": 2.2464898595943836,
"grad_norm": 0.7295798659324646,
"learning_rate": 9.761319082274456e-06,
"loss": 2.1446,
"step": 5400
},
{
"epoch": 2.2485699427977117,
"grad_norm": 1.134780764579773,
"learning_rate": 9.711229054706558e-06,
"loss": 2.1593,
"step": 5405
},
{
"epoch": 2.25065002600104,
"grad_norm": 0.7548600435256958,
"learning_rate": 9.661236878929184e-06,
"loss": 2.1742,
"step": 5410
},
{
"epoch": 2.2527301092043683,
"grad_norm": 0.8188340663909912,
"learning_rate": 9.611342874904194e-06,
"loss": 2.1521,
"step": 5415
},
{
"epoch": 2.2548101924076964,
"grad_norm": 0.7602256536483765,
"learning_rate": 9.561547361965173e-06,
"loss": 2.1661,
"step": 5420
},
{
"epoch": 2.2568902756110245,
"grad_norm": 0.7601512670516968,
"learning_rate": 9.511850658815285e-06,
"loss": 2.1678,
"step": 5425
},
{
"epoch": 2.2589703588143526,
"grad_norm": 0.7207624912261963,
"learning_rate": 9.46225308352534e-06,
"loss": 2.1191,
"step": 5430
},
{
"epoch": 2.2610504420176807,
"grad_norm": 0.6282790899276733,
"learning_rate": 9.412754953531663e-06,
"loss": 2.1543,
"step": 5435
},
{
"epoch": 2.2631305252210088,
"grad_norm": 0.7994824647903442,
"learning_rate": 9.363356585634133e-06,
"loss": 2.1642,
"step": 5440
},
{
"epoch": 2.265210608424337,
"grad_norm": 0.7656511068344116,
"learning_rate": 9.314058295994116e-06,
"loss": 2.1403,
"step": 5445
},
{
"epoch": 2.267290691627665,
"grad_norm": 0.8570970296859741,
"learning_rate": 9.264860400132475e-06,
"loss": 2.1625,
"step": 5450
},
{
"epoch": 2.2693707748309935,
"grad_norm": 0.788067102432251,
"learning_rate": 9.215763212927476e-06,
"loss": 2.1396,
"step": 5455
},
{
"epoch": 2.2714508580343216,
"grad_norm": 0.9106987714767456,
"learning_rate": 9.166767048612872e-06,
"loss": 2.1419,
"step": 5460
},
{
"epoch": 2.2735309412376496,
"grad_norm": 0.7237809300422668,
"learning_rate": 9.117872220775839e-06,
"loss": 2.1413,
"step": 5465
},
{
"epoch": 2.2756110244409777,
"grad_norm": 0.8435239791870117,
"learning_rate": 9.069079042354975e-06,
"loss": 2.1533,
"step": 5470
},
{
"epoch": 2.277691107644306,
"grad_norm": 1.157760500907898,
"learning_rate": 9.02038782563828e-06,
"loss": 2.1467,
"step": 5475
},
{
"epoch": 2.279771190847634,
"grad_norm": 0.8838199973106384,
"learning_rate": 8.971798882261182e-06,
"loss": 2.1843,
"step": 5480
},
{
"epoch": 2.281851274050962,
"grad_norm": 0.7448551058769226,
"learning_rate": 8.923312523204541e-06,
"loss": 2.1238,
"step": 5485
},
{
"epoch": 2.28393135725429,
"grad_norm": 0.8447023630142212,
"learning_rate": 8.874929058792667e-06,
"loss": 2.1628,
"step": 5490
},
{
"epoch": 2.286011440457618,
"grad_norm": 0.7749513983726501,
"learning_rate": 8.826648798691284e-06,
"loss": 2.1367,
"step": 5495
},
{
"epoch": 2.2880915236609463,
"grad_norm": 0.7845672965049744,
"learning_rate": 8.778472051905609e-06,
"loss": 2.1882,
"step": 5500
},
{
"epoch": 2.2901716068642743,
"grad_norm": 0.7977719306945801,
"learning_rate": 8.730399126778355e-06,
"loss": 2.1734,
"step": 5505
},
{
"epoch": 2.292251690067603,
"grad_norm": 0.8288791179656982,
"learning_rate": 8.682430330987732e-06,
"loss": 2.1488,
"step": 5510
},
{
"epoch": 2.294331773270931,
"grad_norm": 0.7984702587127686,
"learning_rate": 8.63456597154549e-06,
"loss": 2.1762,
"step": 5515
},
{
"epoch": 2.296411856474259,
"grad_norm": 0.88454270362854,
"learning_rate": 8.586806354794997e-06,
"loss": 2.1475,
"step": 5520
},
{
"epoch": 2.298491939677587,
"grad_norm": 0.8620187044143677,
"learning_rate": 8.539151786409223e-06,
"loss": 2.1913,
"step": 5525
},
{
"epoch": 2.3005720228809152,
"grad_norm": 1.0124173164367676,
"learning_rate": 8.491602571388784e-06,
"loss": 2.1124,
"step": 5530
},
{
"epoch": 2.3026521060842433,
"grad_norm": 0.8168820738792419,
"learning_rate": 8.444159014060051e-06,
"loss": 2.1298,
"step": 5535
},
{
"epoch": 2.3047321892875714,
"grad_norm": 0.8259533643722534,
"learning_rate": 8.396821418073118e-06,
"loss": 2.1422,
"step": 5540
},
{
"epoch": 2.3068122724908995,
"grad_norm": 0.8243906497955322,
"learning_rate": 8.349590086399934e-06,
"loss": 2.1251,
"step": 5545
},
{
"epoch": 2.308892355694228,
"grad_norm": 0.8051894903182983,
"learning_rate": 8.302465321332306e-06,
"loss": 2.1547,
"step": 5550
},
{
"epoch": 2.310972438897556,
"grad_norm": 0.8569920063018799,
"learning_rate": 8.255447424480007e-06,
"loss": 2.107,
"step": 5555
},
{
"epoch": 2.313052522100884,
"grad_norm": 0.9085841178894043,
"learning_rate": 8.208536696768823e-06,
"loss": 2.1564,
"step": 5560
},
{
"epoch": 2.3151326053042123,
"grad_norm": 0.7567054033279419,
"learning_rate": 8.161733438438643e-06,
"loss": 2.1805,
"step": 5565
},
{
"epoch": 2.3172126885075404,
"grad_norm": 1.1547774076461792,
"learning_rate": 8.115037949041488e-06,
"loss": 2.175,
"step": 5570
},
{
"epoch": 2.3192927717108685,
"grad_norm": 0.7055802941322327,
"learning_rate": 8.068450527439667e-06,
"loss": 2.1464,
"step": 5575
},
{
"epoch": 2.3213728549141965,
"grad_norm": 0.8332855105400085,
"learning_rate": 8.02197147180382e-06,
"loss": 2.138,
"step": 5580
},
{
"epoch": 2.3234529381175246,
"grad_norm": 0.8682128190994263,
"learning_rate": 7.975601079611036e-06,
"loss": 2.1292,
"step": 5585
},
{
"epoch": 2.3255330213208527,
"grad_norm": 0.8856552839279175,
"learning_rate": 7.929339647642898e-06,
"loss": 2.2076,
"step": 5590
},
{
"epoch": 2.327613104524181,
"grad_norm": 0.9237544536590576,
"learning_rate": 7.88318747198363e-06,
"loss": 2.1674,
"step": 5595
},
{
"epoch": 2.329693187727509,
"grad_norm": 0.8059023022651672,
"learning_rate": 7.837144848018203e-06,
"loss": 2.1626,
"step": 5600
},
{
"epoch": 2.3317732709308374,
"grad_norm": 0.792470395565033,
"learning_rate": 7.791212070430426e-06,
"loss": 2.12,
"step": 5605
},
{
"epoch": 2.3338533541341655,
"grad_norm": 0.8884658813476562,
"learning_rate": 7.745389433201047e-06,
"loss": 2.1297,
"step": 5610
},
{
"epoch": 2.3359334373374936,
"grad_norm": 0.8813104033470154,
"learning_rate": 7.699677229605914e-06,
"loss": 2.1471,
"step": 5615
},
{
"epoch": 2.3380135205408217,
"grad_norm": 0.746537983417511,
"learning_rate": 7.654075752214065e-06,
"loss": 2.1614,
"step": 5620
},
{
"epoch": 2.3400936037441498,
"grad_norm": 1.192823886871338,
"learning_rate": 7.608585292885862e-06,
"loss": 2.1381,
"step": 5625
},
{
"epoch": 2.342173686947478,
"grad_norm": 0.6864091157913208,
"learning_rate": 7.563206142771106e-06,
"loss": 2.1509,
"step": 5630
},
{
"epoch": 2.344253770150806,
"grad_norm": 0.7745160460472107,
"learning_rate": 7.517938592307225e-06,
"loss": 2.1405,
"step": 5635
},
{
"epoch": 2.346333853354134,
"grad_norm": 0.6769249439239502,
"learning_rate": 7.472782931217373e-06,
"loss": 2.1317,
"step": 5640
},
{
"epoch": 2.348413936557462,
"grad_norm": 0.8352982997894287,
"learning_rate": 7.427739448508566e-06,
"loss": 2.1467,
"step": 5645
},
{
"epoch": 2.3504940197607906,
"grad_norm": 0.9535309672355652,
"learning_rate": 7.382808432469885e-06,
"loss": 2.162,
"step": 5650
},
{
"epoch": 2.3525741029641187,
"grad_norm": 0.8944627046585083,
"learning_rate": 7.337990170670556e-06,
"loss": 2.1402,
"step": 5655
},
{
"epoch": 2.354654186167447,
"grad_norm": 0.7682763338088989,
"learning_rate": 7.293284949958193e-06,
"loss": 2.1346,
"step": 5660
},
{
"epoch": 2.356734269370775,
"grad_norm": 0.6998905539512634,
"learning_rate": 7.248693056456882e-06,
"loss": 2.1127,
"step": 5665
},
{
"epoch": 2.358814352574103,
"grad_norm": 0.8731864094734192,
"learning_rate": 7.2042147755654185e-06,
"loss": 2.1329,
"step": 5670
},
{
"epoch": 2.360894435777431,
"grad_norm": 0.6891990900039673,
"learning_rate": 7.159850391955441e-06,
"loss": 2.1495,
"step": 5675
},
{
"epoch": 2.362974518980759,
"grad_norm": 0.7515982389450073,
"learning_rate": 7.11560018956961e-06,
"loss": 2.1439,
"step": 5680
},
{
"epoch": 2.3650546021840873,
"grad_norm": 0.7395572662353516,
"learning_rate": 7.071464451619794e-06,
"loss": 2.1448,
"step": 5685
},
{
"epoch": 2.3671346853874153,
"grad_norm": 0.7845259308815002,
"learning_rate": 7.027443460585278e-06,
"loss": 2.1667,
"step": 5690
},
{
"epoch": 2.3692147685907434,
"grad_norm": 0.8673036098480225,
"learning_rate": 6.983537498210938e-06,
"loss": 2.1466,
"step": 5695
},
{
"epoch": 2.371294851794072,
"grad_norm": 0.8610036969184875,
"learning_rate": 6.939746845505435e-06,
"loss": 2.1419,
"step": 5700
},
{
"epoch": 2.3733749349974,
"grad_norm": 0.7803443074226379,
"learning_rate": 6.896071782739416e-06,
"loss": 2.1728,
"step": 5705
},
{
"epoch": 2.375455018200728,
"grad_norm": 0.8815957903862,
"learning_rate": 6.852512589443719e-06,
"loss": 2.1333,
"step": 5710
},
{
"epoch": 2.377535101404056,
"grad_norm": 0.7982692718505859,
"learning_rate": 6.8090695444076035e-06,
"loss": 2.1605,
"step": 5715
},
{
"epoch": 2.3796151846073843,
"grad_norm": 0.889788806438446,
"learning_rate": 6.76574292567696e-06,
"loss": 2.161,
"step": 5720
},
{
"epoch": 2.3816952678107124,
"grad_norm": 0.7610637545585632,
"learning_rate": 6.722533010552492e-06,
"loss": 2.153,
"step": 5725
},
{
"epoch": 2.3837753510140405,
"grad_norm": 0.876675009727478,
"learning_rate": 6.679440075588001e-06,
"loss": 2.1666,
"step": 5730
},
{
"epoch": 2.3858554342173686,
"grad_norm": 0.8992761969566345,
"learning_rate": 6.636464396588582e-06,
"loss": 2.1821,
"step": 5735
},
{
"epoch": 2.3879355174206967,
"grad_norm": 0.779009997844696,
"learning_rate": 6.5936062486088495e-06,
"loss": 2.1368,
"step": 5740
},
{
"epoch": 2.390015600624025,
"grad_norm": 0.7470064163208008,
"learning_rate": 6.550865905951198e-06,
"loss": 2.1409,
"step": 5745
},
{
"epoch": 2.3920956838273533,
"grad_norm": 0.7569335699081421,
"learning_rate": 6.508243642164044e-06,
"loss": 2.1626,
"step": 5750
},
{
"epoch": 2.3941757670306814,
"grad_norm": 0.8971595764160156,
"learning_rate": 6.465739730040082e-06,
"loss": 2.2096,
"step": 5755
},
{
"epoch": 2.3962558502340094,
"grad_norm": 0.7085235118865967,
"learning_rate": 6.423354441614496e-06,
"loss": 2.1093,
"step": 5760
},
{
"epoch": 2.3983359334373375,
"grad_norm": 0.8055812120437622,
"learning_rate": 6.381088048163286e-06,
"loss": 2.1378,
"step": 5765
},
{
"epoch": 2.4004160166406656,
"grad_norm": 0.743766725063324,
"learning_rate": 6.338940820201464e-06,
"loss": 2.1661,
"step": 5770
},
{
"epoch": 2.4024960998439937,
"grad_norm": 0.7603965997695923,
"learning_rate": 6.2969130274813796e-06,
"loss": 2.1367,
"step": 5775
},
{
"epoch": 2.404576183047322,
"grad_norm": 0.7308275103569031,
"learning_rate": 6.255004938990949e-06,
"loss": 2.1867,
"step": 5780
},
{
"epoch": 2.40665626625065,
"grad_norm": 0.690366268157959,
"learning_rate": 6.2132168229519646e-06,
"loss": 2.1444,
"step": 5785
},
{
"epoch": 2.408736349453978,
"grad_norm": 0.8872700929641724,
"learning_rate": 6.17154894681837e-06,
"loss": 2.1508,
"step": 5790
},
{
"epoch": 2.410816432657306,
"grad_norm": 0.7977089285850525,
"learning_rate": 6.1300015772745326e-06,
"loss": 2.1729,
"step": 5795
},
{
"epoch": 2.4128965158606346,
"grad_norm": 0.8401610851287842,
"learning_rate": 6.088574980233546e-06,
"loss": 2.1363,
"step": 5800
},
{
"epoch": 2.4149765990639627,
"grad_norm": 0.8002891540527344,
"learning_rate": 6.0472694208355465e-06,
"loss": 2.1188,
"step": 5805
},
{
"epoch": 2.4170566822672908,
"grad_norm": 0.7504671216011047,
"learning_rate": 6.006085163445993e-06,
"loss": 2.1852,
"step": 5810
},
{
"epoch": 2.419136765470619,
"grad_norm": 0.7341333627700806,
"learning_rate": 5.965022471653989e-06,
"loss": 2.1733,
"step": 5815
},
{
"epoch": 2.421216848673947,
"grad_norm": 0.7916009426116943,
"learning_rate": 5.924081608270574e-06,
"loss": 2.1516,
"step": 5820
},
{
"epoch": 2.423296931877275,
"grad_norm": 0.7159552574157715,
"learning_rate": 5.883262835327058e-06,
"loss": 2.1762,
"step": 5825
},
{
"epoch": 2.425377015080603,
"grad_norm": 0.8512945175170898,
"learning_rate": 5.842566414073361e-06,
"loss": 2.1495,
"step": 5830
},
{
"epoch": 2.427457098283931,
"grad_norm": 0.8425369262695312,
"learning_rate": 5.801992604976317e-06,
"loss": 2.1454,
"step": 5835
},
{
"epoch": 2.4295371814872597,
"grad_norm": 0.7385132312774658,
"learning_rate": 5.76154166771799e-06,
"loss": 2.1219,
"step": 5840
},
{
"epoch": 2.431617264690588,
"grad_norm": 0.7568185925483704,
"learning_rate": 5.721213861194066e-06,
"loss": 2.1488,
"step": 5845
},
{
"epoch": 2.433697347893916,
"grad_norm": 0.7648590803146362,
"learning_rate": 5.681009443512156e-06,
"loss": 2.1374,
"step": 5850
},
{
"epoch": 2.435777431097244,
"grad_norm": 0.7592807412147522,
"learning_rate": 5.640928671990139e-06,
"loss": 2.1475,
"step": 5855
},
{
"epoch": 2.437857514300572,
"grad_norm": 0.7430097460746765,
"learning_rate": 5.600971803154534e-06,
"loss": 2.1509,
"step": 5860
},
{
"epoch": 2.4399375975039,
"grad_norm": 0.7668802738189697,
"learning_rate": 5.561139092738865e-06,
"loss": 2.1284,
"step": 5865
},
{
"epoch": 2.4420176807072282,
"grad_norm": 1.1163736581802368,
"learning_rate": 5.521430795682012e-06,
"loss": 2.1686,
"step": 5870
},
{
"epoch": 2.4440977639105563,
"grad_norm": 0.7597137093544006,
"learning_rate": 5.481847166126555e-06,
"loss": 2.1846,
"step": 5875
},
{
"epoch": 2.4461778471138844,
"grad_norm": 0.7747199535369873,
"learning_rate": 5.442388457417211e-06,
"loss": 2.1709,
"step": 5880
},
{
"epoch": 2.4482579303172125,
"grad_norm": 0.8134779334068298,
"learning_rate": 5.403054922099132e-06,
"loss": 2.1754,
"step": 5885
},
{
"epoch": 2.4503380135205406,
"grad_norm": 0.8847607970237732,
"learning_rate": 5.3638468119163675e-06,
"loss": 2.1396,
"step": 5890
},
{
"epoch": 2.452418096723869,
"grad_norm": 0.8032739758491516,
"learning_rate": 5.324764377810187e-06,
"loss": 2.1771,
"step": 5895
},
{
"epoch": 2.454498179927197,
"grad_norm": 0.768173336982727,
"learning_rate": 5.285807869917522e-06,
"loss": 2.1521,
"step": 5900
},
{
"epoch": 2.4565782631305253,
"grad_norm": 0.9260585904121399,
"learning_rate": 5.246977537569345e-06,
"loss": 2.125,
"step": 5905
},
{
"epoch": 2.4586583463338534,
"grad_norm": 0.7803197503089905,
"learning_rate": 5.208273629289065e-06,
"loss": 2.1739,
"step": 5910
},
{
"epoch": 2.4607384295371815,
"grad_norm": 0.7892303466796875,
"learning_rate": 5.169696392790946e-06,
"loss": 2.1638,
"step": 5915
},
{
"epoch": 2.4628185127405096,
"grad_norm": 0.719467043876648,
"learning_rate": 5.13124607497853e-06,
"loss": 2.1767,
"step": 5920
},
{
"epoch": 2.4648985959438376,
"grad_norm": 0.8412065505981445,
"learning_rate": 5.0929229219430556e-06,
"loss": 2.1636,
"step": 5925
},
{
"epoch": 2.4669786791471657,
"grad_norm": 0.8725780844688416,
"learning_rate": 5.054727178961854e-06,
"loss": 2.1908,
"step": 5930
},
{
"epoch": 2.4690587623504943,
"grad_norm": 0.9785857796669006,
"learning_rate": 5.016659090496833e-06,
"loss": 2.1268,
"step": 5935
},
{
"epoch": 2.4711388455538223,
"grad_norm": 0.7336744070053101,
"learning_rate": 4.978718900192841e-06,
"loss": 2.1662,
"step": 5940
},
{
"epoch": 2.4732189287571504,
"grad_norm": 0.819547176361084,
"learning_rate": 4.940906850876184e-06,
"loss": 2.136,
"step": 5945
},
{
"epoch": 2.4752990119604785,
"grad_norm": 0.8629989624023438,
"learning_rate": 4.903223184553027e-06,
"loss": 2.1465,
"step": 5950
},
{
"epoch": 2.4773790951638066,
"grad_norm": 0.9366332292556763,
"learning_rate": 4.865668142407828e-06,
"loss": 2.1505,
"step": 5955
},
{
"epoch": 2.4794591783671347,
"grad_norm": 0.8467020988464355,
"learning_rate": 4.828241964801847e-06,
"loss": 2.1357,
"step": 5960
},
{
"epoch": 2.481539261570463,
"grad_norm": 0.9429585337638855,
"learning_rate": 4.790944891271581e-06,
"loss": 2.1352,
"step": 5965
},
{
"epoch": 2.483619344773791,
"grad_norm": 0.7886870503425598,
"learning_rate": 4.753777160527215e-06,
"loss": 2.1286,
"step": 5970
},
{
"epoch": 2.485699427977119,
"grad_norm": 0.9287928342819214,
"learning_rate": 4.716739010451102e-06,
"loss": 2.1432,
"step": 5975
},
{
"epoch": 2.487779511180447,
"grad_norm": 0.7071095705032349,
"learning_rate": 4.679830678096272e-06,
"loss": 2.1476,
"step": 5980
},
{
"epoch": 2.489859594383775,
"grad_norm": 0.7225685715675354,
"learning_rate": 4.643052399684886e-06,
"loss": 2.1121,
"step": 5985
},
{
"epoch": 2.4919396775871037,
"grad_norm": 0.7947995066642761,
"learning_rate": 4.6064044106067045e-06,
"loss": 2.1618,
"step": 5990
},
{
"epoch": 2.4940197607904318,
"grad_norm": 0.7835599780082703,
"learning_rate": 4.569886945417639e-06,
"loss": 2.1221,
"step": 5995
},
{
"epoch": 2.49609984399376,
"grad_norm": 0.7121095061302185,
"learning_rate": 4.533500237838187e-06,
"loss": 2.1202,
"step": 6000
},
{
"epoch": 2.498179927197088,
"grad_norm": 0.9667412042617798,
"learning_rate": 4.4972445207519895e-06,
"loss": 2.1949,
"step": 6005
},
{
"epoch": 2.500260010400416,
"grad_norm": 0.7887002825737,
"learning_rate": 4.461120026204299e-06,
"loss": 2.1333,
"step": 6010
},
{
"epoch": 2.502340093603744,
"grad_norm": 0.7729286551475525,
"learning_rate": 4.425126985400521e-06,
"loss": 2.1112,
"step": 6015
},
{
"epoch": 2.504420176807072,
"grad_norm": 0.8352200388908386,
"learning_rate": 4.389265628704734e-06,
"loss": 2.1195,
"step": 6020
},
{
"epoch": 2.5065002600104003,
"grad_norm": 0.7960278987884521,
"learning_rate": 4.353536185638188e-06,
"loss": 2.1827,
"step": 6025
},
{
"epoch": 2.508580343213729,
"grad_norm": 0.8067042827606201,
"learning_rate": 4.317938884877862e-06,
"loss": 2.1796,
"step": 6030
},
{
"epoch": 2.510660426417057,
"grad_norm": 0.7853888273239136,
"learning_rate": 4.282473954255e-06,
"loss": 2.1338,
"step": 6035
},
{
"epoch": 2.512740509620385,
"grad_norm": 0.7720993161201477,
"learning_rate": 4.247141620753642e-06,
"loss": 2.1328,
"step": 6040
},
{
"epoch": 2.514820592823713,
"grad_norm": 0.7566680908203125,
"learning_rate": 4.211942110509165e-06,
"loss": 2.1656,
"step": 6045
},
{
"epoch": 2.516900676027041,
"grad_norm": 0.7984460592269897,
"learning_rate": 4.17687564880686e-06,
"loss": 2.1554,
"step": 6050
},
{
"epoch": 2.5189807592303692,
"grad_norm": 0.7508406639099121,
"learning_rate": 4.141942460080461e-06,
"loss": 2.1899,
"step": 6055
},
{
"epoch": 2.5210608424336973,
"grad_norm": 0.8357878923416138,
"learning_rate": 4.107142767910741e-06,
"loss": 2.1807,
"step": 6060
},
{
"epoch": 2.5231409256370254,
"grad_norm": 0.7380987405776978,
"learning_rate": 4.0724767950240415e-06,
"loss": 2.1567,
"step": 6065
},
{
"epoch": 2.5252210088403535,
"grad_norm": 0.8467985391616821,
"learning_rate": 4.037944763290879e-06,
"loss": 2.1115,
"step": 6070
},
{
"epoch": 2.5273010920436816,
"grad_norm": 0.7928602695465088,
"learning_rate": 4.0035468937245245e-06,
"loss": 2.1322,
"step": 6075
},
{
"epoch": 2.5293811752470097,
"grad_norm": 0.9464954733848572,
"learning_rate": 3.9692834064795735e-06,
"loss": 2.1607,
"step": 6080
},
{
"epoch": 2.5314612584503378,
"grad_norm": 0.7425093054771423,
"learning_rate": 3.935154520850529e-06,
"loss": 2.1538,
"step": 6085
},
{
"epoch": 2.5335413416536663,
"grad_norm": 0.8105461001396179,
"learning_rate": 3.901160455270416e-06,
"loss": 2.1784,
"step": 6090
},
{
"epoch": 2.5356214248569944,
"grad_norm": 0.8401845097541809,
"learning_rate": 3.8673014273093945e-06,
"loss": 2.1461,
"step": 6095
},
{
"epoch": 2.5377015080603225,
"grad_norm": 0.9266782402992249,
"learning_rate": 3.833577653673346e-06,
"loss": 2.1359,
"step": 6100
},
{
"epoch": 2.5397815912636506,
"grad_norm": 0.754927933216095,
"learning_rate": 3.7999893502024707e-06,
"loss": 2.0896,
"step": 6105
},
{
"epoch": 2.5418616744669786,
"grad_norm": 0.7852912545204163,
"learning_rate": 3.7665367318699602e-06,
"loss": 2.1487,
"step": 6110
},
{
"epoch": 2.5439417576703067,
"grad_norm": 0.8186183571815491,
"learning_rate": 3.7332200127805585e-06,
"loss": 2.1314,
"step": 6115
},
{
"epoch": 2.546021840873635,
"grad_norm": 0.9804696440696716,
"learning_rate": 3.700039406169248e-06,
"loss": 2.1519,
"step": 6120
},
{
"epoch": 2.5481019240769633,
"grad_norm": 0.7977464199066162,
"learning_rate": 3.666995124399836e-06,
"loss": 2.1604,
"step": 6125
},
{
"epoch": 2.5501820072802914,
"grad_norm": 0.802650511264801,
"learning_rate": 3.63408737896363e-06,
"loss": 2.1328,
"step": 6130
},
{
"epoch": 2.5522620904836195,
"grad_norm": 0.8455849289894104,
"learning_rate": 3.6013163804780843e-06,
"loss": 2.1521,
"step": 6135
},
{
"epoch": 2.5543421736869476,
"grad_norm": 0.8368895649909973,
"learning_rate": 3.568682338685414e-06,
"loss": 2.1753,
"step": 6140
},
{
"epoch": 2.5564222568902757,
"grad_norm": 0.7585736513137817,
"learning_rate": 3.5361854624512912e-06,
"loss": 2.1695,
"step": 6145
},
{
"epoch": 2.5585023400936038,
"grad_norm": 0.8306118845939636,
"learning_rate": 3.503825959763496e-06,
"loss": 2.1526,
"step": 6150
},
{
"epoch": 2.560582423296932,
"grad_norm": 0.9081049561500549,
"learning_rate": 3.4716040377305944e-06,
"loss": 2.1503,
"step": 6155
},
{
"epoch": 2.56266250650026,
"grad_norm": 0.8425130844116211,
"learning_rate": 3.439519902580582e-06,
"loss": 2.1075,
"step": 6160
},
{
"epoch": 2.564742589703588,
"grad_norm": 0.8468633890151978,
"learning_rate": 3.4075737596596074e-06,
"loss": 2.1245,
"step": 6165
},
{
"epoch": 2.566822672906916,
"grad_norm": 1.2079002857208252,
"learning_rate": 3.375765813430612e-06,
"loss": 2.1524,
"step": 6170
},
{
"epoch": 2.568902756110244,
"grad_norm": 0.7111079096794128,
"learning_rate": 3.3440962674720743e-06,
"loss": 2.1511,
"step": 6175
},
{
"epoch": 2.5709828393135723,
"grad_norm": 1.0822231769561768,
"learning_rate": 3.312565324476649e-06,
"loss": 2.1155,
"step": 6180
},
{
"epoch": 2.573062922516901,
"grad_norm": 0.8851024508476257,
"learning_rate": 3.2811731862499166e-06,
"loss": 2.1426,
"step": 6185
},
{
"epoch": 2.575143005720229,
"grad_norm": 0.7527185678482056,
"learning_rate": 3.249920053709074e-06,
"loss": 2.1514,
"step": 6190
},
{
"epoch": 2.577223088923557,
"grad_norm": 0.7749047875404358,
"learning_rate": 3.218806126881643e-06,
"loss": 2.1163,
"step": 6195
},
{
"epoch": 2.579303172126885,
"grad_norm": 0.6856330037117004,
"learning_rate": 3.1878316049041984e-06,
"loss": 2.1971,
"step": 6200
},
{
"epoch": 2.581383255330213,
"grad_norm": 0.8586908578872681,
"learning_rate": 3.156996686021077e-06,
"loss": 2.1033,
"step": 6205
},
{
"epoch": 2.5834633385335413,
"grad_norm": 0.7209339141845703,
"learning_rate": 3.1263015675831427e-06,
"loss": 2.1671,
"step": 6210
},
{
"epoch": 2.5855434217368694,
"grad_norm": 0.8630194664001465,
"learning_rate": 3.095746446046499e-06,
"loss": 2.1427,
"step": 6215
},
{
"epoch": 2.587623504940198,
"grad_norm": 0.727620005607605,
"learning_rate": 3.0653315169712203e-06,
"loss": 2.1219,
"step": 6220
},
{
"epoch": 2.589703588143526,
"grad_norm": 0.7987605333328247,
"learning_rate": 3.0350569750201368e-06,
"loss": 2.1473,
"step": 6225
},
{
"epoch": 2.591783671346854,
"grad_norm": 0.8267928957939148,
"learning_rate": 3.00492301395755e-06,
"loss": 2.1306,
"step": 6230
},
{
"epoch": 2.593863754550182,
"grad_norm": 0.7957183718681335,
"learning_rate": 2.9749298266480264e-06,
"loss": 2.105,
"step": 6235
},
{
"epoch": 2.5959438377535102,
"grad_norm": 0.7759562730789185,
"learning_rate": 2.945077605055127e-06,
"loss": 2.1411,
"step": 6240
},
{
"epoch": 2.5980239209568383,
"grad_norm": 0.8441088795661926,
"learning_rate": 2.9153665402402137e-06,
"loss": 2.126,
"step": 6245
},
{
"epoch": 2.6001040041601664,
"grad_norm": 0.798534631729126,
"learning_rate": 2.8857968223612143e-06,
"loss": 2.1506,
"step": 6250
},
{
"epoch": 2.6021840873634945,
"grad_norm": 0.8515554070472717,
"learning_rate": 2.8563686406713863e-06,
"loss": 2.1549,
"step": 6255
},
{
"epoch": 2.6042641705668226,
"grad_norm": 1.0475796461105347,
"learning_rate": 2.8270821835181316e-06,
"loss": 2.1054,
"step": 6260
},
{
"epoch": 2.6063442537701507,
"grad_norm": 0.8114381432533264,
"learning_rate": 2.7979376383417798e-06,
"loss": 2.1381,
"step": 6265
},
{
"epoch": 2.6084243369734788,
"grad_norm": 0.7577459216117859,
"learning_rate": 2.768935191674396e-06,
"loss": 2.1364,
"step": 6270
},
{
"epoch": 2.610504420176807,
"grad_norm": 0.7933220267295837,
"learning_rate": 2.7400750291385697e-06,
"loss": 2.1323,
"step": 6275
},
{
"epoch": 2.6125845033801354,
"grad_norm": 0.7705368399620056,
"learning_rate": 2.711357335446246e-06,
"loss": 2.1612,
"step": 6280
},
{
"epoch": 2.6146645865834635,
"grad_norm": 0.8709501028060913,
"learning_rate": 2.682782294397529e-06,
"loss": 2.1322,
"step": 6285
},
{
"epoch": 2.6167446697867915,
"grad_norm": 0.9047673344612122,
"learning_rate": 2.654350088879523e-06,
"loss": 2.1423,
"step": 6290
},
{
"epoch": 2.6188247529901196,
"grad_norm": 0.8506908416748047,
"learning_rate": 2.626060900865132e-06,
"loss": 2.1844,
"step": 6295
},
{
"epoch": 2.6209048361934477,
"grad_norm": 0.7736124992370605,
"learning_rate": 2.5979149114119334e-06,
"loss": 2.1991,
"step": 6300
},
{
"epoch": 2.622984919396776,
"grad_norm": 0.8622713088989258,
"learning_rate": 2.569912300660987e-06,
"loss": 2.1664,
"step": 6305
},
{
"epoch": 2.625065002600104,
"grad_norm": 0.821495771408081,
"learning_rate": 2.54205324783571e-06,
"loss": 2.1621,
"step": 6310
},
{
"epoch": 2.627145085803432,
"grad_norm": 0.7039251923561096,
"learning_rate": 2.5143379312406847e-06,
"loss": 2.1118,
"step": 6315
},
{
"epoch": 2.6292251690067605,
"grad_norm": 1.277297019958496,
"learning_rate": 2.4867665282605755e-06,
"loss": 2.1417,
"step": 6320
},
{
"epoch": 2.6313052522100886,
"grad_norm": 0.711395263671875,
"learning_rate": 2.459339215358955e-06,
"loss": 2.0946,
"step": 6325
},
{
"epoch": 2.6333853354134167,
"grad_norm": 0.7947670817375183,
"learning_rate": 2.4320561680771874e-06,
"loss": 2.1806,
"step": 6330
},
{
"epoch": 2.6354654186167448,
"grad_norm": 0.7751787900924683,
"learning_rate": 2.4049175610332957e-06,
"loss": 2.1688,
"step": 6335
},
{
"epoch": 2.637545501820073,
"grad_norm": 0.978768527507782,
"learning_rate": 2.377923567920862e-06,
"loss": 2.1478,
"step": 6340
},
{
"epoch": 2.639625585023401,
"grad_norm": 0.7635921239852905,
"learning_rate": 2.351074361507888e-06,
"loss": 2.1733,
"step": 6345
},
{
"epoch": 2.641705668226729,
"grad_norm": 0.7769209146499634,
"learning_rate": 2.3243701136357266e-06,
"loss": 2.1009,
"step": 6350
},
{
"epoch": 2.643785751430057,
"grad_norm": 0.9034749865531921,
"learning_rate": 2.2978109952179416e-06,
"loss": 2.1271,
"step": 6355
},
{
"epoch": 2.645865834633385,
"grad_norm": 0.9482102990150452,
"learning_rate": 2.2713971762392456e-06,
"loss": 2.1541,
"step": 6360
},
{
"epoch": 2.6479459178367133,
"grad_norm": 0.8525805473327637,
"learning_rate": 2.245128825754406e-06,
"loss": 2.1331,
"step": 6365
},
{
"epoch": 2.6500260010400414,
"grad_norm": 0.7677263021469116,
"learning_rate": 2.2190061118871396e-06,
"loss": 2.177,
"step": 6370
},
{
"epoch": 2.6521060842433695,
"grad_norm": 0.7640902400016785,
"learning_rate": 2.193029201829061e-06,
"loss": 2.1332,
"step": 6375
},
{
"epoch": 2.654186167446698,
"grad_norm": 0.9157296419143677,
"learning_rate": 2.1671982618386098e-06,
"loss": 2.126,
"step": 6380
},
{
"epoch": 2.656266250650026,
"grad_norm": 0.8202362656593323,
"learning_rate": 2.1415134572399824e-06,
"loss": 2.1443,
"step": 6385
},
{
"epoch": 2.658346333853354,
"grad_norm": 0.881324052810669,
"learning_rate": 2.115974952422067e-06,
"loss": 2.1435,
"step": 6390
},
{
"epoch": 2.6604264170566823,
"grad_norm": 0.7640511393547058,
"learning_rate": 2.0905829108374077e-06,
"loss": 2.1499,
"step": 6395
},
{
"epoch": 2.6625065002600103,
"grad_norm": 0.6942290663719177,
"learning_rate": 2.065337495001135e-06,
"loss": 2.1699,
"step": 6400
},
{
"epoch": 2.6645865834633384,
"grad_norm": 0.7413431406021118,
"learning_rate": 2.0402388664899574e-06,
"loss": 2.1568,
"step": 6405
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.7595916986465454,
"learning_rate": 2.015287185941089e-06,
"loss": 2.1116,
"step": 6410
},
{
"epoch": 2.668746749869995,
"grad_norm": 0.7836700081825256,
"learning_rate": 1.9904826130512618e-06,
"loss": 2.1544,
"step": 6415
},
{
"epoch": 2.670826833073323,
"grad_norm": 0.8121596574783325,
"learning_rate": 1.9658253065756694e-06,
"loss": 2.1396,
"step": 6420
},
{
"epoch": 2.6729069162766512,
"grad_norm": 0.7615113854408264,
"learning_rate": 1.941315424326984e-06,
"loss": 2.1623,
"step": 6425
},
{
"epoch": 2.6749869994799793,
"grad_norm": 0.6900429725646973,
"learning_rate": 1.9169531231742892e-06,
"loss": 2.1518,
"step": 6430
},
{
"epoch": 2.6770670826833074,
"grad_norm": 0.7657844424247742,
"learning_rate": 1.8927385590421565e-06,
"loss": 2.1401,
"step": 6435
},
{
"epoch": 2.6791471658866355,
"grad_norm": 0.7979406714439392,
"learning_rate": 1.8686718869095815e-06,
"loss": 2.1908,
"step": 6440
},
{
"epoch": 2.6812272490899636,
"grad_norm": 0.6649172306060791,
"learning_rate": 1.8447532608090261e-06,
"loss": 2.1519,
"step": 6445
},
{
"epoch": 2.6833073322932917,
"grad_norm": 0.7565729022026062,
"learning_rate": 1.8209828338254132e-06,
"loss": 2.1739,
"step": 6450
},
{
"epoch": 2.6853874154966197,
"grad_norm": 0.8752434849739075,
"learning_rate": 1.797360758095165e-06,
"loss": 2.11,
"step": 6455
},
{
"epoch": 2.687467498699948,
"grad_norm": 0.6268664598464966,
"learning_rate": 1.7738871848052092e-06,
"loss": 2.1259,
"step": 6460
},
{
"epoch": 2.689547581903276,
"grad_norm": 0.7275916337966919,
"learning_rate": 1.750562264192035e-06,
"loss": 2.1678,
"step": 6465
},
{
"epoch": 2.691627665106604,
"grad_norm": 0.9467329382896423,
"learning_rate": 1.7273861455407075e-06,
"loss": 2.1566,
"step": 6470
},
{
"epoch": 2.6937077483099325,
"grad_norm": 0.7651445865631104,
"learning_rate": 1.7043589771839314e-06,
"loss": 2.1423,
"step": 6475
},
{
"epoch": 2.6957878315132606,
"grad_norm": 0.7837608456611633,
"learning_rate": 1.6814809065010927e-06,
"loss": 2.1355,
"step": 6480
},
{
"epoch": 2.6978679147165887,
"grad_norm": 0.7050347328186035,
"learning_rate": 1.6587520799173168e-06,
"loss": 2.1632,
"step": 6485
},
{
"epoch": 2.699947997919917,
"grad_norm": 0.8150919079780579,
"learning_rate": 1.6361726429025227e-06,
"loss": 2.1455,
"step": 6490
},
{
"epoch": 2.702028081123245,
"grad_norm": 0.7656033039093018,
"learning_rate": 1.6137427399705113e-06,
"loss": 2.1248,
"step": 6495
},
{
"epoch": 2.704108164326573,
"grad_norm": 0.8316398859024048,
"learning_rate": 1.5914625146780299e-06,
"loss": 2.1594,
"step": 6500
},
{
"epoch": 2.706188247529901,
"grad_norm": 0.7980731725692749,
"learning_rate": 1.569332109623839e-06,
"loss": 2.1578,
"step": 6505
},
{
"epoch": 2.7082683307332296,
"grad_norm": 0.8120521903038025,
"learning_rate": 1.5473516664478354e-06,
"loss": 2.1358,
"step": 6510
},
{
"epoch": 2.7103484139365577,
"grad_norm": 0.6915965676307678,
"learning_rate": 1.5255213258301037e-06,
"loss": 2.1685,
"step": 6515
},
{
"epoch": 2.7124284971398858,
"grad_norm": 0.8055347800254822,
"learning_rate": 1.5038412274900493e-06,
"loss": 2.1332,
"step": 6520
},
{
"epoch": 2.714508580343214,
"grad_norm": 1.0255202054977417,
"learning_rate": 1.4823115101854829e-06,
"loss": 2.1319,
"step": 6525
},
{
"epoch": 2.716588663546542,
"grad_norm": 0.7742552757263184,
"learning_rate": 1.4609323117117434e-06,
"loss": 2.1698,
"step": 6530
},
{
"epoch": 2.71866874674987,
"grad_norm": 0.9451241493225098,
"learning_rate": 1.4397037689008186e-06,
"loss": 2.133,
"step": 6535
},
{
"epoch": 2.720748829953198,
"grad_norm": 0.8512309193611145,
"learning_rate": 1.4186260176204668e-06,
"loss": 2.1614,
"step": 6540
},
{
"epoch": 2.722828913156526,
"grad_norm": 0.7536253929138184,
"learning_rate": 1.397699192773319e-06,
"loss": 2.0921,
"step": 6545
},
{
"epoch": 2.7249089963598543,
"grad_norm": 0.8096660375595093,
"learning_rate": 1.3769234282960702e-06,
"loss": 2.1722,
"step": 6550
},
{
"epoch": 2.7269890795631824,
"grad_norm": 0.7829006910324097,
"learning_rate": 1.3562988571585777e-06,
"loss": 2.1508,
"step": 6555
},
{
"epoch": 2.7290691627665105,
"grad_norm": 0.7823526263237,
"learning_rate": 1.3358256113630369e-06,
"loss": 2.1571,
"step": 6560
},
{
"epoch": 2.7311492459698385,
"grad_norm": 0.6584709882736206,
"learning_rate": 1.3155038219431065e-06,
"loss": 2.1743,
"step": 6565
},
{
"epoch": 2.733229329173167,
"grad_norm": 0.7424419522285461,
"learning_rate": 1.2953336189631098e-06,
"loss": 2.1726,
"step": 6570
},
{
"epoch": 2.735309412376495,
"grad_norm": 1.0195063352584839,
"learning_rate": 1.2753151315171602e-06,
"loss": 2.1497,
"step": 6575
},
{
"epoch": 2.7373894955798233,
"grad_norm": 0.7543803453445435,
"learning_rate": 1.2554484877283724e-06,
"loss": 2.1554,
"step": 6580
},
{
"epoch": 2.7394695787831513,
"grad_norm": 0.7631281614303589,
"learning_rate": 1.2357338147480107e-06,
"loss": 2.1328,
"step": 6585
},
{
"epoch": 2.7415496619864794,
"grad_norm": 0.7183612585067749,
"learning_rate": 1.2161712387547014e-06,
"loss": 2.1774,
"step": 6590
},
{
"epoch": 2.7436297451898075,
"grad_norm": 0.7182380557060242,
"learning_rate": 1.1967608849536127e-06,
"loss": 2.2034,
"step": 6595
},
{
"epoch": 2.7457098283931356,
"grad_norm": 0.9577491283416748,
"learning_rate": 1.177502877575648e-06,
"loss": 2.2012,
"step": 6600
},
{
"epoch": 2.747789911596464,
"grad_norm": 0.7523165941238403,
"learning_rate": 1.1583973398766573e-06,
"loss": 2.1192,
"step": 6605
},
{
"epoch": 2.749869994799792,
"grad_norm": 0.9832723140716553,
"learning_rate": 1.1394443941366518e-06,
"loss": 2.1164,
"step": 6610
},
{
"epoch": 2.7519500780031203,
"grad_norm": 0.6826837658882141,
"learning_rate": 1.1206441616590235e-06,
"loss": 2.1478,
"step": 6615
},
{
"epoch": 2.7540301612064484,
"grad_norm": 0.8534629344940186,
"learning_rate": 1.1019967627697498e-06,
"loss": 2.1572,
"step": 6620
},
{
"epoch": 2.7561102444097765,
"grad_norm": 0.7205258011817932,
"learning_rate": 1.0835023168166452e-06,
"loss": 2.1686,
"step": 6625
},
{
"epoch": 2.7581903276131046,
"grad_norm": 0.8041960597038269,
"learning_rate": 1.065160942168586e-06,
"loss": 2.1548,
"step": 6630
},
{
"epoch": 2.7602704108164327,
"grad_norm": 0.8386279940605164,
"learning_rate": 1.046972756214762e-06,
"loss": 2.1106,
"step": 6635
},
{
"epoch": 2.7623504940197607,
"grad_norm": 0.8722918629646301,
"learning_rate": 1.0289378753639055e-06,
"loss": 2.1529,
"step": 6640
},
{
"epoch": 2.764430577223089,
"grad_norm": 0.7601779103279114,
"learning_rate": 1.0110564150435709e-06,
"loss": 2.1476,
"step": 6645
},
{
"epoch": 2.766510660426417,
"grad_norm": 0.7956823706626892,
"learning_rate": 9.93328489699377e-07,
"loss": 2.1767,
"step": 6650
},
{
"epoch": 2.768590743629745,
"grad_norm": 0.7255099415779114,
"learning_rate": 9.757542127942998e-07,
"loss": 2.1429,
"step": 6655
},
{
"epoch": 2.770670826833073,
"grad_norm": 0.9191991686820984,
"learning_rate": 9.583336968078948e-07,
"loss": 2.1431,
"step": 6660
},
{
"epoch": 2.7727509100364016,
"grad_norm": 0.7821683287620544,
"learning_rate": 9.410670532356419e-07,
"loss": 2.0939,
"step": 6665
},
{
"epoch": 2.7748309932397297,
"grad_norm": 0.7056489586830139,
"learning_rate": 9.23954392588186e-07,
"loss": 2.1588,
"step": 6670
},
{
"epoch": 2.776911076443058,
"grad_norm": 0.6995697021484375,
"learning_rate": 9.069958243906524e-07,
"loss": 2.1238,
"step": 6675
},
{
"epoch": 2.778991159646386,
"grad_norm": 0.7750363945960999,
"learning_rate": 8.901914571819298e-07,
"loss": 2.156,
"step": 6680
},
{
"epoch": 2.781071242849714,
"grad_norm": 0.8692283034324646,
"learning_rate": 8.735413985139884e-07,
"loss": 2.1586,
"step": 6685
},
{
"epoch": 2.783151326053042,
"grad_norm": 0.8263838291168213,
"learning_rate": 8.570457549511802e-07,
"loss": 2.1698,
"step": 6690
},
{
"epoch": 2.78523140925637,
"grad_norm": 0.7581605911254883,
"learning_rate": 8.407046320695805e-07,
"loss": 2.1863,
"step": 6695
},
{
"epoch": 2.7873114924596982,
"grad_norm": 0.7336562871932983,
"learning_rate": 8.24518134456273e-07,
"loss": 2.1528,
"step": 6700
},
{
"epoch": 2.7893915756630268,
"grad_norm": 0.7941420078277588,
"learning_rate": 8.084863657087189e-07,
"loss": 2.1556,
"step": 6705
},
{
"epoch": 2.791471658866355,
"grad_norm": 0.7126220464706421,
"learning_rate": 7.926094284340713e-07,
"loss": 2.1559,
"step": 6710
},
{
"epoch": 2.793551742069683,
"grad_norm": 0.6795457601547241,
"learning_rate": 7.768874242485291e-07,
"loss": 2.1256,
"step": 6715
},
{
"epoch": 2.795631825273011,
"grad_norm": 0.7890847325325012,
"learning_rate": 7.613204537766704e-07,
"loss": 2.1345,
"step": 6720
},
{
"epoch": 2.797711908476339,
"grad_norm": 0.8842664957046509,
"learning_rate": 7.459086166508367e-07,
"loss": 2.1474,
"step": 6725
},
{
"epoch": 2.799791991679667,
"grad_norm": 0.9438645243644714,
"learning_rate": 7.306520115104743e-07,
"loss": 2.1808,
"step": 6730
},
{
"epoch": 2.8018720748829953,
"grad_norm": 0.8111101388931274,
"learning_rate": 7.155507360014941e-07,
"loss": 2.167,
"step": 6735
},
{
"epoch": 2.8039521580863234,
"grad_norm": 0.7519661784172058,
"learning_rate": 7.006048867756798e-07,
"loss": 2.1375,
"step": 6740
},
{
"epoch": 2.8060322412896515,
"grad_norm": 0.776961088180542,
"learning_rate": 6.858145594900389e-07,
"loss": 2.146,
"step": 6745
},
{
"epoch": 2.8081123244929795,
"grad_norm": 0.8284921050071716,
"learning_rate": 6.711798488062027e-07,
"loss": 2.1459,
"step": 6750
},
{
"epoch": 2.8101924076963076,
"grad_norm": 0.7575844526290894,
"learning_rate": 6.567008483898185e-07,
"loss": 2.1788,
"step": 6755
},
{
"epoch": 2.8122724908996357,
"grad_norm": 0.7377430200576782,
"learning_rate": 6.423776509099505e-07,
"loss": 2.1239,
"step": 6760
},
{
"epoch": 2.8143525741029642,
"grad_norm": 0.8331411480903625,
"learning_rate": 6.28210348038491e-07,
"loss": 2.1618,
"step": 6765
},
{
"epoch": 2.8164326573062923,
"grad_norm": 0.7900028824806213,
"learning_rate": 6.14199030449572e-07,
"loss": 2.142,
"step": 6770
},
{
"epoch": 2.8185127405096204,
"grad_norm": 0.7936934232711792,
"learning_rate": 6.003437878189661e-07,
"loss": 2.1309,
"step": 6775
},
{
"epoch": 2.8205928237129485,
"grad_norm": 0.7084915041923523,
"learning_rate": 5.866447088235444e-07,
"loss": 2.1436,
"step": 6780
},
{
"epoch": 2.8226729069162766,
"grad_norm": 0.894076406955719,
"learning_rate": 5.731018811406891e-07,
"loss": 2.1469,
"step": 6785
},
{
"epoch": 2.8247529901196047,
"grad_norm": 0.8683717250823975,
"learning_rate": 5.597153914477376e-07,
"loss": 2.174,
"step": 6790
},
{
"epoch": 2.8268330733229328,
"grad_norm": 0.6982574462890625,
"learning_rate": 5.464853254214225e-07,
"loss": 2.1291,
"step": 6795
},
{
"epoch": 2.8289131565262613,
"grad_norm": 0.7772095799446106,
"learning_rate": 5.334117677373352e-07,
"loss": 2.1343,
"step": 6800
},
{
"epoch": 2.8309932397295894,
"grad_norm": 0.893601655960083,
"learning_rate": 5.204948020693657e-07,
"loss": 2.1457,
"step": 6805
},
{
"epoch": 2.8330733229329175,
"grad_norm": 0.7480981349945068,
"learning_rate": 5.07734511089189e-07,
"loss": 2.1251,
"step": 6810
},
{
"epoch": 2.8351534061362456,
"grad_norm": 0.7743918299674988,
"learning_rate": 4.951309764657131e-07,
"loss": 2.1248,
"step": 6815
},
{
"epoch": 2.8372334893395736,
"grad_norm": 0.7804460525512695,
"learning_rate": 4.826842788645758e-07,
"loss": 2.1898,
"step": 6820
},
{
"epoch": 2.8393135725429017,
"grad_norm": 0.8208061456680298,
"learning_rate": 4.703944979476238e-07,
"loss": 2.1471,
"step": 6825
},
{
"epoch": 2.84139365574623,
"grad_norm": 0.9219505786895752,
"learning_rate": 4.5826171237239035e-07,
"loss": 2.1078,
"step": 6830
},
{
"epoch": 2.843473738949558,
"grad_norm": 0.838716447353363,
"learning_rate": 4.4628599979160136e-07,
"loss": 2.175,
"step": 6835
},
{
"epoch": 2.845553822152886,
"grad_norm": 0.7236716151237488,
"learning_rate": 4.344674368526841e-07,
"loss": 2.1288,
"step": 6840
},
{
"epoch": 2.847633905356214,
"grad_norm": 0.7392603158950806,
"learning_rate": 4.2280609919727323e-07,
"loss": 2.1579,
"step": 6845
},
{
"epoch": 2.849713988559542,
"grad_norm": 0.6753377914428711,
"learning_rate": 4.1130206146071106e-07,
"loss": 2.1519,
"step": 6850
},
{
"epoch": 2.8517940717628703,
"grad_norm": 0.9331068396568298,
"learning_rate": 3.999553972715925e-07,
"loss": 2.1456,
"step": 6855
},
{
"epoch": 2.853874154966199,
"grad_norm": 0.7977923154830933,
"learning_rate": 3.887661792512848e-07,
"loss": 2.1992,
"step": 6860
},
{
"epoch": 2.855954238169527,
"grad_norm": 0.8902707099914551,
"learning_rate": 3.777344790134585e-07,
"loss": 2.1761,
"step": 6865
},
{
"epoch": 2.858034321372855,
"grad_norm": 0.8167480230331421,
"learning_rate": 3.668603671636295e-07,
"loss": 2.1186,
"step": 6870
},
{
"epoch": 2.860114404576183,
"grad_norm": 0.7843384742736816,
"learning_rate": 3.5614391329871487e-07,
"loss": 2.1964,
"step": 6875
},
{
"epoch": 2.862194487779511,
"grad_norm": 0.9206196665763855,
"learning_rate": 3.4558518600658893e-07,
"loss": 2.1679,
"step": 6880
},
{
"epoch": 2.864274570982839,
"grad_norm": 0.7699995636940002,
"learning_rate": 3.3518425286562795e-07,
"loss": 2.158,
"step": 6885
},
{
"epoch": 2.8663546541861673,
"grad_norm": 0.860309898853302,
"learning_rate": 3.249411804442881e-07,
"loss": 2.1557,
"step": 6890
},
{
"epoch": 2.868434737389496,
"grad_norm": 0.8029336333274841,
"learning_rate": 3.1485603430068676e-07,
"loss": 2.1117,
"step": 6895
},
{
"epoch": 2.870514820592824,
"grad_norm": 0.7837362885475159,
"learning_rate": 3.049288789821664e-07,
"loss": 2.1301,
"step": 6900
},
{
"epoch": 2.872594903796152,
"grad_norm": 0.7675833106040955,
"learning_rate": 2.9515977802490324e-07,
"loss": 2.1545,
"step": 6905
},
{
"epoch": 2.87467498699948,
"grad_norm": 0.820162296295166,
"learning_rate": 2.8554879395347177e-07,
"loss": 2.1776,
"step": 6910
},
{
"epoch": 2.876755070202808,
"grad_norm": 1.1358041763305664,
"learning_rate": 2.760959882804753e-07,
"loss": 2.1434,
"step": 6915
},
{
"epoch": 2.8788351534061363,
"grad_norm": 0.8005056381225586,
"learning_rate": 2.668014215061243e-07,
"loss": 2.1697,
"step": 6920
},
{
"epoch": 2.8809152366094644,
"grad_norm": 0.81009840965271,
"learning_rate": 2.576651531178725e-07,
"loss": 2.1597,
"step": 6925
},
{
"epoch": 2.8829953198127924,
"grad_norm": 0.9089166522026062,
"learning_rate": 2.4868724159002323e-07,
"loss": 2.1189,
"step": 6930
},
{
"epoch": 2.8850754030161205,
"grad_norm": 0.9949043989181519,
"learning_rate": 2.398677443833569e-07,
"loss": 2.1673,
"step": 6935
},
{
"epoch": 2.8871554862194486,
"grad_norm": 0.7992036938667297,
"learning_rate": 2.3120671794476522e-07,
"loss": 2.1761,
"step": 6940
},
{
"epoch": 2.8892355694227767,
"grad_norm": 0.8436097502708435,
"learning_rate": 2.2270421770688722e-07,
"loss": 2.1527,
"step": 6945
},
{
"epoch": 2.891315652626105,
"grad_norm": 0.8040410280227661,
"learning_rate": 2.143602980877596e-07,
"loss": 2.1215,
"step": 6950
},
{
"epoch": 2.8933957358294333,
"grad_norm": 0.9088870882987976,
"learning_rate": 2.0617501249046156e-07,
"loss": 2.1204,
"step": 6955
},
{
"epoch": 2.8954758190327614,
"grad_norm": 0.6987228989601135,
"learning_rate": 1.9814841330277889e-07,
"loss": 2.1617,
"step": 6960
},
{
"epoch": 2.8975559022360895,
"grad_norm": 1.0931040048599243,
"learning_rate": 1.902805518968681e-07,
"loss": 2.145,
"step": 6965
},
{
"epoch": 2.8996359854394176,
"grad_norm": 0.8285526633262634,
"learning_rate": 1.8257147862892065e-07,
"loss": 2.1255,
"step": 6970
},
{
"epoch": 2.9017160686427457,
"grad_norm": 0.7968695163726807,
"learning_rate": 1.7502124283885478e-07,
"loss": 2.0997,
"step": 6975
},
{
"epoch": 2.9037961518460738,
"grad_norm": 0.8100786805152893,
"learning_rate": 1.6762989284997975e-07,
"loss": 2.1215,
"step": 6980
},
{
"epoch": 2.905876235049402,
"grad_norm": 0.7216346859931946,
"learning_rate": 1.6039747596870437e-07,
"loss": 2.1376,
"step": 6985
},
{
"epoch": 2.9079563182527304,
"grad_norm": 0.7840524911880493,
"learning_rate": 1.5332403848422606e-07,
"loss": 2.1328,
"step": 6990
},
{
"epoch": 2.9100364014560585,
"grad_norm": 0.9314194917678833,
"learning_rate": 1.464096256682368e-07,
"loss": 2.1194,
"step": 6995
},
{
"epoch": 2.9121164846593866,
"grad_norm": 0.6695131063461304,
"learning_rate": 1.3965428177463712e-07,
"loss": 2.1324,
"step": 7000
}
],
"logging_steps": 5,
"max_steps": 7209,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.7817058483577553e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}