Elfsong's picture
End of training
b458200 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9998492272898606,
"eval_steps": 500,
"global_step": 4421,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011307953260459858,
"grad_norm": 0.205332413315773,
"learning_rate": 2.5e-06,
"loss": 0.9778,
"step": 5
},
{
"epoch": 0.0022615906520919715,
"grad_norm": 0.2380959391593933,
"learning_rate": 5e-06,
"loss": 0.9816,
"step": 10
},
{
"epoch": 0.003392385978137957,
"grad_norm": 0.22828762233257294,
"learning_rate": 7.5e-06,
"loss": 1.0123,
"step": 15
},
{
"epoch": 0.004523181304183943,
"grad_norm": 0.1957542896270752,
"learning_rate": 1e-05,
"loss": 0.9404,
"step": 20
},
{
"epoch": 0.005653976630229929,
"grad_norm": 0.2502771019935608,
"learning_rate": 1.25e-05,
"loss": 0.9604,
"step": 25
},
{
"epoch": 0.006784771956275914,
"grad_norm": 0.24806493520736694,
"learning_rate": 1.5e-05,
"loss": 1.0407,
"step": 30
},
{
"epoch": 0.0079155672823219,
"grad_norm": 0.28463977575302124,
"learning_rate": 1.75e-05,
"loss": 1.0461,
"step": 35
},
{
"epoch": 0.009046362608367886,
"grad_norm": 0.2142462134361267,
"learning_rate": 2e-05,
"loss": 0.9104,
"step": 40
},
{
"epoch": 0.010177157934413872,
"grad_norm": 0.21732334792613983,
"learning_rate": 2.25e-05,
"loss": 0.8991,
"step": 45
},
{
"epoch": 0.011307953260459858,
"grad_norm": 0.2227325588464737,
"learning_rate": 2.5e-05,
"loss": 0.8901,
"step": 50
},
{
"epoch": 0.012438748586505842,
"grad_norm": 0.19881105422973633,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.8378,
"step": 55
},
{
"epoch": 0.013569543912551827,
"grad_norm": 0.21935518085956573,
"learning_rate": 3e-05,
"loss": 0.8743,
"step": 60
},
{
"epoch": 0.014700339238597813,
"grad_norm": 0.21730449795722961,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.8588,
"step": 65
},
{
"epoch": 0.0158311345646438,
"grad_norm": 0.23200418055057526,
"learning_rate": 3.5e-05,
"loss": 0.7527,
"step": 70
},
{
"epoch": 0.016961929890689786,
"grad_norm": 0.20900775492191315,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.8365,
"step": 75
},
{
"epoch": 0.018092725216735772,
"grad_norm": 0.31192561984062195,
"learning_rate": 4e-05,
"loss": 0.7791,
"step": 80
},
{
"epoch": 0.019223520542781758,
"grad_norm": 0.25915804505348206,
"learning_rate": 4.25e-05,
"loss": 0.8506,
"step": 85
},
{
"epoch": 0.020354315868827744,
"grad_norm": 0.20527321100234985,
"learning_rate": 4.5e-05,
"loss": 0.8062,
"step": 90
},
{
"epoch": 0.02148511119487373,
"grad_norm": 0.2385016530752182,
"learning_rate": 4.75e-05,
"loss": 0.7525,
"step": 95
},
{
"epoch": 0.022615906520919715,
"grad_norm": 0.2394818663597107,
"learning_rate": 5e-05,
"loss": 0.7416,
"step": 100
},
{
"epoch": 0.023746701846965697,
"grad_norm": 0.269607275724411,
"learning_rate": 4.999983481113995e-05,
"loss": 0.7653,
"step": 105
},
{
"epoch": 0.024877497173011683,
"grad_norm": 0.21368731558322906,
"learning_rate": 4.9999339246742786e-05,
"loss": 0.75,
"step": 110
},
{
"epoch": 0.02600829249905767,
"grad_norm": 0.25945496559143066,
"learning_rate": 4.9998513313357435e-05,
"loss": 0.7693,
"step": 115
},
{
"epoch": 0.027139087825103655,
"grad_norm": 0.2617523968219757,
"learning_rate": 4.999735702189871e-05,
"loss": 0.7995,
"step": 120
},
{
"epoch": 0.02826988315114964,
"grad_norm": 0.26992905139923096,
"learning_rate": 4.999587038764713e-05,
"loss": 0.7784,
"step": 125
},
{
"epoch": 0.029400678477195626,
"grad_norm": 0.23823940753936768,
"learning_rate": 4.999405343024871e-05,
"loss": 0.7316,
"step": 130
},
{
"epoch": 0.030531473803241612,
"grad_norm": 0.2858569920063019,
"learning_rate": 4.9991906173714756e-05,
"loss": 0.7796,
"step": 135
},
{
"epoch": 0.0316622691292876,
"grad_norm": 0.25298023223876953,
"learning_rate": 4.99894286464215e-05,
"loss": 0.7169,
"step": 140
},
{
"epoch": 0.03279306445533359,
"grad_norm": 0.35693949460983276,
"learning_rate": 4.998662088110972e-05,
"loss": 0.8062,
"step": 145
},
{
"epoch": 0.03392385978137957,
"grad_norm": 0.42634308338165283,
"learning_rate": 4.998348291488435e-05,
"loss": 0.7035,
"step": 150
},
{
"epoch": 0.03505465510742556,
"grad_norm": 0.34167715907096863,
"learning_rate": 4.998001478921395e-05,
"loss": 0.7683,
"step": 155
},
{
"epoch": 0.036185450433471544,
"grad_norm": 0.2687824070453644,
"learning_rate": 4.997621654993018e-05,
"loss": 0.7816,
"step": 160
},
{
"epoch": 0.03731624575951753,
"grad_norm": 0.2919199764728546,
"learning_rate": 4.997208824722719e-05,
"loss": 0.7392,
"step": 165
},
{
"epoch": 0.038447041085563516,
"grad_norm": 0.24317045509815216,
"learning_rate": 4.9967629935660944e-05,
"loss": 0.6972,
"step": 170
},
{
"epoch": 0.0395778364116095,
"grad_norm": 0.2556512951850891,
"learning_rate": 4.9962841674148516e-05,
"loss": 0.7431,
"step": 175
},
{
"epoch": 0.04070863173765549,
"grad_norm": 0.35918310284614563,
"learning_rate": 4.99577235259673e-05,
"loss": 0.78,
"step": 180
},
{
"epoch": 0.04183942706370147,
"grad_norm": 0.28553536534309387,
"learning_rate": 4.9952275558754185e-05,
"loss": 0.7467,
"step": 185
},
{
"epoch": 0.04297022238974746,
"grad_norm": 0.25147977471351624,
"learning_rate": 4.994649784450465e-05,
"loss": 0.7579,
"step": 190
},
{
"epoch": 0.044101017715793445,
"grad_norm": 0.3088456690311432,
"learning_rate": 4.994039045957182e-05,
"loss": 0.752,
"step": 195
},
{
"epoch": 0.04523181304183943,
"grad_norm": 0.32329487800598145,
"learning_rate": 4.993395348466544e-05,
"loss": 0.7012,
"step": 200
},
{
"epoch": 0.046362608367885416,
"grad_norm": 0.28732138872146606,
"learning_rate": 4.992718700485085e-05,
"loss": 0.7247,
"step": 205
},
{
"epoch": 0.047493403693931395,
"grad_norm": 0.2657299339771271,
"learning_rate": 4.99200911095478e-05,
"loss": 0.7247,
"step": 210
},
{
"epoch": 0.04862419901997738,
"grad_norm": 0.30124104022979736,
"learning_rate": 4.991266589252933e-05,
"loss": 0.7001,
"step": 215
},
{
"epoch": 0.049754994346023367,
"grad_norm": 0.3533799946308136,
"learning_rate": 4.990491145192049e-05,
"loss": 0.7714,
"step": 220
},
{
"epoch": 0.05088578967206935,
"grad_norm": 0.29441332817077637,
"learning_rate": 4.989682789019706e-05,
"loss": 0.7338,
"step": 225
},
{
"epoch": 0.05201658499811534,
"grad_norm": 0.2670339345932007,
"learning_rate": 4.988841531418418e-05,
"loss": 0.719,
"step": 230
},
{
"epoch": 0.053147380324161324,
"grad_norm": 0.44572877883911133,
"learning_rate": 4.9879673835054955e-05,
"loss": 0.7315,
"step": 235
},
{
"epoch": 0.05427817565020731,
"grad_norm": 0.29553067684173584,
"learning_rate": 4.9870603568328985e-05,
"loss": 0.7495,
"step": 240
},
{
"epoch": 0.055408970976253295,
"grad_norm": 0.26393231749534607,
"learning_rate": 4.986120463387084e-05,
"loss": 0.6637,
"step": 245
},
{
"epoch": 0.05653976630229928,
"grad_norm": 0.35982418060302734,
"learning_rate": 4.985147715588845e-05,
"loss": 0.7571,
"step": 250
},
{
"epoch": 0.05767056162834527,
"grad_norm": 0.38977113366127014,
"learning_rate": 4.9841421262931506e-05,
"loss": 0.7551,
"step": 255
},
{
"epoch": 0.05880135695439125,
"grad_norm": 0.28935956954956055,
"learning_rate": 4.983103708788972e-05,
"loss": 0.7863,
"step": 260
},
{
"epoch": 0.05993215228043724,
"grad_norm": 0.34443530440330505,
"learning_rate": 4.98203247679911e-05,
"loss": 0.8106,
"step": 265
},
{
"epoch": 0.061062947606483224,
"grad_norm": 0.4763427674770355,
"learning_rate": 4.980928444480011e-05,
"loss": 0.7729,
"step": 270
},
{
"epoch": 0.06219374293252921,
"grad_norm": 0.2860422730445862,
"learning_rate": 4.9797916264215824e-05,
"loss": 0.7593,
"step": 275
},
{
"epoch": 0.0633245382585752,
"grad_norm": 0.28870680928230286,
"learning_rate": 4.978622037647e-05,
"loss": 0.7574,
"step": 280
},
{
"epoch": 0.06445533358462119,
"grad_norm": 0.40277180075645447,
"learning_rate": 4.9774196936125056e-05,
"loss": 0.799,
"step": 285
},
{
"epoch": 0.06558612891066717,
"grad_norm": 0.3290288746356964,
"learning_rate": 4.9761846102072065e-05,
"loss": 0.7519,
"step": 290
},
{
"epoch": 0.06671692423671316,
"grad_norm": 0.3139791190624237,
"learning_rate": 4.9749168037528635e-05,
"loss": 0.6837,
"step": 295
},
{
"epoch": 0.06784771956275915,
"grad_norm": 0.30802035331726074,
"learning_rate": 4.9736162910036785e-05,
"loss": 0.7662,
"step": 300
},
{
"epoch": 0.06897851488880513,
"grad_norm": 0.34561124444007874,
"learning_rate": 4.972283089146067e-05,
"loss": 0.6897,
"step": 305
},
{
"epoch": 0.07010931021485112,
"grad_norm": 0.3372039198875427,
"learning_rate": 4.970917215798438e-05,
"loss": 0.7344,
"step": 310
},
{
"epoch": 0.0712401055408971,
"grad_norm": 0.41160914301872253,
"learning_rate": 4.9695186890109567e-05,
"loss": 0.832,
"step": 315
},
{
"epoch": 0.07237090086694309,
"grad_norm": 0.2914057672023773,
"learning_rate": 4.968087527265306e-05,
"loss": 0.7113,
"step": 320
},
{
"epoch": 0.07350169619298907,
"grad_norm": 0.3247675597667694,
"learning_rate": 4.966623749474445e-05,
"loss": 0.6996,
"step": 325
},
{
"epoch": 0.07463249151903506,
"grad_norm": 0.435735285282135,
"learning_rate": 4.9651273749823546e-05,
"loss": 0.8236,
"step": 330
},
{
"epoch": 0.07576328684508105,
"grad_norm": 0.3213053047657013,
"learning_rate": 4.963598423563788e-05,
"loss": 0.7012,
"step": 335
},
{
"epoch": 0.07689408217112703,
"grad_norm": 0.3745056390762329,
"learning_rate": 4.962036915424004e-05,
"loss": 0.7018,
"step": 340
},
{
"epoch": 0.07802487749717302,
"grad_norm": 0.28368842601776123,
"learning_rate": 4.960442871198503e-05,
"loss": 0.7084,
"step": 345
},
{
"epoch": 0.079155672823219,
"grad_norm": 0.2621799409389496,
"learning_rate": 4.958816311952752e-05,
"loss": 0.7217,
"step": 350
},
{
"epoch": 0.08028646814926499,
"grad_norm": 0.25561287999153137,
"learning_rate": 4.95715725918191e-05,
"loss": 0.7616,
"step": 355
},
{
"epoch": 0.08141726347531097,
"grad_norm": 0.3495071828365326,
"learning_rate": 4.9554657348105385e-05,
"loss": 0.7061,
"step": 360
},
{
"epoch": 0.08254805880135696,
"grad_norm": 0.3490068018436432,
"learning_rate": 4.953741761192317e-05,
"loss": 0.7809,
"step": 365
},
{
"epoch": 0.08367885412740295,
"grad_norm": 0.39416739344596863,
"learning_rate": 4.9519853611097434e-05,
"loss": 0.7282,
"step": 370
},
{
"epoch": 0.08480964945344893,
"grad_norm": 0.2763444185256958,
"learning_rate": 4.950196557773837e-05,
"loss": 0.7262,
"step": 375
},
{
"epoch": 0.08594044477949492,
"grad_norm": 0.29107871651649475,
"learning_rate": 4.948375374823828e-05,
"loss": 0.7346,
"step": 380
},
{
"epoch": 0.0870712401055409,
"grad_norm": 0.28965339064598083,
"learning_rate": 4.946521836326847e-05,
"loss": 0.6768,
"step": 385
},
{
"epoch": 0.08820203543158689,
"grad_norm": 0.31072792410850525,
"learning_rate": 4.9446359667776065e-05,
"loss": 0.7277,
"step": 390
},
{
"epoch": 0.08933283075763288,
"grad_norm": 0.2789427936077118,
"learning_rate": 4.9427177910980794e-05,
"loss": 0.7481,
"step": 395
},
{
"epoch": 0.09046362608367886,
"grad_norm": 0.2573710083961487,
"learning_rate": 4.9407673346371644e-05,
"loss": 0.7077,
"step": 400
},
{
"epoch": 0.09159442140972485,
"grad_norm": 0.4152914881706238,
"learning_rate": 4.938784623170357e-05,
"loss": 0.7233,
"step": 405
},
{
"epoch": 0.09272521673577083,
"grad_norm": 0.30680012702941895,
"learning_rate": 4.936769682899404e-05,
"loss": 0.7353,
"step": 410
},
{
"epoch": 0.0938560120618168,
"grad_norm": 0.30145958065986633,
"learning_rate": 4.934722540451961e-05,
"loss": 0.7001,
"step": 415
},
{
"epoch": 0.09498680738786279,
"grad_norm": 0.31772518157958984,
"learning_rate": 4.932643222881238e-05,
"loss": 0.7183,
"step": 420
},
{
"epoch": 0.09611760271390878,
"grad_norm": 0.3001084327697754,
"learning_rate": 4.930531757665643e-05,
"loss": 0.6898,
"step": 425
},
{
"epoch": 0.09724839803995476,
"grad_norm": 0.2780250012874603,
"learning_rate": 4.928388172708418e-05,
"loss": 0.7782,
"step": 430
},
{
"epoch": 0.09837919336600075,
"grad_norm": 0.28147390484809875,
"learning_rate": 4.926212496337272e-05,
"loss": 0.7311,
"step": 435
},
{
"epoch": 0.09950998869204673,
"grad_norm": 0.4945797324180603,
"learning_rate": 4.924004757304005e-05,
"loss": 0.8001,
"step": 440
},
{
"epoch": 0.10064078401809272,
"grad_norm": 0.3075043857097626,
"learning_rate": 4.921764984784128e-05,
"loss": 0.7233,
"step": 445
},
{
"epoch": 0.1017715793441387,
"grad_norm": 0.3451552093029022,
"learning_rate": 4.919493208376479e-05,
"loss": 0.6629,
"step": 450
},
{
"epoch": 0.10290237467018469,
"grad_norm": 0.28970155119895935,
"learning_rate": 4.917189458102831e-05,
"loss": 0.7793,
"step": 455
},
{
"epoch": 0.10403316999623068,
"grad_norm": 0.2446502447128296,
"learning_rate": 4.9148537644074936e-05,
"loss": 0.6899,
"step": 460
},
{
"epoch": 0.10516396532227666,
"grad_norm": 0.2791134715080261,
"learning_rate": 4.912486158156912e-05,
"loss": 0.69,
"step": 465
},
{
"epoch": 0.10629476064832265,
"grad_norm": 0.35021790862083435,
"learning_rate": 4.910086670639264e-05,
"loss": 0.7497,
"step": 470
},
{
"epoch": 0.10742555597436863,
"grad_norm": 0.27730756998062134,
"learning_rate": 4.907655333564035e-05,
"loss": 0.6799,
"step": 475
},
{
"epoch": 0.10855635130041462,
"grad_norm": 0.3183215856552124,
"learning_rate": 4.9051921790616095e-05,
"loss": 0.723,
"step": 480
},
{
"epoch": 0.1096871466264606,
"grad_norm": 0.31501445174217224,
"learning_rate": 4.902697239682844e-05,
"loss": 0.7611,
"step": 485
},
{
"epoch": 0.11081794195250659,
"grad_norm": 0.30429741740226746,
"learning_rate": 4.9001705483986314e-05,
"loss": 0.7909,
"step": 490
},
{
"epoch": 0.11194873727855258,
"grad_norm": 0.27980148792266846,
"learning_rate": 4.8976121385994735e-05,
"loss": 0.7085,
"step": 495
},
{
"epoch": 0.11307953260459856,
"grad_norm": 0.2850303649902344,
"learning_rate": 4.895022044095034e-05,
"loss": 0.751,
"step": 500
},
{
"epoch": 0.11421032793064455,
"grad_norm": 0.30970653891563416,
"learning_rate": 4.892400299113693e-05,
"loss": 0.6766,
"step": 505
},
{
"epoch": 0.11534112325669053,
"grad_norm": 0.4121417999267578,
"learning_rate": 4.8897469383020966e-05,
"loss": 0.6824,
"step": 510
},
{
"epoch": 0.11647191858273652,
"grad_norm": 0.3178861737251282,
"learning_rate": 4.887061996724696e-05,
"loss": 0.6798,
"step": 515
},
{
"epoch": 0.1176027139087825,
"grad_norm": 0.3267967700958252,
"learning_rate": 4.884345509863286e-05,
"loss": 0.7661,
"step": 520
},
{
"epoch": 0.11873350923482849,
"grad_norm": 0.3270506262779236,
"learning_rate": 4.881597513616536e-05,
"loss": 0.7321,
"step": 525
},
{
"epoch": 0.11986430456087448,
"grad_norm": 0.3873696029186249,
"learning_rate": 4.878818044299517e-05,
"loss": 0.7278,
"step": 530
},
{
"epoch": 0.12099509988692046,
"grad_norm": 0.3305418789386749,
"learning_rate": 4.876007138643216e-05,
"loss": 0.7304,
"step": 535
},
{
"epoch": 0.12212589521296645,
"grad_norm": 0.26419228315353394,
"learning_rate": 4.873164833794059e-05,
"loss": 0.7248,
"step": 540
},
{
"epoch": 0.12325669053901243,
"grad_norm": 0.3038617968559265,
"learning_rate": 4.870291167313413e-05,
"loss": 0.6681,
"step": 545
},
{
"epoch": 0.12438748586505842,
"grad_norm": 0.2820129692554474,
"learning_rate": 4.8673861771770934e-05,
"loss": 0.7434,
"step": 550
},
{
"epoch": 0.12551828119110442,
"grad_norm": 0.3421660363674164,
"learning_rate": 4.8644499017748615e-05,
"loss": 0.7266,
"step": 555
},
{
"epoch": 0.1266490765171504,
"grad_norm": 0.3642486035823822,
"learning_rate": 4.861482379909914e-05,
"loss": 0.7421,
"step": 560
},
{
"epoch": 0.1277798718431964,
"grad_norm": 0.35517194867134094,
"learning_rate": 4.8584836507983786e-05,
"loss": 0.7432,
"step": 565
},
{
"epoch": 0.12891066716924238,
"grad_norm": 0.3161648213863373,
"learning_rate": 4.855453754068784e-05,
"loss": 0.7098,
"step": 570
},
{
"epoch": 0.13004146249528836,
"grad_norm": 0.296561598777771,
"learning_rate": 4.852392729761547e-05,
"loss": 0.6641,
"step": 575
},
{
"epoch": 0.13117225782133435,
"grad_norm": 0.323515921831131,
"learning_rate": 4.849300618328435e-05,
"loss": 0.7522,
"step": 580
},
{
"epoch": 0.13230305314738033,
"grad_norm": 0.34789595007896423,
"learning_rate": 4.8461774606320386e-05,
"loss": 0.7712,
"step": 585
},
{
"epoch": 0.13343384847342632,
"grad_norm": 0.3661488890647888,
"learning_rate": 4.843023297945226e-05,
"loss": 0.6862,
"step": 590
},
{
"epoch": 0.1345646437994723,
"grad_norm": 0.43650659918785095,
"learning_rate": 4.8398381719506e-05,
"loss": 0.7003,
"step": 595
},
{
"epoch": 0.1356954391255183,
"grad_norm": 0.38563141226768494,
"learning_rate": 4.836622124739948e-05,
"loss": 0.7094,
"step": 600
},
{
"epoch": 0.13682623445156428,
"grad_norm": 0.30190715193748474,
"learning_rate": 4.833375198813683e-05,
"loss": 0.6664,
"step": 605
},
{
"epoch": 0.13795702977761026,
"grad_norm": 0.35016635060310364,
"learning_rate": 4.8300974370802855e-05,
"loss": 0.6657,
"step": 610
},
{
"epoch": 0.13908782510365625,
"grad_norm": 0.3495071530342102,
"learning_rate": 4.8267888828557315e-05,
"loss": 0.7689,
"step": 615
},
{
"epoch": 0.14021862042970223,
"grad_norm": 0.2628171145915985,
"learning_rate": 4.823449579862927e-05,
"loss": 0.7278,
"step": 620
},
{
"epoch": 0.14134941575574822,
"grad_norm": 0.3362691104412079,
"learning_rate": 4.820079572231123e-05,
"loss": 0.6934,
"step": 625
},
{
"epoch": 0.1424802110817942,
"grad_norm": 0.32949429750442505,
"learning_rate": 4.8166789044953385e-05,
"loss": 0.6363,
"step": 630
},
{
"epoch": 0.1436110064078402,
"grad_norm": 0.3482156991958618,
"learning_rate": 4.813247621595766e-05,
"loss": 0.6735,
"step": 635
},
{
"epoch": 0.14474180173388618,
"grad_norm": 0.27361541986465454,
"learning_rate": 4.809785768877183e-05,
"loss": 0.6783,
"step": 640
},
{
"epoch": 0.14587259705993216,
"grad_norm": 0.29385972023010254,
"learning_rate": 4.80629339208835e-05,
"loss": 0.6947,
"step": 645
},
{
"epoch": 0.14700339238597815,
"grad_norm": 0.2907145023345947,
"learning_rate": 4.802770537381407e-05,
"loss": 0.6583,
"step": 650
},
{
"epoch": 0.14813418771202413,
"grad_norm": 0.3557474613189697,
"learning_rate": 4.799217251311261e-05,
"loss": 0.6196,
"step": 655
},
{
"epoch": 0.14926498303807012,
"grad_norm": 0.3381137251853943,
"learning_rate": 4.795633580834974e-05,
"loss": 0.6959,
"step": 660
},
{
"epoch": 0.1503957783641161,
"grad_norm": 0.3507809042930603,
"learning_rate": 4.792019573311142e-05,
"loss": 0.7787,
"step": 665
},
{
"epoch": 0.1515265736901621,
"grad_norm": 0.3603408634662628,
"learning_rate": 4.7883752764992676e-05,
"loss": 0.6956,
"step": 670
},
{
"epoch": 0.15265736901620808,
"grad_norm": 0.3778272867202759,
"learning_rate": 4.7847007385591295e-05,
"loss": 0.6352,
"step": 675
},
{
"epoch": 0.15378816434225406,
"grad_norm": 0.3363897502422333,
"learning_rate": 4.7809960080501464e-05,
"loss": 0.6615,
"step": 680
},
{
"epoch": 0.15491895966830005,
"grad_norm": 0.32491081953048706,
"learning_rate": 4.777261133930735e-05,
"loss": 0.7499,
"step": 685
},
{
"epoch": 0.15604975499434603,
"grad_norm": 0.318862646818161,
"learning_rate": 4.773496165557663e-05,
"loss": 0.725,
"step": 690
},
{
"epoch": 0.15718055032039202,
"grad_norm": 0.45129063725471497,
"learning_rate": 4.7697011526853976e-05,
"loss": 0.7582,
"step": 695
},
{
"epoch": 0.158311345646438,
"grad_norm": 0.3082630932331085,
"learning_rate": 4.7658761454654454e-05,
"loss": 0.834,
"step": 700
},
{
"epoch": 0.159442140972484,
"grad_norm": 0.29232099652290344,
"learning_rate": 4.762021194445695e-05,
"loss": 0.688,
"step": 705
},
{
"epoch": 0.16057293629852998,
"grad_norm": 0.304189532995224,
"learning_rate": 4.758136350569743e-05,
"loss": 0.6758,
"step": 710
},
{
"epoch": 0.16170373162457596,
"grad_norm": 0.3389667570590973,
"learning_rate": 4.754221665176223e-05,
"loss": 0.6746,
"step": 715
},
{
"epoch": 0.16283452695062195,
"grad_norm": 0.5311838388442993,
"learning_rate": 4.7502771899981284e-05,
"loss": 0.8003,
"step": 720
},
{
"epoch": 0.16396532227666794,
"grad_norm": 0.26352110505104065,
"learning_rate": 4.7463029771621294e-05,
"loss": 0.6647,
"step": 725
},
{
"epoch": 0.16509611760271392,
"grad_norm": 0.3928554058074951,
"learning_rate": 4.74229907918788e-05,
"loss": 0.7258,
"step": 730
},
{
"epoch": 0.1662269129287599,
"grad_norm": 0.4840872883796692,
"learning_rate": 4.738265548987327e-05,
"loss": 0.7886,
"step": 735
},
{
"epoch": 0.1673577082548059,
"grad_norm": 0.324370414018631,
"learning_rate": 4.734202439864012e-05,
"loss": 0.7031,
"step": 740
},
{
"epoch": 0.16848850358085188,
"grad_norm": 0.30743566155433655,
"learning_rate": 4.730109805512363e-05,
"loss": 0.7228,
"step": 745
},
{
"epoch": 0.16961929890689786,
"grad_norm": 0.3641277551651001,
"learning_rate": 4.7259877000169896e-05,
"loss": 0.7265,
"step": 750
},
{
"epoch": 0.17075009423294385,
"grad_norm": 0.40837985277175903,
"learning_rate": 4.721836177851963e-05,
"loss": 0.7128,
"step": 755
},
{
"epoch": 0.17188088955898984,
"grad_norm": 0.28167346119880676,
"learning_rate": 4.717655293880102e-05,
"loss": 0.6837,
"step": 760
},
{
"epoch": 0.17301168488503582,
"grad_norm": 0.37647080421447754,
"learning_rate": 4.713445103352241e-05,
"loss": 0.7493,
"step": 765
},
{
"epoch": 0.1741424802110818,
"grad_norm": 0.3222416043281555,
"learning_rate": 4.7092056619065084e-05,
"loss": 0.6314,
"step": 770
},
{
"epoch": 0.1752732755371278,
"grad_norm": 0.29139477014541626,
"learning_rate": 4.704937025567582e-05,
"loss": 0.7274,
"step": 775
},
{
"epoch": 0.17640407086317378,
"grad_norm": 0.3189648687839508,
"learning_rate": 4.700639250745957e-05,
"loss": 0.7202,
"step": 780
},
{
"epoch": 0.17753486618921976,
"grad_norm": 0.26070472598075867,
"learning_rate": 4.696312394237195e-05,
"loss": 0.7426,
"step": 785
},
{
"epoch": 0.17866566151526575,
"grad_norm": 0.384833961725235,
"learning_rate": 4.691956513221174e-05,
"loss": 0.7669,
"step": 790
},
{
"epoch": 0.17979645684131174,
"grad_norm": 0.3161134421825409,
"learning_rate": 4.6875716652613366e-05,
"loss": 0.7224,
"step": 795
},
{
"epoch": 0.18092725216735772,
"grad_norm": 0.40663212537765503,
"learning_rate": 4.6831579083039265e-05,
"loss": 0.7176,
"step": 800
},
{
"epoch": 0.1820580474934037,
"grad_norm": 0.4073905646800995,
"learning_rate": 4.6787153006772214e-05,
"loss": 0.7454,
"step": 805
},
{
"epoch": 0.1831888428194497,
"grad_norm": 0.36114805936813354,
"learning_rate": 4.6742439010907645e-05,
"loss": 0.7271,
"step": 810
},
{
"epoch": 0.18431963814549568,
"grad_norm": 0.35414162278175354,
"learning_rate": 4.6697437686345883e-05,
"loss": 0.8134,
"step": 815
},
{
"epoch": 0.18545043347154166,
"grad_norm": 0.3441600799560547,
"learning_rate": 4.6652149627784324e-05,
"loss": 0.7259,
"step": 820
},
{
"epoch": 0.18658122879758765,
"grad_norm": 0.34488874673843384,
"learning_rate": 4.660657543370958e-05,
"loss": 0.7541,
"step": 825
},
{
"epoch": 0.1877120241236336,
"grad_norm": 0.3300029933452606,
"learning_rate": 4.65607157063896e-05,
"loss": 0.7123,
"step": 830
},
{
"epoch": 0.1888428194496796,
"grad_norm": 0.39021798968315125,
"learning_rate": 4.651457105186566e-05,
"loss": 0.7049,
"step": 835
},
{
"epoch": 0.18997361477572558,
"grad_norm": 0.3784525394439697,
"learning_rate": 4.646814207994441e-05,
"loss": 0.7892,
"step": 840
},
{
"epoch": 0.19110441010177157,
"grad_norm": 0.3650527000427246,
"learning_rate": 4.642142940418973e-05,
"loss": 0.7315,
"step": 845
},
{
"epoch": 0.19223520542781755,
"grad_norm": 0.36192572116851807,
"learning_rate": 4.637443364191474e-05,
"loss": 0.6201,
"step": 850
},
{
"epoch": 0.19336600075386354,
"grad_norm": 0.3428821265697479,
"learning_rate": 4.6327155414173554e-05,
"loss": 0.7248,
"step": 855
},
{
"epoch": 0.19449679607990952,
"grad_norm": 0.2692446708679199,
"learning_rate": 4.627959534575307e-05,
"loss": 0.6986,
"step": 860
},
{
"epoch": 0.1956275914059555,
"grad_norm": 0.33562323451042175,
"learning_rate": 4.623175406516479e-05,
"loss": 0.7553,
"step": 865
},
{
"epoch": 0.1967583867320015,
"grad_norm": 0.332381010055542,
"learning_rate": 4.618363220463644e-05,
"loss": 0.7021,
"step": 870
},
{
"epoch": 0.19788918205804748,
"grad_norm": 0.3331127166748047,
"learning_rate": 4.6135230400103636e-05,
"loss": 0.7278,
"step": 875
},
{
"epoch": 0.19901997738409347,
"grad_norm": 0.32819780707359314,
"learning_rate": 4.6086549291201485e-05,
"loss": 0.7189,
"step": 880
},
{
"epoch": 0.20015077271013945,
"grad_norm": 0.31646525859832764,
"learning_rate": 4.603758952125615e-05,
"loss": 0.6949,
"step": 885
},
{
"epoch": 0.20128156803618544,
"grad_norm": 0.3622991740703583,
"learning_rate": 4.5988351737276316e-05,
"loss": 0.7193,
"step": 890
},
{
"epoch": 0.20241236336223142,
"grad_norm": 0.3097212016582489,
"learning_rate": 4.593883658994466e-05,
"loss": 0.6913,
"step": 895
},
{
"epoch": 0.2035431586882774,
"grad_norm": 0.3757197856903076,
"learning_rate": 4.588904473360923e-05,
"loss": 0.6859,
"step": 900
},
{
"epoch": 0.2046739540143234,
"grad_norm": 0.3894336223602295,
"learning_rate": 4.5838976826274826e-05,
"loss": 0.7495,
"step": 905
},
{
"epoch": 0.20580474934036938,
"grad_norm": 0.2777577042579651,
"learning_rate": 4.578863352959429e-05,
"loss": 0.7305,
"step": 910
},
{
"epoch": 0.20693554466641537,
"grad_norm": 0.30092760920524597,
"learning_rate": 4.573801550885979e-05,
"loss": 0.6952,
"step": 915
},
{
"epoch": 0.20806633999246135,
"grad_norm": 0.31918197870254517,
"learning_rate": 4.568712343299394e-05,
"loss": 0.6309,
"step": 920
},
{
"epoch": 0.20919713531850734,
"grad_norm": 0.3190583884716034,
"learning_rate": 4.563595797454109e-05,
"loss": 0.6932,
"step": 925
},
{
"epoch": 0.21032793064455332,
"grad_norm": 0.4575042128562927,
"learning_rate": 4.558451980965832e-05,
"loss": 0.7446,
"step": 930
},
{
"epoch": 0.2114587259705993,
"grad_norm": 0.3298736810684204,
"learning_rate": 4.553280961810658e-05,
"loss": 0.7434,
"step": 935
},
{
"epoch": 0.2125895212966453,
"grad_norm": 0.2681873142719269,
"learning_rate": 4.548082808324169e-05,
"loss": 0.7609,
"step": 940
},
{
"epoch": 0.21372031662269128,
"grad_norm": 0.32544100284576416,
"learning_rate": 4.542857589200527e-05,
"loss": 0.7076,
"step": 945
},
{
"epoch": 0.21485111194873727,
"grad_norm": 0.3351302444934845,
"learning_rate": 4.537605373491573e-05,
"loss": 0.7442,
"step": 950
},
{
"epoch": 0.21598190727478325,
"grad_norm": 0.3408782482147217,
"learning_rate": 4.532326230605908e-05,
"loss": 0.6697,
"step": 955
},
{
"epoch": 0.21711270260082924,
"grad_norm": 0.31308743357658386,
"learning_rate": 4.52702023030798e-05,
"loss": 0.6795,
"step": 960
},
{
"epoch": 0.21824349792687522,
"grad_norm": 0.31887832283973694,
"learning_rate": 4.521687442717161e-05,
"loss": 0.6907,
"step": 965
},
{
"epoch": 0.2193742932529212,
"grad_norm": 0.28720954060554504,
"learning_rate": 4.516327938306818e-05,
"loss": 0.6951,
"step": 970
},
{
"epoch": 0.2205050885789672,
"grad_norm": 0.35572728514671326,
"learning_rate": 4.510941787903385e-05,
"loss": 0.6731,
"step": 975
},
{
"epoch": 0.22163588390501318,
"grad_norm": 0.32665789127349854,
"learning_rate": 4.505529062685426e-05,
"loss": 0.6859,
"step": 980
},
{
"epoch": 0.22276667923105917,
"grad_norm": 0.425155907869339,
"learning_rate": 4.5000898341826935e-05,
"loss": 0.7611,
"step": 985
},
{
"epoch": 0.22389747455710515,
"grad_norm": 0.3223753273487091,
"learning_rate": 4.494624174275185e-05,
"loss": 0.6784,
"step": 990
},
{
"epoch": 0.22502826988315114,
"grad_norm": 0.29629823565483093,
"learning_rate": 4.48913215519219e-05,
"loss": 0.7528,
"step": 995
},
{
"epoch": 0.22615906520919712,
"grad_norm": 0.45501330494880676,
"learning_rate": 4.483613849511337e-05,
"loss": 0.7412,
"step": 1000
},
{
"epoch": 0.2272898605352431,
"grad_norm": 0.47708141803741455,
"learning_rate": 4.478069330157638e-05,
"loss": 0.7186,
"step": 1005
},
{
"epoch": 0.2284206558612891,
"grad_norm": 0.46172332763671875,
"learning_rate": 4.472498670402519e-05,
"loss": 0.7429,
"step": 1010
},
{
"epoch": 0.22955145118733508,
"grad_norm": 0.2885262966156006,
"learning_rate": 4.4669019438628545e-05,
"loss": 0.6749,
"step": 1015
},
{
"epoch": 0.23068224651338107,
"grad_norm": 0.3848798871040344,
"learning_rate": 4.461279224499995e-05,
"loss": 0.6889,
"step": 1020
},
{
"epoch": 0.23181304183942705,
"grad_norm": 0.3475760519504547,
"learning_rate": 4.455630586618788e-05,
"loss": 0.7423,
"step": 1025
},
{
"epoch": 0.23294383716547304,
"grad_norm": 0.3690018653869629,
"learning_rate": 4.449956104866597e-05,
"loss": 0.6995,
"step": 1030
},
{
"epoch": 0.23407463249151902,
"grad_norm": 0.4979022741317749,
"learning_rate": 4.444255854232318e-05,
"loss": 0.7137,
"step": 1035
},
{
"epoch": 0.235205427817565,
"grad_norm": 0.3002910017967224,
"learning_rate": 4.438529910045381e-05,
"loss": 0.6342,
"step": 1040
},
{
"epoch": 0.236336223143611,
"grad_norm": 0.2860986292362213,
"learning_rate": 4.432778347974764e-05,
"loss": 0.6486,
"step": 1045
},
{
"epoch": 0.23746701846965698,
"grad_norm": 0.3187776207923889,
"learning_rate": 4.427001244027984e-05,
"loss": 0.6935,
"step": 1050
},
{
"epoch": 0.23859781379570297,
"grad_norm": 0.436594694852829,
"learning_rate": 4.4211986745500976e-05,
"loss": 0.7125,
"step": 1055
},
{
"epoch": 0.23972860912174895,
"grad_norm": 0.25989067554473877,
"learning_rate": 4.415370716222693e-05,
"loss": 0.6699,
"step": 1060
},
{
"epoch": 0.24085940444779494,
"grad_norm": 0.30455416440963745,
"learning_rate": 4.4095174460628734e-05,
"loss": 0.7244,
"step": 1065
},
{
"epoch": 0.24199019977384092,
"grad_norm": 0.2574412226676941,
"learning_rate": 4.40363894142224e-05,
"loss": 0.6719,
"step": 1070
},
{
"epoch": 0.2431209950998869,
"grad_norm": 0.2614154815673828,
"learning_rate": 4.397735279985873e-05,
"loss": 0.7,
"step": 1075
},
{
"epoch": 0.2442517904259329,
"grad_norm": 0.32729870080947876,
"learning_rate": 4.3918065397712983e-05,
"loss": 0.6669,
"step": 1080
},
{
"epoch": 0.24538258575197888,
"grad_norm": 0.5149984359741211,
"learning_rate": 4.385852799127464e-05,
"loss": 0.7371,
"step": 1085
},
{
"epoch": 0.24651338107802487,
"grad_norm": 0.322007417678833,
"learning_rate": 4.379874136733702e-05,
"loss": 0.7595,
"step": 1090
},
{
"epoch": 0.24764417640407085,
"grad_norm": 0.38709428906440735,
"learning_rate": 4.373870631598683e-05,
"loss": 0.7662,
"step": 1095
},
{
"epoch": 0.24877497173011684,
"grad_norm": 0.3887243866920471,
"learning_rate": 4.367842363059383e-05,
"loss": 0.6608,
"step": 1100
},
{
"epoch": 0.24990576705616283,
"grad_norm": 0.343573659658432,
"learning_rate": 4.3617894107800275e-05,
"loss": 0.7364,
"step": 1105
},
{
"epoch": 0.25103656238220884,
"grad_norm": 0.3381284773349762,
"learning_rate": 4.355711854751037e-05,
"loss": 0.6939,
"step": 1110
},
{
"epoch": 0.2521673577082548,
"grad_norm": 0.428345650434494,
"learning_rate": 4.3496097752879764e-05,
"loss": 0.7322,
"step": 1115
},
{
"epoch": 0.2532981530343008,
"grad_norm": 0.3029363453388214,
"learning_rate": 4.3434832530304906e-05,
"loss": 0.6434,
"step": 1120
},
{
"epoch": 0.2544289483603468,
"grad_norm": 0.32285043597221375,
"learning_rate": 4.337332368941237e-05,
"loss": 0.686,
"step": 1125
},
{
"epoch": 0.2555597436863928,
"grad_norm": 0.2844852805137634,
"learning_rate": 4.331157204304819e-05,
"loss": 0.6786,
"step": 1130
},
{
"epoch": 0.25669053901243877,
"grad_norm": 0.38639211654663086,
"learning_rate": 4.324957840726708e-05,
"loss": 0.669,
"step": 1135
},
{
"epoch": 0.25782133433848475,
"grad_norm": 0.29250484704971313,
"learning_rate": 4.3187343601321696e-05,
"loss": 0.684,
"step": 1140
},
{
"epoch": 0.25895212966453074,
"grad_norm": 0.3040000796318054,
"learning_rate": 4.312486844765175e-05,
"loss": 0.6721,
"step": 1145
},
{
"epoch": 0.2600829249905767,
"grad_norm": 0.3095468580722809,
"learning_rate": 4.3062153771873214e-05,
"loss": 0.8026,
"step": 1150
},
{
"epoch": 0.2612137203166227,
"grad_norm": 0.3532247543334961,
"learning_rate": 4.299920040276735e-05,
"loss": 0.7338,
"step": 1155
},
{
"epoch": 0.2623445156426687,
"grad_norm": 0.3691394627094269,
"learning_rate": 4.2936009172269766e-05,
"loss": 0.6489,
"step": 1160
},
{
"epoch": 0.2634753109687147,
"grad_norm": 0.3503078520298004,
"learning_rate": 4.287258091545946e-05,
"loss": 0.6705,
"step": 1165
},
{
"epoch": 0.26460610629476067,
"grad_norm": 0.31756189465522766,
"learning_rate": 4.280891647054775e-05,
"loss": 0.6642,
"step": 1170
},
{
"epoch": 0.26573690162080665,
"grad_norm": 0.27942630648612976,
"learning_rate": 4.274501667886718e-05,
"loss": 0.7139,
"step": 1175
},
{
"epoch": 0.26686769694685264,
"grad_norm": 0.35604235529899597,
"learning_rate": 4.268088238486048e-05,
"loss": 0.8335,
"step": 1180
},
{
"epoch": 0.2679984922728986,
"grad_norm": 0.3140622675418854,
"learning_rate": 4.261651443606931e-05,
"loss": 0.8127,
"step": 1185
},
{
"epoch": 0.2691292875989446,
"grad_norm": 0.327470988035202,
"learning_rate": 4.255191368312311e-05,
"loss": 0.7311,
"step": 1190
},
{
"epoch": 0.2702600829249906,
"grad_norm": 0.3089313805103302,
"learning_rate": 4.2487080979727876e-05,
"loss": 0.733,
"step": 1195
},
{
"epoch": 0.2713908782510366,
"grad_norm": 0.3237866163253784,
"learning_rate": 4.242201718265483e-05,
"loss": 0.6754,
"step": 1200
},
{
"epoch": 0.27252167357708257,
"grad_norm": 0.3597028851509094,
"learning_rate": 4.235672315172912e-05,
"loss": 0.741,
"step": 1205
},
{
"epoch": 0.27365246890312855,
"grad_norm": 0.30509960651397705,
"learning_rate": 4.229119974981848e-05,
"loss": 0.7098,
"step": 1210
},
{
"epoch": 0.27478326422917454,
"grad_norm": 0.37183189392089844,
"learning_rate": 4.222544784282178e-05,
"loss": 0.7037,
"step": 1215
},
{
"epoch": 0.2759140595552205,
"grad_norm": 0.35368862748146057,
"learning_rate": 4.2159468299657645e-05,
"loss": 0.654,
"step": 1220
},
{
"epoch": 0.2770448548812665,
"grad_norm": 0.3120376765727997,
"learning_rate": 4.209326199225291e-05,
"loss": 0.6845,
"step": 1225
},
{
"epoch": 0.2781756502073125,
"grad_norm": 0.3322497308254242,
"learning_rate": 4.202682979553112e-05,
"loss": 0.738,
"step": 1230
},
{
"epoch": 0.2793064455333585,
"grad_norm": 0.39859551191329956,
"learning_rate": 4.1960172587401007e-05,
"loss": 0.7208,
"step": 1235
},
{
"epoch": 0.28043724085940447,
"grad_norm": 0.304196298122406,
"learning_rate": 4.1893291248744794e-05,
"loss": 0.6701,
"step": 1240
},
{
"epoch": 0.28156803618545045,
"grad_norm": 0.30052655935287476,
"learning_rate": 4.1826186663406685e-05,
"loss": 0.7255,
"step": 1245
},
{
"epoch": 0.28269883151149644,
"grad_norm": 0.3247777223587036,
"learning_rate": 4.1758859718181054e-05,
"loss": 0.7067,
"step": 1250
},
{
"epoch": 0.2838296268375424,
"grad_norm": 0.39652687311172485,
"learning_rate": 4.169131130280081e-05,
"loss": 0.8056,
"step": 1255
},
{
"epoch": 0.2849604221635884,
"grad_norm": 0.299211710691452,
"learning_rate": 4.162354230992562e-05,
"loss": 0.7158,
"step": 1260
},
{
"epoch": 0.2860912174896344,
"grad_norm": 0.34312811493873596,
"learning_rate": 4.155555363513009e-05,
"loss": 0.6555,
"step": 1265
},
{
"epoch": 0.2872220128156804,
"grad_norm": 0.34061411023139954,
"learning_rate": 4.148734617689196e-05,
"loss": 0.6973,
"step": 1270
},
{
"epoch": 0.28835280814172637,
"grad_norm": 0.32622766494750977,
"learning_rate": 4.1418920836580214e-05,
"loss": 0.7034,
"step": 1275
},
{
"epoch": 0.28948360346777235,
"grad_norm": 0.31413719058036804,
"learning_rate": 4.135027851844316e-05,
"loss": 0.6874,
"step": 1280
},
{
"epoch": 0.29061439879381834,
"grad_norm": 0.3852449357509613,
"learning_rate": 4.1281420129596504e-05,
"loss": 0.6937,
"step": 1285
},
{
"epoch": 0.2917451941198643,
"grad_norm": 0.25905337929725647,
"learning_rate": 4.121234658001135e-05,
"loss": 0.7273,
"step": 1290
},
{
"epoch": 0.2928759894459103,
"grad_norm": 0.33746325969696045,
"learning_rate": 4.114305878250218e-05,
"loss": 0.6815,
"step": 1295
},
{
"epoch": 0.2940067847719563,
"grad_norm": 0.36523139476776123,
"learning_rate": 4.1073557652714755e-05,
"loss": 0.6763,
"step": 1300
},
{
"epoch": 0.2951375800980023,
"grad_norm": 0.4286907911300659,
"learning_rate": 4.100384410911409e-05,
"loss": 0.7807,
"step": 1305
},
{
"epoch": 0.29626837542404827,
"grad_norm": 0.27938035130500793,
"learning_rate": 4.0933919072972224e-05,
"loss": 0.6515,
"step": 1310
},
{
"epoch": 0.29739917075009425,
"grad_norm": 0.28958678245544434,
"learning_rate": 4.086378346835614e-05,
"loss": 0.6303,
"step": 1315
},
{
"epoch": 0.29852996607614024,
"grad_norm": 0.31973332166671753,
"learning_rate": 4.0793438222115477e-05,
"loss": 0.733,
"step": 1320
},
{
"epoch": 0.2996607614021862,
"grad_norm": 0.302673876285553,
"learning_rate": 4.072288426387032e-05,
"loss": 0.6551,
"step": 1325
},
{
"epoch": 0.3007915567282322,
"grad_norm": 0.3454115092754364,
"learning_rate": 4.065212252599889e-05,
"loss": 0.6847,
"step": 1330
},
{
"epoch": 0.3019223520542782,
"grad_norm": 0.32197806239128113,
"learning_rate": 4.0581153943625266e-05,
"loss": 0.7283,
"step": 1335
},
{
"epoch": 0.3030531473803242,
"grad_norm": 0.2939291000366211,
"learning_rate": 4.050997945460699e-05,
"loss": 0.6519,
"step": 1340
},
{
"epoch": 0.30418394270637017,
"grad_norm": 0.34127116203308105,
"learning_rate": 4.043859999952266e-05,
"loss": 0.7041,
"step": 1345
},
{
"epoch": 0.30531473803241616,
"grad_norm": 0.3606717586517334,
"learning_rate": 4.0367016521659564e-05,
"loss": 0.6745,
"step": 1350
},
{
"epoch": 0.30644553335846214,
"grad_norm": 0.3977923095226288,
"learning_rate": 4.029522996700112e-05,
"loss": 0.6635,
"step": 1355
},
{
"epoch": 0.3075763286845081,
"grad_norm": 0.27561894059181213,
"learning_rate": 4.0223241284214496e-05,
"loss": 0.6661,
"step": 1360
},
{
"epoch": 0.3087071240105541,
"grad_norm": 0.31549111008644104,
"learning_rate": 4.015105142463794e-05,
"loss": 0.6659,
"step": 1365
},
{
"epoch": 0.3098379193366001,
"grad_norm": 0.32156458497047424,
"learning_rate": 4.0078661342268314e-05,
"loss": 0.6656,
"step": 1370
},
{
"epoch": 0.3109687146626461,
"grad_norm": 0.33597517013549805,
"learning_rate": 4.000607199374843e-05,
"loss": 0.6291,
"step": 1375
},
{
"epoch": 0.31209950998869207,
"grad_norm": 0.2836547791957855,
"learning_rate": 3.9933284338354415e-05,
"loss": 0.6936,
"step": 1380
},
{
"epoch": 0.31323030531473806,
"grad_norm": 0.3355998396873474,
"learning_rate": 3.986029933798308e-05,
"loss": 0.6578,
"step": 1385
},
{
"epoch": 0.31436110064078404,
"grad_norm": 0.3303869962692261,
"learning_rate": 3.9787117957139116e-05,
"loss": 0.6859,
"step": 1390
},
{
"epoch": 0.31549189596683,
"grad_norm": 0.3788108825683594,
"learning_rate": 3.9713741162922455e-05,
"loss": 0.6997,
"step": 1395
},
{
"epoch": 0.316622691292876,
"grad_norm": 0.33582428097724915,
"learning_rate": 3.964016992501541e-05,
"loss": 0.689,
"step": 1400
},
{
"epoch": 0.317753486618922,
"grad_norm": 0.35693231225013733,
"learning_rate": 3.956640521566989e-05,
"loss": 0.676,
"step": 1405
},
{
"epoch": 0.318884281944968,
"grad_norm": 0.3589436709880829,
"learning_rate": 3.949244800969456e-05,
"loss": 0.7545,
"step": 1410
},
{
"epoch": 0.32001507727101397,
"grad_norm": 0.3047327399253845,
"learning_rate": 3.941829928444194e-05,
"loss": 0.6391,
"step": 1415
},
{
"epoch": 0.32114587259705996,
"grad_norm": 0.292953759431839,
"learning_rate": 3.9343960019795525e-05,
"loss": 0.6886,
"step": 1420
},
{
"epoch": 0.32227666792310594,
"grad_norm": 0.3644665777683258,
"learning_rate": 3.926943119815675e-05,
"loss": 0.7283,
"step": 1425
},
{
"epoch": 0.3234074632491519,
"grad_norm": 0.3624630570411682,
"learning_rate": 3.919471380443212e-05,
"loss": 0.6566,
"step": 1430
},
{
"epoch": 0.3245382585751979,
"grad_norm": 0.48623165488243103,
"learning_rate": 3.911980882602011e-05,
"loss": 0.8311,
"step": 1435
},
{
"epoch": 0.3256690539012439,
"grad_norm": 0.3244991600513458,
"learning_rate": 3.904471725279818e-05,
"loss": 0.7087,
"step": 1440
},
{
"epoch": 0.3267998492272899,
"grad_norm": 0.3399847149848938,
"learning_rate": 3.8969440077109634e-05,
"loss": 0.6146,
"step": 1445
},
{
"epoch": 0.32793064455333587,
"grad_norm": 0.3181338310241699,
"learning_rate": 3.889397829375052e-05,
"loss": 0.7608,
"step": 1450
},
{
"epoch": 0.32906143987938186,
"grad_norm": 0.5128947496414185,
"learning_rate": 3.881833289995654e-05,
"loss": 0.7225,
"step": 1455
},
{
"epoch": 0.33019223520542784,
"grad_norm": 0.3176124095916748,
"learning_rate": 3.874250489538981e-05,
"loss": 0.7225,
"step": 1460
},
{
"epoch": 0.33132303053147383,
"grad_norm": 0.3748844563961029,
"learning_rate": 3.866649528212563e-05,
"loss": 0.7188,
"step": 1465
},
{
"epoch": 0.3324538258575198,
"grad_norm": 0.974604606628418,
"learning_rate": 3.859030506463932e-05,
"loss": 0.7509,
"step": 1470
},
{
"epoch": 0.3335846211835658,
"grad_norm": 0.3221200704574585,
"learning_rate": 3.851393524979291e-05,
"loss": 0.6781,
"step": 1475
},
{
"epoch": 0.3347154165096118,
"grad_norm": 0.33971571922302246,
"learning_rate": 3.84373868468218e-05,
"loss": 0.6711,
"step": 1480
},
{
"epoch": 0.33584621183565777,
"grad_norm": 0.3183509409427643,
"learning_rate": 3.836066086732145e-05,
"loss": 0.6808,
"step": 1485
},
{
"epoch": 0.33697700716170376,
"grad_norm": 0.2814907729625702,
"learning_rate": 3.828375832523407e-05,
"loss": 0.7171,
"step": 1490
},
{
"epoch": 0.33810780248774974,
"grad_norm": 0.2738807797431946,
"learning_rate": 3.820668023683507e-05,
"loss": 0.7934,
"step": 1495
},
{
"epoch": 0.33923859781379573,
"grad_norm": 0.3376060128211975,
"learning_rate": 3.812942762071981e-05,
"loss": 0.6045,
"step": 1500
},
{
"epoch": 0.3403693931398417,
"grad_norm": 0.3851218819618225,
"learning_rate": 3.8052001497790005e-05,
"loss": 0.7214,
"step": 1505
},
{
"epoch": 0.3415001884658877,
"grad_norm": 0.2853710949420929,
"learning_rate": 3.7974402891240294e-05,
"loss": 0.7312,
"step": 1510
},
{
"epoch": 0.3426309837919337,
"grad_norm": 0.34209561347961426,
"learning_rate": 3.78966328265447e-05,
"loss": 0.66,
"step": 1515
},
{
"epoch": 0.34376177911797967,
"grad_norm": 0.2967279851436615,
"learning_rate": 3.7818692331443093e-05,
"loss": 0.7354,
"step": 1520
},
{
"epoch": 0.34489257444402566,
"grad_norm": 0.31301623582839966,
"learning_rate": 3.7740582435927614e-05,
"loss": 0.6634,
"step": 1525
},
{
"epoch": 0.34602336977007164,
"grad_norm": 0.287758469581604,
"learning_rate": 3.766230417222901e-05,
"loss": 0.7688,
"step": 1530
},
{
"epoch": 0.34715416509611763,
"grad_norm": 0.34585824608802795,
"learning_rate": 3.7583858574803046e-05,
"loss": 0.6542,
"step": 1535
},
{
"epoch": 0.3482849604221636,
"grad_norm": 0.32640525698661804,
"learning_rate": 3.7505246680316853e-05,
"loss": 0.71,
"step": 1540
},
{
"epoch": 0.3494157557482096,
"grad_norm": 0.2845459580421448,
"learning_rate": 3.742646952763515e-05,
"loss": 0.6233,
"step": 1545
},
{
"epoch": 0.3505465510742556,
"grad_norm": 0.30241382122039795,
"learning_rate": 3.7347528157806586e-05,
"loss": 0.6739,
"step": 1550
},
{
"epoch": 0.35167734640030157,
"grad_norm": 0.35119229555130005,
"learning_rate": 3.726842361404996e-05,
"loss": 0.72,
"step": 1555
},
{
"epoch": 0.35280814172634756,
"grad_norm": 0.3631749153137207,
"learning_rate": 3.718915694174042e-05,
"loss": 0.6596,
"step": 1560
},
{
"epoch": 0.35393893705239354,
"grad_norm": 0.258357971906662,
"learning_rate": 3.7109729188395666e-05,
"loss": 0.7037,
"step": 1565
},
{
"epoch": 0.35506973237843953,
"grad_norm": 0.2907659113407135,
"learning_rate": 3.703014140366209e-05,
"loss": 0.6494,
"step": 1570
},
{
"epoch": 0.3562005277044855,
"grad_norm": 0.309076189994812,
"learning_rate": 3.695039463930093e-05,
"loss": 0.6668,
"step": 1575
},
{
"epoch": 0.3573313230305315,
"grad_norm": 0.33287695050239563,
"learning_rate": 3.687048994917437e-05,
"loss": 0.7215,
"step": 1580
},
{
"epoch": 0.3584621183565775,
"grad_norm": 0.2877466082572937,
"learning_rate": 3.679042838923157e-05,
"loss": 0.6261,
"step": 1585
},
{
"epoch": 0.35959291368262347,
"grad_norm": 0.26237618923187256,
"learning_rate": 3.671021101749476e-05,
"loss": 0.6966,
"step": 1590
},
{
"epoch": 0.36072370900866946,
"grad_norm": 0.34308937191963196,
"learning_rate": 3.6629838894045224e-05,
"loss": 0.662,
"step": 1595
},
{
"epoch": 0.36185450433471544,
"grad_norm": 0.337215393781662,
"learning_rate": 3.654931308100934e-05,
"loss": 0.7402,
"step": 1600
},
{
"epoch": 0.36298529966076143,
"grad_norm": 0.4486747980117798,
"learning_rate": 3.646863464254447e-05,
"loss": 0.7111,
"step": 1605
},
{
"epoch": 0.3641160949868074,
"grad_norm": 0.37535396218299866,
"learning_rate": 3.638780464482497e-05,
"loss": 0.7322,
"step": 1610
},
{
"epoch": 0.3652468903128534,
"grad_norm": 0.4385060966014862,
"learning_rate": 3.630682415602804e-05,
"loss": 0.6517,
"step": 1615
},
{
"epoch": 0.3663776856388994,
"grad_norm": 0.29366278648376465,
"learning_rate": 3.6225694246319666e-05,
"loss": 0.636,
"step": 1620
},
{
"epoch": 0.36750848096494537,
"grad_norm": 0.3330417573451996,
"learning_rate": 3.614441598784042e-05,
"loss": 0.727,
"step": 1625
},
{
"epoch": 0.36863927629099136,
"grad_norm": 0.3851955831050873,
"learning_rate": 3.6062990454691334e-05,
"loss": 0.7019,
"step": 1630
},
{
"epoch": 0.36977007161703734,
"grad_norm": 0.4180035889148712,
"learning_rate": 3.598141872291969e-05,
"loss": 0.7318,
"step": 1635
},
{
"epoch": 0.37090086694308333,
"grad_norm": 0.28281131386756897,
"learning_rate": 3.589970187050481e-05,
"loss": 0.7143,
"step": 1640
},
{
"epoch": 0.3720316622691293,
"grad_norm": 0.35991495847702026,
"learning_rate": 3.581784097734376e-05,
"loss": 0.7144,
"step": 1645
},
{
"epoch": 0.3731624575951753,
"grad_norm": 0.3908022940158844,
"learning_rate": 3.5735837125237174e-05,
"loss": 0.6779,
"step": 1650
},
{
"epoch": 0.3742932529212213,
"grad_norm": 0.3579081594944,
"learning_rate": 3.565369139787488e-05,
"loss": 0.6774,
"step": 1655
},
{
"epoch": 0.3754240482472672,
"grad_norm": 0.37918293476104736,
"learning_rate": 3.5571404880821594e-05,
"loss": 0.7551,
"step": 1660
},
{
"epoch": 0.3765548435733132,
"grad_norm": 0.372585654258728,
"learning_rate": 3.548897866150259e-05,
"loss": 0.7081,
"step": 1665
},
{
"epoch": 0.3776856388993592,
"grad_norm": 0.38565728068351746,
"learning_rate": 3.540641382918934e-05,
"loss": 0.6547,
"step": 1670
},
{
"epoch": 0.3788164342254052,
"grad_norm": 0.3910474479198456,
"learning_rate": 3.532371147498507e-05,
"loss": 0.6847,
"step": 1675
},
{
"epoch": 0.37994722955145116,
"grad_norm": 0.3123336732387543,
"learning_rate": 3.524087269181039e-05,
"loss": 0.6692,
"step": 1680
},
{
"epoch": 0.38107802487749715,
"grad_norm": 0.3222855031490326,
"learning_rate": 3.515789857438885e-05,
"loss": 0.7101,
"step": 1685
},
{
"epoch": 0.38220882020354313,
"grad_norm": 0.3308558762073517,
"learning_rate": 3.507479021923241e-05,
"loss": 0.7193,
"step": 1690
},
{
"epoch": 0.3833396155295891,
"grad_norm": 0.36425960063934326,
"learning_rate": 3.4991548724627054e-05,
"loss": 0.6698,
"step": 1695
},
{
"epoch": 0.3844704108556351,
"grad_norm": 0.3454649746417999,
"learning_rate": 3.490817519061819e-05,
"loss": 0.6996,
"step": 1700
},
{
"epoch": 0.3856012061816811,
"grad_norm": 0.39363983273506165,
"learning_rate": 3.4824670718996114e-05,
"loss": 0.7256,
"step": 1705
},
{
"epoch": 0.3867320015077271,
"grad_norm": 0.29884523153305054,
"learning_rate": 3.4741036413281534e-05,
"loss": 0.706,
"step": 1710
},
{
"epoch": 0.38786279683377306,
"grad_norm": 0.6705525517463684,
"learning_rate": 3.4657273378710874e-05,
"loss": 0.7508,
"step": 1715
},
{
"epoch": 0.38899359215981905,
"grad_norm": 0.31176072359085083,
"learning_rate": 3.4573382722221776e-05,
"loss": 0.6792,
"step": 1720
},
{
"epoch": 0.39012438748586503,
"grad_norm": 0.37332355976104736,
"learning_rate": 3.448936555243837e-05,
"loss": 0.6805,
"step": 1725
},
{
"epoch": 0.391255182811911,
"grad_norm": 0.4867086112499237,
"learning_rate": 3.440522297965671e-05,
"loss": 0.6306,
"step": 1730
},
{
"epoch": 0.392385978137957,
"grad_norm": 0.32693204283714294,
"learning_rate": 3.4320956115830046e-05,
"loss": 0.719,
"step": 1735
},
{
"epoch": 0.393516773464003,
"grad_norm": 0.2943226993083954,
"learning_rate": 3.4236566074554157e-05,
"loss": 0.7405,
"step": 1740
},
{
"epoch": 0.394647568790049,
"grad_norm": 0.3139977753162384,
"learning_rate": 3.415205397105261e-05,
"loss": 0.7152,
"step": 1745
},
{
"epoch": 0.39577836411609496,
"grad_norm": 0.33439525961875916,
"learning_rate": 3.406742092216206e-05,
"loss": 0.7017,
"step": 1750
},
{
"epoch": 0.39690915944214095,
"grad_norm": 0.3081996440887451,
"learning_rate": 3.398266804631744e-05,
"loss": 0.6647,
"step": 1755
},
{
"epoch": 0.39803995476818693,
"grad_norm": 0.3134262263774872,
"learning_rate": 3.389779646353724e-05,
"loss": 0.7313,
"step": 1760
},
{
"epoch": 0.3991707500942329,
"grad_norm": 0.3375689685344696,
"learning_rate": 3.381280729540866e-05,
"loss": 0.6829,
"step": 1765
},
{
"epoch": 0.4003015454202789,
"grad_norm": 0.38416242599487305,
"learning_rate": 3.37277016650728e-05,
"loss": 0.7534,
"step": 1770
},
{
"epoch": 0.4014323407463249,
"grad_norm": 0.3711940050125122,
"learning_rate": 3.364248069720982e-05,
"loss": 0.6618,
"step": 1775
},
{
"epoch": 0.4025631360723709,
"grad_norm": 0.338777631521225,
"learning_rate": 3.3557145518024094e-05,
"loss": 0.6692,
"step": 1780
},
{
"epoch": 0.40369393139841686,
"grad_norm": 0.2786078155040741,
"learning_rate": 3.3471697255229294e-05,
"loss": 0.7504,
"step": 1785
},
{
"epoch": 0.40482472672446285,
"grad_norm": 0.33004823327064514,
"learning_rate": 3.338613703803351e-05,
"loss": 0.7056,
"step": 1790
},
{
"epoch": 0.40595552205050883,
"grad_norm": 0.3257131278514862,
"learning_rate": 3.330046599712432e-05,
"loss": 0.7102,
"step": 1795
},
{
"epoch": 0.4070863173765548,
"grad_norm": 0.3138837516307831,
"learning_rate": 3.321468526465386e-05,
"loss": 0.6638,
"step": 1800
},
{
"epoch": 0.4082171127026008,
"grad_norm": 0.3327350914478302,
"learning_rate": 3.312879597422383e-05,
"loss": 0.7355,
"step": 1805
},
{
"epoch": 0.4093479080286468,
"grad_norm": 0.2875402569770813,
"learning_rate": 3.304279926087055e-05,
"loss": 0.7113,
"step": 1810
},
{
"epoch": 0.4104787033546928,
"grad_norm": 0.5153040289878845,
"learning_rate": 3.295669626104995e-05,
"loss": 0.7401,
"step": 1815
},
{
"epoch": 0.41160949868073876,
"grad_norm": 0.3518928587436676,
"learning_rate": 3.287048811262254e-05,
"loss": 0.6864,
"step": 1820
},
{
"epoch": 0.41274029400678475,
"grad_norm": 0.3488028049468994,
"learning_rate": 3.2784175954838376e-05,
"loss": 0.6401,
"step": 1825
},
{
"epoch": 0.41387108933283073,
"grad_norm": 0.37360239028930664,
"learning_rate": 3.2697760928322016e-05,
"loss": 0.7004,
"step": 1830
},
{
"epoch": 0.4150018846588767,
"grad_norm": 0.3383936285972595,
"learning_rate": 3.261124417505745e-05,
"loss": 0.6563,
"step": 1835
},
{
"epoch": 0.4161326799849227,
"grad_norm": 0.36131277680397034,
"learning_rate": 3.252462683837297e-05,
"loss": 0.6737,
"step": 1840
},
{
"epoch": 0.4172634753109687,
"grad_norm": 0.3024144768714905,
"learning_rate": 3.2437910062926116e-05,
"loss": 0.6466,
"step": 1845
},
{
"epoch": 0.4183942706370147,
"grad_norm": 0.6971142888069153,
"learning_rate": 3.235109499468849e-05,
"loss": 0.6927,
"step": 1850
},
{
"epoch": 0.41952506596306066,
"grad_norm": 0.3525508642196655,
"learning_rate": 3.226418278093069e-05,
"loss": 0.7009,
"step": 1855
},
{
"epoch": 0.42065586128910665,
"grad_norm": 0.3152811527252197,
"learning_rate": 3.2177174570207066e-05,
"loss": 0.7065,
"step": 1860
},
{
"epoch": 0.42178665661515263,
"grad_norm": 0.2631702721118927,
"learning_rate": 3.2090071512340584e-05,
"loss": 0.6723,
"step": 1865
},
{
"epoch": 0.4229174519411986,
"grad_norm": 0.35791584849357605,
"learning_rate": 3.200287475840764e-05,
"loss": 0.6927,
"step": 1870
},
{
"epoch": 0.4240482472672446,
"grad_norm": 0.30266880989074707,
"learning_rate": 3.191558546072283e-05,
"loss": 0.6395,
"step": 1875
},
{
"epoch": 0.4251790425932906,
"grad_norm": 0.27712151408195496,
"learning_rate": 3.1828204772823705e-05,
"loss": 0.6246,
"step": 1880
},
{
"epoch": 0.4263098379193366,
"grad_norm": 0.4084063172340393,
"learning_rate": 3.174073384945556e-05,
"loss": 0.6993,
"step": 1885
},
{
"epoch": 0.42744063324538256,
"grad_norm": 0.3760344088077545,
"learning_rate": 3.1653173846556186e-05,
"loss": 0.6413,
"step": 1890
},
{
"epoch": 0.42857142857142855,
"grad_norm": 0.41881611943244934,
"learning_rate": 3.156552592124054e-05,
"loss": 0.7295,
"step": 1895
},
{
"epoch": 0.42970222389747453,
"grad_norm": 0.3386279046535492,
"learning_rate": 3.147779123178548e-05,
"loss": 0.7482,
"step": 1900
},
{
"epoch": 0.4308330192235205,
"grad_norm": 0.4601892828941345,
"learning_rate": 3.138997093761449e-05,
"loss": 0.7499,
"step": 1905
},
{
"epoch": 0.4319638145495665,
"grad_norm": 0.4254579246044159,
"learning_rate": 3.1302066199282295e-05,
"loss": 0.7148,
"step": 1910
},
{
"epoch": 0.4330946098756125,
"grad_norm": 0.3381584584712982,
"learning_rate": 3.121407817845959e-05,
"loss": 0.6117,
"step": 1915
},
{
"epoch": 0.4342254052016585,
"grad_norm": 0.3117331266403198,
"learning_rate": 3.112600803791764e-05,
"loss": 0.6246,
"step": 1920
},
{
"epoch": 0.43535620052770446,
"grad_norm": 0.4453639090061188,
"learning_rate": 3.103785694151293e-05,
"loss": 0.754,
"step": 1925
},
{
"epoch": 0.43648699585375045,
"grad_norm": 0.4143831729888916,
"learning_rate": 3.094962605417179e-05,
"loss": 0.7966,
"step": 1930
},
{
"epoch": 0.43761779117979643,
"grad_norm": 0.2990778684616089,
"learning_rate": 3.086131654187501e-05,
"loss": 0.6519,
"step": 1935
},
{
"epoch": 0.4387485865058424,
"grad_norm": 0.3955526649951935,
"learning_rate": 3.077292957164238e-05,
"loss": 0.7048,
"step": 1940
},
{
"epoch": 0.4398793818318884,
"grad_norm": 0.3522753119468689,
"learning_rate": 3.068446631151736e-05,
"loss": 0.7202,
"step": 1945
},
{
"epoch": 0.4410101771579344,
"grad_norm": 0.3563268482685089,
"learning_rate": 3.0595927930551524e-05,
"loss": 0.7145,
"step": 1950
},
{
"epoch": 0.4421409724839804,
"grad_norm": 0.38255730271339417,
"learning_rate": 3.0507315598789237e-05,
"loss": 0.7158,
"step": 1955
},
{
"epoch": 0.44327176781002636,
"grad_norm": 0.3502512276172638,
"learning_rate": 3.0418630487252087e-05,
"loss": 0.644,
"step": 1960
},
{
"epoch": 0.44440256313607235,
"grad_norm": 0.36824584007263184,
"learning_rate": 3.0329873767923477e-05,
"loss": 0.7561,
"step": 1965
},
{
"epoch": 0.44553335846211833,
"grad_norm": 0.32158178091049194,
"learning_rate": 3.0241046613733114e-05,
"loss": 0.6694,
"step": 1970
},
{
"epoch": 0.4466641537881643,
"grad_norm": 0.28382861614227295,
"learning_rate": 3.01521501985415e-05,
"loss": 0.6803,
"step": 1975
},
{
"epoch": 0.4477949491142103,
"grad_norm": 0.3525499999523163,
"learning_rate": 3.0063185697124446e-05,
"loss": 0.7263,
"step": 1980
},
{
"epoch": 0.4489257444402563,
"grad_norm": 0.2863157093524933,
"learning_rate": 2.9974154285157497e-05,
"loss": 0.7232,
"step": 1985
},
{
"epoch": 0.4500565397663023,
"grad_norm": 0.3138844668865204,
"learning_rate": 2.9885057139200468e-05,
"loss": 0.6912,
"step": 1990
},
{
"epoch": 0.45118733509234826,
"grad_norm": 0.33406513929367065,
"learning_rate": 2.979589543668182e-05,
"loss": 0.684,
"step": 1995
},
{
"epoch": 0.45231813041839425,
"grad_norm": 0.3506259620189667,
"learning_rate": 2.970667035588317e-05,
"loss": 0.7522,
"step": 2000
},
{
"epoch": 0.45344892574444023,
"grad_norm": 0.37139952182769775,
"learning_rate": 2.9617383075923665e-05,
"loss": 0.6471,
"step": 2005
},
{
"epoch": 0.4545797210704862,
"grad_norm": 0.295625239610672,
"learning_rate": 2.952803477674441e-05,
"loss": 0.7209,
"step": 2010
},
{
"epoch": 0.4557105163965322,
"grad_norm": 0.3062797486782074,
"learning_rate": 2.9438626639092932e-05,
"loss": 0.7059,
"step": 2015
},
{
"epoch": 0.4568413117225782,
"grad_norm": 0.3885577917098999,
"learning_rate": 2.9349159844507455e-05,
"loss": 0.7319,
"step": 2020
},
{
"epoch": 0.4579721070486242,
"grad_norm": 0.365987628698349,
"learning_rate": 2.9259635575301436e-05,
"loss": 0.6858,
"step": 2025
},
{
"epoch": 0.45910290237467016,
"grad_norm": 0.32557693123817444,
"learning_rate": 2.9170055014547825e-05,
"loss": 0.622,
"step": 2030
},
{
"epoch": 0.46023369770071615,
"grad_norm": 0.31643807888031006,
"learning_rate": 2.908041934606347e-05,
"loss": 0.6794,
"step": 2035
},
{
"epoch": 0.46136449302676213,
"grad_norm": 0.3457587957382202,
"learning_rate": 2.89907297543935e-05,
"loss": 0.7015,
"step": 2040
},
{
"epoch": 0.4624952883528081,
"grad_norm": 0.3037043809890747,
"learning_rate": 2.8900987424795606e-05,
"loss": 0.6773,
"step": 2045
},
{
"epoch": 0.4636260836788541,
"grad_norm": 0.3223413825035095,
"learning_rate": 2.8811193543224462e-05,
"loss": 0.643,
"step": 2050
},
{
"epoch": 0.4647568790049001,
"grad_norm": 0.5646958947181702,
"learning_rate": 2.8721349296315963e-05,
"loss": 0.6942,
"step": 2055
},
{
"epoch": 0.4658876743309461,
"grad_norm": 0.3289279043674469,
"learning_rate": 2.8631455871371614e-05,
"loss": 0.6679,
"step": 2060
},
{
"epoch": 0.46701846965699206,
"grad_norm": 0.4061075448989868,
"learning_rate": 2.8541514456342815e-05,
"loss": 0.7992,
"step": 2065
},
{
"epoch": 0.46814926498303805,
"grad_norm": 0.37772536277770996,
"learning_rate": 2.8451526239815134e-05,
"loss": 0.6817,
"step": 2070
},
{
"epoch": 0.46928006030908404,
"grad_norm": 0.31532320380210876,
"learning_rate": 2.8361492410992662e-05,
"loss": 0.6771,
"step": 2075
},
{
"epoch": 0.47041085563513,
"grad_norm": 0.352198988199234,
"learning_rate": 2.8271414159682224e-05,
"loss": 0.6515,
"step": 2080
},
{
"epoch": 0.471541650961176,
"grad_norm": 0.39696019887924194,
"learning_rate": 2.8181292676277738e-05,
"loss": 0.7276,
"step": 2085
},
{
"epoch": 0.472672446287222,
"grad_norm": 0.4117799997329712,
"learning_rate": 2.809112915174439e-05,
"loss": 0.6333,
"step": 2090
},
{
"epoch": 0.473803241613268,
"grad_norm": 0.36984243988990784,
"learning_rate": 2.8000924777602965e-05,
"loss": 0.7202,
"step": 2095
},
{
"epoch": 0.47493403693931396,
"grad_norm": 0.3305279612541199,
"learning_rate": 2.79106807459141e-05,
"loss": 0.6418,
"step": 2100
},
{
"epoch": 0.47606483226535995,
"grad_norm": 0.40777119994163513,
"learning_rate": 2.7820398249262474e-05,
"loss": 0.7948,
"step": 2105
},
{
"epoch": 0.47719562759140594,
"grad_norm": 0.3309784233570099,
"learning_rate": 2.7730078480741122e-05,
"loss": 0.6319,
"step": 2110
},
{
"epoch": 0.4783264229174519,
"grad_norm": 0.3214864134788513,
"learning_rate": 2.7639722633935605e-05,
"loss": 0.7008,
"step": 2115
},
{
"epoch": 0.4794572182434979,
"grad_norm": 0.3192216157913208,
"learning_rate": 2.754933190290826e-05,
"loss": 0.6489,
"step": 2120
},
{
"epoch": 0.4805880135695439,
"grad_norm": 0.31766754388809204,
"learning_rate": 2.745890748218245e-05,
"loss": 0.6728,
"step": 2125
},
{
"epoch": 0.4817188088955899,
"grad_norm": 0.32393330335617065,
"learning_rate": 2.736845056672671e-05,
"loss": 0.6808,
"step": 2130
},
{
"epoch": 0.48284960422163586,
"grad_norm": 0.3087853193283081,
"learning_rate": 2.727796235193904e-05,
"loss": 0.7033,
"step": 2135
},
{
"epoch": 0.48398039954768185,
"grad_norm": 0.3951945900917053,
"learning_rate": 2.7187444033631044e-05,
"loss": 0.6537,
"step": 2140
},
{
"epoch": 0.48511119487372784,
"grad_norm": 0.31923210620880127,
"learning_rate": 2.709689680801213e-05,
"loss": 0.6795,
"step": 2145
},
{
"epoch": 0.4862419901997738,
"grad_norm": 0.4405725300312042,
"learning_rate": 2.7006321871673752e-05,
"loss": 0.7204,
"step": 2150
},
{
"epoch": 0.4873727855258198,
"grad_norm": 0.36784470081329346,
"learning_rate": 2.6915720421573538e-05,
"loss": 0.698,
"step": 2155
},
{
"epoch": 0.4885035808518658,
"grad_norm": 0.38032978773117065,
"learning_rate": 2.682509365501953e-05,
"loss": 0.74,
"step": 2160
},
{
"epoch": 0.4896343761779118,
"grad_norm": 0.36600053310394287,
"learning_rate": 2.6734442769654273e-05,
"loss": 0.6317,
"step": 2165
},
{
"epoch": 0.49076517150395776,
"grad_norm": 0.39383023977279663,
"learning_rate": 2.6643768963439113e-05,
"loss": 0.6807,
"step": 2170
},
{
"epoch": 0.49189596683000375,
"grad_norm": 0.37128937244415283,
"learning_rate": 2.6553073434638248e-05,
"loss": 0.7359,
"step": 2175
},
{
"epoch": 0.49302676215604974,
"grad_norm": 0.32236599922180176,
"learning_rate": 2.6462357381802966e-05,
"loss": 0.6154,
"step": 2180
},
{
"epoch": 0.4941575574820957,
"grad_norm": 0.3519161343574524,
"learning_rate": 2.6371622003755768e-05,
"loss": 0.7197,
"step": 2185
},
{
"epoch": 0.4952883528081417,
"grad_norm": 0.38883543014526367,
"learning_rate": 2.628086849957455e-05,
"loss": 0.7554,
"step": 2190
},
{
"epoch": 0.4964191481341877,
"grad_norm": 0.34098756313323975,
"learning_rate": 2.6190098068576763e-05,
"loss": 0.7133,
"step": 2195
},
{
"epoch": 0.4975499434602337,
"grad_norm": 0.36088091135025024,
"learning_rate": 2.6099311910303502e-05,
"loss": 0.6746,
"step": 2200
},
{
"epoch": 0.49868073878627966,
"grad_norm": 0.38198113441467285,
"learning_rate": 2.6008511224503728e-05,
"loss": 0.6848,
"step": 2205
},
{
"epoch": 0.49981153411232565,
"grad_norm": 0.3310260474681854,
"learning_rate": 2.59176972111184e-05,
"loss": 0.6657,
"step": 2210
},
{
"epoch": 0.5009423294383717,
"grad_norm": 0.3948574364185333,
"learning_rate": 2.582687107026458e-05,
"loss": 0.6704,
"step": 2215
},
{
"epoch": 0.5020731247644177,
"grad_norm": 0.31727057695388794,
"learning_rate": 2.5736034002219594e-05,
"loss": 0.6454,
"step": 2220
},
{
"epoch": 0.5032039200904637,
"grad_norm": 0.33022522926330566,
"learning_rate": 2.564518720740519e-05,
"loss": 0.6928,
"step": 2225
},
{
"epoch": 0.5043347154165096,
"grad_norm": 0.5317490696907043,
"learning_rate": 2.555433188637164e-05,
"loss": 0.823,
"step": 2230
},
{
"epoch": 0.5054655107425556,
"grad_norm": 0.39583778381347656,
"learning_rate": 2.54634692397819e-05,
"loss": 0.7081,
"step": 2235
},
{
"epoch": 0.5065963060686016,
"grad_norm": 0.36913448572158813,
"learning_rate": 2.5372600468395723e-05,
"loss": 0.6707,
"step": 2240
},
{
"epoch": 0.5077271013946476,
"grad_norm": 0.33116042613983154,
"learning_rate": 2.528172677305382e-05,
"loss": 0.7008,
"step": 2245
},
{
"epoch": 0.5088578967206936,
"grad_norm": 0.3586164116859436,
"learning_rate": 2.5190849354661955e-05,
"loss": 0.6895,
"step": 2250
},
{
"epoch": 0.5099886920467396,
"grad_norm": 0.44672051072120667,
"learning_rate": 2.50999694141751e-05,
"loss": 0.7304,
"step": 2255
},
{
"epoch": 0.5111194873727856,
"grad_norm": 0.4558676779270172,
"learning_rate": 2.5009088152581565e-05,
"loss": 0.7073,
"step": 2260
},
{
"epoch": 0.5122502826988315,
"grad_norm": 0.31825345754623413,
"learning_rate": 2.4918206770887102e-05,
"loss": 0.7007,
"step": 2265
},
{
"epoch": 0.5133810780248775,
"grad_norm": 0.41337841749191284,
"learning_rate": 2.482732647009907e-05,
"loss": 0.7995,
"step": 2270
},
{
"epoch": 0.5145118733509235,
"grad_norm": 0.3080434799194336,
"learning_rate": 2.473644845121051e-05,
"loss": 0.7367,
"step": 2275
},
{
"epoch": 0.5156426686769695,
"grad_norm": 0.35662513971328735,
"learning_rate": 2.4645573915184354e-05,
"loss": 0.6669,
"step": 2280
},
{
"epoch": 0.5167734640030155,
"grad_norm": 0.41301533579826355,
"learning_rate": 2.4554704062937467e-05,
"loss": 0.6953,
"step": 2285
},
{
"epoch": 0.5179042593290615,
"grad_norm": 0.42937204241752625,
"learning_rate": 2.4463840095324834e-05,
"loss": 0.6625,
"step": 2290
},
{
"epoch": 0.5190350546551075,
"grad_norm": 0.32970476150512695,
"learning_rate": 2.437298321312369e-05,
"loss": 0.6823,
"step": 2295
},
{
"epoch": 0.5201658499811534,
"grad_norm": 0.36597487330436707,
"learning_rate": 2.428213461701759e-05,
"loss": 0.6233,
"step": 2300
},
{
"epoch": 0.5212966453071994,
"grad_norm": 0.31977376341819763,
"learning_rate": 2.4191295507580648e-05,
"loss": 0.6732,
"step": 2305
},
{
"epoch": 0.5224274406332454,
"grad_norm": 0.3720978796482086,
"learning_rate": 2.410046708526155e-05,
"loss": 0.7449,
"step": 2310
},
{
"epoch": 0.5235582359592914,
"grad_norm": 0.4317164421081543,
"learning_rate": 2.4009650550367804e-05,
"loss": 0.6818,
"step": 2315
},
{
"epoch": 0.5246890312853374,
"grad_norm": 0.358803391456604,
"learning_rate": 2.3918847103049792e-05,
"loss": 0.7051,
"step": 2320
},
{
"epoch": 0.5258198266113834,
"grad_norm": 0.37477102875709534,
"learning_rate": 2.3828057943284932e-05,
"loss": 0.6474,
"step": 2325
},
{
"epoch": 0.5269506219374294,
"grad_norm": 0.3854588568210602,
"learning_rate": 2.373728427086188e-05,
"loss": 0.6464,
"step": 2330
},
{
"epoch": 0.5280814172634754,
"grad_norm": 0.29804185032844543,
"learning_rate": 2.3646527285364565e-05,
"loss": 0.6824,
"step": 2335
},
{
"epoch": 0.5292122125895213,
"grad_norm": 0.3477884829044342,
"learning_rate": 2.3555788186156442e-05,
"loss": 0.7401,
"step": 2340
},
{
"epoch": 0.5303430079155673,
"grad_norm": 0.3655013144016266,
"learning_rate": 2.346506817236457e-05,
"loss": 0.6915,
"step": 2345
},
{
"epoch": 0.5314738032416133,
"grad_norm": 0.31074225902557373,
"learning_rate": 2.3374368442863814e-05,
"loss": 0.7442,
"step": 2350
},
{
"epoch": 0.5326045985676593,
"grad_norm": 0.38817688822746277,
"learning_rate": 2.3283690196260967e-05,
"loss": 0.7317,
"step": 2355
},
{
"epoch": 0.5337353938937053,
"grad_norm": 0.2897610366344452,
"learning_rate": 2.3193034630878907e-05,
"loss": 0.6206,
"step": 2360
},
{
"epoch": 0.5348661892197513,
"grad_norm": 0.38513097167015076,
"learning_rate": 2.310240294474081e-05,
"loss": 0.7794,
"step": 2365
},
{
"epoch": 0.5359969845457973,
"grad_norm": 0.3019099533557892,
"learning_rate": 2.3011796335554258e-05,
"loss": 0.6191,
"step": 2370
},
{
"epoch": 0.5371277798718432,
"grad_norm": 0.29924795031547546,
"learning_rate": 2.2921216000695465e-05,
"loss": 0.6881,
"step": 2375
},
{
"epoch": 0.5382585751978892,
"grad_norm": 0.37753212451934814,
"learning_rate": 2.2830663137193398e-05,
"loss": 0.6226,
"step": 2380
},
{
"epoch": 0.5393893705239352,
"grad_norm": 0.3259458839893341,
"learning_rate": 2.274013894171401e-05,
"loss": 0.7258,
"step": 2385
},
{
"epoch": 0.5405201658499812,
"grad_norm": 0.3186294436454773,
"learning_rate": 2.2649644610544392e-05,
"loss": 0.7074,
"step": 2390
},
{
"epoch": 0.5416509611760272,
"grad_norm": 0.328595370054245,
"learning_rate": 2.255918133957697e-05,
"loss": 0.6656,
"step": 2395
},
{
"epoch": 0.5427817565020732,
"grad_norm": 0.34288713335990906,
"learning_rate": 2.2468750324293717e-05,
"loss": 0.6913,
"step": 2400
},
{
"epoch": 0.5439125518281192,
"grad_norm": 0.34917885065078735,
"learning_rate": 2.2378352759750333e-05,
"loss": 0.6997,
"step": 2405
},
{
"epoch": 0.5450433471541651,
"grad_norm": 0.38892245292663574,
"learning_rate": 2.2287989840560485e-05,
"loss": 0.6667,
"step": 2410
},
{
"epoch": 0.5461741424802111,
"grad_norm": 0.41548117995262146,
"learning_rate": 2.219766276087996e-05,
"loss": 0.648,
"step": 2415
},
{
"epoch": 0.5473049378062571,
"grad_norm": 0.37720760703086853,
"learning_rate": 2.2107372714390974e-05,
"loss": 0.7646,
"step": 2420
},
{
"epoch": 0.5484357331323031,
"grad_norm": 0.32246890664100647,
"learning_rate": 2.2017120894286287e-05,
"loss": 0.6772,
"step": 2425
},
{
"epoch": 0.5495665284583491,
"grad_norm": 0.35085204243659973,
"learning_rate": 2.1926908493253527e-05,
"loss": 0.658,
"step": 2430
},
{
"epoch": 0.5506973237843951,
"grad_norm": 0.32103869318962097,
"learning_rate": 2.1836736703459398e-05,
"loss": 0.6576,
"step": 2435
},
{
"epoch": 0.551828119110441,
"grad_norm": 0.30640605092048645,
"learning_rate": 2.1746606716533907e-05,
"loss": 0.7009,
"step": 2440
},
{
"epoch": 0.552958914436487,
"grad_norm": 0.4351046681404114,
"learning_rate": 2.1656519723554643e-05,
"loss": 0.7124,
"step": 2445
},
{
"epoch": 0.554089709762533,
"grad_norm": 0.3515176773071289,
"learning_rate": 2.1566476915031013e-05,
"loss": 0.7086,
"step": 2450
},
{
"epoch": 0.555220505088579,
"grad_norm": 0.35644426941871643,
"learning_rate": 2.1476479480888545e-05,
"loss": 0.7245,
"step": 2455
},
{
"epoch": 0.556351300414625,
"grad_norm": 0.49966442584991455,
"learning_rate": 2.1386528610453104e-05,
"loss": 0.7511,
"step": 2460
},
{
"epoch": 0.557482095740671,
"grad_norm": 0.3358660340309143,
"learning_rate": 2.129662549243523e-05,
"loss": 0.6579,
"step": 2465
},
{
"epoch": 0.558612891066717,
"grad_norm": 0.392120361328125,
"learning_rate": 2.120677131491442e-05,
"loss": 0.7838,
"step": 2470
},
{
"epoch": 0.559743686392763,
"grad_norm": 0.3123244047164917,
"learning_rate": 2.11169672653234e-05,
"loss": 0.6543,
"step": 2475
},
{
"epoch": 0.5608744817188089,
"grad_norm": 0.3226960301399231,
"learning_rate": 2.1027214530432465e-05,
"loss": 0.6582,
"step": 2480
},
{
"epoch": 0.5620052770448549,
"grad_norm": 0.3497219681739807,
"learning_rate": 2.0937514296333754e-05,
"loss": 0.6815,
"step": 2485
},
{
"epoch": 0.5631360723709009,
"grad_norm": 0.39245134592056274,
"learning_rate": 2.0847867748425648e-05,
"loss": 0.7226,
"step": 2490
},
{
"epoch": 0.5642668676969469,
"grad_norm": 0.3870549499988556,
"learning_rate": 2.0758276071397012e-05,
"loss": 0.7073,
"step": 2495
},
{
"epoch": 0.5653976630229929,
"grad_norm": 0.40596914291381836,
"learning_rate": 2.0668740449211605e-05,
"loss": 0.6929,
"step": 2500
},
{
"epoch": 0.5665284583490389,
"grad_norm": 0.3204245865345001,
"learning_rate": 2.0579262065092423e-05,
"loss": 0.7193,
"step": 2505
},
{
"epoch": 0.5676592536750849,
"grad_norm": 0.30433857440948486,
"learning_rate": 2.048984210150604e-05,
"loss": 0.6859,
"step": 2510
},
{
"epoch": 0.5687900490011308,
"grad_norm": 0.392553448677063,
"learning_rate": 2.0400481740147022e-05,
"loss": 0.7217,
"step": 2515
},
{
"epoch": 0.5699208443271768,
"grad_norm": 0.3402389585971832,
"learning_rate": 2.0311182161922237e-05,
"loss": 0.6868,
"step": 2520
},
{
"epoch": 0.5710516396532228,
"grad_norm": 0.42901313304901123,
"learning_rate": 2.022194454693536e-05,
"loss": 0.6861,
"step": 2525
},
{
"epoch": 0.5721824349792688,
"grad_norm": 0.34680864214897156,
"learning_rate": 2.013277007447117e-05,
"loss": 0.7805,
"step": 2530
},
{
"epoch": 0.5733132303053148,
"grad_norm": 0.30028700828552246,
"learning_rate": 2.0043659922980005e-05,
"loss": 0.6454,
"step": 2535
},
{
"epoch": 0.5744440256313608,
"grad_norm": 0.3310604691505432,
"learning_rate": 1.995461527006225e-05,
"loss": 0.6193,
"step": 2540
},
{
"epoch": 0.5755748209574068,
"grad_norm": 0.3977152407169342,
"learning_rate": 1.9865637292452636e-05,
"loss": 0.7275,
"step": 2545
},
{
"epoch": 0.5767056162834527,
"grad_norm": 0.42726007103919983,
"learning_rate": 1.977672716600486e-05,
"loss": 0.7321,
"step": 2550
},
{
"epoch": 0.5778364116094987,
"grad_norm": 0.4253356158733368,
"learning_rate": 1.968788606567589e-05,
"loss": 0.7107,
"step": 2555
},
{
"epoch": 0.5789672069355447,
"grad_norm": 0.3486230969429016,
"learning_rate": 1.9599115165510544e-05,
"loss": 0.6859,
"step": 2560
},
{
"epoch": 0.5800980022615907,
"grad_norm": 0.3471638560295105,
"learning_rate": 1.9510415638625932e-05,
"loss": 0.656,
"step": 2565
},
{
"epoch": 0.5812287975876367,
"grad_norm": 0.37314942479133606,
"learning_rate": 1.942178865719593e-05,
"loss": 0.6545,
"step": 2570
},
{
"epoch": 0.5823595929136827,
"grad_norm": 0.3019452393054962,
"learning_rate": 1.9333235392435774e-05,
"loss": 0.6422,
"step": 2575
},
{
"epoch": 0.5834903882397287,
"grad_norm": 0.30790606141090393,
"learning_rate": 1.9244757014586458e-05,
"loss": 0.6182,
"step": 2580
},
{
"epoch": 0.5846211835657746,
"grad_norm": 0.3539668917655945,
"learning_rate": 1.9156354692899405e-05,
"loss": 0.6835,
"step": 2585
},
{
"epoch": 0.5857519788918206,
"grad_norm": 0.3208529055118561,
"learning_rate": 1.9068029595620884e-05,
"loss": 0.6619,
"step": 2590
},
{
"epoch": 0.5868827742178666,
"grad_norm": 0.49773553013801575,
"learning_rate": 1.897978288997669e-05,
"loss": 0.7187,
"step": 2595
},
{
"epoch": 0.5880135695439126,
"grad_norm": 0.3386790156364441,
"learning_rate": 1.889161574215663e-05,
"loss": 0.6659,
"step": 2600
},
{
"epoch": 0.5891443648699586,
"grad_norm": 0.3373807370662689,
"learning_rate": 1.880352931729914e-05,
"loss": 0.6461,
"step": 2605
},
{
"epoch": 0.5902751601960046,
"grad_norm": 0.4661053717136383,
"learning_rate": 1.8715524779475944e-05,
"loss": 0.6994,
"step": 2610
},
{
"epoch": 0.5914059555220506,
"grad_norm": 0.38146570324897766,
"learning_rate": 1.862760329167655e-05,
"loss": 0.6413,
"step": 2615
},
{
"epoch": 0.5925367508480965,
"grad_norm": 0.3764294981956482,
"learning_rate": 1.8539766015793006e-05,
"loss": 0.6617,
"step": 2620
},
{
"epoch": 0.5936675461741425,
"grad_norm": 0.35271722078323364,
"learning_rate": 1.845201411260446e-05,
"loss": 0.7036,
"step": 2625
},
{
"epoch": 0.5947983415001885,
"grad_norm": 0.3613468110561371,
"learning_rate": 1.8364348741761867e-05,
"loss": 0.7361,
"step": 2630
},
{
"epoch": 0.5959291368262345,
"grad_norm": 0.34245991706848145,
"learning_rate": 1.8276771061772647e-05,
"loss": 0.7073,
"step": 2635
},
{
"epoch": 0.5970599321522805,
"grad_norm": 0.32761844992637634,
"learning_rate": 1.8189282229985345e-05,
"loss": 0.7661,
"step": 2640
},
{
"epoch": 0.5981907274783265,
"grad_norm": 0.3382299542427063,
"learning_rate": 1.8101883402574415e-05,
"loss": 0.6813,
"step": 2645
},
{
"epoch": 0.5993215228043725,
"grad_norm": 0.30160099267959595,
"learning_rate": 1.8014575734524865e-05,
"loss": 0.7183,
"step": 2650
},
{
"epoch": 0.6004523181304184,
"grad_norm": 0.3124518096446991,
"learning_rate": 1.7927360379617024e-05,
"loss": 0.6506,
"step": 2655
},
{
"epoch": 0.6015831134564644,
"grad_norm": 0.3907219469547272,
"learning_rate": 1.78402384904113e-05,
"loss": 0.6575,
"step": 2660
},
{
"epoch": 0.6027139087825104,
"grad_norm": 0.35735592246055603,
"learning_rate": 1.7753211218232938e-05,
"loss": 0.6877,
"step": 2665
},
{
"epoch": 0.6038447041085564,
"grad_norm": 0.40482988953590393,
"learning_rate": 1.7666279713156815e-05,
"loss": 0.6788,
"step": 2670
},
{
"epoch": 0.6049754994346024,
"grad_norm": 0.40024474263191223,
"learning_rate": 1.757944512399221e-05,
"loss": 0.7644,
"step": 2675
},
{
"epoch": 0.6061062947606484,
"grad_norm": 0.4042050242424011,
"learning_rate": 1.7492708598267683e-05,
"loss": 0.7347,
"step": 2680
},
{
"epoch": 0.6072370900866944,
"grad_norm": 0.38071730732917786,
"learning_rate": 1.7406071282215854e-05,
"loss": 0.6841,
"step": 2685
},
{
"epoch": 0.6083678854127403,
"grad_norm": 0.4327053427696228,
"learning_rate": 1.7319534320758284e-05,
"loss": 0.7712,
"step": 2690
},
{
"epoch": 0.6094986807387863,
"grad_norm": 0.41496187448501587,
"learning_rate": 1.7233098857490325e-05,
"loss": 0.7306,
"step": 2695
},
{
"epoch": 0.6106294760648323,
"grad_norm": 0.48277217149734497,
"learning_rate": 1.714676603466605e-05,
"loss": 0.6843,
"step": 2700
},
{
"epoch": 0.6117602713908783,
"grad_norm": 0.4226689338684082,
"learning_rate": 1.7060536993183084e-05,
"loss": 0.6336,
"step": 2705
},
{
"epoch": 0.6128910667169243,
"grad_norm": 0.3646908104419708,
"learning_rate": 1.6974412872567597e-05,
"loss": 0.6637,
"step": 2710
},
{
"epoch": 0.6140218620429703,
"grad_norm": 0.30986544489860535,
"learning_rate": 1.688839481095922e-05,
"loss": 0.6905,
"step": 2715
},
{
"epoch": 0.6151526573690163,
"grad_norm": 0.416892409324646,
"learning_rate": 1.680248394509599e-05,
"loss": 0.7408,
"step": 2720
},
{
"epoch": 0.6162834526950622,
"grad_norm": 0.3360169231891632,
"learning_rate": 1.6716681410299348e-05,
"loss": 0.7591,
"step": 2725
},
{
"epoch": 0.6174142480211082,
"grad_norm": 0.40622156858444214,
"learning_rate": 1.6630988340459128e-05,
"loss": 0.6792,
"step": 2730
},
{
"epoch": 0.6185450433471542,
"grad_norm": 0.37062937021255493,
"learning_rate": 1.654540586801858e-05,
"loss": 0.6656,
"step": 2735
},
{
"epoch": 0.6196758386732002,
"grad_norm": 0.29908493161201477,
"learning_rate": 1.645993512395938e-05,
"loss": 0.6576,
"step": 2740
},
{
"epoch": 0.6208066339992462,
"grad_norm": 0.3012191951274872,
"learning_rate": 1.6374577237786703e-05,
"loss": 0.6174,
"step": 2745
},
{
"epoch": 0.6219374293252922,
"grad_norm": 0.322457879781723,
"learning_rate": 1.628933333751432e-05,
"loss": 0.6562,
"step": 2750
},
{
"epoch": 0.6230682246513382,
"grad_norm": 0.3631836771965027,
"learning_rate": 1.6204204549649628e-05,
"loss": 0.6264,
"step": 2755
},
{
"epoch": 0.6241990199773841,
"grad_norm": 0.30250152945518494,
"learning_rate": 1.6119191999178847e-05,
"loss": 0.7027,
"step": 2760
},
{
"epoch": 0.6253298153034301,
"grad_norm": 0.31643325090408325,
"learning_rate": 1.6034296809552047e-05,
"loss": 0.6767,
"step": 2765
},
{
"epoch": 0.6264606106294761,
"grad_norm": 0.3336418569087982,
"learning_rate": 1.594952010266843e-05,
"loss": 0.67,
"step": 2770
},
{
"epoch": 0.6275914059555221,
"grad_norm": 0.33178821206092834,
"learning_rate": 1.5864862998861384e-05,
"loss": 0.6477,
"step": 2775
},
{
"epoch": 0.6287222012815681,
"grad_norm": 0.3760960102081299,
"learning_rate": 1.5780326616883745e-05,
"loss": 0.6692,
"step": 2780
},
{
"epoch": 0.6298529966076141,
"grad_norm": 0.38543248176574707,
"learning_rate": 1.5695912073893006e-05,
"loss": 0.6762,
"step": 2785
},
{
"epoch": 0.63098379193366,
"grad_norm": 0.39795583486557007,
"learning_rate": 1.561162048543653e-05,
"loss": 0.6861,
"step": 2790
},
{
"epoch": 0.632114587259706,
"grad_norm": 0.30678924918174744,
"learning_rate": 1.552745296543684e-05,
"loss": 0.7045,
"step": 2795
},
{
"epoch": 0.633245382585752,
"grad_norm": 0.39268332719802856,
"learning_rate": 1.544341062617685e-05,
"loss": 0.6791,
"step": 2800
},
{
"epoch": 0.634376177911798,
"grad_norm": 0.3372342586517334,
"learning_rate": 1.535949457828525e-05,
"loss": 0.6737,
"step": 2805
},
{
"epoch": 0.635506973237844,
"grad_norm": 0.3903445303440094,
"learning_rate": 1.527570593072172e-05,
"loss": 0.7094,
"step": 2810
},
{
"epoch": 0.63663776856389,
"grad_norm": 0.3412375748157501,
"learning_rate": 1.5192045790762354e-05,
"loss": 0.7126,
"step": 2815
},
{
"epoch": 0.637768563889936,
"grad_norm": 0.37893742322921753,
"learning_rate": 1.5108515263985018e-05,
"loss": 0.739,
"step": 2820
},
{
"epoch": 0.638899359215982,
"grad_norm": 0.3894254267215729,
"learning_rate": 1.502511545425469e-05,
"loss": 0.7108,
"step": 2825
},
{
"epoch": 0.6400301545420279,
"grad_norm": 0.3613717257976532,
"learning_rate": 1.4941847463708958e-05,
"loss": 0.672,
"step": 2830
},
{
"epoch": 0.6411609498680739,
"grad_norm": 0.2811620235443115,
"learning_rate": 1.4858712392743352e-05,
"loss": 0.7129,
"step": 2835
},
{
"epoch": 0.6422917451941199,
"grad_norm": 0.411286324262619,
"learning_rate": 1.4775711339996896e-05,
"loss": 0.6747,
"step": 2840
},
{
"epoch": 0.6434225405201659,
"grad_norm": 0.39716291427612305,
"learning_rate": 1.4692845402337523e-05,
"loss": 0.7217,
"step": 2845
},
{
"epoch": 0.6445533358462119,
"grad_norm": 0.3730713725090027,
"learning_rate": 1.4610115674847619e-05,
"loss": 0.6249,
"step": 2850
},
{
"epoch": 0.6456841311722579,
"grad_norm": 0.3958978056907654,
"learning_rate": 1.4527523250809545e-05,
"loss": 0.6599,
"step": 2855
},
{
"epoch": 0.6468149264983039,
"grad_norm": 0.2955171763896942,
"learning_rate": 1.4445069221691148e-05,
"loss": 0.6542,
"step": 2860
},
{
"epoch": 0.6479457218243498,
"grad_norm": 0.45475757122039795,
"learning_rate": 1.436275467713141e-05,
"loss": 0.8182,
"step": 2865
},
{
"epoch": 0.6490765171503958,
"grad_norm": 0.40360134840011597,
"learning_rate": 1.428058070492599e-05,
"loss": 0.6866,
"step": 2870
},
{
"epoch": 0.6502073124764418,
"grad_norm": 0.3490990996360779,
"learning_rate": 1.4198548391012878e-05,
"loss": 0.6879,
"step": 2875
},
{
"epoch": 0.6513381078024878,
"grad_norm": 0.3447786569595337,
"learning_rate": 1.4116658819458025e-05,
"loss": 0.6206,
"step": 2880
},
{
"epoch": 0.6524689031285338,
"grad_norm": 0.4018050730228424,
"learning_rate": 1.4034913072441015e-05,
"loss": 0.6705,
"step": 2885
},
{
"epoch": 0.6535996984545798,
"grad_norm": 0.31316670775413513,
"learning_rate": 1.3953312230240801e-05,
"loss": 0.7058,
"step": 2890
},
{
"epoch": 0.6547304937806258,
"grad_norm": 0.4424934387207031,
"learning_rate": 1.3871857371221389e-05,
"loss": 0.6871,
"step": 2895
},
{
"epoch": 0.6558612891066717,
"grad_norm": 0.36514902114868164,
"learning_rate": 1.3790549571817615e-05,
"loss": 0.6632,
"step": 2900
},
{
"epoch": 0.6569920844327177,
"grad_norm": 0.3037571609020233,
"learning_rate": 1.3709389906520875e-05,
"loss": 0.6516,
"step": 2905
},
{
"epoch": 0.6581228797587637,
"grad_norm": 0.3230195641517639,
"learning_rate": 1.3628379447864997e-05,
"loss": 0.7393,
"step": 2910
},
{
"epoch": 0.6592536750848097,
"grad_norm": 0.425601601600647,
"learning_rate": 1.3547519266411985e-05,
"loss": 0.6665,
"step": 2915
},
{
"epoch": 0.6603844704108557,
"grad_norm": 0.39113959670066833,
"learning_rate": 1.3466810430737941e-05,
"loss": 0.6772,
"step": 2920
},
{
"epoch": 0.6615152657369017,
"grad_norm": 0.3463885188102722,
"learning_rate": 1.3386254007418928e-05,
"loss": 0.7132,
"step": 2925
},
{
"epoch": 0.6626460610629477,
"grad_norm": 0.29994767904281616,
"learning_rate": 1.3305851061016821e-05,
"loss": 0.7092,
"step": 2930
},
{
"epoch": 0.6637768563889936,
"grad_norm": 0.3709157407283783,
"learning_rate": 1.3225602654065323e-05,
"loss": 0.6795,
"step": 2935
},
{
"epoch": 0.6649076517150396,
"grad_norm": 0.36443623900413513,
"learning_rate": 1.3145509847055837e-05,
"loss": 0.6979,
"step": 2940
},
{
"epoch": 0.6660384470410856,
"grad_norm": 0.3367445468902588,
"learning_rate": 1.3065573698423558e-05,
"loss": 0.7412,
"step": 2945
},
{
"epoch": 0.6671692423671316,
"grad_norm": 0.3487666845321655,
"learning_rate": 1.2985795264533372e-05,
"loss": 0.8255,
"step": 2950
},
{
"epoch": 0.6683000376931776,
"grad_norm": 0.3769291937351227,
"learning_rate": 1.2906175599665949e-05,
"loss": 0.6697,
"step": 2955
},
{
"epoch": 0.6694308330192236,
"grad_norm": 0.3350309431552887,
"learning_rate": 1.2826715756003846e-05,
"loss": 0.7478,
"step": 2960
},
{
"epoch": 0.6705616283452696,
"grad_norm": 0.30802276730537415,
"learning_rate": 1.2747416783617511e-05,
"loss": 0.6233,
"step": 2965
},
{
"epoch": 0.6716924236713155,
"grad_norm": 0.5399759411811829,
"learning_rate": 1.2668279730451535e-05,
"loss": 0.7359,
"step": 2970
},
{
"epoch": 0.6728232189973615,
"grad_norm": 0.38919568061828613,
"learning_rate": 1.2589305642310651e-05,
"loss": 0.6935,
"step": 2975
},
{
"epoch": 0.6739540143234075,
"grad_norm": 0.30821794271469116,
"learning_rate": 1.2510495562846053e-05,
"loss": 0.7083,
"step": 2980
},
{
"epoch": 0.6750848096494535,
"grad_norm": 0.28666800260543823,
"learning_rate": 1.2431850533541487e-05,
"loss": 0.6569,
"step": 2985
},
{
"epoch": 0.6762156049754995,
"grad_norm": 0.36479493975639343,
"learning_rate": 1.2353371593699592e-05,
"loss": 0.6867,
"step": 2990
},
{
"epoch": 0.6773464003015455,
"grad_norm": 0.3713551461696625,
"learning_rate": 1.22750597804281e-05,
"loss": 0.68,
"step": 2995
},
{
"epoch": 0.6784771956275915,
"grad_norm": 0.3654766380786896,
"learning_rate": 1.2196916128626126e-05,
"loss": 0.73,
"step": 3000
},
{
"epoch": 0.6796079909536374,
"grad_norm": 0.2997152507305145,
"learning_rate": 1.2118941670970551e-05,
"loss": 0.6777,
"step": 3005
},
{
"epoch": 0.6807387862796834,
"grad_norm": 0.3098806142807007,
"learning_rate": 1.2041137437902297e-05,
"loss": 0.709,
"step": 3010
},
{
"epoch": 0.6818695816057294,
"grad_norm": 0.32077932357788086,
"learning_rate": 1.1963504457612781e-05,
"loss": 0.6451,
"step": 3015
},
{
"epoch": 0.6830003769317754,
"grad_norm": 0.33692800998687744,
"learning_rate": 1.1886043756030294e-05,
"loss": 0.6855,
"step": 3020
},
{
"epoch": 0.6841311722578214,
"grad_norm": 0.4159882664680481,
"learning_rate": 1.1808756356806411e-05,
"loss": 0.6746,
"step": 3025
},
{
"epoch": 0.6852619675838674,
"grad_norm": 0.4914894104003906,
"learning_rate": 1.1731643281302548e-05,
"loss": 0.7548,
"step": 3030
},
{
"epoch": 0.6863927629099134,
"grad_norm": 0.354419082403183,
"learning_rate": 1.1654705548576364e-05,
"loss": 0.7227,
"step": 3035
},
{
"epoch": 0.6875235582359593,
"grad_norm": 0.42316100001335144,
"learning_rate": 1.157794417536838e-05,
"loss": 0.709,
"step": 3040
},
{
"epoch": 0.6886543535620053,
"grad_norm": 0.35025471448898315,
"learning_rate": 1.1501360176088494e-05,
"loss": 0.6336,
"step": 3045
},
{
"epoch": 0.6897851488880513,
"grad_norm": 0.33178970217704773,
"learning_rate": 1.1424954562802598e-05,
"loss": 0.616,
"step": 3050
},
{
"epoch": 0.6909159442140973,
"grad_norm": 0.32804471254348755,
"learning_rate": 1.1348728345219176e-05,
"loss": 0.6617,
"step": 3055
},
{
"epoch": 0.6920467395401433,
"grad_norm": 0.29233989119529724,
"learning_rate": 1.127268253067598e-05,
"loss": 0.6296,
"step": 3060
},
{
"epoch": 0.6931775348661893,
"grad_norm": 0.3966659605503082,
"learning_rate": 1.1196818124126729e-05,
"loss": 0.6721,
"step": 3065
},
{
"epoch": 0.6943083301922353,
"grad_norm": 0.34669914841651917,
"learning_rate": 1.1121136128127812e-05,
"loss": 0.6118,
"step": 3070
},
{
"epoch": 0.6954391255182812,
"grad_norm": 0.33230316638946533,
"learning_rate": 1.104563754282505e-05,
"loss": 0.6855,
"step": 3075
},
{
"epoch": 0.6965699208443272,
"grad_norm": 0.3585960865020752,
"learning_rate": 1.0970323365940444e-05,
"loss": 0.6893,
"step": 3080
},
{
"epoch": 0.6977007161703732,
"grad_norm": 0.4158352315425873,
"learning_rate": 1.0895194592759042e-05,
"loss": 0.7072,
"step": 3085
},
{
"epoch": 0.6988315114964192,
"grad_norm": 0.43162232637405396,
"learning_rate": 1.082025221611577e-05,
"loss": 0.7138,
"step": 3090
},
{
"epoch": 0.6999623068224652,
"grad_norm": 0.3278350830078125,
"learning_rate": 1.0745497226382267e-05,
"loss": 0.6111,
"step": 3095
},
{
"epoch": 0.7010931021485112,
"grad_norm": 0.44768843054771423,
"learning_rate": 1.0670930611453874e-05,
"loss": 0.6449,
"step": 3100
},
{
"epoch": 0.7022238974745572,
"grad_norm": 0.48205995559692383,
"learning_rate": 1.0596553356736507e-05,
"loss": 0.6902,
"step": 3105
},
{
"epoch": 0.7033546928006031,
"grad_norm": 0.3945559561252594,
"learning_rate": 1.0522366445133686e-05,
"loss": 0.6727,
"step": 3110
},
{
"epoch": 0.7044854881266491,
"grad_norm": 0.33756589889526367,
"learning_rate": 1.044837085703352e-05,
"loss": 0.6969,
"step": 3115
},
{
"epoch": 0.7056162834526951,
"grad_norm": 0.2790575325489044,
"learning_rate": 1.0374567570295766e-05,
"loss": 0.625,
"step": 3120
},
{
"epoch": 0.7067470787787411,
"grad_norm": 0.4053255021572113,
"learning_rate": 1.0300957560238875e-05,
"loss": 0.7338,
"step": 3125
},
{
"epoch": 0.7078778741047871,
"grad_norm": 0.3397720158100128,
"learning_rate": 1.0227541799627136e-05,
"loss": 0.6771,
"step": 3130
},
{
"epoch": 0.7090086694308331,
"grad_norm": 0.3540814518928528,
"learning_rate": 1.015432125865782e-05,
"loss": 0.6582,
"step": 3135
},
{
"epoch": 0.7101394647568791,
"grad_norm": 0.3383145034313202,
"learning_rate": 1.0081296904948342e-05,
"loss": 0.5987,
"step": 3140
},
{
"epoch": 0.711270260082925,
"grad_norm": 0.4115695357322693,
"learning_rate": 1.0008469703523493e-05,
"loss": 0.6981,
"step": 3145
},
{
"epoch": 0.712401055408971,
"grad_norm": 0.3034178912639618,
"learning_rate": 9.935840616802645e-06,
"loss": 0.6991,
"step": 3150
},
{
"epoch": 0.713531850735017,
"grad_norm": 0.32083261013031006,
"learning_rate": 9.863410604587095e-06,
"loss": 0.6806,
"step": 3155
},
{
"epoch": 0.714662646061063,
"grad_norm": 0.5665989518165588,
"learning_rate": 9.791180624047322e-06,
"loss": 0.7539,
"step": 3160
},
{
"epoch": 0.715793441387109,
"grad_norm": 0.4178657829761505,
"learning_rate": 9.719151629710386e-06,
"loss": 0.6961,
"step": 3165
},
{
"epoch": 0.716924236713155,
"grad_norm": 0.36418062448501587,
"learning_rate": 9.647324573447291e-06,
"loss": 0.7055,
"step": 3170
},
{
"epoch": 0.718055032039201,
"grad_norm": 0.32820287346839905,
"learning_rate": 9.575700404460386e-06,
"loss": 0.6329,
"step": 3175
},
{
"epoch": 0.7191858273652469,
"grad_norm": 0.5332444906234741,
"learning_rate": 9.504280069270871e-06,
"loss": 0.723,
"step": 3180
},
{
"epoch": 0.7203166226912929,
"grad_norm": 0.5376641154289246,
"learning_rate": 9.433064511706225e-06,
"loss": 0.7362,
"step": 3185
},
{
"epoch": 0.7214474180173389,
"grad_norm": 0.32007166743278503,
"learning_rate": 9.362054672887819e-06,
"loss": 0.6754,
"step": 3190
},
{
"epoch": 0.7225782133433849,
"grad_norm": 0.37915733456611633,
"learning_rate": 9.291251491218387e-06,
"loss": 0.6565,
"step": 3195
},
{
"epoch": 0.7237090086694309,
"grad_norm": 0.35389769077301025,
"learning_rate": 9.220655902369665e-06,
"loss": 0.6775,
"step": 3200
},
{
"epoch": 0.7248398039954769,
"grad_norm": 0.5367900133132935,
"learning_rate": 9.150268839270055e-06,
"loss": 0.7366,
"step": 3205
},
{
"epoch": 0.7259705993215229,
"grad_norm": 0.4069572985172272,
"learning_rate": 9.080091232092247e-06,
"loss": 0.6873,
"step": 3210
},
{
"epoch": 0.7271013946475688,
"grad_norm": 0.5752056837081909,
"learning_rate": 9.01012400824097e-06,
"loss": 0.7199,
"step": 3215
},
{
"epoch": 0.7282321899736148,
"grad_norm": 0.29914751648902893,
"learning_rate": 8.940368092340682e-06,
"loss": 0.7129,
"step": 3220
},
{
"epoch": 0.7293629852996608,
"grad_norm": 0.33143705129623413,
"learning_rate": 8.870824406223416e-06,
"loss": 0.6581,
"step": 3225
},
{
"epoch": 0.7304937806257068,
"grad_norm": 0.4050018787384033,
"learning_rate": 8.801493868916536e-06,
"loss": 0.6941,
"step": 3230
},
{
"epoch": 0.7316245759517528,
"grad_norm": 0.33594897389411926,
"learning_rate": 8.732377396630642e-06,
"loss": 0.6639,
"step": 3235
},
{
"epoch": 0.7327553712777988,
"grad_norm": 0.4058912694454193,
"learning_rate": 8.663475902747445e-06,
"loss": 0.7139,
"step": 3240
},
{
"epoch": 0.7338861666038448,
"grad_norm": 0.5566866397857666,
"learning_rate": 8.594790297807667e-06,
"loss": 0.6765,
"step": 3245
},
{
"epoch": 0.7350169619298907,
"grad_norm": 0.36217668652534485,
"learning_rate": 8.526321489499067e-06,
"loss": 0.6592,
"step": 3250
},
{
"epoch": 0.7361477572559367,
"grad_norm": 0.3255480229854584,
"learning_rate": 8.458070382644382e-06,
"loss": 0.7567,
"step": 3255
},
{
"epoch": 0.7372785525819827,
"grad_norm": 0.4506484866142273,
"learning_rate": 8.390037879189422e-06,
"loss": 0.6732,
"step": 3260
},
{
"epoch": 0.7384093479080287,
"grad_norm": 0.4981943368911743,
"learning_rate": 8.322224878191126e-06,
"loss": 0.6665,
"step": 3265
},
{
"epoch": 0.7395401432340747,
"grad_norm": 0.36179500818252563,
"learning_rate": 8.25463227580567e-06,
"loss": 0.6821,
"step": 3270
},
{
"epoch": 0.7406709385601207,
"grad_norm": 0.34908345341682434,
"learning_rate": 8.187260965276666e-06,
"loss": 0.6194,
"step": 3275
},
{
"epoch": 0.7418017338861667,
"grad_norm": 0.3363327980041504,
"learning_rate": 8.120111836923283e-06,
"loss": 0.6294,
"step": 3280
},
{
"epoch": 0.7429325292122126,
"grad_norm": 0.34136900305747986,
"learning_rate": 8.053185778128594e-06,
"loss": 0.6208,
"step": 3285
},
{
"epoch": 0.7440633245382586,
"grad_norm": 0.37522128224372864,
"learning_rate": 7.986483673327724e-06,
"loss": 0.6751,
"step": 3290
},
{
"epoch": 0.7451941198643046,
"grad_norm": 0.34423232078552246,
"learning_rate": 7.92000640399626e-06,
"loss": 0.733,
"step": 3295
},
{
"epoch": 0.7463249151903506,
"grad_norm": 0.4137992858886719,
"learning_rate": 7.853754848638542e-06,
"loss": 0.7044,
"step": 3300
},
{
"epoch": 0.7474557105163966,
"grad_norm": 0.36555016040802,
"learning_rate": 7.787729882776065e-06,
"loss": 0.6735,
"step": 3305
},
{
"epoch": 0.7485865058424426,
"grad_norm": 0.33875149488449097,
"learning_rate": 7.721932378935973e-06,
"loss": 0.732,
"step": 3310
},
{
"epoch": 0.7497173011684886,
"grad_norm": 0.32351580262184143,
"learning_rate": 7.656363206639409e-06,
"loss": 0.7191,
"step": 3315
},
{
"epoch": 0.7508480964945344,
"grad_norm": 0.48030319809913635,
"learning_rate": 7.591023232390138e-06,
"loss": 0.6972,
"step": 3320
},
{
"epoch": 0.7519788918205804,
"grad_norm": 0.381740540266037,
"learning_rate": 7.525913319663011e-06,
"loss": 0.6752,
"step": 3325
},
{
"epoch": 0.7531096871466264,
"grad_norm": 0.3707197308540344,
"learning_rate": 7.461034328892621e-06,
"loss": 0.6924,
"step": 3330
},
{
"epoch": 0.7542404824726724,
"grad_norm": 0.4179406762123108,
"learning_rate": 7.3963871174618945e-06,
"loss": 0.6774,
"step": 3335
},
{
"epoch": 0.7553712777987184,
"grad_norm": 0.3096112012863159,
"learning_rate": 7.3319725396907485e-06,
"loss": 0.6671,
"step": 3340
},
{
"epoch": 0.7565020731247644,
"grad_norm": 0.37638741731643677,
"learning_rate": 7.267791446824854e-06,
"loss": 0.739,
"step": 3345
},
{
"epoch": 0.7576328684508103,
"grad_norm": 0.33642110228538513,
"learning_rate": 7.2038446870243195e-06,
"loss": 0.6591,
"step": 3350
},
{
"epoch": 0.7587636637768563,
"grad_norm": 0.3964068591594696,
"learning_rate": 7.140133105352545e-06,
"loss": 0.6936,
"step": 3355
},
{
"epoch": 0.7598944591029023,
"grad_norm": 0.42048409581184387,
"learning_rate": 7.076657543765008e-06,
"loss": 0.729,
"step": 3360
},
{
"epoch": 0.7610252544289483,
"grad_norm": 0.3949214518070221,
"learning_rate": 7.013418841098174e-06,
"loss": 0.7064,
"step": 3365
},
{
"epoch": 0.7621560497549943,
"grad_norm": 0.38450565934181213,
"learning_rate": 6.95041783305837e-06,
"loss": 0.6666,
"step": 3370
},
{
"epoch": 0.7632868450810403,
"grad_norm": 0.33812659978866577,
"learning_rate": 6.887655352210765e-06,
"loss": 0.6572,
"step": 3375
},
{
"epoch": 0.7644176404070863,
"grad_norm": 0.373017281293869,
"learning_rate": 6.825132227968378e-06,
"loss": 0.7411,
"step": 3380
},
{
"epoch": 0.7655484357331322,
"grad_norm": 0.36028534173965454,
"learning_rate": 6.7628492865810995e-06,
"loss": 0.6234,
"step": 3385
},
{
"epoch": 0.7666792310591782,
"grad_norm": 0.3726188838481903,
"learning_rate": 6.700807351124785e-06,
"loss": 0.6261,
"step": 3390
},
{
"epoch": 0.7678100263852242,
"grad_norm": 0.32167547941207886,
"learning_rate": 6.639007241490347e-06,
"loss": 0.7218,
"step": 3395
},
{
"epoch": 0.7689408217112702,
"grad_norm": 0.3346633315086365,
"learning_rate": 6.5774497743729734e-06,
"loss": 0.6264,
"step": 3400
},
{
"epoch": 0.7700716170373162,
"grad_norm": 0.5438939929008484,
"learning_rate": 6.5161357632612745e-06,
"loss": 0.6799,
"step": 3405
},
{
"epoch": 0.7712024123633622,
"grad_norm": 0.7162203192710876,
"learning_rate": 6.4550660184265866e-06,
"loss": 0.7282,
"step": 3410
},
{
"epoch": 0.7723332076894082,
"grad_norm": 0.3571074306964874,
"learning_rate": 6.394241346912236e-06,
"loss": 0.7061,
"step": 3415
},
{
"epoch": 0.7734640030154541,
"grad_norm": 0.37110117077827454,
"learning_rate": 6.333662552522865e-06,
"loss": 0.6464,
"step": 3420
},
{
"epoch": 0.7745947983415001,
"grad_norm": 0.42400380969047546,
"learning_rate": 6.273330435813837e-06,
"loss": 0.6814,
"step": 3425
},
{
"epoch": 0.7757255936675461,
"grad_norm": 0.41441938281059265,
"learning_rate": 6.213245794080641e-06,
"loss": 0.6435,
"step": 3430
},
{
"epoch": 0.7768563889935921,
"grad_norm": 0.38226518034935,
"learning_rate": 6.153409421348358e-06,
"loss": 0.6979,
"step": 3435
},
{
"epoch": 0.7779871843196381,
"grad_norm": 0.28945887088775635,
"learning_rate": 6.093822108361163e-06,
"loss": 0.6509,
"step": 3440
},
{
"epoch": 0.7791179796456841,
"grad_norm": 0.3852415680885315,
"learning_rate": 6.034484642571866e-06,
"loss": 0.7581,
"step": 3445
},
{
"epoch": 0.7802487749717301,
"grad_norm": 0.36283373832702637,
"learning_rate": 5.975397808131549e-06,
"loss": 0.6021,
"step": 3450
},
{
"epoch": 0.781379570297776,
"grad_norm": 0.3490721583366394,
"learning_rate": 5.916562385879151e-06,
"loss": 0.6571,
"step": 3455
},
{
"epoch": 0.782510365623822,
"grad_norm": 0.32459717988967896,
"learning_rate": 5.857979153331189e-06,
"loss": 0.6211,
"step": 3460
},
{
"epoch": 0.783641160949868,
"grad_norm": 0.37339502573013306,
"learning_rate": 5.799648884671441e-06,
"loss": 0.6819,
"step": 3465
},
{
"epoch": 0.784771956275914,
"grad_norm": 0.35320839285850525,
"learning_rate": 5.741572350740768e-06,
"loss": 0.7348,
"step": 3470
},
{
"epoch": 0.78590275160196,
"grad_norm": 0.2820529043674469,
"learning_rate": 5.68375031902687e-06,
"loss": 0.6302,
"step": 3475
},
{
"epoch": 0.787033546928006,
"grad_norm": 0.35477685928344727,
"learning_rate": 5.626183553654194e-06,
"loss": 0.6241,
"step": 3480
},
{
"epoch": 0.788164342254052,
"grad_norm": 0.36558765172958374,
"learning_rate": 5.5688728153738155e-06,
"loss": 0.6594,
"step": 3485
},
{
"epoch": 0.789295137580098,
"grad_norm": 0.3570399880409241,
"learning_rate": 5.511818861553364e-06,
"loss": 0.6271,
"step": 3490
},
{
"epoch": 0.7904259329061439,
"grad_norm": 0.4297529458999634,
"learning_rate": 5.45502244616706e-06,
"loss": 0.7279,
"step": 3495
},
{
"epoch": 0.7915567282321899,
"grad_norm": 0.3277917504310608,
"learning_rate": 5.398484319785688e-06,
"loss": 0.7204,
"step": 3500
},
{
"epoch": 0.7926875235582359,
"grad_norm": 0.35679319500923157,
"learning_rate": 5.342205229566774e-06,
"loss": 0.6979,
"step": 3505
},
{
"epoch": 0.7938183188842819,
"grad_norm": 0.5490666627883911,
"learning_rate": 5.286185919244599e-06,
"loss": 0.7884,
"step": 3510
},
{
"epoch": 0.7949491142103279,
"grad_norm": 0.3300570845603943,
"learning_rate": 5.230427129120441e-06,
"loss": 0.6661,
"step": 3515
},
{
"epoch": 0.7960799095363739,
"grad_norm": 0.34464097023010254,
"learning_rate": 5.174929596052791e-06,
"loss": 0.729,
"step": 3520
},
{
"epoch": 0.7972107048624198,
"grad_norm": 0.36439618468284607,
"learning_rate": 5.119694053447566e-06,
"loss": 0.6483,
"step": 3525
},
{
"epoch": 0.7983415001884658,
"grad_norm": 0.3646329939365387,
"learning_rate": 5.064721231248498e-06,
"loss": 0.6497,
"step": 3530
},
{
"epoch": 0.7994722955145118,
"grad_norm": 0.42587414383888245,
"learning_rate": 5.010011855927393e-06,
"loss": 0.6638,
"step": 3535
},
{
"epoch": 0.8006030908405578,
"grad_norm": 0.3738311529159546,
"learning_rate": 4.955566650474616e-06,
"loss": 0.806,
"step": 3540
},
{
"epoch": 0.8017338861666038,
"grad_norm": 0.4998151659965515,
"learning_rate": 4.90138633438946e-06,
"loss": 0.6658,
"step": 3545
},
{
"epoch": 0.8028646814926498,
"grad_norm": 0.39495596289634705,
"learning_rate": 4.847471623670713e-06,
"loss": 0.7759,
"step": 3550
},
{
"epoch": 0.8039954768186958,
"grad_norm": 0.38152778148651123,
"learning_rate": 4.79382323080714e-06,
"loss": 0.6445,
"step": 3555
},
{
"epoch": 0.8051262721447418,
"grad_norm": 0.5026568174362183,
"learning_rate": 4.740441864768086e-06,
"loss": 0.7176,
"step": 3560
},
{
"epoch": 0.8062570674707877,
"grad_norm": 0.3014233112335205,
"learning_rate": 4.687328230994118e-06,
"loss": 0.6597,
"step": 3565
},
{
"epoch": 0.8073878627968337,
"grad_norm": 0.4386585056781769,
"learning_rate": 4.634483031387676e-06,
"loss": 0.7718,
"step": 3570
},
{
"epoch": 0.8085186581228797,
"grad_norm": 0.3882271647453308,
"learning_rate": 4.581906964303825e-06,
"loss": 0.6668,
"step": 3575
},
{
"epoch": 0.8096494534489257,
"grad_norm": 0.3510667681694031,
"learning_rate": 4.529600724541022e-06,
"loss": 0.7296,
"step": 3580
},
{
"epoch": 0.8107802487749717,
"grad_norm": 0.5134342908859253,
"learning_rate": 4.477565003331904e-06,
"loss": 0.7208,
"step": 3585
},
{
"epoch": 0.8119110441010177,
"grad_norm": 0.32369402050971985,
"learning_rate": 4.4258004883342e-06,
"loss": 0.6951,
"step": 3590
},
{
"epoch": 0.8130418394270637,
"grad_norm": 0.4089120030403137,
"learning_rate": 4.3743078636215935e-06,
"loss": 0.6571,
"step": 3595
},
{
"epoch": 0.8141726347531096,
"grad_norm": 0.3248507082462311,
"learning_rate": 4.323087809674733e-06,
"loss": 0.6267,
"step": 3600
},
{
"epoch": 0.8153034300791556,
"grad_norm": 0.3590109348297119,
"learning_rate": 4.2721410033722014e-06,
"loss": 0.6919,
"step": 3605
},
{
"epoch": 0.8164342254052016,
"grad_norm": 0.3924254775047302,
"learning_rate": 4.221468117981592e-06,
"loss": 0.6,
"step": 3610
},
{
"epoch": 0.8175650207312476,
"grad_norm": 0.42247772216796875,
"learning_rate": 4.1710698231505975e-06,
"loss": 0.6375,
"step": 3615
},
{
"epoch": 0.8186958160572936,
"grad_norm": 0.3658187985420227,
"learning_rate": 4.120946784898156e-06,
"loss": 0.7743,
"step": 3620
},
{
"epoch": 0.8198266113833396,
"grad_norm": 0.39758992195129395,
"learning_rate": 4.071099665605682e-06,
"loss": 0.6259,
"step": 3625
},
{
"epoch": 0.8209574067093856,
"grad_norm": 0.45203524827957153,
"learning_rate": 4.021529124008278e-06,
"loss": 0.7297,
"step": 3630
},
{
"epoch": 0.8220882020354315,
"grad_norm": 0.43119361996650696,
"learning_rate": 3.9722358151860515e-06,
"loss": 0.6612,
"step": 3635
},
{
"epoch": 0.8232189973614775,
"grad_norm": 0.41796061396598816,
"learning_rate": 3.923220390555432e-06,
"loss": 0.7526,
"step": 3640
},
{
"epoch": 0.8243497926875235,
"grad_norm": 0.33241549134254456,
"learning_rate": 3.87448349786059e-06,
"loss": 0.6832,
"step": 3645
},
{
"epoch": 0.8254805880135695,
"grad_norm": 0.3728543817996979,
"learning_rate": 3.826025781164874e-06,
"loss": 0.6604,
"step": 3650
},
{
"epoch": 0.8266113833396155,
"grad_norm": 0.297720342874527,
"learning_rate": 3.7778478808422753e-06,
"loss": 0.7111,
"step": 3655
},
{
"epoch": 0.8277421786656615,
"grad_norm": 0.3133184015750885,
"learning_rate": 3.7299504335689905e-06,
"loss": 0.6552,
"step": 3660
},
{
"epoch": 0.8288729739917075,
"grad_norm": 0.3344557583332062,
"learning_rate": 3.682334072314994e-06,
"loss": 0.6516,
"step": 3665
},
{
"epoch": 0.8300037693177534,
"grad_norm": 0.33505749702453613,
"learning_rate": 3.6349994263356806e-06,
"loss": 0.6788,
"step": 3670
},
{
"epoch": 0.8311345646437994,
"grad_norm": 0.32801946997642517,
"learning_rate": 3.587947121163551e-06,
"loss": 0.6627,
"step": 3675
},
{
"epoch": 0.8322653599698454,
"grad_norm": 0.3641601800918579,
"learning_rate": 3.541177778599944e-06,
"loss": 0.6904,
"step": 3680
},
{
"epoch": 0.8333961552958914,
"grad_norm": 0.3527655005455017,
"learning_rate": 3.494692016706799e-06,
"loss": 0.7227,
"step": 3685
},
{
"epoch": 0.8345269506219374,
"grad_norm": 0.32480356097221375,
"learning_rate": 3.4484904497985167e-06,
"loss": 0.6718,
"step": 3690
},
{
"epoch": 0.8356577459479834,
"grad_norm": 0.40660572052001953,
"learning_rate": 3.4025736884338326e-06,
"loss": 0.7252,
"step": 3695
},
{
"epoch": 0.8367885412740294,
"grad_norm": 0.3736969530582428,
"learning_rate": 3.356942339407748e-06,
"loss": 0.6344,
"step": 3700
},
{
"epoch": 0.8379193366000753,
"grad_norm": 0.3524364233016968,
"learning_rate": 3.311597005743508e-06,
"loss": 0.6561,
"step": 3705
},
{
"epoch": 0.8390501319261213,
"grad_norm": 0.346884548664093,
"learning_rate": 3.26653828668462e-06,
"loss": 0.7203,
"step": 3710
},
{
"epoch": 0.8401809272521673,
"grad_norm": 0.3504559099674225,
"learning_rate": 3.2217667776869716e-06,
"loss": 0.6846,
"step": 3715
},
{
"epoch": 0.8413117225782133,
"grad_norm": 0.4117507338523865,
"learning_rate": 3.1772830704109108e-06,
"loss": 0.7109,
"step": 3720
},
{
"epoch": 0.8424425179042593,
"grad_norm": 0.35699552297592163,
"learning_rate": 3.133087752713479e-06,
"loss": 0.7086,
"step": 3725
},
{
"epoch": 0.8435733132303053,
"grad_norm": 0.4353185296058655,
"learning_rate": 3.089181408640612e-06,
"loss": 0.6974,
"step": 3730
},
{
"epoch": 0.8447041085563513,
"grad_norm": 0.35224634408950806,
"learning_rate": 3.0455646184194137e-06,
"loss": 0.695,
"step": 3735
},
{
"epoch": 0.8458349038823972,
"grad_norm": 0.3479955792427063,
"learning_rate": 3.0022379584505212e-06,
"loss": 0.7459,
"step": 3740
},
{
"epoch": 0.8469656992084432,
"grad_norm": 0.33437180519104004,
"learning_rate": 2.9592020013004455e-06,
"loss": 0.6236,
"step": 3745
},
{
"epoch": 0.8480964945344892,
"grad_norm": 0.3484211266040802,
"learning_rate": 2.9164573156940654e-06,
"loss": 0.6564,
"step": 3750
},
{
"epoch": 0.8492272898605352,
"grad_norm": 0.42290642857551575,
"learning_rate": 2.874004466507041e-06,
"loss": 0.8202,
"step": 3755
},
{
"epoch": 0.8503580851865812,
"grad_norm": 0.3348793089389801,
"learning_rate": 2.8318440147583862e-06,
"loss": 0.6083,
"step": 3760
},
{
"epoch": 0.8514888805126272,
"grad_norm": 0.45830124616622925,
"learning_rate": 2.7899765176030627e-06,
"loss": 0.6741,
"step": 3765
},
{
"epoch": 0.8526196758386732,
"grad_norm": 0.40784764289855957,
"learning_rate": 2.7484025283246034e-06,
"loss": 0.6632,
"step": 3770
},
{
"epoch": 0.8537504711647191,
"grad_norm": 0.31340643763542175,
"learning_rate": 2.707122596327805e-06,
"loss": 0.6891,
"step": 3775
},
{
"epoch": 0.8548812664907651,
"grad_norm": 0.5138049721717834,
"learning_rate": 2.6661372671314493e-06,
"loss": 0.7407,
"step": 3780
},
{
"epoch": 0.8560120618168111,
"grad_norm": 0.3300493359565735,
"learning_rate": 2.6254470823611323e-06,
"loss": 0.7163,
"step": 3785
},
{
"epoch": 0.8571428571428571,
"grad_norm": 0.4111888110637665,
"learning_rate": 2.585052579742059e-06,
"loss": 0.7343,
"step": 3790
},
{
"epoch": 0.8582736524689031,
"grad_norm": 0.3648470938205719,
"learning_rate": 2.5449542930919864e-06,
"loss": 0.6905,
"step": 3795
},
{
"epoch": 0.8594044477949491,
"grad_norm": 0.3930950164794922,
"learning_rate": 2.5051527523141356e-06,
"loss": 0.6164,
"step": 3800
},
{
"epoch": 0.860535243120995,
"grad_norm": 0.35205841064453125,
"learning_rate": 2.465648483390193e-06,
"loss": 0.6893,
"step": 3805
},
{
"epoch": 0.861666038447041,
"grad_norm": 0.3441345989704132,
"learning_rate": 2.4264420083733807e-06,
"loss": 0.6441,
"step": 3810
},
{
"epoch": 0.862796833773087,
"grad_norm": 0.3523414134979248,
"learning_rate": 2.387533845381518e-06,
"loss": 0.7179,
"step": 3815
},
{
"epoch": 0.863927629099133,
"grad_norm": 0.4754193425178528,
"learning_rate": 2.3489245085902194e-06,
"loss": 0.7682,
"step": 3820
},
{
"epoch": 0.865058424425179,
"grad_norm": 0.3973066508769989,
"learning_rate": 2.310614508226078e-06,
"loss": 0.6431,
"step": 3825
},
{
"epoch": 0.866189219751225,
"grad_norm": 0.49921613931655884,
"learning_rate": 2.2726043505599036e-06,
"loss": 0.7379,
"step": 3830
},
{
"epoch": 0.867320015077271,
"grad_norm": 0.3483542203903198,
"learning_rate": 2.2348945379000783e-06,
"loss": 0.6746,
"step": 3835
},
{
"epoch": 0.868450810403317,
"grad_norm": 0.409015417098999,
"learning_rate": 2.1974855685858663e-06,
"loss": 0.6205,
"step": 3840
},
{
"epoch": 0.8695816057293629,
"grad_norm": 0.38850805163383484,
"learning_rate": 2.1603779369808757e-06,
"loss": 0.6971,
"step": 3845
},
{
"epoch": 0.8707124010554089,
"grad_norm": 0.39465731382369995,
"learning_rate": 2.123572133466495e-06,
"loss": 0.6327,
"step": 3850
},
{
"epoch": 0.8718431963814549,
"grad_norm": 0.3345524072647095,
"learning_rate": 2.087068644435425e-06,
"loss": 0.6426,
"step": 3855
},
{
"epoch": 0.8729739917075009,
"grad_norm": 0.28288835287094116,
"learning_rate": 2.050867952285243e-06,
"loss": 0.5873,
"step": 3860
},
{
"epoch": 0.8741047870335469,
"grad_norm": 0.3995983898639679,
"learning_rate": 2.0149705354120224e-06,
"loss": 0.6867,
"step": 3865
},
{
"epoch": 0.8752355823595929,
"grad_norm": 0.4070720076560974,
"learning_rate": 1.9793768682040524e-06,
"loss": 0.726,
"step": 3870
},
{
"epoch": 0.8763663776856389,
"grad_norm": 0.37636858224868774,
"learning_rate": 1.9440874210355065e-06,
"loss": 0.6516,
"step": 3875
},
{
"epoch": 0.8774971730116848,
"grad_norm": 0.2892749309539795,
"learning_rate": 1.909102660260273e-06,
"loss": 0.6692,
"step": 3880
},
{
"epoch": 0.8786279683377308,
"grad_norm": 0.3219640851020813,
"learning_rate": 1.8744230482057673e-06,
"loss": 0.7656,
"step": 3885
},
{
"epoch": 0.8797587636637768,
"grad_norm": 0.4004978835582733,
"learning_rate": 1.8400490431668387e-06,
"loss": 0.7057,
"step": 3890
},
{
"epoch": 0.8808895589898228,
"grad_norm": 0.3124302327632904,
"learning_rate": 1.805981099399709e-06,
"loss": 0.6377,
"step": 3895
},
{
"epoch": 0.8820203543158688,
"grad_norm": 0.3707364797592163,
"learning_rate": 1.7722196671159542e-06,
"loss": 0.6751,
"step": 3900
},
{
"epoch": 0.8831511496419148,
"grad_norm": 0.38595885038375854,
"learning_rate": 1.7387651924765796e-06,
"loss": 0.6968,
"step": 3905
},
{
"epoch": 0.8842819449679608,
"grad_norm": 0.3807552754878998,
"learning_rate": 1.7056181175861025e-06,
"loss": 0.7338,
"step": 3910
},
{
"epoch": 0.8854127402940067,
"grad_norm": 0.40677499771118164,
"learning_rate": 1.6727788804867277e-06,
"loss": 0.713,
"step": 3915
},
{
"epoch": 0.8865435356200527,
"grad_norm": 0.5259581804275513,
"learning_rate": 1.6402479151525458e-06,
"loss": 0.6833,
"step": 3920
},
{
"epoch": 0.8876743309460987,
"grad_norm": 0.3447456359863281,
"learning_rate": 1.6080256514838077e-06,
"loss": 0.6712,
"step": 3925
},
{
"epoch": 0.8888051262721447,
"grad_norm": 0.31114619970321655,
"learning_rate": 1.5761125153012312e-06,
"loss": 0.73,
"step": 3930
},
{
"epoch": 0.8899359215981907,
"grad_norm": 0.29841819405555725,
"learning_rate": 1.5445089283403768e-06,
"loss": 0.6782,
"step": 3935
},
{
"epoch": 0.8910667169242367,
"grad_norm": 0.45541536808013916,
"learning_rate": 1.5132153082460908e-06,
"loss": 0.7093,
"step": 3940
},
{
"epoch": 0.8921975122502827,
"grad_norm": 0.31067731976509094,
"learning_rate": 1.482232068566966e-06,
"loss": 0.6212,
"step": 3945
},
{
"epoch": 0.8933283075763286,
"grad_norm": 0.3661406934261322,
"learning_rate": 1.4515596187498898e-06,
"loss": 0.6728,
"step": 3950
},
{
"epoch": 0.8944591029023746,
"grad_norm": 0.31056010723114014,
"learning_rate": 1.4211983641346154e-06,
"loss": 0.64,
"step": 3955
},
{
"epoch": 0.8955898982284206,
"grad_norm": 0.3716438114643097,
"learning_rate": 1.3911487059484362e-06,
"loss": 0.7058,
"step": 3960
},
{
"epoch": 0.8967206935544666,
"grad_norm": 0.3605138659477234,
"learning_rate": 1.3614110413008474e-06,
"loss": 0.7142,
"step": 3965
},
{
"epoch": 0.8978514888805126,
"grad_norm": 0.39523765444755554,
"learning_rate": 1.3319857631783227e-06,
"loss": 0.667,
"step": 3970
},
{
"epoch": 0.8989822842065586,
"grad_norm": 0.4720902144908905,
"learning_rate": 1.302873260439122e-06,
"loss": 0.7009,
"step": 3975
},
{
"epoch": 0.9001130795326046,
"grad_norm": 0.39917027950286865,
"learning_rate": 1.2740739178081274e-06,
"loss": 0.6236,
"step": 3980
},
{
"epoch": 0.9012438748586505,
"grad_norm": 0.38968703150749207,
"learning_rate": 1.2455881158717874e-06,
"loss": 0.6108,
"step": 3985
},
{
"epoch": 0.9023746701846965,
"grad_norm": 0.3744681775569916,
"learning_rate": 1.2174162310730764e-06,
"loss": 0.674,
"step": 3990
},
{
"epoch": 0.9035054655107425,
"grad_norm": 0.41147854924201965,
"learning_rate": 1.1895586357065197e-06,
"loss": 0.6971,
"step": 3995
},
{
"epoch": 0.9046362608367885,
"grad_norm": 0.4496522843837738,
"learning_rate": 1.1620156979132685e-06,
"loss": 0.7027,
"step": 4000
},
{
"epoch": 0.9057670561628345,
"grad_norm": 0.38566187024116516,
"learning_rate": 1.134787781676236e-06,
"loss": 0.6488,
"step": 4005
},
{
"epoch": 0.9068978514888805,
"grad_norm": 0.3715657591819763,
"learning_rate": 1.1078752468153042e-06,
"loss": 0.6727,
"step": 4010
},
{
"epoch": 0.9080286468149265,
"grad_norm": 0.3041117787361145,
"learning_rate": 1.0812784489825507e-06,
"loss": 0.6763,
"step": 4015
},
{
"epoch": 0.9091594421409724,
"grad_norm": 0.40202027559280396,
"learning_rate": 1.054997739657551e-06,
"loss": 0.652,
"step": 4020
},
{
"epoch": 0.9102902374670184,
"grad_norm": 0.41109445691108704,
"learning_rate": 1.029033466142737e-06,
"loss": 0.7183,
"step": 4025
},
{
"epoch": 0.9114210327930644,
"grad_norm": 0.31253260374069214,
"learning_rate": 1.0033859715588122e-06,
"loss": 0.6929,
"step": 4030
},
{
"epoch": 0.9125518281191104,
"grad_norm": 0.4093742072582245,
"learning_rate": 9.780555948401994e-07,
"loss": 0.7043,
"step": 4035
},
{
"epoch": 0.9136826234451564,
"grad_norm": 0.3625013828277588,
"learning_rate": 9.530426707305918e-07,
"loss": 0.7268,
"step": 4040
},
{
"epoch": 0.9148134187712024,
"grad_norm": 0.34964805841445923,
"learning_rate": 9.283475297785005e-07,
"loss": 0.6746,
"step": 4045
},
{
"epoch": 0.9159442140972484,
"grad_norm": 0.3727870285511017,
"learning_rate": 9.039704983328984e-07,
"loss": 0.6868,
"step": 4050
},
{
"epoch": 0.9170750094232943,
"grad_norm": 0.5575105547904968,
"learning_rate": 8.799118985389126e-07,
"loss": 0.7606,
"step": 4055
},
{
"epoch": 0.9182058047493403,
"grad_norm": 0.38986077904701233,
"learning_rate": 8.561720483335478e-07,
"loss": 0.6885,
"step": 4060
},
{
"epoch": 0.9193366000753863,
"grad_norm": 0.35799407958984375,
"learning_rate": 8.327512614415195e-07,
"loss": 0.6676,
"step": 4065
},
{
"epoch": 0.9204673954014323,
"grad_norm": 0.3649112284183502,
"learning_rate": 8.09649847371069e-07,
"loss": 0.6487,
"step": 4070
},
{
"epoch": 0.9215981907274783,
"grad_norm": 0.4163808226585388,
"learning_rate": 7.868681114098914e-07,
"loss": 0.6342,
"step": 4075
},
{
"epoch": 0.9227289860535243,
"grad_norm": 0.34704506397247314,
"learning_rate": 7.644063546211167e-07,
"loss": 0.6623,
"step": 4080
},
{
"epoch": 0.9238597813795703,
"grad_norm": 0.3512720465660095,
"learning_rate": 7.422648738392934e-07,
"loss": 0.6688,
"step": 4085
},
{
"epoch": 0.9249905767056162,
"grad_norm": 0.33337023854255676,
"learning_rate": 7.204439616665115e-07,
"loss": 0.6587,
"step": 4090
},
{
"epoch": 0.9261213720316622,
"grad_norm": 0.35821977257728577,
"learning_rate": 6.989439064684911e-07,
"loss": 0.6823,
"step": 4095
},
{
"epoch": 0.9272521673577082,
"grad_norm": 0.46636807918548584,
"learning_rate": 6.777649923708024e-07,
"loss": 0.7261,
"step": 4100
},
{
"epoch": 0.9283829626837542,
"grad_norm": 0.4258100688457489,
"learning_rate": 6.569074992551022e-07,
"loss": 0.6615,
"step": 4105
},
{
"epoch": 0.9295137580098002,
"grad_norm": 0.39648309350013733,
"learning_rate": 6.363717027554256e-07,
"loss": 0.7147,
"step": 4110
},
{
"epoch": 0.9306445533358462,
"grad_norm": 0.41724279522895813,
"learning_rate": 6.161578742545665e-07,
"loss": 0.6852,
"step": 4115
},
{
"epoch": 0.9317753486618922,
"grad_norm": 0.3736780285835266,
"learning_rate": 5.962662808804587e-07,
"loss": 0.717,
"step": 4120
},
{
"epoch": 0.9329061439879381,
"grad_norm": 0.3304630517959595,
"learning_rate": 5.766971855026809e-07,
"loss": 0.6539,
"step": 4125
},
{
"epoch": 0.9340369393139841,
"grad_norm": 0.39884060621261597,
"learning_rate": 5.574508467289518e-07,
"loss": 0.7029,
"step": 4130
},
{
"epoch": 0.9351677346400301,
"grad_norm": 0.49207261204719543,
"learning_rate": 5.385275189017353e-07,
"loss": 0.7092,
"step": 4135
},
{
"epoch": 0.9362985299660761,
"grad_norm": 0.3424831032752991,
"learning_rate": 5.199274520948677e-07,
"loss": 0.6355,
"step": 4140
},
{
"epoch": 0.9374293252921221,
"grad_norm": 0.32665055990219116,
"learning_rate": 5.01650892110253e-07,
"loss": 0.6958,
"step": 4145
},
{
"epoch": 0.9385601206181681,
"grad_norm": 0.38883742690086365,
"learning_rate": 4.836980804746261e-07,
"loss": 0.6334,
"step": 4150
},
{
"epoch": 0.9396909159442141,
"grad_norm": 0.3798101842403412,
"learning_rate": 4.660692544363382e-07,
"loss": 0.781,
"step": 4155
},
{
"epoch": 0.94082171127026,
"grad_norm": 0.4334189295768738,
"learning_rate": 4.487646469622464e-07,
"loss": 0.7235,
"step": 4160
},
{
"epoch": 0.941952506596306,
"grad_norm": 0.34717586636543274,
"learning_rate": 4.31784486734621e-07,
"loss": 0.669,
"step": 4165
},
{
"epoch": 0.943083301922352,
"grad_norm": 0.3831476867198944,
"learning_rate": 4.1512899814813156e-07,
"loss": 0.6536,
"step": 4170
},
{
"epoch": 0.944214097248398,
"grad_norm": 0.35673925280570984,
"learning_rate": 3.9879840130686576e-07,
"loss": 0.6853,
"step": 4175
},
{
"epoch": 0.945344892574444,
"grad_norm": 0.48463523387908936,
"learning_rate": 3.82792912021443e-07,
"loss": 0.7454,
"step": 4180
},
{
"epoch": 0.94647568790049,
"grad_norm": 0.36075058579444885,
"learning_rate": 3.6711274180614153e-07,
"loss": 0.6806,
"step": 4185
},
{
"epoch": 0.947606483226536,
"grad_norm": 0.42450064420700073,
"learning_rate": 3.517580978761148e-07,
"loss": 0.7356,
"step": 4190
},
{
"epoch": 0.9487372785525819,
"grad_norm": 0.36473605036735535,
"learning_rate": 3.3672918314466007e-07,
"loss": 0.717,
"step": 4195
},
{
"epoch": 0.9498680738786279,
"grad_norm": 0.4312973916530609,
"learning_rate": 3.220261962205179e-07,
"loss": 0.6991,
"step": 4200
},
{
"epoch": 0.9509988692046739,
"grad_norm": 0.3392401933670044,
"learning_rate": 3.0764933140525475e-07,
"loss": 0.6468,
"step": 4205
},
{
"epoch": 0.9521296645307199,
"grad_norm": 0.426111102104187,
"learning_rate": 2.935987786907124e-07,
"loss": 0.7004,
"step": 4210
},
{
"epoch": 0.9532604598567659,
"grad_norm": 0.3565954864025116,
"learning_rate": 2.7987472375646804e-07,
"loss": 0.7199,
"step": 4215
},
{
"epoch": 0.9543912551828119,
"grad_norm": 0.3929762542247772,
"learning_rate": 2.664773479674032e-07,
"loss": 0.7348,
"step": 4220
},
{
"epoch": 0.9555220505088579,
"grad_norm": 0.4081243872642517,
"learning_rate": 2.5340682837129146e-07,
"loss": 0.662,
"step": 4225
},
{
"epoch": 0.9566528458349038,
"grad_norm": 0.3799093961715698,
"learning_rate": 2.406633376964784e-07,
"loss": 0.6571,
"step": 4230
},
{
"epoch": 0.9577836411609498,
"grad_norm": 0.35904359817504883,
"learning_rate": 2.2824704434957766e-07,
"loss": 0.7287,
"step": 4235
},
{
"epoch": 0.9589144364869958,
"grad_norm": 0.38494235277175903,
"learning_rate": 2.1615811241325613e-07,
"loss": 0.71,
"step": 4240
},
{
"epoch": 0.9600452318130418,
"grad_norm": 0.4001871943473816,
"learning_rate": 2.0439670164406345e-07,
"loss": 0.7414,
"step": 4245
},
{
"epoch": 0.9611760271390878,
"grad_norm": 0.33206847310066223,
"learning_rate": 1.929629674703226e-07,
"loss": 0.6673,
"step": 4250
},
{
"epoch": 0.9623068224651338,
"grad_norm": 0.3849842846393585,
"learning_rate": 1.8185706099007883e-07,
"loss": 0.7487,
"step": 4255
},
{
"epoch": 0.9634376177911798,
"grad_norm": 0.33372604846954346,
"learning_rate": 1.7107912896908995e-07,
"loss": 0.6522,
"step": 4260
},
{
"epoch": 0.9645684131172257,
"grad_norm": 0.4485720992088318,
"learning_rate": 1.6062931383890312e-07,
"loss": 0.6499,
"step": 4265
},
{
"epoch": 0.9656992084432717,
"grad_norm": 0.5238537788391113,
"learning_rate": 1.5050775369495895e-07,
"loss": 0.6708,
"step": 4270
},
{
"epoch": 0.9668300037693177,
"grad_norm": 0.31111645698547363,
"learning_rate": 1.4071458229478196e-07,
"loss": 0.6394,
"step": 4275
},
{
"epoch": 0.9679607990953637,
"grad_norm": 0.3432201147079468,
"learning_rate": 1.3124992905619028e-07,
"loss": 0.7097,
"step": 4280
},
{
"epoch": 0.9690915944214097,
"grad_norm": 0.38018399477005005,
"learning_rate": 1.2211391905561086e-07,
"loss": 0.711,
"step": 4285
},
{
"epoch": 0.9702223897474557,
"grad_norm": 0.36301785707473755,
"learning_rate": 1.1330667302641151e-07,
"loss": 0.664,
"step": 4290
},
{
"epoch": 0.9713531850735017,
"grad_norm": 0.3279567360877991,
"learning_rate": 1.0482830735730198e-07,
"loss": 0.6311,
"step": 4295
},
{
"epoch": 0.9724839803995476,
"grad_norm": 0.31279444694519043,
"learning_rate": 9.66789340908103e-08,
"loss": 0.6588,
"step": 4300
},
{
"epoch": 0.9736147757255936,
"grad_norm": 0.38352543115615845,
"learning_rate": 8.885866092178952e-08,
"loss": 0.6798,
"step": 4305
},
{
"epoch": 0.9747455710516396,
"grad_norm": 0.3573386073112488,
"learning_rate": 8.136759119600213e-08,
"loss": 0.6686,
"step": 4310
},
{
"epoch": 0.9758763663776856,
"grad_norm": 0.3751896917819977,
"learning_rate": 7.42058239087462e-08,
"loss": 0.7003,
"step": 4315
},
{
"epoch": 0.9770071617037316,
"grad_norm": 0.425658643245697,
"learning_rate": 6.737345370355919e-08,
"loss": 0.7152,
"step": 4320
},
{
"epoch": 0.9781379570297776,
"grad_norm": 0.34428244829177856,
"learning_rate": 6.087057087095504e-08,
"loss": 0.7485,
"step": 4325
},
{
"epoch": 0.9792687523558236,
"grad_norm": 0.4021622836589813,
"learning_rate": 5.469726134723907e-08,
"loss": 0.7458,
"step": 4330
},
{
"epoch": 0.9803995476818695,
"grad_norm": 0.364872545003891,
"learning_rate": 4.885360671336714e-08,
"loss": 0.7113,
"step": 4335
},
{
"epoch": 0.9815303430079155,
"grad_norm": 0.4006316363811493,
"learning_rate": 4.3339684193871576e-08,
"loss": 0.7263,
"step": 4340
},
{
"epoch": 0.9826611383339615,
"grad_norm": 0.3257283866405487,
"learning_rate": 3.8155566655839746e-08,
"loss": 0.6425,
"step": 4345
},
{
"epoch": 0.9837919336600075,
"grad_norm": 0.37064969539642334,
"learning_rate": 3.330132260794538e-08,
"loss": 0.678,
"step": 4350
},
{
"epoch": 0.9849227289860535,
"grad_norm": 0.4389401078224182,
"learning_rate": 2.8777016199554863e-08,
"loss": 0.651,
"step": 4355
},
{
"epoch": 0.9860535243120995,
"grad_norm": 0.4058115482330322,
"learning_rate": 2.4582707219866772e-08,
"loss": 0.7288,
"step": 4360
},
{
"epoch": 0.9871843196381455,
"grad_norm": 0.36480987071990967,
"learning_rate": 2.0718451097134773e-08,
"loss": 0.7054,
"step": 4365
},
{
"epoch": 0.9883151149641914,
"grad_norm": 0.33493801951408386,
"learning_rate": 1.718429889792095e-08,
"loss": 0.6635,
"step": 4370
},
{
"epoch": 0.9894459102902374,
"grad_norm": 0.4736829698085785,
"learning_rate": 1.3980297326432468e-08,
"loss": 0.6585,
"step": 4375
},
{
"epoch": 0.9905767056162834,
"grad_norm": 0.4238462746143341,
"learning_rate": 1.110648872389708e-08,
"loss": 0.6614,
"step": 4380
},
{
"epoch": 0.9917075009423294,
"grad_norm": 0.32537785172462463,
"learning_rate": 8.56291106801077e-09,
"loss": 0.6681,
"step": 4385
},
{
"epoch": 0.9928382962683754,
"grad_norm": 0.4619493782520294,
"learning_rate": 6.349597972424293e-09,
"loss": 0.6741,
"step": 4390
},
{
"epoch": 0.9939690915944214,
"grad_norm": 0.35761797428131104,
"learning_rate": 4.4665786863185014e-09,
"loss": 0.6885,
"step": 4395
},
{
"epoch": 0.9950998869204674,
"grad_norm": 0.3071589767932892,
"learning_rate": 2.913878093990796e-09,
"loss": 0.6676,
"step": 4400
},
{
"epoch": 0.9962306822465133,
"grad_norm": 0.3391641080379486,
"learning_rate": 1.6915167145525878e-09,
"loss": 0.6981,
"step": 4405
},
{
"epoch": 0.9973614775725593,
"grad_norm": 0.41276663541793823,
"learning_rate": 7.995107016406378e-10,
"loss": 0.6594,
"step": 4410
},
{
"epoch": 0.9984922728986053,
"grad_norm": 0.4570743143558502,
"learning_rate": 2.3787184321444335e-10,
"loss": 0.7059,
"step": 4415
},
{
"epoch": 0.9996230682246513,
"grad_norm": 0.3156117796897888,
"learning_rate": 6.607561386928751e-12,
"loss": 0.6465,
"step": 4420
},
{
"epoch": 0.9998492272898606,
"step": 4421,
"total_flos": 4.746119130111803e+18,
"train_loss": 0.6650473316231887,
"train_runtime": 22780.4973,
"train_samples_per_second": 9.316,
"train_steps_per_second": 0.194
}
],
"logging_steps": 5,
"max_steps": 4421,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.746119130111803e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}