vit-base-beans / trainer_state.json
howaboutyu's picture
End of training
2a941a6 verified
raw
history blame contribute delete
No virus
200 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 12285,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.3860865831375122,
"learning_rate": 1.9983719983719984e-05,
"loss": 0.684,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.3011243343353271,
"learning_rate": 1.996743996743997e-05,
"loss": 0.6568,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 1.646723985671997,
"learning_rate": 1.9951159951159952e-05,
"loss": 0.6477,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 1.43569016456604,
"learning_rate": 1.9934879934879937e-05,
"loss": 0.61,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 1.4323921203613281,
"learning_rate": 1.991859991859992e-05,
"loss": 0.5758,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 2.1664583683013916,
"learning_rate": 1.9902319902319905e-05,
"loss": 0.5529,
"step": 60
},
{
"epoch": 0.03,
"grad_norm": 1.935644268989563,
"learning_rate": 1.9886039886039888e-05,
"loss": 0.4969,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 2.984022617340088,
"learning_rate": 1.986975986975987e-05,
"loss": 0.5017,
"step": 80
},
{
"epoch": 0.04,
"grad_norm": 1.9753074645996094,
"learning_rate": 1.9853479853479855e-05,
"loss": 0.4438,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 4.39138650894165,
"learning_rate": 1.9837199837199838e-05,
"loss": 0.4033,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 3.0486788749694824,
"learning_rate": 1.9820919820919823e-05,
"loss": 0.3642,
"step": 110
},
{
"epoch": 0.05,
"grad_norm": 1.738529920578003,
"learning_rate": 1.9804639804639806e-05,
"loss": 0.3557,
"step": 120
},
{
"epoch": 0.05,
"grad_norm": 2.9336562156677246,
"learning_rate": 1.978835978835979e-05,
"loss": 0.3655,
"step": 130
},
{
"epoch": 0.06,
"grad_norm": 2.0220277309417725,
"learning_rate": 1.9772079772079773e-05,
"loss": 0.2903,
"step": 140
},
{
"epoch": 0.06,
"grad_norm": 2.4428532123565674,
"learning_rate": 1.975579975579976e-05,
"loss": 0.2706,
"step": 150
},
{
"epoch": 0.07,
"grad_norm": 5.031763076782227,
"learning_rate": 1.973951973951974e-05,
"loss": 0.3558,
"step": 160
},
{
"epoch": 0.07,
"grad_norm": 3.3514373302459717,
"learning_rate": 1.9723239723239724e-05,
"loss": 0.232,
"step": 170
},
{
"epoch": 0.07,
"grad_norm": 1.0613574981689453,
"learning_rate": 1.970695970695971e-05,
"loss": 0.2499,
"step": 180
},
{
"epoch": 0.08,
"grad_norm": 7.994803428649902,
"learning_rate": 1.969067969067969e-05,
"loss": 0.2609,
"step": 190
},
{
"epoch": 0.08,
"grad_norm": 1.9980034828186035,
"learning_rate": 1.9674399674399677e-05,
"loss": 0.2267,
"step": 200
},
{
"epoch": 0.09,
"grad_norm": 2.297769069671631,
"learning_rate": 1.965811965811966e-05,
"loss": 0.2371,
"step": 210
},
{
"epoch": 0.09,
"grad_norm": 5.071080207824707,
"learning_rate": 1.9641839641839645e-05,
"loss": 0.248,
"step": 220
},
{
"epoch": 0.09,
"grad_norm": 0.43589872121810913,
"learning_rate": 1.9625559625559627e-05,
"loss": 0.151,
"step": 230
},
{
"epoch": 0.1,
"grad_norm": 6.790423393249512,
"learning_rate": 1.960927960927961e-05,
"loss": 0.1886,
"step": 240
},
{
"epoch": 0.1,
"grad_norm": 1.1880414485931396,
"learning_rate": 1.9592999592999595e-05,
"loss": 0.2104,
"step": 250
},
{
"epoch": 0.11,
"grad_norm": 6.434396266937256,
"learning_rate": 1.9576719576719577e-05,
"loss": 0.2847,
"step": 260
},
{
"epoch": 0.11,
"grad_norm": 2.175398826599121,
"learning_rate": 1.9560439560439563e-05,
"loss": 0.261,
"step": 270
},
{
"epoch": 0.11,
"grad_norm": 2.2410614490509033,
"learning_rate": 1.9544159544159545e-05,
"loss": 0.1376,
"step": 280
},
{
"epoch": 0.12,
"grad_norm": 0.5503996014595032,
"learning_rate": 1.952787952787953e-05,
"loss": 0.2034,
"step": 290
},
{
"epoch": 0.12,
"grad_norm": 3.550145387649536,
"learning_rate": 1.9511599511599513e-05,
"loss": 0.1802,
"step": 300
},
{
"epoch": 0.13,
"grad_norm": 0.35100820660591125,
"learning_rate": 1.94953194953195e-05,
"loss": 0.1778,
"step": 310
},
{
"epoch": 0.13,
"grad_norm": 1.0361884832382202,
"learning_rate": 1.947903947903948e-05,
"loss": 0.2186,
"step": 320
},
{
"epoch": 0.13,
"grad_norm": 7.980532169342041,
"learning_rate": 1.9462759462759463e-05,
"loss": 0.2742,
"step": 330
},
{
"epoch": 0.14,
"grad_norm": 4.8567585945129395,
"learning_rate": 1.9446479446479445e-05,
"loss": 0.1544,
"step": 340
},
{
"epoch": 0.14,
"grad_norm": 5.355805397033691,
"learning_rate": 1.943019943019943e-05,
"loss": 0.2739,
"step": 350
},
{
"epoch": 0.15,
"grad_norm": 6.359828472137451,
"learning_rate": 1.9413919413919417e-05,
"loss": 0.2076,
"step": 360
},
{
"epoch": 0.15,
"grad_norm": 0.9936553239822388,
"learning_rate": 1.93976393976394e-05,
"loss": 0.1666,
"step": 370
},
{
"epoch": 0.15,
"grad_norm": 6.090355396270752,
"learning_rate": 1.9381359381359385e-05,
"loss": 0.1941,
"step": 380
},
{
"epoch": 0.16,
"grad_norm": 2.5009548664093018,
"learning_rate": 1.9365079365079367e-05,
"loss": 0.1905,
"step": 390
},
{
"epoch": 0.16,
"grad_norm": 8.634650230407715,
"learning_rate": 1.934879934879935e-05,
"loss": 0.1431,
"step": 400
},
{
"epoch": 0.17,
"grad_norm": 2.43247389793396,
"learning_rate": 1.9332519332519335e-05,
"loss": 0.1736,
"step": 410
},
{
"epoch": 0.17,
"grad_norm": 15.868481636047363,
"learning_rate": 1.9316239316239317e-05,
"loss": 0.249,
"step": 420
},
{
"epoch": 0.18,
"grad_norm": 1.8422390222549438,
"learning_rate": 1.9299959299959303e-05,
"loss": 0.1407,
"step": 430
},
{
"epoch": 0.18,
"grad_norm": 5.148740291595459,
"learning_rate": 1.9283679283679285e-05,
"loss": 0.1503,
"step": 440
},
{
"epoch": 0.18,
"grad_norm": 2.3315675258636475,
"learning_rate": 1.926739926739927e-05,
"loss": 0.1885,
"step": 450
},
{
"epoch": 0.19,
"grad_norm": 3.6225030422210693,
"learning_rate": 1.9251119251119253e-05,
"loss": 0.1403,
"step": 460
},
{
"epoch": 0.19,
"grad_norm": 4.605388641357422,
"learning_rate": 1.9234839234839235e-05,
"loss": 0.2384,
"step": 470
},
{
"epoch": 0.2,
"grad_norm": 2.3162589073181152,
"learning_rate": 1.921855921855922e-05,
"loss": 0.129,
"step": 480
},
{
"epoch": 0.2,
"grad_norm": 0.4153892695903778,
"learning_rate": 1.9202279202279203e-05,
"loss": 0.1109,
"step": 490
},
{
"epoch": 0.2,
"grad_norm": 7.691011905670166,
"learning_rate": 1.9185999185999185e-05,
"loss": 0.1846,
"step": 500
},
{
"epoch": 0.21,
"grad_norm": 7.940028667449951,
"learning_rate": 1.916971916971917e-05,
"loss": 0.1391,
"step": 510
},
{
"epoch": 0.21,
"grad_norm": 0.5145124793052673,
"learning_rate": 1.9153439153439156e-05,
"loss": 0.1288,
"step": 520
},
{
"epoch": 0.22,
"grad_norm": 2.5785932540893555,
"learning_rate": 1.913715913715914e-05,
"loss": 0.1537,
"step": 530
},
{
"epoch": 0.22,
"grad_norm": 6.997181415557861,
"learning_rate": 1.9120879120879124e-05,
"loss": 0.1578,
"step": 540
},
{
"epoch": 0.22,
"grad_norm": 2.4879519939422607,
"learning_rate": 1.9104599104599107e-05,
"loss": 0.1954,
"step": 550
},
{
"epoch": 0.23,
"grad_norm": 5.291905879974365,
"learning_rate": 1.908831908831909e-05,
"loss": 0.1557,
"step": 560
},
{
"epoch": 0.23,
"grad_norm": 5.735557556152344,
"learning_rate": 1.9072039072039074e-05,
"loss": 0.1621,
"step": 570
},
{
"epoch": 0.24,
"grad_norm": 5.979973316192627,
"learning_rate": 1.9055759055759057e-05,
"loss": 0.1503,
"step": 580
},
{
"epoch": 0.24,
"grad_norm": 0.19126015901565552,
"learning_rate": 1.9039479039479042e-05,
"loss": 0.094,
"step": 590
},
{
"epoch": 0.24,
"grad_norm": 1.0556552410125732,
"learning_rate": 1.9023199023199025e-05,
"loss": 0.1876,
"step": 600
},
{
"epoch": 0.25,
"grad_norm": 3.954843759536743,
"learning_rate": 1.900691900691901e-05,
"loss": 0.3162,
"step": 610
},
{
"epoch": 0.25,
"grad_norm": 0.14296281337738037,
"learning_rate": 1.8990638990638992e-05,
"loss": 0.1288,
"step": 620
},
{
"epoch": 0.26,
"grad_norm": 8.772310256958008,
"learning_rate": 1.8974358974358975e-05,
"loss": 0.2622,
"step": 630
},
{
"epoch": 0.26,
"grad_norm": 2.3941524028778076,
"learning_rate": 1.895807895807896e-05,
"loss": 0.1524,
"step": 640
},
{
"epoch": 0.26,
"grad_norm": 8.104179382324219,
"learning_rate": 1.8941798941798943e-05,
"loss": 0.1109,
"step": 650
},
{
"epoch": 0.27,
"grad_norm": 1.5782121419906616,
"learning_rate": 1.8925518925518925e-05,
"loss": 0.0729,
"step": 660
},
{
"epoch": 0.27,
"grad_norm": 0.39667731523513794,
"learning_rate": 1.890923890923891e-05,
"loss": 0.1116,
"step": 670
},
{
"epoch": 0.28,
"grad_norm": 5.58447265625,
"learning_rate": 1.8892958892958896e-05,
"loss": 0.1312,
"step": 680
},
{
"epoch": 0.28,
"grad_norm": 1.3114192485809326,
"learning_rate": 1.887667887667888e-05,
"loss": 0.2522,
"step": 690
},
{
"epoch": 0.28,
"grad_norm": 0.997601330280304,
"learning_rate": 1.8860398860398864e-05,
"loss": 0.1605,
"step": 700
},
{
"epoch": 0.29,
"grad_norm": 3.605452537536621,
"learning_rate": 1.8844118844118846e-05,
"loss": 0.2443,
"step": 710
},
{
"epoch": 0.29,
"grad_norm": 18.868513107299805,
"learning_rate": 1.882783882783883e-05,
"loss": 0.1923,
"step": 720
},
{
"epoch": 0.3,
"grad_norm": 3.4352970123291016,
"learning_rate": 1.881155881155881e-05,
"loss": 0.1099,
"step": 730
},
{
"epoch": 0.3,
"grad_norm": 2.06532883644104,
"learning_rate": 1.8795278795278796e-05,
"loss": 0.1426,
"step": 740
},
{
"epoch": 0.31,
"grad_norm": 5.7250237464904785,
"learning_rate": 1.8778998778998782e-05,
"loss": 0.209,
"step": 750
},
{
"epoch": 0.31,
"grad_norm": 0.23493361473083496,
"learning_rate": 1.8762718762718764e-05,
"loss": 0.1986,
"step": 760
},
{
"epoch": 0.31,
"grad_norm": 17.769451141357422,
"learning_rate": 1.874643874643875e-05,
"loss": 0.1267,
"step": 770
},
{
"epoch": 0.32,
"grad_norm": 0.27574750781059265,
"learning_rate": 1.8730158730158732e-05,
"loss": 0.1484,
"step": 780
},
{
"epoch": 0.32,
"grad_norm": 0.30309033393859863,
"learning_rate": 1.8713878713878714e-05,
"loss": 0.1838,
"step": 790
},
{
"epoch": 0.33,
"grad_norm": 17.183013916015625,
"learning_rate": 1.86975986975987e-05,
"loss": 0.135,
"step": 800
},
{
"epoch": 0.33,
"grad_norm": 0.5572558641433716,
"learning_rate": 1.8681318681318682e-05,
"loss": 0.1456,
"step": 810
},
{
"epoch": 0.33,
"grad_norm": 0.4613451659679413,
"learning_rate": 1.8665038665038664e-05,
"loss": 0.1337,
"step": 820
},
{
"epoch": 0.34,
"grad_norm": 6.645438194274902,
"learning_rate": 1.864875864875865e-05,
"loss": 0.1446,
"step": 830
},
{
"epoch": 0.34,
"grad_norm": 5.389886856079102,
"learning_rate": 1.8632478632478636e-05,
"loss": 0.1253,
"step": 840
},
{
"epoch": 0.35,
"grad_norm": 14.86754322052002,
"learning_rate": 1.8616198616198618e-05,
"loss": 0.1346,
"step": 850
},
{
"epoch": 0.35,
"grad_norm": 13.419057846069336,
"learning_rate": 1.85999185999186e-05,
"loss": 0.0926,
"step": 860
},
{
"epoch": 0.35,
"grad_norm": 13.904304504394531,
"learning_rate": 1.8583638583638586e-05,
"loss": 0.1944,
"step": 870
},
{
"epoch": 0.36,
"grad_norm": 0.28235912322998047,
"learning_rate": 1.8567358567358568e-05,
"loss": 0.1261,
"step": 880
},
{
"epoch": 0.36,
"grad_norm": 5.711563587188721,
"learning_rate": 1.855107855107855e-05,
"loss": 0.1824,
"step": 890
},
{
"epoch": 0.37,
"grad_norm": 17.74437141418457,
"learning_rate": 1.8534798534798536e-05,
"loss": 0.1742,
"step": 900
},
{
"epoch": 0.37,
"grad_norm": 3.648202657699585,
"learning_rate": 1.851851851851852e-05,
"loss": 0.2103,
"step": 910
},
{
"epoch": 0.37,
"grad_norm": 2.0693366527557373,
"learning_rate": 1.8502238502238504e-05,
"loss": 0.1868,
"step": 920
},
{
"epoch": 0.38,
"grad_norm": 2.299172878265381,
"learning_rate": 1.848595848595849e-05,
"loss": 0.168,
"step": 930
},
{
"epoch": 0.38,
"grad_norm": 1.4059839248657227,
"learning_rate": 1.8469678469678472e-05,
"loss": 0.1455,
"step": 940
},
{
"epoch": 0.39,
"grad_norm": 0.926304817199707,
"learning_rate": 1.8453398453398454e-05,
"loss": 0.2002,
"step": 950
},
{
"epoch": 0.39,
"grad_norm": 4.728667736053467,
"learning_rate": 1.8437118437118436e-05,
"loss": 0.1245,
"step": 960
},
{
"epoch": 0.39,
"grad_norm": 0.5045005083084106,
"learning_rate": 1.8420838420838422e-05,
"loss": 0.0638,
"step": 970
},
{
"epoch": 0.4,
"grad_norm": 8.82520580291748,
"learning_rate": 1.8404558404558404e-05,
"loss": 0.1127,
"step": 980
},
{
"epoch": 0.4,
"grad_norm": 5.101595401763916,
"learning_rate": 1.838827838827839e-05,
"loss": 0.2363,
"step": 990
},
{
"epoch": 0.41,
"grad_norm": 7.01576566696167,
"learning_rate": 1.8371998371998375e-05,
"loss": 0.1026,
"step": 1000
},
{
"epoch": 0.41,
"grad_norm": 0.865003764629364,
"learning_rate": 1.8355718355718358e-05,
"loss": 0.0965,
"step": 1010
},
{
"epoch": 0.42,
"grad_norm": 9.897397994995117,
"learning_rate": 1.833943833943834e-05,
"loss": 0.1156,
"step": 1020
},
{
"epoch": 0.42,
"grad_norm": 1.5007679462432861,
"learning_rate": 1.8323158323158326e-05,
"loss": 0.1888,
"step": 1030
},
{
"epoch": 0.42,
"grad_norm": 4.676563262939453,
"learning_rate": 1.8306878306878308e-05,
"loss": 0.1552,
"step": 1040
},
{
"epoch": 0.43,
"grad_norm": 5.3361430168151855,
"learning_rate": 1.829059829059829e-05,
"loss": 0.1447,
"step": 1050
},
{
"epoch": 0.43,
"grad_norm": 0.8933970332145691,
"learning_rate": 1.8274318274318276e-05,
"loss": 0.1394,
"step": 1060
},
{
"epoch": 0.44,
"grad_norm": 7.401905059814453,
"learning_rate": 1.825803825803826e-05,
"loss": 0.2188,
"step": 1070
},
{
"epoch": 0.44,
"grad_norm": 0.4379027783870697,
"learning_rate": 1.8241758241758244e-05,
"loss": 0.1277,
"step": 1080
},
{
"epoch": 0.44,
"grad_norm": 2.8909428119659424,
"learning_rate": 1.8225478225478226e-05,
"loss": 0.1726,
"step": 1090
},
{
"epoch": 0.45,
"grad_norm": 0.11447061598300934,
"learning_rate": 1.820919820919821e-05,
"loss": 0.1523,
"step": 1100
},
{
"epoch": 0.45,
"grad_norm": 0.12276914715766907,
"learning_rate": 1.8192918192918194e-05,
"loss": 0.1823,
"step": 1110
},
{
"epoch": 0.46,
"grad_norm": 1.3844455480575562,
"learning_rate": 1.8176638176638176e-05,
"loss": 0.1006,
"step": 1120
},
{
"epoch": 0.46,
"grad_norm": 3.1034061908721924,
"learning_rate": 1.816035816035816e-05,
"loss": 0.1387,
"step": 1130
},
{
"epoch": 0.46,
"grad_norm": 8.602412223815918,
"learning_rate": 1.8144078144078144e-05,
"loss": 0.2345,
"step": 1140
},
{
"epoch": 0.47,
"grad_norm": 0.13919875025749207,
"learning_rate": 1.812779812779813e-05,
"loss": 0.1653,
"step": 1150
},
{
"epoch": 0.47,
"grad_norm": 0.34385234117507935,
"learning_rate": 1.8111518111518115e-05,
"loss": 0.2003,
"step": 1160
},
{
"epoch": 0.48,
"grad_norm": 4.868250846862793,
"learning_rate": 1.8095238095238097e-05,
"loss": 0.1504,
"step": 1170
},
{
"epoch": 0.48,
"grad_norm": 2.267928123474121,
"learning_rate": 1.807895807895808e-05,
"loss": 0.2023,
"step": 1180
},
{
"epoch": 0.48,
"grad_norm": 3.634040594100952,
"learning_rate": 1.8062678062678065e-05,
"loss": 0.1682,
"step": 1190
},
{
"epoch": 0.49,
"grad_norm": 1.8135625123977661,
"learning_rate": 1.8046398046398047e-05,
"loss": 0.2059,
"step": 1200
},
{
"epoch": 0.49,
"grad_norm": 2.294635057449341,
"learning_rate": 1.803011803011803e-05,
"loss": 0.1498,
"step": 1210
},
{
"epoch": 0.5,
"grad_norm": 3.8841567039489746,
"learning_rate": 1.8013838013838015e-05,
"loss": 0.0727,
"step": 1220
},
{
"epoch": 0.5,
"grad_norm": 0.9216524958610535,
"learning_rate": 1.7997557997558e-05,
"loss": 0.1273,
"step": 1230
},
{
"epoch": 0.5,
"grad_norm": 0.08572695404291153,
"learning_rate": 1.7981277981277983e-05,
"loss": 0.1066,
"step": 1240
},
{
"epoch": 0.51,
"grad_norm": 5.445361137390137,
"learning_rate": 1.7964997964997966e-05,
"loss": 0.1894,
"step": 1250
},
{
"epoch": 0.51,
"grad_norm": 4.239029407501221,
"learning_rate": 1.794871794871795e-05,
"loss": 0.0947,
"step": 1260
},
{
"epoch": 0.52,
"grad_norm": 0.7807052135467529,
"learning_rate": 1.7932437932437933e-05,
"loss": 0.1692,
"step": 1270
},
{
"epoch": 0.52,
"grad_norm": 0.1252571940422058,
"learning_rate": 1.7916157916157916e-05,
"loss": 0.0901,
"step": 1280
},
{
"epoch": 0.53,
"grad_norm": 5.491313457489014,
"learning_rate": 1.78998778998779e-05,
"loss": 0.0849,
"step": 1290
},
{
"epoch": 0.53,
"grad_norm": 0.3406262695789337,
"learning_rate": 1.7883597883597884e-05,
"loss": 0.1345,
"step": 1300
},
{
"epoch": 0.53,
"grad_norm": 3.4588377475738525,
"learning_rate": 1.786731786731787e-05,
"loss": 0.1501,
"step": 1310
},
{
"epoch": 0.54,
"grad_norm": 3.2964069843292236,
"learning_rate": 1.7851037851037855e-05,
"loss": 0.1679,
"step": 1320
},
{
"epoch": 0.54,
"grad_norm": 6.95346212387085,
"learning_rate": 1.7834757834757837e-05,
"loss": 0.095,
"step": 1330
},
{
"epoch": 0.55,
"grad_norm": 2.9120900630950928,
"learning_rate": 1.781847781847782e-05,
"loss": 0.1089,
"step": 1340
},
{
"epoch": 0.55,
"grad_norm": 8.793939590454102,
"learning_rate": 1.78021978021978e-05,
"loss": 0.1338,
"step": 1350
},
{
"epoch": 0.55,
"grad_norm": 0.08519359678030014,
"learning_rate": 1.7785917785917787e-05,
"loss": 0.0932,
"step": 1360
},
{
"epoch": 0.56,
"grad_norm": 16.41631317138672,
"learning_rate": 1.776963776963777e-05,
"loss": 0.1376,
"step": 1370
},
{
"epoch": 0.56,
"grad_norm": 3.0415103435516357,
"learning_rate": 1.7753357753357755e-05,
"loss": 0.1567,
"step": 1380
},
{
"epoch": 0.57,
"grad_norm": 0.8246403336524963,
"learning_rate": 1.773707773707774e-05,
"loss": 0.122,
"step": 1390
},
{
"epoch": 0.57,
"grad_norm": 2.198512077331543,
"learning_rate": 1.7720797720797723e-05,
"loss": 0.1528,
"step": 1400
},
{
"epoch": 0.57,
"grad_norm": 0.5246292352676392,
"learning_rate": 1.7704517704517705e-05,
"loss": 0.1088,
"step": 1410
},
{
"epoch": 0.58,
"grad_norm": 12.515607833862305,
"learning_rate": 1.768823768823769e-05,
"loss": 0.1627,
"step": 1420
},
{
"epoch": 0.58,
"grad_norm": 13.734766006469727,
"learning_rate": 1.7671957671957673e-05,
"loss": 0.1475,
"step": 1430
},
{
"epoch": 0.59,
"grad_norm": 2.593158483505249,
"learning_rate": 1.7655677655677655e-05,
"loss": 0.0968,
"step": 1440
},
{
"epoch": 0.59,
"grad_norm": 0.3462279736995697,
"learning_rate": 1.763939763939764e-05,
"loss": 0.0854,
"step": 1450
},
{
"epoch": 0.59,
"grad_norm": 1.6409497261047363,
"learning_rate": 1.7623117623117623e-05,
"loss": 0.2295,
"step": 1460
},
{
"epoch": 0.6,
"grad_norm": 2.9609594345092773,
"learning_rate": 1.760683760683761e-05,
"loss": 0.1883,
"step": 1470
},
{
"epoch": 0.6,
"grad_norm": 0.673570454120636,
"learning_rate": 1.759055759055759e-05,
"loss": 0.1369,
"step": 1480
},
{
"epoch": 0.61,
"grad_norm": 0.2929579019546509,
"learning_rate": 1.7574277574277577e-05,
"loss": 0.1189,
"step": 1490
},
{
"epoch": 0.61,
"grad_norm": 1.4493731260299683,
"learning_rate": 1.755799755799756e-05,
"loss": 0.1187,
"step": 1500
},
{
"epoch": 0.61,
"grad_norm": 0.07135419547557831,
"learning_rate": 1.754171754171754e-05,
"loss": 0.0603,
"step": 1510
},
{
"epoch": 0.62,
"grad_norm": 0.10734464973211288,
"learning_rate": 1.7525437525437527e-05,
"loss": 0.0217,
"step": 1520
},
{
"epoch": 0.62,
"grad_norm": 0.2217961698770523,
"learning_rate": 1.750915750915751e-05,
"loss": 0.0303,
"step": 1530
},
{
"epoch": 0.63,
"grad_norm": 0.14218159019947052,
"learning_rate": 1.7492877492877495e-05,
"loss": 0.0603,
"step": 1540
},
{
"epoch": 0.63,
"grad_norm": 0.09605604410171509,
"learning_rate": 1.747659747659748e-05,
"loss": 0.0407,
"step": 1550
},
{
"epoch": 0.63,
"grad_norm": 0.07094033062458038,
"learning_rate": 1.7460317460317463e-05,
"loss": 0.0202,
"step": 1560
},
{
"epoch": 0.64,
"grad_norm": 4.650410175323486,
"learning_rate": 1.7444037444037445e-05,
"loss": 0.0611,
"step": 1570
},
{
"epoch": 0.64,
"grad_norm": 23.229633331298828,
"learning_rate": 1.742775742775743e-05,
"loss": 0.0528,
"step": 1580
},
{
"epoch": 0.65,
"grad_norm": 1.466739535331726,
"learning_rate": 1.7411477411477413e-05,
"loss": 0.07,
"step": 1590
},
{
"epoch": 0.65,
"grad_norm": 0.05839679762721062,
"learning_rate": 1.7395197395197395e-05,
"loss": 0.0077,
"step": 1600
},
{
"epoch": 0.66,
"grad_norm": 1.6192926168441772,
"learning_rate": 1.737891737891738e-05,
"loss": 0.0239,
"step": 1610
},
{
"epoch": 0.66,
"grad_norm": 3.8529036045074463,
"learning_rate": 1.7362637362637363e-05,
"loss": 0.0976,
"step": 1620
},
{
"epoch": 0.66,
"grad_norm": 0.24398411810398102,
"learning_rate": 1.734635734635735e-05,
"loss": 0.0079,
"step": 1630
},
{
"epoch": 0.67,
"grad_norm": 0.04527588561177254,
"learning_rate": 1.733007733007733e-05,
"loss": 0.0065,
"step": 1640
},
{
"epoch": 0.67,
"grad_norm": 6.153138160705566,
"learning_rate": 1.7313797313797316e-05,
"loss": 0.0364,
"step": 1650
},
{
"epoch": 0.68,
"grad_norm": 0.03938959911465645,
"learning_rate": 1.72975172975173e-05,
"loss": 0.009,
"step": 1660
},
{
"epoch": 0.68,
"grad_norm": 0.04055130481719971,
"learning_rate": 1.728123728123728e-05,
"loss": 0.0472,
"step": 1670
},
{
"epoch": 0.68,
"grad_norm": 0.07095145434141159,
"learning_rate": 1.7264957264957267e-05,
"loss": 0.0078,
"step": 1680
},
{
"epoch": 0.69,
"grad_norm": 2.7965128421783447,
"learning_rate": 1.724867724867725e-05,
"loss": 0.0559,
"step": 1690
},
{
"epoch": 0.69,
"grad_norm": 6.2940592765808105,
"learning_rate": 1.7232397232397234e-05,
"loss": 0.0366,
"step": 1700
},
{
"epoch": 0.7,
"grad_norm": 0.11980397999286652,
"learning_rate": 1.721611721611722e-05,
"loss": 0.0125,
"step": 1710
},
{
"epoch": 0.7,
"grad_norm": 8.26235294342041,
"learning_rate": 1.7199837199837202e-05,
"loss": 0.0137,
"step": 1720
},
{
"epoch": 0.7,
"grad_norm": 0.04125256836414337,
"learning_rate": 1.7183557183557185e-05,
"loss": 0.0051,
"step": 1730
},
{
"epoch": 0.71,
"grad_norm": 0.03920783847570419,
"learning_rate": 1.7167277167277167e-05,
"loss": 0.0067,
"step": 1740
},
{
"epoch": 0.71,
"grad_norm": 0.13922813534736633,
"learning_rate": 1.7150997150997152e-05,
"loss": 0.0374,
"step": 1750
},
{
"epoch": 0.72,
"grad_norm": 0.034091122448444366,
"learning_rate": 1.7134717134717135e-05,
"loss": 0.006,
"step": 1760
},
{
"epoch": 0.72,
"grad_norm": 10.509510040283203,
"learning_rate": 1.711843711843712e-05,
"loss": 0.0589,
"step": 1770
},
{
"epoch": 0.72,
"grad_norm": 0.043251294642686844,
"learning_rate": 1.7102157102157103e-05,
"loss": 0.0226,
"step": 1780
},
{
"epoch": 0.73,
"grad_norm": 0.8053480982780457,
"learning_rate": 1.7085877085877088e-05,
"loss": 0.0582,
"step": 1790
},
{
"epoch": 0.73,
"grad_norm": 0.04081906005740166,
"learning_rate": 1.706959706959707e-05,
"loss": 0.0391,
"step": 1800
},
{
"epoch": 0.74,
"grad_norm": 0.03760745748877525,
"learning_rate": 1.7053317053317056e-05,
"loss": 0.027,
"step": 1810
},
{
"epoch": 0.74,
"grad_norm": 0.04111940413713455,
"learning_rate": 1.7037037037037038e-05,
"loss": 0.0368,
"step": 1820
},
{
"epoch": 0.74,
"grad_norm": 2.6297411918640137,
"learning_rate": 1.702075702075702e-05,
"loss": 0.0478,
"step": 1830
},
{
"epoch": 0.75,
"grad_norm": 0.1751009225845337,
"learning_rate": 1.7004477004477006e-05,
"loss": 0.0298,
"step": 1840
},
{
"epoch": 0.75,
"grad_norm": 0.042650580406188965,
"learning_rate": 1.698819698819699e-05,
"loss": 0.0203,
"step": 1850
},
{
"epoch": 0.76,
"grad_norm": 0.034141793847084045,
"learning_rate": 1.6971916971916974e-05,
"loss": 0.0044,
"step": 1860
},
{
"epoch": 0.76,
"grad_norm": 0.03497103974223137,
"learning_rate": 1.6955636955636956e-05,
"loss": 0.0304,
"step": 1870
},
{
"epoch": 0.77,
"grad_norm": 3.8585641384124756,
"learning_rate": 1.6939356939356942e-05,
"loss": 0.0365,
"step": 1880
},
{
"epoch": 0.77,
"grad_norm": 0.0322452187538147,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.0596,
"step": 1890
},
{
"epoch": 0.77,
"grad_norm": 0.034800559282302856,
"learning_rate": 1.6906796906796906e-05,
"loss": 0.0061,
"step": 1900
},
{
"epoch": 0.78,
"grad_norm": 0.0860045924782753,
"learning_rate": 1.6890516890516892e-05,
"loss": 0.0172,
"step": 1910
},
{
"epoch": 0.78,
"grad_norm": 0.031149201095104218,
"learning_rate": 1.6874236874236874e-05,
"loss": 0.0238,
"step": 1920
},
{
"epoch": 0.79,
"grad_norm": 0.03368987515568733,
"learning_rate": 1.685795685795686e-05,
"loss": 0.0043,
"step": 1930
},
{
"epoch": 0.79,
"grad_norm": 0.03161125257611275,
"learning_rate": 1.6841676841676846e-05,
"loss": 0.0146,
"step": 1940
},
{
"epoch": 0.79,
"grad_norm": 0.029046092182397842,
"learning_rate": 1.6825396825396828e-05,
"loss": 0.0236,
"step": 1950
},
{
"epoch": 0.8,
"grad_norm": 0.9345057606697083,
"learning_rate": 1.680911680911681e-05,
"loss": 0.0042,
"step": 1960
},
{
"epoch": 0.8,
"grad_norm": 0.028860267251729965,
"learning_rate": 1.6792836792836796e-05,
"loss": 0.0286,
"step": 1970
},
{
"epoch": 0.81,
"grad_norm": 0.02852853201329708,
"learning_rate": 1.6776556776556778e-05,
"loss": 0.023,
"step": 1980
},
{
"epoch": 0.81,
"grad_norm": 0.03128168359398842,
"learning_rate": 1.676027676027676e-05,
"loss": 0.0036,
"step": 1990
},
{
"epoch": 0.81,
"grad_norm": 0.037479083985090256,
"learning_rate": 1.6743996743996746e-05,
"loss": 0.0591,
"step": 2000
},
{
"epoch": 0.82,
"grad_norm": 0.04688659682869911,
"learning_rate": 1.6727716727716728e-05,
"loss": 0.0316,
"step": 2010
},
{
"epoch": 0.82,
"grad_norm": 0.03302760049700737,
"learning_rate": 1.6711436711436714e-05,
"loss": 0.0668,
"step": 2020
},
{
"epoch": 0.83,
"grad_norm": 0.06181880831718445,
"learning_rate": 1.6695156695156696e-05,
"loss": 0.0281,
"step": 2030
},
{
"epoch": 0.83,
"grad_norm": 0.0320013165473938,
"learning_rate": 1.667887667887668e-05,
"loss": 0.0232,
"step": 2040
},
{
"epoch": 0.83,
"grad_norm": 0.13600216805934906,
"learning_rate": 1.6662596662596664e-05,
"loss": 0.0442,
"step": 2050
},
{
"epoch": 0.84,
"grad_norm": 0.12886099517345428,
"learning_rate": 1.6646316646316646e-05,
"loss": 0.0305,
"step": 2060
},
{
"epoch": 0.84,
"grad_norm": 0.0625109001994133,
"learning_rate": 1.6630036630036632e-05,
"loss": 0.0233,
"step": 2070
},
{
"epoch": 0.85,
"grad_norm": 13.604376792907715,
"learning_rate": 1.6613756613756614e-05,
"loss": 0.0288,
"step": 2080
},
{
"epoch": 0.85,
"grad_norm": 0.029248738661408424,
"learning_rate": 1.65974765974766e-05,
"loss": 0.0039,
"step": 2090
},
{
"epoch": 0.85,
"grad_norm": 1.4231517314910889,
"learning_rate": 1.6581196581196585e-05,
"loss": 0.0095,
"step": 2100
},
{
"epoch": 0.86,
"grad_norm": 0.02830047346651554,
"learning_rate": 1.6564916564916568e-05,
"loss": 0.007,
"step": 2110
},
{
"epoch": 0.86,
"grad_norm": 0.027091912925243378,
"learning_rate": 1.654863654863655e-05,
"loss": 0.0041,
"step": 2120
},
{
"epoch": 0.87,
"grad_norm": 0.02793751284480095,
"learning_rate": 1.6532356532356532e-05,
"loss": 0.0087,
"step": 2130
},
{
"epoch": 0.87,
"grad_norm": 0.030688917264342308,
"learning_rate": 1.6516076516076518e-05,
"loss": 0.0033,
"step": 2140
},
{
"epoch": 0.88,
"grad_norm": 0.02540646307170391,
"learning_rate": 1.64997964997965e-05,
"loss": 0.0254,
"step": 2150
},
{
"epoch": 0.88,
"grad_norm": 0.026573829352855682,
"learning_rate": 1.6483516483516486e-05,
"loss": 0.0195,
"step": 2160
},
{
"epoch": 0.88,
"grad_norm": 0.025454262271523476,
"learning_rate": 1.6467236467236468e-05,
"loss": 0.0031,
"step": 2170
},
{
"epoch": 0.89,
"grad_norm": 0.038121115416288376,
"learning_rate": 1.6450956450956453e-05,
"loss": 0.0035,
"step": 2180
},
{
"epoch": 0.89,
"grad_norm": 0.025772370398044586,
"learning_rate": 1.6434676434676436e-05,
"loss": 0.003,
"step": 2190
},
{
"epoch": 0.9,
"grad_norm": 3.4986250400543213,
"learning_rate": 1.641839641839642e-05,
"loss": 0.0038,
"step": 2200
},
{
"epoch": 0.9,
"grad_norm": 25.038734436035156,
"learning_rate": 1.6402116402116404e-05,
"loss": 0.0119,
"step": 2210
},
{
"epoch": 0.9,
"grad_norm": 0.025794176384806633,
"learning_rate": 1.6385836385836386e-05,
"loss": 0.0353,
"step": 2220
},
{
"epoch": 0.91,
"grad_norm": 4.056914806365967,
"learning_rate": 1.636955636955637e-05,
"loss": 0.0517,
"step": 2230
},
{
"epoch": 0.91,
"grad_norm": 0.19518433511257172,
"learning_rate": 1.6353276353276354e-05,
"loss": 0.0291,
"step": 2240
},
{
"epoch": 0.92,
"grad_norm": 0.02424285002052784,
"learning_rate": 1.633699633699634e-05,
"loss": 0.0359,
"step": 2250
},
{
"epoch": 0.92,
"grad_norm": 0.03164544701576233,
"learning_rate": 1.632071632071632e-05,
"loss": 0.0382,
"step": 2260
},
{
"epoch": 0.92,
"grad_norm": 0.022855272516608238,
"learning_rate": 1.6304436304436307e-05,
"loss": 0.003,
"step": 2270
},
{
"epoch": 0.93,
"grad_norm": 0.023591142147779465,
"learning_rate": 1.628815628815629e-05,
"loss": 0.0497,
"step": 2280
},
{
"epoch": 0.93,
"grad_norm": 0.02427799627184868,
"learning_rate": 1.627187627187627e-05,
"loss": 0.0381,
"step": 2290
},
{
"epoch": 0.94,
"grad_norm": 0.022075733169913292,
"learning_rate": 1.6255596255596257e-05,
"loss": 0.0038,
"step": 2300
},
{
"epoch": 0.94,
"grad_norm": 0.25007203221321106,
"learning_rate": 1.623931623931624e-05,
"loss": 0.0364,
"step": 2310
},
{
"epoch": 0.94,
"grad_norm": 0.02502160519361496,
"learning_rate": 1.6223036223036225e-05,
"loss": 0.0029,
"step": 2320
},
{
"epoch": 0.95,
"grad_norm": 0.036409296095371246,
"learning_rate": 1.6206756206756207e-05,
"loss": 0.0387,
"step": 2330
},
{
"epoch": 0.95,
"grad_norm": 0.027146685868501663,
"learning_rate": 1.6190476190476193e-05,
"loss": 0.0045,
"step": 2340
},
{
"epoch": 0.96,
"grad_norm": 0.024981442838907242,
"learning_rate": 1.6174196174196175e-05,
"loss": 0.0264,
"step": 2350
},
{
"epoch": 0.96,
"grad_norm": 0.027865292504429817,
"learning_rate": 1.615791615791616e-05,
"loss": 0.0029,
"step": 2360
},
{
"epoch": 0.96,
"grad_norm": 0.034725822508335114,
"learning_rate": 1.6141636141636143e-05,
"loss": 0.0029,
"step": 2370
},
{
"epoch": 0.97,
"grad_norm": 0.022250523790717125,
"learning_rate": 1.6125356125356125e-05,
"loss": 0.0337,
"step": 2380
},
{
"epoch": 0.97,
"grad_norm": 0.024188194423913956,
"learning_rate": 1.610907610907611e-05,
"loss": 0.0026,
"step": 2390
},
{
"epoch": 0.98,
"grad_norm": 0.02303464338183403,
"learning_rate": 1.6092796092796093e-05,
"loss": 0.0285,
"step": 2400
},
{
"epoch": 0.98,
"grad_norm": 0.020316725596785545,
"learning_rate": 1.607651607651608e-05,
"loss": 0.0026,
"step": 2410
},
{
"epoch": 0.98,
"grad_norm": 0.023156961426138878,
"learning_rate": 1.606023606023606e-05,
"loss": 0.0031,
"step": 2420
},
{
"epoch": 0.99,
"grad_norm": 2.9847331047058105,
"learning_rate": 1.6043956043956047e-05,
"loss": 0.0034,
"step": 2430
},
{
"epoch": 0.99,
"grad_norm": 10.845735549926758,
"learning_rate": 1.602767602767603e-05,
"loss": 0.0557,
"step": 2440
},
{
"epoch": 1.0,
"grad_norm": 0.02037137933075428,
"learning_rate": 1.601139601139601e-05,
"loss": 0.0333,
"step": 2450
},
{
"epoch": 1.0,
"grad_norm": 0.019075889140367508,
"learning_rate": 1.5995115995115997e-05,
"loss": 0.0029,
"step": 2460
},
{
"epoch": 1.01,
"grad_norm": 0.02034451812505722,
"learning_rate": 1.597883597883598e-05,
"loss": 0.0035,
"step": 2470
},
{
"epoch": 1.01,
"grad_norm": 0.02513672597706318,
"learning_rate": 1.5962555962555965e-05,
"loss": 0.0149,
"step": 2480
},
{
"epoch": 1.01,
"grad_norm": 0.0232282355427742,
"learning_rate": 1.5946275946275947e-05,
"loss": 0.0066,
"step": 2490
},
{
"epoch": 1.02,
"grad_norm": 0.019541621208190918,
"learning_rate": 1.5929995929995933e-05,
"loss": 0.003,
"step": 2500
},
{
"epoch": 1.02,
"grad_norm": 0.027926787734031677,
"learning_rate": 1.5913715913715915e-05,
"loss": 0.0024,
"step": 2510
},
{
"epoch": 1.03,
"grad_norm": 0.021236905828118324,
"learning_rate": 1.5897435897435897e-05,
"loss": 0.0023,
"step": 2520
},
{
"epoch": 1.03,
"grad_norm": 0.017625728622078896,
"learning_rate": 1.5881155881155883e-05,
"loss": 0.0023,
"step": 2530
},
{
"epoch": 1.03,
"grad_norm": 3.0908312797546387,
"learning_rate": 1.5864875864875865e-05,
"loss": 0.0032,
"step": 2540
},
{
"epoch": 1.04,
"grad_norm": 0.025432445108890533,
"learning_rate": 1.584859584859585e-05,
"loss": 0.0246,
"step": 2550
},
{
"epoch": 1.04,
"grad_norm": 0.0189252570271492,
"learning_rate": 1.5832315832315833e-05,
"loss": 0.0025,
"step": 2560
},
{
"epoch": 1.05,
"grad_norm": 0.16396763920783997,
"learning_rate": 1.581603581603582e-05,
"loss": 0.0378,
"step": 2570
},
{
"epoch": 1.05,
"grad_norm": 0.019563721492886543,
"learning_rate": 1.57997557997558e-05,
"loss": 0.0281,
"step": 2580
},
{
"epoch": 1.05,
"grad_norm": 0.02156243473291397,
"learning_rate": 1.5783475783475787e-05,
"loss": 0.1073,
"step": 2590
},
{
"epoch": 1.06,
"grad_norm": 3.184936285018921,
"learning_rate": 1.576719576719577e-05,
"loss": 0.0413,
"step": 2600
},
{
"epoch": 1.06,
"grad_norm": 0.0187922902405262,
"learning_rate": 1.575091575091575e-05,
"loss": 0.0423,
"step": 2610
},
{
"epoch": 1.07,
"grad_norm": 0.020309004932641983,
"learning_rate": 1.5734635734635737e-05,
"loss": 0.0026,
"step": 2620
},
{
"epoch": 1.07,
"grad_norm": 0.028299883008003235,
"learning_rate": 1.571835571835572e-05,
"loss": 0.0026,
"step": 2630
},
{
"epoch": 1.07,
"grad_norm": 0.022750265896320343,
"learning_rate": 1.5702075702075705e-05,
"loss": 0.0026,
"step": 2640
},
{
"epoch": 1.08,
"grad_norm": 0.017459379509091377,
"learning_rate": 1.5685795685795687e-05,
"loss": 0.0026,
"step": 2650
},
{
"epoch": 1.08,
"grad_norm": 0.02400645986199379,
"learning_rate": 1.5669515669515672e-05,
"loss": 0.0022,
"step": 2660
},
{
"epoch": 1.09,
"grad_norm": 0.037710972130298615,
"learning_rate": 1.5653235653235655e-05,
"loss": 0.0024,
"step": 2670
},
{
"epoch": 1.09,
"grad_norm": 0.01844876818358898,
"learning_rate": 1.5636955636955637e-05,
"loss": 0.0022,
"step": 2680
},
{
"epoch": 1.09,
"grad_norm": 0.015886761248111725,
"learning_rate": 1.5620675620675623e-05,
"loss": 0.0021,
"step": 2690
},
{
"epoch": 1.1,
"grad_norm": 0.016119027510285378,
"learning_rate": 1.5604395604395605e-05,
"loss": 0.0024,
"step": 2700
},
{
"epoch": 1.1,
"grad_norm": 0.01977747306227684,
"learning_rate": 1.558811558811559e-05,
"loss": 0.0405,
"step": 2710
},
{
"epoch": 1.11,
"grad_norm": 0.01591884344816208,
"learning_rate": 1.5571835571835573e-05,
"loss": 0.0021,
"step": 2720
},
{
"epoch": 1.11,
"grad_norm": 0.017170535400509834,
"learning_rate": 1.555555555555556e-05,
"loss": 0.0102,
"step": 2730
},
{
"epoch": 1.12,
"grad_norm": 0.02160962112247944,
"learning_rate": 1.553927553927554e-05,
"loss": 0.0164,
"step": 2740
},
{
"epoch": 1.12,
"grad_norm": 0.04177393019199371,
"learning_rate": 1.5522995522995526e-05,
"loss": 0.002,
"step": 2750
},
{
"epoch": 1.12,
"grad_norm": 0.01732414774596691,
"learning_rate": 1.550671550671551e-05,
"loss": 0.0022,
"step": 2760
},
{
"epoch": 1.13,
"grad_norm": 0.05687391385436058,
"learning_rate": 1.549043549043549e-05,
"loss": 0.002,
"step": 2770
},
{
"epoch": 1.13,
"grad_norm": 0.015546981245279312,
"learning_rate": 1.5474155474155473e-05,
"loss": 0.0296,
"step": 2780
},
{
"epoch": 1.14,
"grad_norm": 11.891217231750488,
"learning_rate": 1.545787545787546e-05,
"loss": 0.0303,
"step": 2790
},
{
"epoch": 1.14,
"grad_norm": 3.074970245361328,
"learning_rate": 1.5441595441595444e-05,
"loss": 0.0346,
"step": 2800
},
{
"epoch": 1.14,
"grad_norm": 1.3277289867401123,
"learning_rate": 1.5425315425315426e-05,
"loss": 0.0053,
"step": 2810
},
{
"epoch": 1.15,
"grad_norm": 0.014851146377623081,
"learning_rate": 1.5409035409035412e-05,
"loss": 0.0021,
"step": 2820
},
{
"epoch": 1.15,
"grad_norm": 0.02586003951728344,
"learning_rate": 1.5392755392755394e-05,
"loss": 0.0194,
"step": 2830
},
{
"epoch": 1.16,
"grad_norm": 0.018063299357891083,
"learning_rate": 1.5376475376475377e-05,
"loss": 0.0374,
"step": 2840
},
{
"epoch": 1.16,
"grad_norm": 0.014860156923532486,
"learning_rate": 1.5360195360195362e-05,
"loss": 0.0368,
"step": 2850
},
{
"epoch": 1.16,
"grad_norm": 0.016715556383132935,
"learning_rate": 1.5343915343915344e-05,
"loss": 0.0232,
"step": 2860
},
{
"epoch": 1.17,
"grad_norm": 0.017222585156559944,
"learning_rate": 1.532763532763533e-05,
"loss": 0.0021,
"step": 2870
},
{
"epoch": 1.17,
"grad_norm": 0.015297485515475273,
"learning_rate": 1.5311355311355312e-05,
"loss": 0.002,
"step": 2880
},
{
"epoch": 1.18,
"grad_norm": 0.01927722617983818,
"learning_rate": 1.5295075295075298e-05,
"loss": 0.0344,
"step": 2890
},
{
"epoch": 1.18,
"grad_norm": 0.014726200141012669,
"learning_rate": 1.527879527879528e-05,
"loss": 0.0105,
"step": 2900
},
{
"epoch": 1.18,
"grad_norm": 0.015239718370139599,
"learning_rate": 1.5262515262515263e-05,
"loss": 0.0019,
"step": 2910
},
{
"epoch": 1.19,
"grad_norm": 0.014116072095930576,
"learning_rate": 1.5246235246235248e-05,
"loss": 0.0482,
"step": 2920
},
{
"epoch": 1.19,
"grad_norm": 0.014437291771173477,
"learning_rate": 1.522995522995523e-05,
"loss": 0.0028,
"step": 2930
},
{
"epoch": 1.2,
"grad_norm": 0.017663761973381042,
"learning_rate": 1.5213675213675214e-05,
"loss": 0.007,
"step": 2940
},
{
"epoch": 1.2,
"grad_norm": 0.024807853624224663,
"learning_rate": 1.51973951973952e-05,
"loss": 0.0044,
"step": 2950
},
{
"epoch": 1.2,
"grad_norm": 0.01389392837882042,
"learning_rate": 1.5181115181115182e-05,
"loss": 0.021,
"step": 2960
},
{
"epoch": 1.21,
"grad_norm": 0.014578912407159805,
"learning_rate": 1.5164835164835166e-05,
"loss": 0.002,
"step": 2970
},
{
"epoch": 1.21,
"grad_norm": 0.013830927200615406,
"learning_rate": 1.514855514855515e-05,
"loss": 0.0017,
"step": 2980
},
{
"epoch": 1.22,
"grad_norm": 0.012908479198813438,
"learning_rate": 1.5132275132275134e-05,
"loss": 0.0047,
"step": 2990
},
{
"epoch": 1.22,
"grad_norm": 0.013685975223779678,
"learning_rate": 1.5115995115995116e-05,
"loss": 0.0062,
"step": 3000
},
{
"epoch": 1.23,
"grad_norm": 0.015914512798190117,
"learning_rate": 1.50997150997151e-05,
"loss": 0.0415,
"step": 3010
},
{
"epoch": 1.23,
"grad_norm": 0.09328664839267731,
"learning_rate": 1.5083435083435086e-05,
"loss": 0.0017,
"step": 3020
},
{
"epoch": 1.23,
"grad_norm": 0.013503558933734894,
"learning_rate": 1.5067155067155068e-05,
"loss": 0.0292,
"step": 3030
},
{
"epoch": 1.24,
"grad_norm": 0.012664329260587692,
"learning_rate": 1.505087505087505e-05,
"loss": 0.0108,
"step": 3040
},
{
"epoch": 1.24,
"grad_norm": 0.013521691784262657,
"learning_rate": 1.5034595034595036e-05,
"loss": 0.0016,
"step": 3050
},
{
"epoch": 1.25,
"grad_norm": 0.017031285911798477,
"learning_rate": 1.501831501831502e-05,
"loss": 0.0056,
"step": 3060
},
{
"epoch": 1.25,
"grad_norm": 0.0123978890478611,
"learning_rate": 1.5002035002035002e-05,
"loss": 0.0454,
"step": 3070
},
{
"epoch": 1.25,
"grad_norm": 0.01293584518134594,
"learning_rate": 1.4985754985754988e-05,
"loss": 0.004,
"step": 3080
},
{
"epoch": 1.26,
"grad_norm": 0.013730690814554691,
"learning_rate": 1.496947496947497e-05,
"loss": 0.0355,
"step": 3090
},
{
"epoch": 1.26,
"grad_norm": 0.01241120882332325,
"learning_rate": 1.4953194953194954e-05,
"loss": 0.0017,
"step": 3100
},
{
"epoch": 1.27,
"grad_norm": 0.016001150012016296,
"learning_rate": 1.493691493691494e-05,
"loss": 0.0017,
"step": 3110
},
{
"epoch": 1.27,
"grad_norm": 0.019151071086525917,
"learning_rate": 1.4920634920634922e-05,
"loss": 0.0335,
"step": 3120
},
{
"epoch": 1.27,
"grad_norm": 0.014675545506179333,
"learning_rate": 1.4904354904354906e-05,
"loss": 0.0203,
"step": 3130
},
{
"epoch": 1.28,
"grad_norm": 0.5518173575401306,
"learning_rate": 1.4888074888074888e-05,
"loss": 0.0025,
"step": 3140
},
{
"epoch": 1.28,
"grad_norm": 0.012442667037248611,
"learning_rate": 1.4871794871794874e-05,
"loss": 0.0021,
"step": 3150
},
{
"epoch": 1.29,
"grad_norm": 0.013752995058894157,
"learning_rate": 1.4855514855514856e-05,
"loss": 0.0018,
"step": 3160
},
{
"epoch": 1.29,
"grad_norm": 0.011561810038983822,
"learning_rate": 1.483923483923484e-05,
"loss": 0.0016,
"step": 3170
},
{
"epoch": 1.29,
"grad_norm": 0.011732109822332859,
"learning_rate": 1.4822954822954826e-05,
"loss": 0.0015,
"step": 3180
},
{
"epoch": 1.3,
"grad_norm": 0.011794438585639,
"learning_rate": 1.4806674806674808e-05,
"loss": 0.0014,
"step": 3190
},
{
"epoch": 1.3,
"grad_norm": 0.011947757564485073,
"learning_rate": 1.479039479039479e-05,
"loss": 0.0026,
"step": 3200
},
{
"epoch": 1.31,
"grad_norm": 0.017924221232533455,
"learning_rate": 1.4774114774114776e-05,
"loss": 0.0015,
"step": 3210
},
{
"epoch": 1.31,
"grad_norm": 0.011501024477183819,
"learning_rate": 1.475783475783476e-05,
"loss": 0.0021,
"step": 3220
},
{
"epoch": 1.31,
"grad_norm": 0.05062294751405716,
"learning_rate": 1.4741554741554742e-05,
"loss": 0.0015,
"step": 3230
},
{
"epoch": 1.32,
"grad_norm": 0.011451934464275837,
"learning_rate": 1.4725274725274727e-05,
"loss": 0.0015,
"step": 3240
},
{
"epoch": 1.32,
"grad_norm": 0.011398130096495152,
"learning_rate": 1.470899470899471e-05,
"loss": 0.0262,
"step": 3250
},
{
"epoch": 1.33,
"grad_norm": 0.011111021041870117,
"learning_rate": 1.4692714692714694e-05,
"loss": 0.0015,
"step": 3260
},
{
"epoch": 1.33,
"grad_norm": 0.011720293201506138,
"learning_rate": 1.4676434676434676e-05,
"loss": 0.0014,
"step": 3270
},
{
"epoch": 1.33,
"grad_norm": 0.01106089074164629,
"learning_rate": 1.4660154660154662e-05,
"loss": 0.0248,
"step": 3280
},
{
"epoch": 1.34,
"grad_norm": 0.031572628766298294,
"learning_rate": 1.4643874643874645e-05,
"loss": 0.0015,
"step": 3290
},
{
"epoch": 1.34,
"grad_norm": 0.010560325346887112,
"learning_rate": 1.4627594627594628e-05,
"loss": 0.0014,
"step": 3300
},
{
"epoch": 1.35,
"grad_norm": 31.388111114501953,
"learning_rate": 1.4611314611314613e-05,
"loss": 0.0255,
"step": 3310
},
{
"epoch": 1.35,
"grad_norm": 0.016965394839644432,
"learning_rate": 1.4595034595034596e-05,
"loss": 0.0014,
"step": 3320
},
{
"epoch": 1.36,
"grad_norm": 0.022373100742697716,
"learning_rate": 1.457875457875458e-05,
"loss": 0.0013,
"step": 3330
},
{
"epoch": 1.36,
"grad_norm": 0.011025676503777504,
"learning_rate": 1.4562474562474565e-05,
"loss": 0.0374,
"step": 3340
},
{
"epoch": 1.36,
"grad_norm": 0.016683539375662804,
"learning_rate": 1.4546194546194547e-05,
"loss": 0.0389,
"step": 3350
},
{
"epoch": 1.37,
"grad_norm": 0.012086950242519379,
"learning_rate": 1.4529914529914531e-05,
"loss": 0.0304,
"step": 3360
},
{
"epoch": 1.37,
"grad_norm": 0.011172090657055378,
"learning_rate": 1.4513634513634515e-05,
"loss": 0.0178,
"step": 3370
},
{
"epoch": 1.38,
"grad_norm": 0.013024254702031612,
"learning_rate": 1.44973544973545e-05,
"loss": 0.0014,
"step": 3380
},
{
"epoch": 1.38,
"grad_norm": 0.010836287401616573,
"learning_rate": 1.4481074481074482e-05,
"loss": 0.0014,
"step": 3390
},
{
"epoch": 1.38,
"grad_norm": 0.014210844412446022,
"learning_rate": 1.4464794464794465e-05,
"loss": 0.0014,
"step": 3400
},
{
"epoch": 1.39,
"grad_norm": 0.010528087615966797,
"learning_rate": 1.444851444851445e-05,
"loss": 0.0044,
"step": 3410
},
{
"epoch": 1.39,
"grad_norm": 0.01593305543065071,
"learning_rate": 1.4432234432234433e-05,
"loss": 0.0455,
"step": 3420
},
{
"epoch": 1.4,
"grad_norm": 0.015049874782562256,
"learning_rate": 1.4415954415954416e-05,
"loss": 0.0027,
"step": 3430
},
{
"epoch": 1.4,
"grad_norm": 0.011662309989333153,
"learning_rate": 1.4399674399674401e-05,
"loss": 0.0013,
"step": 3440
},
{
"epoch": 1.4,
"grad_norm": 0.011207195930182934,
"learning_rate": 1.4383394383394385e-05,
"loss": 0.0018,
"step": 3450
},
{
"epoch": 1.41,
"grad_norm": 3.6042699813842773,
"learning_rate": 1.4367114367114367e-05,
"loss": 0.0029,
"step": 3460
},
{
"epoch": 1.41,
"grad_norm": 0.09215729683637619,
"learning_rate": 1.4350834350834353e-05,
"loss": 0.002,
"step": 3470
},
{
"epoch": 1.42,
"grad_norm": 0.010877463966608047,
"learning_rate": 1.4334554334554335e-05,
"loss": 0.0014,
"step": 3480
},
{
"epoch": 1.42,
"grad_norm": 0.009993131272494793,
"learning_rate": 1.431827431827432e-05,
"loss": 0.0016,
"step": 3490
},
{
"epoch": 1.42,
"grad_norm": 1.349046230316162,
"learning_rate": 1.4301994301994305e-05,
"loss": 0.0018,
"step": 3500
},
{
"epoch": 1.43,
"grad_norm": 0.009341539815068245,
"learning_rate": 1.4285714285714287e-05,
"loss": 0.0012,
"step": 3510
},
{
"epoch": 1.43,
"grad_norm": 0.009393510408699512,
"learning_rate": 1.4269434269434271e-05,
"loss": 0.0011,
"step": 3520
},
{
"epoch": 1.44,
"grad_norm": 0.009326926432549953,
"learning_rate": 1.4253154253154253e-05,
"loss": 0.0012,
"step": 3530
},
{
"epoch": 1.44,
"grad_norm": 0.009275635704398155,
"learning_rate": 1.4236874236874239e-05,
"loss": 0.0384,
"step": 3540
},
{
"epoch": 1.44,
"grad_norm": 22.40707778930664,
"learning_rate": 1.4220594220594221e-05,
"loss": 0.0131,
"step": 3550
},
{
"epoch": 1.45,
"grad_norm": 0.00953533872961998,
"learning_rate": 1.4204314204314205e-05,
"loss": 0.0347,
"step": 3560
},
{
"epoch": 1.45,
"grad_norm": 0.5032986998558044,
"learning_rate": 1.4188034188034189e-05,
"loss": 0.0402,
"step": 3570
},
{
"epoch": 1.46,
"grad_norm": 0.011732584796845913,
"learning_rate": 1.4171754171754173e-05,
"loss": 0.0592,
"step": 3580
},
{
"epoch": 1.46,
"grad_norm": 0.010645696893334389,
"learning_rate": 1.4155474155474155e-05,
"loss": 0.0268,
"step": 3590
},
{
"epoch": 1.47,
"grad_norm": 0.013740918599069118,
"learning_rate": 1.4139194139194141e-05,
"loss": 0.0252,
"step": 3600
},
{
"epoch": 1.47,
"grad_norm": 0.013372181914746761,
"learning_rate": 1.4122914122914125e-05,
"loss": 0.0376,
"step": 3610
},
{
"epoch": 1.47,
"grad_norm": 0.015505131334066391,
"learning_rate": 1.4106634106634107e-05,
"loss": 0.0014,
"step": 3620
},
{
"epoch": 1.48,
"grad_norm": 0.014338747598230839,
"learning_rate": 1.4090354090354093e-05,
"loss": 0.0853,
"step": 3630
},
{
"epoch": 1.48,
"grad_norm": 0.01571911759674549,
"learning_rate": 1.4074074074074075e-05,
"loss": 0.0298,
"step": 3640
},
{
"epoch": 1.49,
"grad_norm": 0.020005526021122932,
"learning_rate": 1.4057794057794059e-05,
"loss": 0.0017,
"step": 3650
},
{
"epoch": 1.49,
"grad_norm": 0.018354693427681923,
"learning_rate": 1.4041514041514041e-05,
"loss": 0.0016,
"step": 3660
},
{
"epoch": 1.49,
"grad_norm": 0.021922029554843903,
"learning_rate": 1.4025234025234027e-05,
"loss": 0.0017,
"step": 3670
},
{
"epoch": 1.5,
"grad_norm": 0.013702883385121822,
"learning_rate": 1.400895400895401e-05,
"loss": 0.0014,
"step": 3680
},
{
"epoch": 1.5,
"grad_norm": 0.010742840357124805,
"learning_rate": 1.3992673992673993e-05,
"loss": 0.0026,
"step": 3690
},
{
"epoch": 1.51,
"grad_norm": 0.15446045994758606,
"learning_rate": 1.3976393976393979e-05,
"loss": 0.0013,
"step": 3700
},
{
"epoch": 1.51,
"grad_norm": 0.01300391647964716,
"learning_rate": 1.3960113960113961e-05,
"loss": 0.0012,
"step": 3710
},
{
"epoch": 1.51,
"grad_norm": 0.017101220786571503,
"learning_rate": 1.3943833943833945e-05,
"loss": 0.0012,
"step": 3720
},
{
"epoch": 1.52,
"grad_norm": 0.009062445722520351,
"learning_rate": 1.3927553927553929e-05,
"loss": 0.0012,
"step": 3730
},
{
"epoch": 1.52,
"grad_norm": 0.008803702890872955,
"learning_rate": 1.3911273911273913e-05,
"loss": 0.0011,
"step": 3740
},
{
"epoch": 1.53,
"grad_norm": 0.008593735285103321,
"learning_rate": 1.3894993894993895e-05,
"loss": 0.0012,
"step": 3750
},
{
"epoch": 1.53,
"grad_norm": 0.009692203253507614,
"learning_rate": 1.387871387871388e-05,
"loss": 0.0011,
"step": 3760
},
{
"epoch": 1.53,
"grad_norm": 0.011008762754499912,
"learning_rate": 1.3862433862433865e-05,
"loss": 0.0011,
"step": 3770
},
{
"epoch": 1.54,
"grad_norm": 0.009994535706937313,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.022,
"step": 3780
},
{
"epoch": 1.54,
"grad_norm": 0.009117243811488152,
"learning_rate": 1.382987382987383e-05,
"loss": 0.0011,
"step": 3790
},
{
"epoch": 1.55,
"grad_norm": 0.008967447094619274,
"learning_rate": 1.3813593813593815e-05,
"loss": 0.0057,
"step": 3800
},
{
"epoch": 1.55,
"grad_norm": 0.008691845461726189,
"learning_rate": 1.3797313797313799e-05,
"loss": 0.0013,
"step": 3810
},
{
"epoch": 1.55,
"grad_norm": 0.011074850335717201,
"learning_rate": 1.378103378103378e-05,
"loss": 0.001,
"step": 3820
},
{
"epoch": 1.56,
"grad_norm": 0.00832684338092804,
"learning_rate": 1.3764753764753766e-05,
"loss": 0.0011,
"step": 3830
},
{
"epoch": 1.56,
"grad_norm": 0.008292116224765778,
"learning_rate": 1.374847374847375e-05,
"loss": 0.001,
"step": 3840
},
{
"epoch": 1.57,
"grad_norm": 0.009205167181789875,
"learning_rate": 1.3732193732193733e-05,
"loss": 0.0011,
"step": 3850
},
{
"epoch": 1.57,
"grad_norm": 0.008790573105216026,
"learning_rate": 1.3715913715913718e-05,
"loss": 0.001,
"step": 3860
},
{
"epoch": 1.58,
"grad_norm": 0.008000485599040985,
"learning_rate": 1.36996336996337e-05,
"loss": 0.008,
"step": 3870
},
{
"epoch": 1.58,
"grad_norm": 0.00819096527993679,
"learning_rate": 1.3683353683353684e-05,
"loss": 0.001,
"step": 3880
},
{
"epoch": 1.58,
"grad_norm": 0.014848892576992512,
"learning_rate": 1.3667073667073668e-05,
"loss": 0.015,
"step": 3890
},
{
"epoch": 1.59,
"grad_norm": 0.008053899742662907,
"learning_rate": 1.3650793650793652e-05,
"loss": 0.0009,
"step": 3900
},
{
"epoch": 1.59,
"grad_norm": 6.416678428649902,
"learning_rate": 1.3634513634513635e-05,
"loss": 0.0344,
"step": 3910
},
{
"epoch": 1.6,
"grad_norm": 0.10300695151090622,
"learning_rate": 1.3618233618233619e-05,
"loss": 0.001,
"step": 3920
},
{
"epoch": 1.6,
"grad_norm": 0.008424129337072372,
"learning_rate": 1.3601953601953604e-05,
"loss": 0.0267,
"step": 3930
},
{
"epoch": 1.6,
"grad_norm": 0.00800679437816143,
"learning_rate": 1.3585673585673586e-05,
"loss": 0.0326,
"step": 3940
},
{
"epoch": 1.61,
"grad_norm": 0.009919759817421436,
"learning_rate": 1.356939356939357e-05,
"loss": 0.0011,
"step": 3950
},
{
"epoch": 1.61,
"grad_norm": 0.02416282147169113,
"learning_rate": 1.3553113553113554e-05,
"loss": 0.0012,
"step": 3960
},
{
"epoch": 1.62,
"grad_norm": 5.555994033813477,
"learning_rate": 1.3536833536833538e-05,
"loss": 0.043,
"step": 3970
},
{
"epoch": 1.62,
"grad_norm": 0.10745339095592499,
"learning_rate": 1.352055352055352e-05,
"loss": 0.0011,
"step": 3980
},
{
"epoch": 1.62,
"grad_norm": 0.00835937075316906,
"learning_rate": 1.3504273504273506e-05,
"loss": 0.0009,
"step": 3990
},
{
"epoch": 1.63,
"grad_norm": 0.007618330419063568,
"learning_rate": 1.348799348799349e-05,
"loss": 0.0241,
"step": 4000
},
{
"epoch": 1.63,
"grad_norm": 0.022973209619522095,
"learning_rate": 1.3471713471713472e-05,
"loss": 0.001,
"step": 4010
},
{
"epoch": 1.64,
"grad_norm": 0.008424985222518444,
"learning_rate": 1.3455433455433458e-05,
"loss": 0.0018,
"step": 4020
},
{
"epoch": 1.64,
"grad_norm": 0.015286185778677464,
"learning_rate": 1.343915343915344e-05,
"loss": 0.0009,
"step": 4030
},
{
"epoch": 1.64,
"grad_norm": 0.007264839485287666,
"learning_rate": 1.3422873422873424e-05,
"loss": 0.0009,
"step": 4040
},
{
"epoch": 1.65,
"grad_norm": 0.0074860285967588425,
"learning_rate": 1.3406593406593406e-05,
"loss": 0.0009,
"step": 4050
},
{
"epoch": 1.65,
"grad_norm": 0.008237460628151894,
"learning_rate": 1.3390313390313392e-05,
"loss": 0.0373,
"step": 4060
},
{
"epoch": 1.66,
"grad_norm": 0.007270953617990017,
"learning_rate": 1.3374033374033374e-05,
"loss": 0.0009,
"step": 4070
},
{
"epoch": 1.66,
"grad_norm": 0.03919156640768051,
"learning_rate": 1.3357753357753358e-05,
"loss": 0.001,
"step": 4080
},
{
"epoch": 1.66,
"grad_norm": 0.11515277624130249,
"learning_rate": 1.3341473341473344e-05,
"loss": 0.001,
"step": 4090
},
{
"epoch": 1.67,
"grad_norm": 0.007153298240154982,
"learning_rate": 1.3325193325193326e-05,
"loss": 0.0014,
"step": 4100
},
{
"epoch": 1.67,
"grad_norm": 0.00894332304596901,
"learning_rate": 1.330891330891331e-05,
"loss": 0.0022,
"step": 4110
},
{
"epoch": 1.68,
"grad_norm": 0.046884216368198395,
"learning_rate": 1.3292633292633294e-05,
"loss": 0.001,
"step": 4120
},
{
"epoch": 1.68,
"grad_norm": 0.0074531338177621365,
"learning_rate": 1.3276353276353278e-05,
"loss": 0.0009,
"step": 4130
},
{
"epoch": 1.68,
"grad_norm": 0.008025778457522392,
"learning_rate": 1.326007326007326e-05,
"loss": 0.0008,
"step": 4140
},
{
"epoch": 1.69,
"grad_norm": 0.007099485024809837,
"learning_rate": 1.3243793243793246e-05,
"loss": 0.0349,
"step": 4150
},
{
"epoch": 1.69,
"grad_norm": 0.007894063368439674,
"learning_rate": 1.322751322751323e-05,
"loss": 0.0008,
"step": 4160
},
{
"epoch": 1.7,
"grad_norm": 0.008376212790608406,
"learning_rate": 1.3211233211233212e-05,
"loss": 0.0009,
"step": 4170
},
{
"epoch": 1.7,
"grad_norm": 0.007172748912125826,
"learning_rate": 1.3194953194953194e-05,
"loss": 0.0011,
"step": 4180
},
{
"epoch": 1.71,
"grad_norm": 0.007325605023652315,
"learning_rate": 1.317867317867318e-05,
"loss": 0.0008,
"step": 4190
},
{
"epoch": 1.71,
"grad_norm": 0.007277225609868765,
"learning_rate": 1.3162393162393164e-05,
"loss": 0.0009,
"step": 4200
},
{
"epoch": 1.71,
"grad_norm": 0.007008700165897608,
"learning_rate": 1.3146113146113146e-05,
"loss": 0.0009,
"step": 4210
},
{
"epoch": 1.72,
"grad_norm": 0.007119116373360157,
"learning_rate": 1.3129833129833132e-05,
"loss": 0.0088,
"step": 4220
},
{
"epoch": 1.72,
"grad_norm": 0.006735885515809059,
"learning_rate": 1.3113553113553114e-05,
"loss": 0.0011,
"step": 4230
},
{
"epoch": 1.73,
"grad_norm": 0.006696558557450771,
"learning_rate": 1.3097273097273098e-05,
"loss": 0.0057,
"step": 4240
},
{
"epoch": 1.73,
"grad_norm": 0.01188244204968214,
"learning_rate": 1.3080993080993084e-05,
"loss": 0.0011,
"step": 4250
},
{
"epoch": 1.73,
"grad_norm": 0.007251105271279812,
"learning_rate": 1.3064713064713066e-05,
"loss": 0.0357,
"step": 4260
},
{
"epoch": 1.74,
"grad_norm": 0.006903903558850288,
"learning_rate": 1.304843304843305e-05,
"loss": 0.0008,
"step": 4270
},
{
"epoch": 1.74,
"grad_norm": 0.008923369459807873,
"learning_rate": 1.3032153032153034e-05,
"loss": 0.0008,
"step": 4280
},
{
"epoch": 1.75,
"grad_norm": 0.006224838085472584,
"learning_rate": 1.3015873015873018e-05,
"loss": 0.0077,
"step": 4290
},
{
"epoch": 1.75,
"grad_norm": 0.00695427879691124,
"learning_rate": 1.2999592999593e-05,
"loss": 0.0008,
"step": 4300
},
{
"epoch": 1.75,
"grad_norm": 0.007040718570351601,
"learning_rate": 1.2983312983312984e-05,
"loss": 0.0008,
"step": 4310
},
{
"epoch": 1.76,
"grad_norm": 0.006210348103195429,
"learning_rate": 1.296703296703297e-05,
"loss": 0.0015,
"step": 4320
},
{
"epoch": 1.76,
"grad_norm": 0.0062638637609779835,
"learning_rate": 1.2950752950752952e-05,
"loss": 0.0044,
"step": 4330
},
{
"epoch": 1.77,
"grad_norm": 0.006666597910225391,
"learning_rate": 1.2934472934472934e-05,
"loss": 0.0007,
"step": 4340
},
{
"epoch": 1.77,
"grad_norm": 0.0061942501924932,
"learning_rate": 1.291819291819292e-05,
"loss": 0.0011,
"step": 4350
},
{
"epoch": 1.77,
"grad_norm": 0.00600019795820117,
"learning_rate": 1.2901912901912904e-05,
"loss": 0.0008,
"step": 4360
},
{
"epoch": 1.78,
"grad_norm": 0.006045353598892689,
"learning_rate": 1.2885632885632886e-05,
"loss": 0.0451,
"step": 4370
},
{
"epoch": 1.78,
"grad_norm": 0.006641109474003315,
"learning_rate": 1.2869352869352871e-05,
"loss": 0.0008,
"step": 4380
},
{
"epoch": 1.79,
"grad_norm": 0.4562086760997772,
"learning_rate": 1.2853072853072854e-05,
"loss": 0.0009,
"step": 4390
},
{
"epoch": 1.79,
"grad_norm": 0.0076696197502315044,
"learning_rate": 1.2836792836792838e-05,
"loss": 0.0348,
"step": 4400
},
{
"epoch": 1.79,
"grad_norm": 0.006937106605619192,
"learning_rate": 1.2820512820512823e-05,
"loss": 0.0596,
"step": 4410
},
{
"epoch": 1.8,
"grad_norm": 0.00782240740954876,
"learning_rate": 1.2804232804232805e-05,
"loss": 0.0851,
"step": 4420
},
{
"epoch": 1.8,
"grad_norm": 0.007307849358767271,
"learning_rate": 1.278795278795279e-05,
"loss": 0.0009,
"step": 4430
},
{
"epoch": 1.81,
"grad_norm": 0.008858690969645977,
"learning_rate": 1.2771672771672772e-05,
"loss": 0.0021,
"step": 4440
},
{
"epoch": 1.81,
"grad_norm": 0.006560084410011768,
"learning_rate": 1.2755392755392757e-05,
"loss": 0.0008,
"step": 4450
},
{
"epoch": 1.82,
"grad_norm": 0.06266916543245316,
"learning_rate": 1.273911273911274e-05,
"loss": 0.0011,
"step": 4460
},
{
"epoch": 1.82,
"grad_norm": 0.00679628923535347,
"learning_rate": 1.2722832722832723e-05,
"loss": 0.0009,
"step": 4470
},
{
"epoch": 1.82,
"grad_norm": 0.006765253376215696,
"learning_rate": 1.2706552706552709e-05,
"loss": 0.0013,
"step": 4480
},
{
"epoch": 1.83,
"grad_norm": 0.005858385004103184,
"learning_rate": 1.2690272690272691e-05,
"loss": 0.0007,
"step": 4490
},
{
"epoch": 1.83,
"grad_norm": 0.006266339216381311,
"learning_rate": 1.2673992673992674e-05,
"loss": 0.0008,
"step": 4500
},
{
"epoch": 1.84,
"grad_norm": 0.006281218025833368,
"learning_rate": 1.265771265771266e-05,
"loss": 0.1082,
"step": 4510
},
{
"epoch": 1.84,
"grad_norm": 0.006863302085548639,
"learning_rate": 1.2641432641432643e-05,
"loss": 0.0009,
"step": 4520
},
{
"epoch": 1.84,
"grad_norm": 0.013896014541387558,
"learning_rate": 1.2625152625152625e-05,
"loss": 0.0281,
"step": 4530
},
{
"epoch": 1.85,
"grad_norm": 0.24578307569026947,
"learning_rate": 1.2608872608872611e-05,
"loss": 0.001,
"step": 4540
},
{
"epoch": 1.85,
"grad_norm": 0.011449114419519901,
"learning_rate": 1.2592592592592593e-05,
"loss": 0.0007,
"step": 4550
},
{
"epoch": 1.86,
"grad_norm": 36.35368728637695,
"learning_rate": 1.2576312576312577e-05,
"loss": 0.0217,
"step": 4560
},
{
"epoch": 1.86,
"grad_norm": 0.011718428693711758,
"learning_rate": 1.256003256003256e-05,
"loss": 0.0008,
"step": 4570
},
{
"epoch": 1.86,
"grad_norm": 10.411919593811035,
"learning_rate": 1.2543752543752545e-05,
"loss": 0.0159,
"step": 4580
},
{
"epoch": 1.87,
"grad_norm": 0.006179590709507465,
"learning_rate": 1.2527472527472529e-05,
"loss": 0.0307,
"step": 4590
},
{
"epoch": 1.87,
"grad_norm": 0.0063836839981377125,
"learning_rate": 1.2511192511192511e-05,
"loss": 0.0034,
"step": 4600
},
{
"epoch": 1.88,
"grad_norm": 0.008047536946833134,
"learning_rate": 1.2494912494912497e-05,
"loss": 0.001,
"step": 4610
},
{
"epoch": 1.88,
"grad_norm": 0.010491227731108665,
"learning_rate": 1.247863247863248e-05,
"loss": 0.0008,
"step": 4620
},
{
"epoch": 1.88,
"grad_norm": 0.005860119592398405,
"learning_rate": 1.2462352462352463e-05,
"loss": 0.0007,
"step": 4630
},
{
"epoch": 1.89,
"grad_norm": 10.03593635559082,
"learning_rate": 1.2446072446072449e-05,
"loss": 0.0314,
"step": 4640
},
{
"epoch": 1.89,
"grad_norm": 0.006240949500352144,
"learning_rate": 1.2429792429792431e-05,
"loss": 0.0009,
"step": 4650
},
{
"epoch": 1.9,
"grad_norm": 0.00653426069766283,
"learning_rate": 1.2413512413512413e-05,
"loss": 0.0008,
"step": 4660
},
{
"epoch": 1.9,
"grad_norm": 0.0061131748370826244,
"learning_rate": 1.2397232397232399e-05,
"loss": 0.0385,
"step": 4670
},
{
"epoch": 1.9,
"grad_norm": 0.018757157027721405,
"learning_rate": 1.2380952380952383e-05,
"loss": 0.0008,
"step": 4680
},
{
"epoch": 1.91,
"grad_norm": 0.005603988189250231,
"learning_rate": 1.2364672364672365e-05,
"loss": 0.0007,
"step": 4690
},
{
"epoch": 1.91,
"grad_norm": 0.008327238261699677,
"learning_rate": 1.2348392348392349e-05,
"loss": 0.0007,
"step": 4700
},
{
"epoch": 1.92,
"grad_norm": 0.006342690903693438,
"learning_rate": 1.2332112332112333e-05,
"loss": 0.0027,
"step": 4710
},
{
"epoch": 1.92,
"grad_norm": 0.007467071060091257,
"learning_rate": 1.2315832315832317e-05,
"loss": 0.001,
"step": 4720
},
{
"epoch": 1.93,
"grad_norm": 0.005770612042397261,
"learning_rate": 1.22995522995523e-05,
"loss": 0.0422,
"step": 4730
},
{
"epoch": 1.93,
"grad_norm": 0.01268511638045311,
"learning_rate": 1.2283272283272285e-05,
"loss": 0.001,
"step": 4740
},
{
"epoch": 1.93,
"grad_norm": 0.025519585236907005,
"learning_rate": 1.2266992266992269e-05,
"loss": 0.019,
"step": 4750
},
{
"epoch": 1.94,
"grad_norm": 12.875621795654297,
"learning_rate": 1.2250712250712251e-05,
"loss": 0.0206,
"step": 4760
},
{
"epoch": 1.94,
"grad_norm": 0.018496304750442505,
"learning_rate": 1.2234432234432237e-05,
"loss": 0.0008,
"step": 4770
},
{
"epoch": 1.95,
"grad_norm": 0.005795106291770935,
"learning_rate": 1.2218152218152219e-05,
"loss": 0.0032,
"step": 4780
},
{
"epoch": 1.95,
"grad_norm": 0.005989160854369402,
"learning_rate": 1.2201872201872203e-05,
"loss": 0.0007,
"step": 4790
},
{
"epoch": 1.95,
"grad_norm": 0.005859148222953081,
"learning_rate": 1.2185592185592185e-05,
"loss": 0.0007,
"step": 4800
},
{
"epoch": 1.96,
"grad_norm": 0.008097686804831028,
"learning_rate": 1.216931216931217e-05,
"loss": 0.0007,
"step": 4810
},
{
"epoch": 1.96,
"grad_norm": 0.005901312455534935,
"learning_rate": 1.2153032153032153e-05,
"loss": 0.0007,
"step": 4820
},
{
"epoch": 1.97,
"grad_norm": 0.006804050877690315,
"learning_rate": 1.2136752136752137e-05,
"loss": 0.0009,
"step": 4830
},
{
"epoch": 1.97,
"grad_norm": 0.006251387298107147,
"learning_rate": 1.2120472120472123e-05,
"loss": 0.0423,
"step": 4840
},
{
"epoch": 1.97,
"grad_norm": 0.0055562574416399,
"learning_rate": 1.2104192104192105e-05,
"loss": 0.0008,
"step": 4850
},
{
"epoch": 1.98,
"grad_norm": 0.006534604821354151,
"learning_rate": 1.2087912087912089e-05,
"loss": 0.0038,
"step": 4860
},
{
"epoch": 1.98,
"grad_norm": 0.010235198773443699,
"learning_rate": 1.2071632071632073e-05,
"loss": 0.003,
"step": 4870
},
{
"epoch": 1.99,
"grad_norm": 0.006196849979460239,
"learning_rate": 1.2055352055352057e-05,
"loss": 0.0007,
"step": 4880
},
{
"epoch": 1.99,
"grad_norm": 0.015244298614561558,
"learning_rate": 1.2039072039072039e-05,
"loss": 0.0007,
"step": 4890
},
{
"epoch": 1.99,
"grad_norm": 0.03133594989776611,
"learning_rate": 1.2022792022792024e-05,
"loss": 0.0319,
"step": 4900
},
{
"epoch": 2.0,
"grad_norm": 0.012942776083946228,
"learning_rate": 1.2006512006512008e-05,
"loss": 0.0007,
"step": 4910
},
{
"epoch": 2.0,
"grad_norm": 0.0054002669639885426,
"learning_rate": 1.199023199023199e-05,
"loss": 0.0386,
"step": 4920
},
{
"epoch": 2.01,
"grad_norm": 0.006965090055018663,
"learning_rate": 1.1973951973951975e-05,
"loss": 0.0414,
"step": 4930
},
{
"epoch": 2.01,
"grad_norm": 0.005913823377341032,
"learning_rate": 1.1957671957671959e-05,
"loss": 0.0008,
"step": 4940
},
{
"epoch": 2.01,
"grad_norm": 0.00729360431432724,
"learning_rate": 1.1941391941391942e-05,
"loss": 0.0015,
"step": 4950
},
{
"epoch": 2.02,
"grad_norm": 0.005881543271243572,
"learning_rate": 1.1925111925111925e-05,
"loss": 0.0017,
"step": 4960
},
{
"epoch": 2.02,
"grad_norm": 0.00946744717657566,
"learning_rate": 1.190883190883191e-05,
"loss": 0.0008,
"step": 4970
},
{
"epoch": 2.03,
"grad_norm": 0.7791256904602051,
"learning_rate": 1.1892551892551893e-05,
"loss": 0.0456,
"step": 4980
},
{
"epoch": 2.03,
"grad_norm": 0.08430014550685883,
"learning_rate": 1.1876271876271877e-05,
"loss": 0.0048,
"step": 4990
},
{
"epoch": 2.04,
"grad_norm": 0.007524729706346989,
"learning_rate": 1.1859991859991862e-05,
"loss": 0.0008,
"step": 5000
},
{
"epoch": 2.04,
"grad_norm": 0.007158556021749973,
"learning_rate": 1.1843711843711844e-05,
"loss": 0.0007,
"step": 5010
},
{
"epoch": 2.04,
"grad_norm": 0.006158571690320969,
"learning_rate": 1.1827431827431828e-05,
"loss": 0.0007,
"step": 5020
},
{
"epoch": 2.05,
"grad_norm": 0.0062376465648412704,
"learning_rate": 1.1811151811151812e-05,
"loss": 0.0007,
"step": 5030
},
{
"epoch": 2.05,
"grad_norm": 0.009434174746274948,
"learning_rate": 1.1794871794871796e-05,
"loss": 0.0333,
"step": 5040
},
{
"epoch": 2.06,
"grad_norm": 0.006017903331667185,
"learning_rate": 1.1778591778591779e-05,
"loss": 0.0007,
"step": 5050
},
{
"epoch": 2.06,
"grad_norm": 0.007532346062362194,
"learning_rate": 1.1762311762311762e-05,
"loss": 0.0007,
"step": 5060
},
{
"epoch": 2.06,
"grad_norm": 0.005684974603354931,
"learning_rate": 1.1746031746031748e-05,
"loss": 0.0008,
"step": 5070
},
{
"epoch": 2.07,
"grad_norm": 0.005241623613983393,
"learning_rate": 1.172975172975173e-05,
"loss": 0.0306,
"step": 5080
},
{
"epoch": 2.07,
"grad_norm": 0.019347479566931725,
"learning_rate": 1.1713471713471714e-05,
"loss": 0.0008,
"step": 5090
},
{
"epoch": 2.08,
"grad_norm": 0.08700444549322128,
"learning_rate": 1.1697191697191698e-05,
"loss": 0.0009,
"step": 5100
},
{
"epoch": 2.08,
"grad_norm": 0.005539617035537958,
"learning_rate": 1.1680911680911682e-05,
"loss": 0.0009,
"step": 5110
},
{
"epoch": 2.08,
"grad_norm": 0.005851482041180134,
"learning_rate": 1.1664631664631664e-05,
"loss": 0.0007,
"step": 5120
},
{
"epoch": 2.09,
"grad_norm": 0.007532169576734304,
"learning_rate": 1.164835164835165e-05,
"loss": 0.0011,
"step": 5130
},
{
"epoch": 2.09,
"grad_norm": 0.00506225973367691,
"learning_rate": 1.1632071632071634e-05,
"loss": 0.0007,
"step": 5140
},
{
"epoch": 2.1,
"grad_norm": 0.005589496809989214,
"learning_rate": 1.1615791615791616e-05,
"loss": 0.0007,
"step": 5150
},
{
"epoch": 2.1,
"grad_norm": 0.004957486409693956,
"learning_rate": 1.1599511599511602e-05,
"loss": 0.0156,
"step": 5160
},
{
"epoch": 2.1,
"grad_norm": 0.00666527496650815,
"learning_rate": 1.1583231583231584e-05,
"loss": 0.0007,
"step": 5170
},
{
"epoch": 2.11,
"grad_norm": 0.006306789815425873,
"learning_rate": 1.1566951566951568e-05,
"loss": 0.0006,
"step": 5180
},
{
"epoch": 2.11,
"grad_norm": 0.005329395178705454,
"learning_rate": 1.155067155067155e-05,
"loss": 0.0006,
"step": 5190
},
{
"epoch": 2.12,
"grad_norm": 0.0049823857843875885,
"learning_rate": 1.1534391534391536e-05,
"loss": 0.0006,
"step": 5200
},
{
"epoch": 2.12,
"grad_norm": 0.0051444037817418575,
"learning_rate": 1.1518111518111518e-05,
"loss": 0.0022,
"step": 5210
},
{
"epoch": 2.12,
"grad_norm": 0.00532697094604373,
"learning_rate": 1.1501831501831502e-05,
"loss": 0.0006,
"step": 5220
},
{
"epoch": 2.13,
"grad_norm": 0.006971771828830242,
"learning_rate": 1.1485551485551488e-05,
"loss": 0.0007,
"step": 5230
},
{
"epoch": 2.13,
"grad_norm": 0.005065458826720715,
"learning_rate": 1.146927146927147e-05,
"loss": 0.0006,
"step": 5240
},
{
"epoch": 2.14,
"grad_norm": 0.00542556494474411,
"learning_rate": 1.1452991452991454e-05,
"loss": 0.0006,
"step": 5250
},
{
"epoch": 2.14,
"grad_norm": 0.005721778143197298,
"learning_rate": 1.1436711436711438e-05,
"loss": 0.0006,
"step": 5260
},
{
"epoch": 2.14,
"grad_norm": 0.0050778863951563835,
"learning_rate": 1.1420431420431422e-05,
"loss": 0.0006,
"step": 5270
},
{
"epoch": 2.15,
"grad_norm": 0.005689846817404032,
"learning_rate": 1.1404151404151404e-05,
"loss": 0.0007,
"step": 5280
},
{
"epoch": 2.15,
"grad_norm": 0.005032387096434832,
"learning_rate": 1.138787138787139e-05,
"loss": 0.0053,
"step": 5290
},
{
"epoch": 2.16,
"grad_norm": 0.004602556582540274,
"learning_rate": 1.1371591371591374e-05,
"loss": 0.0006,
"step": 5300
},
{
"epoch": 2.16,
"grad_norm": 0.005181928165256977,
"learning_rate": 1.1355311355311356e-05,
"loss": 0.0006,
"step": 5310
},
{
"epoch": 2.17,
"grad_norm": 0.004627116955816746,
"learning_rate": 1.1339031339031338e-05,
"loss": 0.0006,
"step": 5320
},
{
"epoch": 2.17,
"grad_norm": 0.004680185578763485,
"learning_rate": 1.1322751322751324e-05,
"loss": 0.0006,
"step": 5330
},
{
"epoch": 2.17,
"grad_norm": 0.00517154298722744,
"learning_rate": 1.1306471306471308e-05,
"loss": 0.0006,
"step": 5340
},
{
"epoch": 2.18,
"grad_norm": 0.2655492126941681,
"learning_rate": 1.129019129019129e-05,
"loss": 0.04,
"step": 5350
},
{
"epoch": 2.18,
"grad_norm": 0.004791987128555775,
"learning_rate": 1.1273911273911276e-05,
"loss": 0.0027,
"step": 5360
},
{
"epoch": 2.19,
"grad_norm": 0.00524140102788806,
"learning_rate": 1.1257631257631258e-05,
"loss": 0.0019,
"step": 5370
},
{
"epoch": 2.19,
"grad_norm": 0.004854326602071524,
"learning_rate": 1.1241351241351242e-05,
"loss": 0.0006,
"step": 5380
},
{
"epoch": 2.19,
"grad_norm": 0.004912737291306257,
"learning_rate": 1.1225071225071227e-05,
"loss": 0.0229,
"step": 5390
},
{
"epoch": 2.2,
"grad_norm": 0.009351348504424095,
"learning_rate": 1.120879120879121e-05,
"loss": 0.0006,
"step": 5400
},
{
"epoch": 2.2,
"grad_norm": 0.006594196427613497,
"learning_rate": 1.1192511192511194e-05,
"loss": 0.0007,
"step": 5410
},
{
"epoch": 2.21,
"grad_norm": 0.004785753786563873,
"learning_rate": 1.1176231176231178e-05,
"loss": 0.0006,
"step": 5420
},
{
"epoch": 2.21,
"grad_norm": 0.010175659321248531,
"learning_rate": 1.1159951159951162e-05,
"loss": 0.0347,
"step": 5430
},
{
"epoch": 2.21,
"grad_norm": 0.007659697439521551,
"learning_rate": 1.1143671143671144e-05,
"loss": 0.0006,
"step": 5440
},
{
"epoch": 2.22,
"grad_norm": 0.005518093705177307,
"learning_rate": 1.1127391127391128e-05,
"loss": 0.0007,
"step": 5450
},
{
"epoch": 2.22,
"grad_norm": 0.004838414024561644,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.0006,
"step": 5460
},
{
"epoch": 2.23,
"grad_norm": 0.004535248037427664,
"learning_rate": 1.1094831094831096e-05,
"loss": 0.0007,
"step": 5470
},
{
"epoch": 2.23,
"grad_norm": 0.004755628295242786,
"learning_rate": 1.1078551078551078e-05,
"loss": 0.0006,
"step": 5480
},
{
"epoch": 2.23,
"grad_norm": 0.007153332699090242,
"learning_rate": 1.1062271062271063e-05,
"loss": 0.0006,
"step": 5490
},
{
"epoch": 2.24,
"grad_norm": 0.004593558143824339,
"learning_rate": 1.1045991045991047e-05,
"loss": 0.0006,
"step": 5500
},
{
"epoch": 2.24,
"grad_norm": 0.004781143739819527,
"learning_rate": 1.102971102971103e-05,
"loss": 0.0187,
"step": 5510
},
{
"epoch": 2.25,
"grad_norm": 0.022694548591971397,
"learning_rate": 1.1013431013431015e-05,
"loss": 0.0006,
"step": 5520
},
{
"epoch": 2.25,
"grad_norm": 0.004701571073383093,
"learning_rate": 1.0997150997150998e-05,
"loss": 0.0005,
"step": 5530
},
{
"epoch": 2.25,
"grad_norm": 0.014217639341950417,
"learning_rate": 1.0980870980870981e-05,
"loss": 0.0006,
"step": 5540
},
{
"epoch": 2.26,
"grad_norm": 0.0047623575665056705,
"learning_rate": 1.0964590964590967e-05,
"loss": 0.0005,
"step": 5550
},
{
"epoch": 2.26,
"grad_norm": 0.004431570880115032,
"learning_rate": 1.094831094831095e-05,
"loss": 0.0006,
"step": 5560
},
{
"epoch": 2.27,
"grad_norm": 0.006182719487696886,
"learning_rate": 1.0932030932030933e-05,
"loss": 0.0006,
"step": 5570
},
{
"epoch": 2.27,
"grad_norm": 0.004717973992228508,
"learning_rate": 1.0915750915750916e-05,
"loss": 0.0005,
"step": 5580
},
{
"epoch": 2.28,
"grad_norm": 0.005284770391881466,
"learning_rate": 1.0899470899470901e-05,
"loss": 0.0051,
"step": 5590
},
{
"epoch": 2.28,
"grad_norm": 0.004852925427258015,
"learning_rate": 1.0883190883190883e-05,
"loss": 0.0129,
"step": 5600
},
{
"epoch": 2.28,
"grad_norm": 0.011825304478406906,
"learning_rate": 1.0866910866910867e-05,
"loss": 0.0006,
"step": 5610
},
{
"epoch": 2.29,
"grad_norm": 5.4084672927856445,
"learning_rate": 1.0850630850630853e-05,
"loss": 0.0014,
"step": 5620
},
{
"epoch": 2.29,
"grad_norm": 0.0045865620486438274,
"learning_rate": 1.0834350834350835e-05,
"loss": 0.0012,
"step": 5630
},
{
"epoch": 2.3,
"grad_norm": 0.004212076775729656,
"learning_rate": 1.0818070818070818e-05,
"loss": 0.0005,
"step": 5640
},
{
"epoch": 2.3,
"grad_norm": 0.0043626646511256695,
"learning_rate": 1.0801790801790803e-05,
"loss": 0.0005,
"step": 5650
},
{
"epoch": 2.3,
"grad_norm": 0.003995486069470644,
"learning_rate": 1.0785510785510787e-05,
"loss": 0.0005,
"step": 5660
},
{
"epoch": 2.31,
"grad_norm": 12.674348831176758,
"learning_rate": 1.076923076923077e-05,
"loss": 0.0288,
"step": 5670
},
{
"epoch": 2.31,
"grad_norm": 0.004922170657664537,
"learning_rate": 1.0752950752950755e-05,
"loss": 0.0005,
"step": 5680
},
{
"epoch": 2.32,
"grad_norm": 0.013311301358044147,
"learning_rate": 1.0736670736670737e-05,
"loss": 0.0006,
"step": 5690
},
{
"epoch": 2.32,
"grad_norm": 0.004092982970178127,
"learning_rate": 1.0720390720390721e-05,
"loss": 0.0306,
"step": 5700
},
{
"epoch": 2.32,
"grad_norm": 0.005637271795421839,
"learning_rate": 1.0704110704110703e-05,
"loss": 0.0483,
"step": 5710
},
{
"epoch": 2.33,
"grad_norm": 8.750419616699219,
"learning_rate": 1.0687830687830689e-05,
"loss": 0.0386,
"step": 5720
},
{
"epoch": 2.33,
"grad_norm": 0.01064012385904789,
"learning_rate": 1.0671550671550673e-05,
"loss": 0.0006,
"step": 5730
},
{
"epoch": 2.34,
"grad_norm": 0.004589624237269163,
"learning_rate": 1.0655270655270655e-05,
"loss": 0.0006,
"step": 5740
},
{
"epoch": 2.34,
"grad_norm": 0.004802080802619457,
"learning_rate": 1.0638990638990641e-05,
"loss": 0.0009,
"step": 5750
},
{
"epoch": 2.34,
"grad_norm": 0.004713993053883314,
"learning_rate": 1.0622710622710623e-05,
"loss": 0.0006,
"step": 5760
},
{
"epoch": 2.35,
"grad_norm": 0.004530477803200483,
"learning_rate": 1.0606430606430607e-05,
"loss": 0.001,
"step": 5770
},
{
"epoch": 2.35,
"grad_norm": 0.00422940356656909,
"learning_rate": 1.0590150590150593e-05,
"loss": 0.0007,
"step": 5780
},
{
"epoch": 2.36,
"grad_norm": 0.004178835544735193,
"learning_rate": 1.0573870573870575e-05,
"loss": 0.0006,
"step": 5790
},
{
"epoch": 2.36,
"grad_norm": 0.006506350357085466,
"learning_rate": 1.0557590557590557e-05,
"loss": 0.0005,
"step": 5800
},
{
"epoch": 2.36,
"grad_norm": 0.004273206926882267,
"learning_rate": 1.0541310541310543e-05,
"loss": 0.0005,
"step": 5810
},
{
"epoch": 2.37,
"grad_norm": 0.004112168215215206,
"learning_rate": 1.0525030525030527e-05,
"loss": 0.0005,
"step": 5820
},
{
"epoch": 2.37,
"grad_norm": 0.005212805233895779,
"learning_rate": 1.0508750508750509e-05,
"loss": 0.0005,
"step": 5830
},
{
"epoch": 2.38,
"grad_norm": 0.004351438954472542,
"learning_rate": 1.0492470492470493e-05,
"loss": 0.0005,
"step": 5840
},
{
"epoch": 2.38,
"grad_norm": 0.011514941230416298,
"learning_rate": 1.0476190476190477e-05,
"loss": 0.0005,
"step": 5850
},
{
"epoch": 2.39,
"grad_norm": 0.005969716235995293,
"learning_rate": 1.045991045991046e-05,
"loss": 0.0005,
"step": 5860
},
{
"epoch": 2.39,
"grad_norm": 0.004150481894612312,
"learning_rate": 1.0443630443630443e-05,
"loss": 0.0005,
"step": 5870
},
{
"epoch": 2.39,
"grad_norm": 0.003940541297197342,
"learning_rate": 1.0427350427350429e-05,
"loss": 0.0005,
"step": 5880
},
{
"epoch": 2.4,
"grad_norm": 0.00408910820260644,
"learning_rate": 1.0411070411070413e-05,
"loss": 0.0005,
"step": 5890
},
{
"epoch": 2.4,
"grad_norm": 0.0038459610659629107,
"learning_rate": 1.0394790394790395e-05,
"loss": 0.0006,
"step": 5900
},
{
"epoch": 2.41,
"grad_norm": 0.004051607567816973,
"learning_rate": 1.037851037851038e-05,
"loss": 0.0449,
"step": 5910
},
{
"epoch": 2.41,
"grad_norm": 0.005520727019757032,
"learning_rate": 1.0362230362230363e-05,
"loss": 0.0078,
"step": 5920
},
{
"epoch": 2.41,
"grad_norm": 0.004394978284835815,
"learning_rate": 1.0345950345950347e-05,
"loss": 0.0564,
"step": 5930
},
{
"epoch": 2.42,
"grad_norm": 0.004857844207435846,
"learning_rate": 1.0329670329670332e-05,
"loss": 0.0005,
"step": 5940
},
{
"epoch": 2.42,
"grad_norm": 0.06114115193486214,
"learning_rate": 1.0313390313390315e-05,
"loss": 0.0007,
"step": 5950
},
{
"epoch": 2.43,
"grad_norm": 0.004661387763917446,
"learning_rate": 1.0297110297110297e-05,
"loss": 0.0014,
"step": 5960
},
{
"epoch": 2.43,
"grad_norm": 0.005134343635290861,
"learning_rate": 1.028083028083028e-05,
"loss": 0.001,
"step": 5970
},
{
"epoch": 2.43,
"grad_norm": 0.004655875731259584,
"learning_rate": 1.0264550264550266e-05,
"loss": 0.0006,
"step": 5980
},
{
"epoch": 2.44,
"grad_norm": 0.0050579700618982315,
"learning_rate": 1.0248270248270249e-05,
"loss": 0.0015,
"step": 5990
},
{
"epoch": 2.44,
"grad_norm": 0.0047796061262488365,
"learning_rate": 1.0231990231990233e-05,
"loss": 0.0005,
"step": 6000
},
{
"epoch": 2.45,
"grad_norm": 0.003949730657041073,
"learning_rate": 1.0215710215710217e-05,
"loss": 0.0005,
"step": 6010
},
{
"epoch": 2.45,
"grad_norm": 0.004095940385013819,
"learning_rate": 1.01994301994302e-05,
"loss": 0.0005,
"step": 6020
},
{
"epoch": 2.45,
"grad_norm": 0.005133763421326876,
"learning_rate": 1.0183150183150183e-05,
"loss": 0.0005,
"step": 6030
},
{
"epoch": 2.46,
"grad_norm": 0.01469303946942091,
"learning_rate": 1.0166870166870168e-05,
"loss": 0.0013,
"step": 6040
},
{
"epoch": 2.46,
"grad_norm": 0.004049224779009819,
"learning_rate": 1.0150590150590152e-05,
"loss": 0.0018,
"step": 6050
},
{
"epoch": 2.47,
"grad_norm": 0.004054594319313765,
"learning_rate": 1.0134310134310135e-05,
"loss": 0.0184,
"step": 6060
},
{
"epoch": 2.47,
"grad_norm": 0.004326994996517897,
"learning_rate": 1.011803011803012e-05,
"loss": 0.0005,
"step": 6070
},
{
"epoch": 2.47,
"grad_norm": 0.004046597983688116,
"learning_rate": 1.0101750101750102e-05,
"loss": 0.0005,
"step": 6080
},
{
"epoch": 2.48,
"grad_norm": 0.00401376374065876,
"learning_rate": 1.0085470085470086e-05,
"loss": 0.0006,
"step": 6090
},
{
"epoch": 2.48,
"grad_norm": 0.005812219809740782,
"learning_rate": 1.0069190069190069e-05,
"loss": 0.0006,
"step": 6100
},
{
"epoch": 2.49,
"grad_norm": 0.003832985181361437,
"learning_rate": 1.0052910052910054e-05,
"loss": 0.0005,
"step": 6110
},
{
"epoch": 2.49,
"grad_norm": 0.07061895728111267,
"learning_rate": 1.0036630036630037e-05,
"loss": 0.0005,
"step": 6120
},
{
"epoch": 2.49,
"grad_norm": 0.0039010499604046345,
"learning_rate": 1.002035002035002e-05,
"loss": 0.0017,
"step": 6130
},
{
"epoch": 2.5,
"grad_norm": 0.004468753468245268,
"learning_rate": 1.0004070004070006e-05,
"loss": 0.0022,
"step": 6140
},
{
"epoch": 2.5,
"grad_norm": 0.004342631436884403,
"learning_rate": 9.987789987789988e-06,
"loss": 0.0005,
"step": 6150
},
{
"epoch": 2.51,
"grad_norm": 0.011564524844288826,
"learning_rate": 9.971509971509972e-06,
"loss": 0.0005,
"step": 6160
},
{
"epoch": 2.51,
"grad_norm": 0.00380577496252954,
"learning_rate": 9.955229955229956e-06,
"loss": 0.0005,
"step": 6170
},
{
"epoch": 2.52,
"grad_norm": 0.0037414473481476307,
"learning_rate": 9.93894993894994e-06,
"loss": 0.0011,
"step": 6180
},
{
"epoch": 2.52,
"grad_norm": 0.003814409486949444,
"learning_rate": 9.922669922669922e-06,
"loss": 0.0005,
"step": 6190
},
{
"epoch": 2.52,
"grad_norm": 0.0039341021329164505,
"learning_rate": 9.906389906389906e-06,
"loss": 0.0005,
"step": 6200
},
{
"epoch": 2.53,
"grad_norm": 0.003710733028128743,
"learning_rate": 9.890109890109892e-06,
"loss": 0.0005,
"step": 6210
},
{
"epoch": 2.53,
"grad_norm": 3.491090774536133,
"learning_rate": 9.873829873829874e-06,
"loss": 0.0356,
"step": 6220
},
{
"epoch": 2.54,
"grad_norm": 0.005923949647694826,
"learning_rate": 9.857549857549858e-06,
"loss": 0.0005,
"step": 6230
},
{
"epoch": 2.54,
"grad_norm": 0.0036399513483047485,
"learning_rate": 9.841269841269842e-06,
"loss": 0.0004,
"step": 6240
},
{
"epoch": 2.54,
"grad_norm": 0.003884287318214774,
"learning_rate": 9.824989824989826e-06,
"loss": 0.0005,
"step": 6250
},
{
"epoch": 2.55,
"grad_norm": 0.0036194841377437115,
"learning_rate": 9.80870980870981e-06,
"loss": 0.0004,
"step": 6260
},
{
"epoch": 2.55,
"grad_norm": 0.005207626614719629,
"learning_rate": 9.792429792429792e-06,
"loss": 0.0004,
"step": 6270
},
{
"epoch": 2.56,
"grad_norm": 0.008327057585120201,
"learning_rate": 9.776149776149776e-06,
"loss": 0.0005,
"step": 6280
},
{
"epoch": 2.56,
"grad_norm": 0.003949583508074284,
"learning_rate": 9.759869759869762e-06,
"loss": 0.0004,
"step": 6290
},
{
"epoch": 2.56,
"grad_norm": 0.004071325063705444,
"learning_rate": 9.743589743589744e-06,
"loss": 0.0005,
"step": 6300
},
{
"epoch": 2.57,
"grad_norm": 0.0036700996570289135,
"learning_rate": 9.727309727309728e-06,
"loss": 0.0004,
"step": 6310
},
{
"epoch": 2.57,
"grad_norm": 0.005211398471146822,
"learning_rate": 9.711029711029712e-06,
"loss": 0.0005,
"step": 6320
},
{
"epoch": 2.58,
"grad_norm": 0.003708272473886609,
"learning_rate": 9.694749694749696e-06,
"loss": 0.0004,
"step": 6330
},
{
"epoch": 2.58,
"grad_norm": 0.0042539420537650585,
"learning_rate": 9.67846967846968e-06,
"loss": 0.0004,
"step": 6340
},
{
"epoch": 2.58,
"grad_norm": 0.0038529515732079744,
"learning_rate": 9.662189662189662e-06,
"loss": 0.0004,
"step": 6350
},
{
"epoch": 2.59,
"grad_norm": 0.003946115728467703,
"learning_rate": 9.645909645909646e-06,
"loss": 0.0005,
"step": 6360
},
{
"epoch": 2.59,
"grad_norm": 0.004324799869209528,
"learning_rate": 9.62962962962963e-06,
"loss": 0.0004,
"step": 6370
},
{
"epoch": 2.6,
"grad_norm": 0.0038023737724870443,
"learning_rate": 9.613349613349614e-06,
"loss": 0.0004,
"step": 6380
},
{
"epoch": 2.6,
"grad_norm": 0.0037666463758796453,
"learning_rate": 9.597069597069598e-06,
"loss": 0.0004,
"step": 6390
},
{
"epoch": 2.6,
"grad_norm": 0.0034590172581374645,
"learning_rate": 9.580789580789582e-06,
"loss": 0.017,
"step": 6400
},
{
"epoch": 2.61,
"grad_norm": 0.0038201683200895786,
"learning_rate": 9.564509564509566e-06,
"loss": 0.0004,
"step": 6410
},
{
"epoch": 2.61,
"grad_norm": 0.004171228501945734,
"learning_rate": 9.54822954822955e-06,
"loss": 0.0004,
"step": 6420
},
{
"epoch": 2.62,
"grad_norm": 0.0038926773704588413,
"learning_rate": 9.531949531949532e-06,
"loss": 0.0004,
"step": 6430
},
{
"epoch": 2.62,
"grad_norm": 0.0037587357219308615,
"learning_rate": 9.515669515669516e-06,
"loss": 0.0004,
"step": 6440
},
{
"epoch": 2.63,
"grad_norm": 0.0034505994990468025,
"learning_rate": 9.4993894993895e-06,
"loss": 0.0024,
"step": 6450
},
{
"epoch": 2.63,
"grad_norm": 0.0034958263859152794,
"learning_rate": 9.483109483109484e-06,
"loss": 0.0004,
"step": 6460
},
{
"epoch": 2.63,
"grad_norm": 0.0037652612663805485,
"learning_rate": 9.466829466829468e-06,
"loss": 0.0004,
"step": 6470
},
{
"epoch": 2.64,
"grad_norm": 0.003452475182712078,
"learning_rate": 9.450549450549452e-06,
"loss": 0.0004,
"step": 6480
},
{
"epoch": 2.64,
"grad_norm": 0.005090977996587753,
"learning_rate": 9.434269434269436e-06,
"loss": 0.0004,
"step": 6490
},
{
"epoch": 2.65,
"grad_norm": 0.0036007205490022898,
"learning_rate": 9.417989417989418e-06,
"loss": 0.0004,
"step": 6500
},
{
"epoch": 2.65,
"grad_norm": 0.0033244409132748842,
"learning_rate": 9.401709401709402e-06,
"loss": 0.0004,
"step": 6510
},
{
"epoch": 2.65,
"grad_norm": 0.00387198431417346,
"learning_rate": 9.385429385429386e-06,
"loss": 0.0004,
"step": 6520
},
{
"epoch": 2.66,
"grad_norm": 0.003582969307899475,
"learning_rate": 9.36914936914937e-06,
"loss": 0.0004,
"step": 6530
},
{
"epoch": 2.66,
"grad_norm": 0.0032744621858000755,
"learning_rate": 9.352869352869354e-06,
"loss": 0.0005,
"step": 6540
},
{
"epoch": 2.67,
"grad_norm": 0.0034951018169522285,
"learning_rate": 9.336589336589338e-06,
"loss": 0.0004,
"step": 6550
},
{
"epoch": 2.67,
"grad_norm": 0.0034060273319482803,
"learning_rate": 9.320309320309321e-06,
"loss": 0.0004,
"step": 6560
},
{
"epoch": 2.67,
"grad_norm": 0.0034066797234117985,
"learning_rate": 9.304029304029305e-06,
"loss": 0.0004,
"step": 6570
},
{
"epoch": 2.68,
"grad_norm": 0.0035453049931675196,
"learning_rate": 9.287749287749288e-06,
"loss": 0.0004,
"step": 6580
},
{
"epoch": 2.68,
"grad_norm": 0.0033404843416064978,
"learning_rate": 9.271469271469272e-06,
"loss": 0.0004,
"step": 6590
},
{
"epoch": 2.69,
"grad_norm": 0.0032289137598127127,
"learning_rate": 9.255189255189256e-06,
"loss": 0.0004,
"step": 6600
},
{
"epoch": 2.69,
"grad_norm": 0.0035338301677256823,
"learning_rate": 9.23890923890924e-06,
"loss": 0.0004,
"step": 6610
},
{
"epoch": 2.69,
"grad_norm": 0.0032329142559319735,
"learning_rate": 9.222629222629223e-06,
"loss": 0.0004,
"step": 6620
},
{
"epoch": 2.7,
"grad_norm": 0.0033918411936610937,
"learning_rate": 9.206349206349207e-06,
"loss": 0.0004,
"step": 6630
},
{
"epoch": 2.7,
"grad_norm": 0.003434843849390745,
"learning_rate": 9.190069190069191e-06,
"loss": 0.0004,
"step": 6640
},
{
"epoch": 2.71,
"grad_norm": 0.0032904883846640587,
"learning_rate": 9.173789173789175e-06,
"loss": 0.0004,
"step": 6650
},
{
"epoch": 2.71,
"grad_norm": 0.003165784990414977,
"learning_rate": 9.157509157509158e-06,
"loss": 0.0004,
"step": 6660
},
{
"epoch": 2.71,
"grad_norm": 0.0034379889257252216,
"learning_rate": 9.141229141229141e-06,
"loss": 0.0004,
"step": 6670
},
{
"epoch": 2.72,
"grad_norm": 0.0032244266476482153,
"learning_rate": 9.124949124949125e-06,
"loss": 0.001,
"step": 6680
},
{
"epoch": 2.72,
"grad_norm": 0.003119837259873748,
"learning_rate": 9.10866910866911e-06,
"loss": 0.0004,
"step": 6690
},
{
"epoch": 2.73,
"grad_norm": 0.0038290254306048155,
"learning_rate": 9.092389092389093e-06,
"loss": 0.0004,
"step": 6700
},
{
"epoch": 2.73,
"grad_norm": 0.0032256192062050104,
"learning_rate": 9.076109076109077e-06,
"loss": 0.0004,
"step": 6710
},
{
"epoch": 2.74,
"grad_norm": 0.004083781037479639,
"learning_rate": 9.059829059829061e-06,
"loss": 0.0004,
"step": 6720
},
{
"epoch": 2.74,
"grad_norm": 0.003274232381954789,
"learning_rate": 9.043549043549045e-06,
"loss": 0.0004,
"step": 6730
},
{
"epoch": 2.74,
"grad_norm": 0.0032298911828547716,
"learning_rate": 9.027269027269027e-06,
"loss": 0.0004,
"step": 6740
},
{
"epoch": 2.75,
"grad_norm": 0.0031462605111300945,
"learning_rate": 9.010989010989011e-06,
"loss": 0.0425,
"step": 6750
},
{
"epoch": 2.75,
"grad_norm": 0.00312459422275424,
"learning_rate": 8.994708994708995e-06,
"loss": 0.0004,
"step": 6760
},
{
"epoch": 2.76,
"grad_norm": 0.0036323266103863716,
"learning_rate": 8.97842897842898e-06,
"loss": 0.0004,
"step": 6770
},
{
"epoch": 2.76,
"grad_norm": 0.0033034805674105883,
"learning_rate": 8.962148962148963e-06,
"loss": 0.0004,
"step": 6780
},
{
"epoch": 2.76,
"grad_norm": 0.003054459812119603,
"learning_rate": 8.945868945868947e-06,
"loss": 0.0004,
"step": 6790
},
{
"epoch": 2.77,
"grad_norm": 0.005314236972481012,
"learning_rate": 8.929588929588931e-06,
"loss": 0.0004,
"step": 6800
},
{
"epoch": 2.77,
"grad_norm": 0.010932357981801033,
"learning_rate": 8.913308913308915e-06,
"loss": 0.0004,
"step": 6810
},
{
"epoch": 2.78,
"grad_norm": 0.0031523159705102444,
"learning_rate": 8.897028897028897e-06,
"loss": 0.0004,
"step": 6820
},
{
"epoch": 2.78,
"grad_norm": 0.0034312924835830927,
"learning_rate": 8.880748880748881e-06,
"loss": 0.0101,
"step": 6830
},
{
"epoch": 2.78,
"grad_norm": 0.00318572367541492,
"learning_rate": 8.864468864468865e-06,
"loss": 0.0004,
"step": 6840
},
{
"epoch": 2.79,
"grad_norm": 0.0032511164899915457,
"learning_rate": 8.848188848188849e-06,
"loss": 0.0379,
"step": 6850
},
{
"epoch": 2.79,
"grad_norm": 0.0037167894188314676,
"learning_rate": 8.831908831908833e-06,
"loss": 0.0004,
"step": 6860
},
{
"epoch": 2.8,
"grad_norm": 0.003721152199432254,
"learning_rate": 8.815628815628817e-06,
"loss": 0.0008,
"step": 6870
},
{
"epoch": 2.8,
"grad_norm": 0.0030927169136703014,
"learning_rate": 8.7993487993488e-06,
"loss": 0.0004,
"step": 6880
},
{
"epoch": 2.8,
"grad_norm": 0.0034221247769892216,
"learning_rate": 8.783068783068783e-06,
"loss": 0.0031,
"step": 6890
},
{
"epoch": 2.81,
"grad_norm": 0.0033293466549366713,
"learning_rate": 8.766788766788767e-06,
"loss": 0.0004,
"step": 6900
},
{
"epoch": 2.81,
"grad_norm": 0.003214113647118211,
"learning_rate": 8.750508750508751e-06,
"loss": 0.0004,
"step": 6910
},
{
"epoch": 2.82,
"grad_norm": 0.0032116910442709923,
"learning_rate": 8.734228734228735e-06,
"loss": 0.034,
"step": 6920
},
{
"epoch": 2.82,
"grad_norm": 0.003405655035749078,
"learning_rate": 8.717948717948719e-06,
"loss": 0.0515,
"step": 6930
},
{
"epoch": 2.82,
"grad_norm": 0.003949843347072601,
"learning_rate": 8.701668701668703e-06,
"loss": 0.0508,
"step": 6940
},
{
"epoch": 2.83,
"grad_norm": 0.0030140685848891735,
"learning_rate": 8.685388685388687e-06,
"loss": 0.0385,
"step": 6950
},
{
"epoch": 2.83,
"grad_norm": 0.0034769896883517504,
"learning_rate": 8.66910866910867e-06,
"loss": 0.0004,
"step": 6960
},
{
"epoch": 2.84,
"grad_norm": 0.005965403746813536,
"learning_rate": 8.652828652828653e-06,
"loss": 0.0454,
"step": 6970
},
{
"epoch": 2.84,
"grad_norm": 0.004475270863622427,
"learning_rate": 8.636548636548637e-06,
"loss": 0.0005,
"step": 6980
},
{
"epoch": 2.84,
"grad_norm": 0.0039094300009310246,
"learning_rate": 8.62026862026862e-06,
"loss": 0.0005,
"step": 6990
},
{
"epoch": 2.85,
"grad_norm": 0.004547227174043655,
"learning_rate": 8.603988603988605e-06,
"loss": 0.0004,
"step": 7000
},
{
"epoch": 2.85,
"grad_norm": 0.0033658877946436405,
"learning_rate": 8.587708587708589e-06,
"loss": 0.0005,
"step": 7010
},
{
"epoch": 2.86,
"grad_norm": 0.0037282053381204605,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0005,
"step": 7020
},
{
"epoch": 2.86,
"grad_norm": 0.012108271941542625,
"learning_rate": 8.555148555148557e-06,
"loss": 0.0005,
"step": 7030
},
{
"epoch": 2.87,
"grad_norm": 0.00378889380954206,
"learning_rate": 8.53886853886854e-06,
"loss": 0.0142,
"step": 7040
},
{
"epoch": 2.87,
"grad_norm": 0.0037225610576570034,
"learning_rate": 8.522588522588523e-06,
"loss": 0.0009,
"step": 7050
},
{
"epoch": 2.87,
"grad_norm": 0.005083514377474785,
"learning_rate": 8.506308506308507e-06,
"loss": 0.0004,
"step": 7060
},
{
"epoch": 2.88,
"grad_norm": 0.0035945470444858074,
"learning_rate": 8.49002849002849e-06,
"loss": 0.0005,
"step": 7070
},
{
"epoch": 2.88,
"grad_norm": 0.0031938895117491484,
"learning_rate": 8.473748473748475e-06,
"loss": 0.0007,
"step": 7080
},
{
"epoch": 2.89,
"grad_norm": 0.007891247980296612,
"learning_rate": 8.457468457468459e-06,
"loss": 0.0004,
"step": 7090
},
{
"epoch": 2.89,
"grad_norm": 0.003397688502445817,
"learning_rate": 8.44118844118844e-06,
"loss": 0.0004,
"step": 7100
},
{
"epoch": 2.89,
"grad_norm": 0.004096095450222492,
"learning_rate": 8.424908424908426e-06,
"loss": 0.0004,
"step": 7110
},
{
"epoch": 2.9,
"grad_norm": 0.004969074856489897,
"learning_rate": 8.40862840862841e-06,
"loss": 0.0004,
"step": 7120
},
{
"epoch": 2.9,
"grad_norm": 0.002869043732061982,
"learning_rate": 8.392348392348393e-06,
"loss": 0.0376,
"step": 7130
},
{
"epoch": 2.91,
"grad_norm": 0.004255395848304033,
"learning_rate": 8.376068376068377e-06,
"loss": 0.0004,
"step": 7140
},
{
"epoch": 2.91,
"grad_norm": 0.003371414029970765,
"learning_rate": 8.35978835978836e-06,
"loss": 0.0005,
"step": 7150
},
{
"epoch": 2.91,
"grad_norm": 0.0031468465458601713,
"learning_rate": 8.343508343508344e-06,
"loss": 0.0004,
"step": 7160
},
{
"epoch": 2.92,
"grad_norm": 0.004064807202666998,
"learning_rate": 8.327228327228328e-06,
"loss": 0.0004,
"step": 7170
},
{
"epoch": 2.92,
"grad_norm": 0.0038253762759268284,
"learning_rate": 8.31094831094831e-06,
"loss": 0.0275,
"step": 7180
},
{
"epoch": 2.93,
"grad_norm": 0.0029601927381008863,
"learning_rate": 8.294668294668296e-06,
"loss": 0.0162,
"step": 7190
},
{
"epoch": 2.93,
"grad_norm": 0.0035592832136899233,
"learning_rate": 8.278388278388278e-06,
"loss": 0.001,
"step": 7200
},
{
"epoch": 2.93,
"grad_norm": 0.003166656941175461,
"learning_rate": 8.262108262108262e-06,
"loss": 0.0004,
"step": 7210
},
{
"epoch": 2.94,
"grad_norm": 0.0038591506890952587,
"learning_rate": 8.245828245828246e-06,
"loss": 0.0004,
"step": 7220
},
{
"epoch": 2.94,
"grad_norm": 0.004316645674407482,
"learning_rate": 8.22954822954823e-06,
"loss": 0.0339,
"step": 7230
},
{
"epoch": 2.95,
"grad_norm": 0.003106352873146534,
"learning_rate": 8.213268213268214e-06,
"loss": 0.0004,
"step": 7240
},
{
"epoch": 2.95,
"grad_norm": 0.003383921692147851,
"learning_rate": 8.196988196988198e-06,
"loss": 0.0004,
"step": 7250
},
{
"epoch": 2.95,
"grad_norm": 0.003904301906004548,
"learning_rate": 8.18070818070818e-06,
"loss": 0.009,
"step": 7260
},
{
"epoch": 2.96,
"grad_norm": 0.002857522340491414,
"learning_rate": 8.164428164428166e-06,
"loss": 0.0004,
"step": 7270
},
{
"epoch": 2.96,
"grad_norm": 0.0028671324253082275,
"learning_rate": 8.148148148148148e-06,
"loss": 0.0004,
"step": 7280
},
{
"epoch": 2.97,
"grad_norm": 0.0028230687603354454,
"learning_rate": 8.131868131868132e-06,
"loss": 0.0009,
"step": 7290
},
{
"epoch": 2.97,
"grad_norm": 0.0028381363954395056,
"learning_rate": 8.115588115588116e-06,
"loss": 0.0003,
"step": 7300
},
{
"epoch": 2.98,
"grad_norm": 0.0028295046649873257,
"learning_rate": 8.0993080993081e-06,
"loss": 0.0099,
"step": 7310
},
{
"epoch": 2.98,
"grad_norm": 0.0051268660463392735,
"learning_rate": 8.083028083028084e-06,
"loss": 0.0004,
"step": 7320
},
{
"epoch": 2.98,
"grad_norm": 0.006851341109722853,
"learning_rate": 8.066748066748066e-06,
"loss": 0.0569,
"step": 7330
},
{
"epoch": 2.99,
"grad_norm": 0.003248844761401415,
"learning_rate": 8.05046805046805e-06,
"loss": 0.0004,
"step": 7340
},
{
"epoch": 2.99,
"grad_norm": 0.003859333461150527,
"learning_rate": 8.034188034188036e-06,
"loss": 0.0012,
"step": 7350
},
{
"epoch": 3.0,
"grad_norm": 0.002941732294857502,
"learning_rate": 8.017908017908018e-06,
"loss": 0.0145,
"step": 7360
},
{
"epoch": 3.0,
"grad_norm": 0.0032136046793311834,
"learning_rate": 8.001628001628002e-06,
"loss": 0.0004,
"step": 7370
},
{
"epoch": 3.0,
"grad_norm": 0.0037520972546190023,
"learning_rate": 7.985347985347986e-06,
"loss": 0.0003,
"step": 7380
},
{
"epoch": 3.01,
"grad_norm": 0.002765586832538247,
"learning_rate": 7.96906796906797e-06,
"loss": 0.0003,
"step": 7390
},
{
"epoch": 3.01,
"grad_norm": 0.002917984500527382,
"learning_rate": 7.952787952787954e-06,
"loss": 0.0003,
"step": 7400
},
{
"epoch": 3.02,
"grad_norm": 0.002771808998659253,
"learning_rate": 7.936507936507936e-06,
"loss": 0.0003,
"step": 7410
},
{
"epoch": 3.02,
"grad_norm": 0.0028077957686036825,
"learning_rate": 7.92022792022792e-06,
"loss": 0.0003,
"step": 7420
},
{
"epoch": 3.02,
"grad_norm": 0.014859122224152088,
"learning_rate": 7.903947903947906e-06,
"loss": 0.0187,
"step": 7430
},
{
"epoch": 3.03,
"grad_norm": 0.0029327922966331244,
"learning_rate": 7.887667887667888e-06,
"loss": 0.0003,
"step": 7440
},
{
"epoch": 3.03,
"grad_norm": 0.003155101090669632,
"learning_rate": 7.871387871387872e-06,
"loss": 0.0003,
"step": 7450
},
{
"epoch": 3.04,
"grad_norm": 0.002822224283590913,
"learning_rate": 7.855107855107856e-06,
"loss": 0.0003,
"step": 7460
},
{
"epoch": 3.04,
"grad_norm": 0.002726204926148057,
"learning_rate": 7.83882783882784e-06,
"loss": 0.0003,
"step": 7470
},
{
"epoch": 3.04,
"grad_norm": 0.0026202842127531767,
"learning_rate": 7.822547822547824e-06,
"loss": 0.0004,
"step": 7480
},
{
"epoch": 3.05,
"grad_norm": 0.0026620635762810707,
"learning_rate": 7.806267806267806e-06,
"loss": 0.0003,
"step": 7490
},
{
"epoch": 3.05,
"grad_norm": 0.0026845140382647514,
"learning_rate": 7.78998778998779e-06,
"loss": 0.0003,
"step": 7500
},
{
"epoch": 3.06,
"grad_norm": 0.002793940482661128,
"learning_rate": 7.773707773707776e-06,
"loss": 0.0004,
"step": 7510
},
{
"epoch": 3.06,
"grad_norm": 0.002819318324327469,
"learning_rate": 7.757427757427758e-06,
"loss": 0.0003,
"step": 7520
},
{
"epoch": 3.06,
"grad_norm": 0.0027769345324486494,
"learning_rate": 7.741147741147742e-06,
"loss": 0.0003,
"step": 7530
},
{
"epoch": 3.07,
"grad_norm": 0.002659664023667574,
"learning_rate": 7.724867724867726e-06,
"loss": 0.0003,
"step": 7540
},
{
"epoch": 3.07,
"grad_norm": 0.0025388060603290796,
"learning_rate": 7.70858770858771e-06,
"loss": 0.0003,
"step": 7550
},
{
"epoch": 3.08,
"grad_norm": 0.002629263559356332,
"learning_rate": 7.692307692307694e-06,
"loss": 0.0003,
"step": 7560
},
{
"epoch": 3.08,
"grad_norm": 0.0025471756234765053,
"learning_rate": 7.676027676027676e-06,
"loss": 0.0003,
"step": 7570
},
{
"epoch": 3.09,
"grad_norm": 0.006246237549930811,
"learning_rate": 7.65974765974766e-06,
"loss": 0.0003,
"step": 7580
},
{
"epoch": 3.09,
"grad_norm": 0.0031642026733607054,
"learning_rate": 7.643467643467644e-06,
"loss": 0.0003,
"step": 7590
},
{
"epoch": 3.09,
"grad_norm": 0.0028460524044930935,
"learning_rate": 7.627187627187628e-06,
"loss": 0.0003,
"step": 7600
},
{
"epoch": 3.1,
"grad_norm": 0.0027321220841258764,
"learning_rate": 7.610907610907612e-06,
"loss": 0.0004,
"step": 7610
},
{
"epoch": 3.1,
"grad_norm": 0.07277552038431168,
"learning_rate": 7.594627594627595e-06,
"loss": 0.0003,
"step": 7620
},
{
"epoch": 3.11,
"grad_norm": 0.002561114262789488,
"learning_rate": 7.578347578347579e-06,
"loss": 0.0003,
"step": 7630
},
{
"epoch": 3.11,
"grad_norm": 0.002666006563231349,
"learning_rate": 7.5620675620675634e-06,
"loss": 0.0003,
"step": 7640
},
{
"epoch": 3.11,
"grad_norm": 0.003249433124437928,
"learning_rate": 7.5457875457875465e-06,
"loss": 0.0003,
"step": 7650
},
{
"epoch": 3.12,
"grad_norm": 0.002814142033457756,
"learning_rate": 7.5295075295075305e-06,
"loss": 0.0003,
"step": 7660
},
{
"epoch": 3.12,
"grad_norm": 0.002647695131599903,
"learning_rate": 7.5132275132275136e-06,
"loss": 0.0003,
"step": 7670
},
{
"epoch": 3.13,
"grad_norm": 0.0028357000555843115,
"learning_rate": 7.4969474969474975e-06,
"loss": 0.0003,
"step": 7680
},
{
"epoch": 3.13,
"grad_norm": 0.002574663609266281,
"learning_rate": 7.4806674806674814e-06,
"loss": 0.0003,
"step": 7690
},
{
"epoch": 3.13,
"grad_norm": 0.002485772827640176,
"learning_rate": 7.4643874643874645e-06,
"loss": 0.0004,
"step": 7700
},
{
"epoch": 3.14,
"grad_norm": 0.0026384114753454924,
"learning_rate": 7.448107448107449e-06,
"loss": 0.0003,
"step": 7710
},
{
"epoch": 3.14,
"grad_norm": 0.0025012667756527662,
"learning_rate": 7.4318274318274316e-06,
"loss": 0.0003,
"step": 7720
},
{
"epoch": 3.15,
"grad_norm": 0.0023603325244039297,
"learning_rate": 7.415547415547416e-06,
"loss": 0.0008,
"step": 7730
},
{
"epoch": 3.15,
"grad_norm": 0.006851641461253166,
"learning_rate": 7.3992673992674e-06,
"loss": 0.0003,
"step": 7740
},
{
"epoch": 3.15,
"grad_norm": 0.0029785565566271544,
"learning_rate": 7.382987382987383e-06,
"loss": 0.0003,
"step": 7750
},
{
"epoch": 3.16,
"grad_norm": 0.002378121018409729,
"learning_rate": 7.366707366707367e-06,
"loss": 0.0062,
"step": 7760
},
{
"epoch": 3.16,
"grad_norm": 0.0024877325631678104,
"learning_rate": 7.350427350427351e-06,
"loss": 0.0003,
"step": 7770
},
{
"epoch": 3.17,
"grad_norm": 0.004979600198566914,
"learning_rate": 7.334147334147334e-06,
"loss": 0.0003,
"step": 7780
},
{
"epoch": 3.17,
"grad_norm": 0.002649629721418023,
"learning_rate": 7.317867317867319e-06,
"loss": 0.0003,
"step": 7790
},
{
"epoch": 3.17,
"grad_norm": 0.0030928929336369038,
"learning_rate": 7.301587301587301e-06,
"loss": 0.0003,
"step": 7800
},
{
"epoch": 3.18,
"grad_norm": 0.00250143650919199,
"learning_rate": 7.285307285307286e-06,
"loss": 0.0003,
"step": 7810
},
{
"epoch": 3.18,
"grad_norm": 0.002448960905894637,
"learning_rate": 7.26902726902727e-06,
"loss": 0.0003,
"step": 7820
},
{
"epoch": 3.19,
"grad_norm": 0.0023297348525375128,
"learning_rate": 7.252747252747253e-06,
"loss": 0.0003,
"step": 7830
},
{
"epoch": 3.19,
"grad_norm": 0.0023908980656415224,
"learning_rate": 7.236467236467237e-06,
"loss": 0.0003,
"step": 7840
},
{
"epoch": 3.19,
"grad_norm": 0.003359014866873622,
"learning_rate": 7.22018722018722e-06,
"loss": 0.0003,
"step": 7850
},
{
"epoch": 3.2,
"grad_norm": 0.002836639992892742,
"learning_rate": 7.203907203907204e-06,
"loss": 0.0003,
"step": 7860
},
{
"epoch": 3.2,
"grad_norm": 0.0024746765848249197,
"learning_rate": 7.187627187627189e-06,
"loss": 0.0003,
"step": 7870
},
{
"epoch": 3.21,
"grad_norm": 0.002388924825936556,
"learning_rate": 7.171347171347171e-06,
"loss": 0.0003,
"step": 7880
},
{
"epoch": 3.21,
"grad_norm": 0.0024251139257103205,
"learning_rate": 7.155067155067156e-06,
"loss": 0.0003,
"step": 7890
},
{
"epoch": 3.22,
"grad_norm": 0.00242208456620574,
"learning_rate": 7.13878713878714e-06,
"loss": 0.0003,
"step": 7900
},
{
"epoch": 3.22,
"grad_norm": 0.0023256507702171803,
"learning_rate": 7.122507122507123e-06,
"loss": 0.0003,
"step": 7910
},
{
"epoch": 3.22,
"grad_norm": 0.0022887035738676786,
"learning_rate": 7.106227106227107e-06,
"loss": 0.0003,
"step": 7920
},
{
"epoch": 3.23,
"grad_norm": 0.0022210038732737303,
"learning_rate": 7.08994708994709e-06,
"loss": 0.0003,
"step": 7930
},
{
"epoch": 3.23,
"grad_norm": 0.002328604692593217,
"learning_rate": 7.073667073667074e-06,
"loss": 0.0003,
"step": 7940
},
{
"epoch": 3.24,
"grad_norm": 0.002483953256160021,
"learning_rate": 7.057387057387059e-06,
"loss": 0.0003,
"step": 7950
},
{
"epoch": 3.24,
"grad_norm": 0.002675483236089349,
"learning_rate": 7.041107041107041e-06,
"loss": 0.0003,
"step": 7960
},
{
"epoch": 3.24,
"grad_norm": 0.0023732264526188374,
"learning_rate": 7.024827024827026e-06,
"loss": 0.0003,
"step": 7970
},
{
"epoch": 3.25,
"grad_norm": 0.002226916840299964,
"learning_rate": 7.008547008547009e-06,
"loss": 0.0003,
"step": 7980
},
{
"epoch": 3.25,
"grad_norm": 0.003264982718974352,
"learning_rate": 6.992266992266993e-06,
"loss": 0.0003,
"step": 7990
},
{
"epoch": 3.26,
"grad_norm": 0.0026976047083735466,
"learning_rate": 6.975986975986977e-06,
"loss": 0.0003,
"step": 8000
},
{
"epoch": 3.26,
"grad_norm": 0.002336106961593032,
"learning_rate": 6.95970695970696e-06,
"loss": 0.0003,
"step": 8010
},
{
"epoch": 3.26,
"grad_norm": 0.0023025060072541237,
"learning_rate": 6.943426943426944e-06,
"loss": 0.0003,
"step": 8020
},
{
"epoch": 3.27,
"grad_norm": 0.0024826654698699713,
"learning_rate": 6.927146927146929e-06,
"loss": 0.0003,
"step": 8030
},
{
"epoch": 3.27,
"grad_norm": 0.002214565174654126,
"learning_rate": 6.910866910866911e-06,
"loss": 0.0003,
"step": 8040
},
{
"epoch": 3.28,
"grad_norm": 0.002279749372974038,
"learning_rate": 6.894586894586896e-06,
"loss": 0.0003,
"step": 8050
},
{
"epoch": 3.28,
"grad_norm": 0.002262295223772526,
"learning_rate": 6.878306878306879e-06,
"loss": 0.0003,
"step": 8060
},
{
"epoch": 3.28,
"grad_norm": 0.0022824567276984453,
"learning_rate": 6.862026862026863e-06,
"loss": 0.0003,
"step": 8070
},
{
"epoch": 3.29,
"grad_norm": 0.0022059327457100153,
"learning_rate": 6.845746845746847e-06,
"loss": 0.0003,
"step": 8080
},
{
"epoch": 3.29,
"grad_norm": 0.0022225133143365383,
"learning_rate": 6.82946682946683e-06,
"loss": 0.0003,
"step": 8090
},
{
"epoch": 3.3,
"grad_norm": 0.0030766648706048727,
"learning_rate": 6.813186813186814e-06,
"loss": 0.0003,
"step": 8100
},
{
"epoch": 3.3,
"grad_norm": 0.0020688914228230715,
"learning_rate": 6.796906796906797e-06,
"loss": 0.0003,
"step": 8110
},
{
"epoch": 3.3,
"grad_norm": 0.0026230113580822945,
"learning_rate": 6.780626780626781e-06,
"loss": 0.0003,
"step": 8120
},
{
"epoch": 3.31,
"grad_norm": 0.0027380469255149364,
"learning_rate": 6.7643467643467655e-06,
"loss": 0.0002,
"step": 8130
},
{
"epoch": 3.31,
"grad_norm": 0.0020218545105308294,
"learning_rate": 6.748066748066749e-06,
"loss": 0.0002,
"step": 8140
},
{
"epoch": 3.32,
"grad_norm": 0.0022498080506920815,
"learning_rate": 6.7317867317867326e-06,
"loss": 0.0002,
"step": 8150
},
{
"epoch": 3.32,
"grad_norm": 0.0026646710466593504,
"learning_rate": 6.715506715506716e-06,
"loss": 0.0002,
"step": 8160
},
{
"epoch": 3.33,
"grad_norm": 0.0021166689693927765,
"learning_rate": 6.6992266992267e-06,
"loss": 0.0002,
"step": 8170
},
{
"epoch": 3.33,
"grad_norm": 0.0022176315542310476,
"learning_rate": 6.6829466829466836e-06,
"loss": 0.0003,
"step": 8180
},
{
"epoch": 3.33,
"grad_norm": 0.0020941500551998615,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0002,
"step": 8190
},
{
"epoch": 3.34,
"grad_norm": 0.002201402559876442,
"learning_rate": 6.650386650386651e-06,
"loss": 0.0003,
"step": 8200
},
{
"epoch": 3.34,
"grad_norm": 0.002235386986285448,
"learning_rate": 6.634106634106635e-06,
"loss": 0.0062,
"step": 8210
},
{
"epoch": 3.35,
"grad_norm": 0.002202383242547512,
"learning_rate": 6.6178266178266185e-06,
"loss": 0.0002,
"step": 8220
},
{
"epoch": 3.35,
"grad_norm": 0.002144381171092391,
"learning_rate": 6.601546601546602e-06,
"loss": 0.0002,
"step": 8230
},
{
"epoch": 3.35,
"grad_norm": 0.0027761892415583134,
"learning_rate": 6.5852665852665855e-06,
"loss": 0.0003,
"step": 8240
},
{
"epoch": 3.36,
"grad_norm": 0.002119843615218997,
"learning_rate": 6.5689865689865694e-06,
"loss": 0.0002,
"step": 8250
},
{
"epoch": 3.36,
"grad_norm": 0.003361073322594166,
"learning_rate": 6.552706552706553e-06,
"loss": 0.0002,
"step": 8260
},
{
"epoch": 3.37,
"grad_norm": 0.0021668022964149714,
"learning_rate": 6.5364265364265365e-06,
"loss": 0.0003,
"step": 8270
},
{
"epoch": 3.37,
"grad_norm": 0.0020495818462222815,
"learning_rate": 6.5201465201465204e-06,
"loss": 0.0002,
"step": 8280
},
{
"epoch": 3.37,
"grad_norm": 0.002108585089445114,
"learning_rate": 6.5038665038665035e-06,
"loss": 0.0004,
"step": 8290
},
{
"epoch": 3.38,
"grad_norm": 0.0022084820084273815,
"learning_rate": 6.487586487586488e-06,
"loss": 0.0094,
"step": 8300
},
{
"epoch": 3.38,
"grad_norm": 0.002132968744263053,
"learning_rate": 6.471306471306472e-06,
"loss": 0.0002,
"step": 8310
},
{
"epoch": 3.39,
"grad_norm": 0.002239073161035776,
"learning_rate": 6.455026455026455e-06,
"loss": 0.0002,
"step": 8320
},
{
"epoch": 3.39,
"grad_norm": 0.00218349602073431,
"learning_rate": 6.438746438746439e-06,
"loss": 0.0002,
"step": 8330
},
{
"epoch": 3.39,
"grad_norm": 0.00208345171995461,
"learning_rate": 6.422466422466423e-06,
"loss": 0.0002,
"step": 8340
},
{
"epoch": 3.4,
"grad_norm": 0.003050567815080285,
"learning_rate": 6.406186406186406e-06,
"loss": 0.0002,
"step": 8350
},
{
"epoch": 3.4,
"grad_norm": 0.0019847999792546034,
"learning_rate": 6.38990638990639e-06,
"loss": 0.0003,
"step": 8360
},
{
"epoch": 3.41,
"grad_norm": 0.0020100034307688475,
"learning_rate": 6.373626373626373e-06,
"loss": 0.0002,
"step": 8370
},
{
"epoch": 3.41,
"grad_norm": 0.0020706066861748695,
"learning_rate": 6.357346357346358e-06,
"loss": 0.0002,
"step": 8380
},
{
"epoch": 3.41,
"grad_norm": 0.0019506254466250539,
"learning_rate": 6.341066341066342e-06,
"loss": 0.0002,
"step": 8390
},
{
"epoch": 3.42,
"grad_norm": 0.0020071598701179028,
"learning_rate": 6.324786324786325e-06,
"loss": 0.0002,
"step": 8400
},
{
"epoch": 3.42,
"grad_norm": 0.002606179565191269,
"learning_rate": 6.308506308506309e-06,
"loss": 0.0467,
"step": 8410
},
{
"epoch": 3.43,
"grad_norm": 0.0021410868503153324,
"learning_rate": 6.292226292226292e-06,
"loss": 0.0002,
"step": 8420
},
{
"epoch": 3.43,
"grad_norm": 0.002439359435811639,
"learning_rate": 6.275946275946276e-06,
"loss": 0.0002,
"step": 8430
},
{
"epoch": 3.44,
"grad_norm": 0.0037037180736660957,
"learning_rate": 6.25966625966626e-06,
"loss": 0.0002,
"step": 8440
},
{
"epoch": 3.44,
"grad_norm": 0.0022582276724278927,
"learning_rate": 6.243386243386243e-06,
"loss": 0.0002,
"step": 8450
},
{
"epoch": 3.44,
"grad_norm": 0.006983071565628052,
"learning_rate": 6.227106227106228e-06,
"loss": 0.0002,
"step": 8460
},
{
"epoch": 3.45,
"grad_norm": 0.01085950993001461,
"learning_rate": 6.210826210826212e-06,
"loss": 0.0003,
"step": 8470
},
{
"epoch": 3.45,
"grad_norm": 0.0021798298694193363,
"learning_rate": 6.194546194546195e-06,
"loss": 0.0002,
"step": 8480
},
{
"epoch": 3.46,
"grad_norm": 0.0021102093160152435,
"learning_rate": 6.178266178266179e-06,
"loss": 0.0002,
"step": 8490
},
{
"epoch": 3.46,
"grad_norm": 0.0021143911872059107,
"learning_rate": 6.161986161986162e-06,
"loss": 0.0002,
"step": 8500
},
{
"epoch": 3.46,
"grad_norm": 0.002472953638061881,
"learning_rate": 6.145706145706146e-06,
"loss": 0.0002,
"step": 8510
},
{
"epoch": 3.47,
"grad_norm": 0.0019736222457140684,
"learning_rate": 6.12942612942613e-06,
"loss": 0.0002,
"step": 8520
},
{
"epoch": 3.47,
"grad_norm": 0.001965272007510066,
"learning_rate": 6.113146113146113e-06,
"loss": 0.0002,
"step": 8530
},
{
"epoch": 3.48,
"grad_norm": 0.001975101651623845,
"learning_rate": 6.096866096866098e-06,
"loss": 0.0002,
"step": 8540
},
{
"epoch": 3.48,
"grad_norm": 0.002040453255176544,
"learning_rate": 6.080586080586081e-06,
"loss": 0.0002,
"step": 8550
},
{
"epoch": 3.48,
"grad_norm": 7.389462471008301,
"learning_rate": 6.064306064306065e-06,
"loss": 0.034,
"step": 8560
},
{
"epoch": 3.49,
"grad_norm": 0.0022456683218479156,
"learning_rate": 6.048026048026049e-06,
"loss": 0.0002,
"step": 8570
},
{
"epoch": 3.49,
"grad_norm": 0.003975760657340288,
"learning_rate": 6.031746031746032e-06,
"loss": 0.0002,
"step": 8580
},
{
"epoch": 3.5,
"grad_norm": 0.0020120914559811354,
"learning_rate": 6.015466015466016e-06,
"loss": 0.0002,
"step": 8590
},
{
"epoch": 3.5,
"grad_norm": 0.0022050223778933287,
"learning_rate": 5.999185999186001e-06,
"loss": 0.0002,
"step": 8600
},
{
"epoch": 3.5,
"grad_norm": 6.6309919357299805,
"learning_rate": 5.982905982905983e-06,
"loss": 0.043,
"step": 8610
},
{
"epoch": 3.51,
"grad_norm": 0.0022617350332438946,
"learning_rate": 5.966625966625968e-06,
"loss": 0.0002,
"step": 8620
},
{
"epoch": 3.51,
"grad_norm": 0.0019437572918832302,
"learning_rate": 5.950345950345951e-06,
"loss": 0.0002,
"step": 8630
},
{
"epoch": 3.52,
"grad_norm": 0.001993882469832897,
"learning_rate": 5.934065934065935e-06,
"loss": 0.0002,
"step": 8640
},
{
"epoch": 3.52,
"grad_norm": 0.0022044796496629715,
"learning_rate": 5.917785917785919e-06,
"loss": 0.0002,
"step": 8650
},
{
"epoch": 3.52,
"grad_norm": 0.0020595360547304153,
"learning_rate": 5.901505901505902e-06,
"loss": 0.0004,
"step": 8660
},
{
"epoch": 3.53,
"grad_norm": 0.002459390088915825,
"learning_rate": 5.885225885225886e-06,
"loss": 0.0002,
"step": 8670
},
{
"epoch": 3.53,
"grad_norm": 0.0018390618497505784,
"learning_rate": 5.868945868945869e-06,
"loss": 0.0002,
"step": 8680
},
{
"epoch": 3.54,
"grad_norm": 0.002049500122666359,
"learning_rate": 5.852665852665853e-06,
"loss": 0.0002,
"step": 8690
},
{
"epoch": 3.54,
"grad_norm": 0.001947426819242537,
"learning_rate": 5.8363858363858375e-06,
"loss": 0.0002,
"step": 8700
},
{
"epoch": 3.54,
"grad_norm": 0.0030878265388309956,
"learning_rate": 5.820105820105821e-06,
"loss": 0.0002,
"step": 8710
},
{
"epoch": 3.55,
"grad_norm": 0.001884807599708438,
"learning_rate": 5.8038258038258045e-06,
"loss": 0.0002,
"step": 8720
},
{
"epoch": 3.55,
"grad_norm": 0.0019810153171420097,
"learning_rate": 5.7875457875457885e-06,
"loss": 0.0002,
"step": 8730
},
{
"epoch": 3.56,
"grad_norm": 0.001923812204040587,
"learning_rate": 5.7712657712657716e-06,
"loss": 0.0002,
"step": 8740
},
{
"epoch": 3.56,
"grad_norm": 0.001998158637434244,
"learning_rate": 5.7549857549857555e-06,
"loss": 0.0002,
"step": 8750
},
{
"epoch": 3.57,
"grad_norm": 0.0018681226065382361,
"learning_rate": 5.738705738705739e-06,
"loss": 0.0002,
"step": 8760
},
{
"epoch": 3.57,
"grad_norm": 0.006764058023691177,
"learning_rate": 5.7224257224257225e-06,
"loss": 0.0068,
"step": 8770
},
{
"epoch": 3.57,
"grad_norm": 0.003150691743940115,
"learning_rate": 5.706145706145707e-06,
"loss": 0.0007,
"step": 8780
},
{
"epoch": 3.58,
"grad_norm": 0.00217335089109838,
"learning_rate": 5.68986568986569e-06,
"loss": 0.0002,
"step": 8790
},
{
"epoch": 3.58,
"grad_norm": 0.00865620281547308,
"learning_rate": 5.673585673585674e-06,
"loss": 0.0002,
"step": 8800
},
{
"epoch": 3.59,
"grad_norm": 0.0020344313234090805,
"learning_rate": 5.6573056573056575e-06,
"loss": 0.0002,
"step": 8810
},
{
"epoch": 3.59,
"grad_norm": 0.0018948889337480068,
"learning_rate": 5.641025641025641e-06,
"loss": 0.0002,
"step": 8820
},
{
"epoch": 3.59,
"grad_norm": 0.001868214923888445,
"learning_rate": 5.624745624745625e-06,
"loss": 0.0002,
"step": 8830
},
{
"epoch": 3.6,
"grad_norm": 0.0019942354410886765,
"learning_rate": 5.6084656084656084e-06,
"loss": 0.0002,
"step": 8840
},
{
"epoch": 3.6,
"grad_norm": 0.0018839197000488639,
"learning_rate": 5.592185592185592e-06,
"loss": 0.0002,
"step": 8850
},
{
"epoch": 3.61,
"grad_norm": 0.0022100857459008694,
"learning_rate": 5.575905575905577e-06,
"loss": 0.0002,
"step": 8860
},
{
"epoch": 3.61,
"grad_norm": 0.0019310906063765287,
"learning_rate": 5.55962555962556e-06,
"loss": 0.0002,
"step": 8870
},
{
"epoch": 3.61,
"grad_norm": 0.002325033536180854,
"learning_rate": 5.543345543345544e-06,
"loss": 0.0002,
"step": 8880
},
{
"epoch": 3.62,
"grad_norm": 0.0017883091932162642,
"learning_rate": 5.527065527065527e-06,
"loss": 0.0002,
"step": 8890
},
{
"epoch": 3.62,
"grad_norm": 0.0018799800891429186,
"learning_rate": 5.510785510785511e-06,
"loss": 0.0002,
"step": 8900
},
{
"epoch": 3.63,
"grad_norm": 0.0017864195397123694,
"learning_rate": 5.494505494505495e-06,
"loss": 0.0002,
"step": 8910
},
{
"epoch": 3.63,
"grad_norm": 0.0018234961898997426,
"learning_rate": 5.478225478225478e-06,
"loss": 0.0002,
"step": 8920
},
{
"epoch": 3.63,
"grad_norm": 0.0017443567048758268,
"learning_rate": 5.461945461945462e-06,
"loss": 0.0002,
"step": 8930
},
{
"epoch": 3.64,
"grad_norm": 0.0017067781882360578,
"learning_rate": 5.445665445665445e-06,
"loss": 0.0002,
"step": 8940
},
{
"epoch": 3.64,
"grad_norm": 0.002276692306622863,
"learning_rate": 5.42938542938543e-06,
"loss": 0.0186,
"step": 8950
},
{
"epoch": 3.65,
"grad_norm": 0.0017357119359076023,
"learning_rate": 5.413105413105414e-06,
"loss": 0.0002,
"step": 8960
},
{
"epoch": 3.65,
"grad_norm": 0.0019965972751379013,
"learning_rate": 5.396825396825397e-06,
"loss": 0.0002,
"step": 8970
},
{
"epoch": 3.65,
"grad_norm": 0.0017553390935063362,
"learning_rate": 5.380545380545381e-06,
"loss": 0.0003,
"step": 8980
},
{
"epoch": 3.66,
"grad_norm": 0.0019675048533827066,
"learning_rate": 5.364265364265364e-06,
"loss": 0.0002,
"step": 8990
},
{
"epoch": 3.66,
"grad_norm": 0.002049475908279419,
"learning_rate": 5.347985347985348e-06,
"loss": 0.0002,
"step": 9000
},
{
"epoch": 3.67,
"grad_norm": 0.0019142305245622993,
"learning_rate": 5.331705331705332e-06,
"loss": 0.0002,
"step": 9010
},
{
"epoch": 3.67,
"grad_norm": 0.0018189084948971868,
"learning_rate": 5.315425315425315e-06,
"loss": 0.042,
"step": 9020
},
{
"epoch": 3.68,
"grad_norm": 0.0019228870514780283,
"learning_rate": 5.2991452991453e-06,
"loss": 0.0005,
"step": 9030
},
{
"epoch": 3.68,
"grad_norm": 0.002307659713551402,
"learning_rate": 5.282865282865284e-06,
"loss": 0.0002,
"step": 9040
},
{
"epoch": 3.68,
"grad_norm": 0.0021766172721982002,
"learning_rate": 5.266585266585267e-06,
"loss": 0.0002,
"step": 9050
},
{
"epoch": 3.69,
"grad_norm": 0.0017359366174787283,
"learning_rate": 5.250305250305251e-06,
"loss": 0.0341,
"step": 9060
},
{
"epoch": 3.69,
"grad_norm": 0.0017763186478987336,
"learning_rate": 5.234025234025234e-06,
"loss": 0.0002,
"step": 9070
},
{
"epoch": 3.7,
"grad_norm": 0.001665986143052578,
"learning_rate": 5.217745217745218e-06,
"loss": 0.0008,
"step": 9080
},
{
"epoch": 3.7,
"grad_norm": 0.0017538231331855059,
"learning_rate": 5.201465201465202e-06,
"loss": 0.0002,
"step": 9090
},
{
"epoch": 3.7,
"grad_norm": 0.0016558667412027717,
"learning_rate": 5.185185185185185e-06,
"loss": 0.0002,
"step": 9100
},
{
"epoch": 3.71,
"grad_norm": 0.0018909978680312634,
"learning_rate": 5.16890516890517e-06,
"loss": 0.0002,
"step": 9110
},
{
"epoch": 3.71,
"grad_norm": 0.0017842132365331054,
"learning_rate": 5.152625152625153e-06,
"loss": 0.0002,
"step": 9120
},
{
"epoch": 3.72,
"grad_norm": 0.0017819767817854881,
"learning_rate": 5.136345136345137e-06,
"loss": 0.0002,
"step": 9130
},
{
"epoch": 3.72,
"grad_norm": 0.00168974872212857,
"learning_rate": 5.120065120065121e-06,
"loss": 0.0002,
"step": 9140
},
{
"epoch": 3.72,
"grad_norm": 0.0017720448086038232,
"learning_rate": 5.103785103785104e-06,
"loss": 0.0002,
"step": 9150
},
{
"epoch": 3.73,
"grad_norm": 0.0017071804031729698,
"learning_rate": 5.087505087505088e-06,
"loss": 0.0002,
"step": 9160
},
{
"epoch": 3.73,
"grad_norm": 0.0018827036255970597,
"learning_rate": 5.071225071225072e-06,
"loss": 0.0002,
"step": 9170
},
{
"epoch": 3.74,
"grad_norm": 0.0022226206492632627,
"learning_rate": 5.054945054945055e-06,
"loss": 0.0002,
"step": 9180
},
{
"epoch": 3.74,
"grad_norm": 0.0019109738059341908,
"learning_rate": 5.03866503866504e-06,
"loss": 0.0494,
"step": 9190
},
{
"epoch": 3.74,
"grad_norm": 0.0019527949625626206,
"learning_rate": 5.022385022385023e-06,
"loss": 0.0002,
"step": 9200
},
{
"epoch": 3.75,
"grad_norm": 0.0020662578754127026,
"learning_rate": 5.006105006105007e-06,
"loss": 0.0415,
"step": 9210
},
{
"epoch": 3.75,
"grad_norm": 0.002151912311092019,
"learning_rate": 4.98982498982499e-06,
"loss": 0.0003,
"step": 9220
},
{
"epoch": 3.76,
"grad_norm": 0.001946290722116828,
"learning_rate": 4.973544973544974e-06,
"loss": 0.0002,
"step": 9230
},
{
"epoch": 3.76,
"grad_norm": 0.001949216122739017,
"learning_rate": 4.957264957264958e-06,
"loss": 0.0002,
"step": 9240
},
{
"epoch": 3.76,
"grad_norm": 0.002394681563600898,
"learning_rate": 4.9409849409849416e-06,
"loss": 0.0352,
"step": 9250
},
{
"epoch": 3.77,
"grad_norm": 0.0022585808765143156,
"learning_rate": 4.924704924704925e-06,
"loss": 0.0471,
"step": 9260
},
{
"epoch": 3.77,
"grad_norm": 0.002393248025327921,
"learning_rate": 4.908424908424909e-06,
"loss": 0.0002,
"step": 9270
},
{
"epoch": 3.78,
"grad_norm": 0.02360522374510765,
"learning_rate": 4.8921448921448925e-06,
"loss": 0.0003,
"step": 9280
},
{
"epoch": 3.78,
"grad_norm": 0.0023453827016055584,
"learning_rate": 4.8758648758648765e-06,
"loss": 0.0003,
"step": 9290
},
{
"epoch": 3.79,
"grad_norm": 0.00220270873978734,
"learning_rate": 4.8595848595848596e-06,
"loss": 0.0003,
"step": 9300
},
{
"epoch": 3.79,
"grad_norm": 0.0021812734194099903,
"learning_rate": 4.8433048433048435e-06,
"loss": 0.0003,
"step": 9310
},
{
"epoch": 3.79,
"grad_norm": 0.0021124074701219797,
"learning_rate": 4.8270248270248275e-06,
"loss": 0.0002,
"step": 9320
},
{
"epoch": 3.8,
"grad_norm": 0.002312874887138605,
"learning_rate": 4.810744810744811e-06,
"loss": 0.0003,
"step": 9330
},
{
"epoch": 3.8,
"grad_norm": 0.0025071410927921534,
"learning_rate": 4.7944647944647945e-06,
"loss": 0.0002,
"step": 9340
},
{
"epoch": 3.81,
"grad_norm": 0.0022760110441595316,
"learning_rate": 4.7781847781847784e-06,
"loss": 0.0003,
"step": 9350
},
{
"epoch": 3.81,
"grad_norm": 0.002391684567555785,
"learning_rate": 4.761904761904762e-06,
"loss": 0.0002,
"step": 9360
},
{
"epoch": 3.81,
"grad_norm": 0.0021324707195162773,
"learning_rate": 4.745624745624746e-06,
"loss": 0.0002,
"step": 9370
},
{
"epoch": 3.82,
"grad_norm": 0.0021602713968604803,
"learning_rate": 4.729344729344729e-06,
"loss": 0.0002,
"step": 9380
},
{
"epoch": 3.82,
"grad_norm": 0.003342408686876297,
"learning_rate": 4.713064713064713e-06,
"loss": 0.0002,
"step": 9390
},
{
"epoch": 3.83,
"grad_norm": 0.003199818776920438,
"learning_rate": 4.696784696784697e-06,
"loss": 0.0002,
"step": 9400
},
{
"epoch": 3.83,
"grad_norm": 0.002259862143546343,
"learning_rate": 4.680504680504681e-06,
"loss": 0.0005,
"step": 9410
},
{
"epoch": 3.83,
"grad_norm": 0.0020261441823095083,
"learning_rate": 4.664224664224664e-06,
"loss": 0.0002,
"step": 9420
},
{
"epoch": 3.84,
"grad_norm": 0.001844863872975111,
"learning_rate": 4.647944647944648e-06,
"loss": 0.0003,
"step": 9430
},
{
"epoch": 3.84,
"grad_norm": 0.0022536173928529024,
"learning_rate": 4.631664631664632e-06,
"loss": 0.0002,
"step": 9440
},
{
"epoch": 3.85,
"grad_norm": 0.001871871529147029,
"learning_rate": 4.615384615384616e-06,
"loss": 0.0003,
"step": 9450
},
{
"epoch": 3.85,
"grad_norm": 0.0019549911376088858,
"learning_rate": 4.599104599104599e-06,
"loss": 0.0414,
"step": 9460
},
{
"epoch": 3.85,
"grad_norm": 0.002595256781205535,
"learning_rate": 4.582824582824583e-06,
"loss": 0.0002,
"step": 9470
},
{
"epoch": 3.86,
"grad_norm": 0.0021485532633960247,
"learning_rate": 4.566544566544567e-06,
"loss": 0.0002,
"step": 9480
},
{
"epoch": 3.86,
"grad_norm": 0.0018896989058703184,
"learning_rate": 4.55026455026455e-06,
"loss": 0.0264,
"step": 9490
},
{
"epoch": 3.87,
"grad_norm": 0.0023643136955797672,
"learning_rate": 4.533984533984534e-06,
"loss": 0.0003,
"step": 9500
},
{
"epoch": 3.87,
"grad_norm": 0.0017869413131847978,
"learning_rate": 4.517704517704518e-06,
"loss": 0.0003,
"step": 9510
},
{
"epoch": 3.87,
"grad_norm": 0.0022810434456914663,
"learning_rate": 4.501424501424502e-06,
"loss": 0.0002,
"step": 9520
},
{
"epoch": 3.88,
"grad_norm": 0.0020936301443725824,
"learning_rate": 4.485144485144485e-06,
"loss": 0.0003,
"step": 9530
},
{
"epoch": 3.88,
"grad_norm": 0.0017164949094876647,
"learning_rate": 4.468864468864469e-06,
"loss": 0.0147,
"step": 9540
},
{
"epoch": 3.89,
"grad_norm": 0.008885451592504978,
"learning_rate": 4.452584452584453e-06,
"loss": 0.0003,
"step": 9550
},
{
"epoch": 3.89,
"grad_norm": 0.20433610677719116,
"learning_rate": 4.436304436304437e-06,
"loss": 0.0004,
"step": 9560
},
{
"epoch": 3.89,
"grad_norm": 0.0018083051545545459,
"learning_rate": 4.42002442002442e-06,
"loss": 0.0002,
"step": 9570
},
{
"epoch": 3.9,
"grad_norm": 0.00233688997104764,
"learning_rate": 4.403744403744404e-06,
"loss": 0.0002,
"step": 9580
},
{
"epoch": 3.9,
"grad_norm": 0.0020819292403757572,
"learning_rate": 4.387464387464388e-06,
"loss": 0.0012,
"step": 9590
},
{
"epoch": 3.91,
"grad_norm": 0.0069807544350624084,
"learning_rate": 4.371184371184372e-06,
"loss": 0.0003,
"step": 9600
},
{
"epoch": 3.91,
"grad_norm": 0.0027952860109508038,
"learning_rate": 4.354904354904355e-06,
"loss": 0.0002,
"step": 9610
},
{
"epoch": 3.92,
"grad_norm": 0.0018937455024570227,
"learning_rate": 4.338624338624339e-06,
"loss": 0.0272,
"step": 9620
},
{
"epoch": 3.92,
"grad_norm": 0.001811556052416563,
"learning_rate": 4.322344322344323e-06,
"loss": 0.0002,
"step": 9630
},
{
"epoch": 3.92,
"grad_norm": 0.0017631722148507833,
"learning_rate": 4.306064306064307e-06,
"loss": 0.0002,
"step": 9640
},
{
"epoch": 3.93,
"grad_norm": 0.001867889310233295,
"learning_rate": 4.28978428978429e-06,
"loss": 0.0197,
"step": 9650
},
{
"epoch": 3.93,
"grad_norm": 0.0020562438294291496,
"learning_rate": 4.273504273504274e-06,
"loss": 0.0002,
"step": 9660
},
{
"epoch": 3.94,
"grad_norm": 0.007918364368379116,
"learning_rate": 4.257224257224258e-06,
"loss": 0.0003,
"step": 9670
},
{
"epoch": 3.94,
"grad_norm": 0.0026931529864668846,
"learning_rate": 4.240944240944242e-06,
"loss": 0.0003,
"step": 9680
},
{
"epoch": 3.94,
"grad_norm": 0.002624350832775235,
"learning_rate": 4.224664224664225e-06,
"loss": 0.0003,
"step": 9690
},
{
"epoch": 3.95,
"grad_norm": 0.001771993818692863,
"learning_rate": 4.208384208384209e-06,
"loss": 0.0002,
"step": 9700
},
{
"epoch": 3.95,
"grad_norm": 0.010523835197091103,
"learning_rate": 4.192104192104192e-06,
"loss": 0.0003,
"step": 9710
},
{
"epoch": 3.96,
"grad_norm": 0.0034396941773593426,
"learning_rate": 4.175824175824177e-06,
"loss": 0.0003,
"step": 9720
},
{
"epoch": 3.96,
"grad_norm": 0.003138788277283311,
"learning_rate": 4.15954415954416e-06,
"loss": 0.0058,
"step": 9730
},
{
"epoch": 3.96,
"grad_norm": 0.002142369979992509,
"learning_rate": 4.143264143264144e-06,
"loss": 0.0002,
"step": 9740
},
{
"epoch": 3.97,
"grad_norm": 0.006518381182104349,
"learning_rate": 4.126984126984127e-06,
"loss": 0.0002,
"step": 9750
},
{
"epoch": 3.97,
"grad_norm": 0.0019359017023816705,
"learning_rate": 4.1107041107041116e-06,
"loss": 0.0002,
"step": 9760
},
{
"epoch": 3.98,
"grad_norm": 0.0018001939170062542,
"learning_rate": 4.094424094424095e-06,
"loss": 0.0002,
"step": 9770
},
{
"epoch": 3.98,
"grad_norm": 0.002167722210288048,
"learning_rate": 4.078144078144079e-06,
"loss": 0.0002,
"step": 9780
},
{
"epoch": 3.98,
"grad_norm": 0.008154891431331635,
"learning_rate": 4.061864061864062e-06,
"loss": 0.0002,
"step": 9790
},
{
"epoch": 3.99,
"grad_norm": 0.001978978980332613,
"learning_rate": 4.0455840455840465e-06,
"loss": 0.0002,
"step": 9800
},
{
"epoch": 3.99,
"grad_norm": 0.0018466059118509293,
"learning_rate": 4.0293040293040296e-06,
"loss": 0.0002,
"step": 9810
},
{
"epoch": 4.0,
"grad_norm": 0.00179979985114187,
"learning_rate": 4.0130240130240135e-06,
"loss": 0.0002,
"step": 9820
},
{
"epoch": 4.0,
"grad_norm": 0.002002492779865861,
"learning_rate": 3.996743996743997e-06,
"loss": 0.0002,
"step": 9830
},
{
"epoch": 4.0,
"grad_norm": 0.0019970801658928394,
"learning_rate": 3.9804639804639805e-06,
"loss": 0.0002,
"step": 9840
},
{
"epoch": 4.01,
"grad_norm": 0.0017706368817016482,
"learning_rate": 3.9641839641839645e-06,
"loss": 0.0002,
"step": 9850
},
{
"epoch": 4.01,
"grad_norm": 0.0017488128505647182,
"learning_rate": 3.9479039479039484e-06,
"loss": 0.0003,
"step": 9860
},
{
"epoch": 4.02,
"grad_norm": 0.0025758370757102966,
"learning_rate": 3.9316239316239315e-06,
"loss": 0.0002,
"step": 9870
},
{
"epoch": 4.02,
"grad_norm": 0.002105166669934988,
"learning_rate": 3.9153439153439155e-06,
"loss": 0.0002,
"step": 9880
},
{
"epoch": 4.03,
"grad_norm": 0.0027692352887243032,
"learning_rate": 3.899063899063899e-06,
"loss": 0.0043,
"step": 9890
},
{
"epoch": 4.03,
"grad_norm": 0.0020704329945147038,
"learning_rate": 3.882783882783883e-06,
"loss": 0.0002,
"step": 9900
},
{
"epoch": 4.03,
"grad_norm": 0.0019208292942494154,
"learning_rate": 3.8665038665038664e-06,
"loss": 0.0002,
"step": 9910
},
{
"epoch": 4.04,
"grad_norm": 0.0017399511998519301,
"learning_rate": 3.85022385022385e-06,
"loss": 0.0002,
"step": 9920
},
{
"epoch": 4.04,
"grad_norm": 0.0017688291845843196,
"learning_rate": 3.833943833943834e-06,
"loss": 0.0002,
"step": 9930
},
{
"epoch": 4.05,
"grad_norm": 4.471590995788574,
"learning_rate": 3.817663817663818e-06,
"loss": 0.0023,
"step": 9940
},
{
"epoch": 4.05,
"grad_norm": 0.0016602250980213284,
"learning_rate": 3.8013838013838018e-06,
"loss": 0.0002,
"step": 9950
},
{
"epoch": 4.05,
"grad_norm": 0.001645643264055252,
"learning_rate": 3.7851037851037853e-06,
"loss": 0.0002,
"step": 9960
},
{
"epoch": 4.06,
"grad_norm": 0.0017087948508560658,
"learning_rate": 3.768823768823769e-06,
"loss": 0.0002,
"step": 9970
},
{
"epoch": 4.06,
"grad_norm": 0.002038088161498308,
"learning_rate": 3.752543752543753e-06,
"loss": 0.0002,
"step": 9980
},
{
"epoch": 4.07,
"grad_norm": 0.0071817911230027676,
"learning_rate": 3.7362637362637367e-06,
"loss": 0.0002,
"step": 9990
},
{
"epoch": 4.07,
"grad_norm": 0.0021325184497982264,
"learning_rate": 3.7199837199837202e-06,
"loss": 0.0002,
"step": 10000
},
{
"epoch": 4.07,
"grad_norm": 0.001710103009827435,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.0002,
"step": 10010
},
{
"epoch": 4.08,
"grad_norm": 0.0015926583437249064,
"learning_rate": 3.687423687423688e-06,
"loss": 0.0003,
"step": 10020
},
{
"epoch": 4.08,
"grad_norm": 0.0016407363582402468,
"learning_rate": 3.6711436711436716e-06,
"loss": 0.0002,
"step": 10030
},
{
"epoch": 4.09,
"grad_norm": 0.005499332211911678,
"learning_rate": 3.654863654863655e-06,
"loss": 0.0002,
"step": 10040
},
{
"epoch": 4.09,
"grad_norm": 0.0018358811503276229,
"learning_rate": 3.6385836385836387e-06,
"loss": 0.0002,
"step": 10050
},
{
"epoch": 4.09,
"grad_norm": 0.0016708581242710352,
"learning_rate": 3.622303622303623e-06,
"loss": 0.0002,
"step": 10060
},
{
"epoch": 4.1,
"grad_norm": 0.0017990090418606997,
"learning_rate": 3.6060236060236065e-06,
"loss": 0.0002,
"step": 10070
},
{
"epoch": 4.1,
"grad_norm": 0.0018943555187433958,
"learning_rate": 3.58974358974359e-06,
"loss": 0.0002,
"step": 10080
},
{
"epoch": 4.11,
"grad_norm": 0.0016270867781713605,
"learning_rate": 3.5734635734635736e-06,
"loss": 0.0002,
"step": 10090
},
{
"epoch": 4.11,
"grad_norm": 0.009296965785324574,
"learning_rate": 3.557183557183557e-06,
"loss": 0.0002,
"step": 10100
},
{
"epoch": 4.11,
"grad_norm": 0.0016765177715569735,
"learning_rate": 3.5409035409035415e-06,
"loss": 0.0002,
"step": 10110
},
{
"epoch": 4.12,
"grad_norm": 0.004676634445786476,
"learning_rate": 3.524623524623525e-06,
"loss": 0.0002,
"step": 10120
},
{
"epoch": 4.12,
"grad_norm": 0.001882671844214201,
"learning_rate": 3.5083435083435085e-06,
"loss": 0.0011,
"step": 10130
},
{
"epoch": 4.13,
"grad_norm": 0.0016701704589650035,
"learning_rate": 3.492063492063492e-06,
"loss": 0.0002,
"step": 10140
},
{
"epoch": 4.13,
"grad_norm": 0.0018036847468465567,
"learning_rate": 3.4757834757834764e-06,
"loss": 0.0002,
"step": 10150
},
{
"epoch": 4.14,
"grad_norm": 0.0019449255196377635,
"learning_rate": 3.45950345950346e-06,
"loss": 0.0002,
"step": 10160
},
{
"epoch": 4.14,
"grad_norm": 0.0023109372705221176,
"learning_rate": 3.4432234432234434e-06,
"loss": 0.0002,
"step": 10170
},
{
"epoch": 4.14,
"grad_norm": 0.001794449402950704,
"learning_rate": 3.426943426943427e-06,
"loss": 0.0002,
"step": 10180
},
{
"epoch": 4.15,
"grad_norm": 0.0016366175841540098,
"learning_rate": 3.410663410663411e-06,
"loss": 0.0002,
"step": 10190
},
{
"epoch": 4.15,
"grad_norm": 0.0022932947613298893,
"learning_rate": 3.394383394383395e-06,
"loss": 0.0002,
"step": 10200
},
{
"epoch": 4.16,
"grad_norm": 0.003153660800307989,
"learning_rate": 3.3781033781033783e-06,
"loss": 0.0002,
"step": 10210
},
{
"epoch": 4.16,
"grad_norm": 0.0018573219422250986,
"learning_rate": 3.361823361823362e-06,
"loss": 0.0002,
"step": 10220
},
{
"epoch": 4.16,
"grad_norm": 0.0016019688919186592,
"learning_rate": 3.345543345543346e-06,
"loss": 0.0002,
"step": 10230
},
{
"epoch": 4.17,
"grad_norm": 0.0016897093737497926,
"learning_rate": 3.3292633292633297e-06,
"loss": 0.0002,
"step": 10240
},
{
"epoch": 4.17,
"grad_norm": 0.0018914591055363417,
"learning_rate": 3.3129833129833133e-06,
"loss": 0.0003,
"step": 10250
},
{
"epoch": 4.18,
"grad_norm": 0.0018889600178226829,
"learning_rate": 3.2967032967032968e-06,
"loss": 0.0002,
"step": 10260
},
{
"epoch": 4.18,
"grad_norm": 0.00160633132327348,
"learning_rate": 3.2804232804232807e-06,
"loss": 0.0002,
"step": 10270
},
{
"epoch": 4.18,
"grad_norm": 0.00516732269898057,
"learning_rate": 3.2641432641432647e-06,
"loss": 0.0002,
"step": 10280
},
{
"epoch": 4.19,
"grad_norm": 0.0015665347455069423,
"learning_rate": 3.247863247863248e-06,
"loss": 0.0002,
"step": 10290
},
{
"epoch": 4.19,
"grad_norm": 0.0016588406870141625,
"learning_rate": 3.2315832315832317e-06,
"loss": 0.0002,
"step": 10300
},
{
"epoch": 4.2,
"grad_norm": 0.00242376746609807,
"learning_rate": 3.2153032153032156e-06,
"loss": 0.0002,
"step": 10310
},
{
"epoch": 4.2,
"grad_norm": 0.0070383488200604916,
"learning_rate": 3.199023199023199e-06,
"loss": 0.0002,
"step": 10320
},
{
"epoch": 4.2,
"grad_norm": 0.0019135623006150126,
"learning_rate": 3.182743182743183e-06,
"loss": 0.0002,
"step": 10330
},
{
"epoch": 4.21,
"grad_norm": 0.0018966845236718655,
"learning_rate": 3.1664631664631666e-06,
"loss": 0.0002,
"step": 10340
},
{
"epoch": 4.21,
"grad_norm": 0.0014899246161803603,
"learning_rate": 3.1501831501831505e-06,
"loss": 0.0002,
"step": 10350
},
{
"epoch": 4.22,
"grad_norm": 0.001564052072353661,
"learning_rate": 3.133903133903134e-06,
"loss": 0.0002,
"step": 10360
},
{
"epoch": 4.22,
"grad_norm": 0.001840132987126708,
"learning_rate": 3.117623117623118e-06,
"loss": 0.0002,
"step": 10370
},
{
"epoch": 4.22,
"grad_norm": 0.0020550840999931097,
"learning_rate": 3.1013431013431015e-06,
"loss": 0.0002,
"step": 10380
},
{
"epoch": 4.23,
"grad_norm": 0.0018264094833284616,
"learning_rate": 3.0850630850630855e-06,
"loss": 0.0002,
"step": 10390
},
{
"epoch": 4.23,
"grad_norm": 0.001516546355560422,
"learning_rate": 3.068783068783069e-06,
"loss": 0.0002,
"step": 10400
},
{
"epoch": 4.24,
"grad_norm": 0.0016487749526277184,
"learning_rate": 3.052503052503053e-06,
"loss": 0.0002,
"step": 10410
},
{
"epoch": 4.24,
"grad_norm": 0.0016116101760417223,
"learning_rate": 3.0362230362230364e-06,
"loss": 0.0002,
"step": 10420
},
{
"epoch": 4.25,
"grad_norm": 0.001680860761553049,
"learning_rate": 3.0199430199430204e-06,
"loss": 0.0002,
"step": 10430
},
{
"epoch": 4.25,
"grad_norm": 0.002029112773016095,
"learning_rate": 3.003663003663004e-06,
"loss": 0.0002,
"step": 10440
},
{
"epoch": 4.25,
"grad_norm": 0.002056869911029935,
"learning_rate": 2.9873829873829874e-06,
"loss": 0.0002,
"step": 10450
},
{
"epoch": 4.26,
"grad_norm": 0.0016365089686587453,
"learning_rate": 2.9711029711029714e-06,
"loss": 0.0017,
"step": 10460
},
{
"epoch": 4.26,
"grad_norm": 0.001570598571561277,
"learning_rate": 2.9548229548229553e-06,
"loss": 0.0002,
"step": 10470
},
{
"epoch": 4.27,
"grad_norm": 0.0019338660640642047,
"learning_rate": 2.938542938542939e-06,
"loss": 0.0002,
"step": 10480
},
{
"epoch": 4.27,
"grad_norm": 0.001604044926352799,
"learning_rate": 2.9222629222629223e-06,
"loss": 0.0002,
"step": 10490
},
{
"epoch": 4.27,
"grad_norm": 0.0015405503800138831,
"learning_rate": 2.9059829059829063e-06,
"loss": 0.0003,
"step": 10500
},
{
"epoch": 4.28,
"grad_norm": 0.001597168273292482,
"learning_rate": 2.8897028897028902e-06,
"loss": 0.0002,
"step": 10510
},
{
"epoch": 4.28,
"grad_norm": 0.001601763884536922,
"learning_rate": 2.8734228734228737e-06,
"loss": 0.0002,
"step": 10520
},
{
"epoch": 4.29,
"grad_norm": 0.0014684420311823487,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0002,
"step": 10530
},
{
"epoch": 4.29,
"grad_norm": 0.0019548002164810896,
"learning_rate": 2.840862840862841e-06,
"loss": 0.0002,
"step": 10540
},
{
"epoch": 4.29,
"grad_norm": 0.0019341334700584412,
"learning_rate": 2.824582824582825e-06,
"loss": 0.0002,
"step": 10550
},
{
"epoch": 4.3,
"grad_norm": 0.0015359672252088785,
"learning_rate": 2.8083028083028087e-06,
"loss": 0.0002,
"step": 10560
},
{
"epoch": 4.3,
"grad_norm": 0.001660957932472229,
"learning_rate": 2.792022792022792e-06,
"loss": 0.0002,
"step": 10570
},
{
"epoch": 4.31,
"grad_norm": 0.002642634091898799,
"learning_rate": 2.7757427757427757e-06,
"loss": 0.0002,
"step": 10580
},
{
"epoch": 4.31,
"grad_norm": 0.001577245187945664,
"learning_rate": 2.75946275946276e-06,
"loss": 0.0002,
"step": 10590
},
{
"epoch": 4.31,
"grad_norm": 0.0016623500268906355,
"learning_rate": 2.7431827431827436e-06,
"loss": 0.0274,
"step": 10600
},
{
"epoch": 4.32,
"grad_norm": 0.001559157157316804,
"learning_rate": 2.726902726902727e-06,
"loss": 0.0002,
"step": 10610
},
{
"epoch": 4.32,
"grad_norm": 0.0015379212563857436,
"learning_rate": 2.7106227106227106e-06,
"loss": 0.0002,
"step": 10620
},
{
"epoch": 4.33,
"grad_norm": 0.001682962873019278,
"learning_rate": 2.694342694342695e-06,
"loss": 0.0002,
"step": 10630
},
{
"epoch": 4.33,
"grad_norm": 0.0015784628922119737,
"learning_rate": 2.6780626780626785e-06,
"loss": 0.0002,
"step": 10640
},
{
"epoch": 4.33,
"grad_norm": 0.0015349604655057192,
"learning_rate": 2.661782661782662e-06,
"loss": 0.0002,
"step": 10650
},
{
"epoch": 4.34,
"grad_norm": 0.0015412438660860062,
"learning_rate": 2.6455026455026455e-06,
"loss": 0.0002,
"step": 10660
},
{
"epoch": 4.34,
"grad_norm": 0.0016461275517940521,
"learning_rate": 2.629222629222629e-06,
"loss": 0.0002,
"step": 10670
},
{
"epoch": 4.35,
"grad_norm": 0.0016684934962540865,
"learning_rate": 2.6129426129426134e-06,
"loss": 0.0002,
"step": 10680
},
{
"epoch": 4.35,
"grad_norm": 0.0015019102720543742,
"learning_rate": 2.596662596662597e-06,
"loss": 0.0002,
"step": 10690
},
{
"epoch": 4.35,
"grad_norm": 0.0015912950038909912,
"learning_rate": 2.5803825803825804e-06,
"loss": 0.0002,
"step": 10700
},
{
"epoch": 4.36,
"grad_norm": 0.002051288727670908,
"learning_rate": 2.564102564102564e-06,
"loss": 0.0002,
"step": 10710
},
{
"epoch": 4.36,
"grad_norm": 0.0014287488302215934,
"learning_rate": 2.5478225478225483e-06,
"loss": 0.0002,
"step": 10720
},
{
"epoch": 4.37,
"grad_norm": 0.0014953837962821126,
"learning_rate": 2.531542531542532e-06,
"loss": 0.0002,
"step": 10730
},
{
"epoch": 4.37,
"grad_norm": 0.0016842116601765156,
"learning_rate": 2.5152625152625154e-06,
"loss": 0.0002,
"step": 10740
},
{
"epoch": 4.38,
"grad_norm": 0.0016165722627192736,
"learning_rate": 2.4989824989824993e-06,
"loss": 0.0004,
"step": 10750
},
{
"epoch": 4.38,
"grad_norm": 0.0016578533686697483,
"learning_rate": 2.482702482702483e-06,
"loss": 0.0002,
"step": 10760
},
{
"epoch": 4.38,
"grad_norm": 0.001627171179279685,
"learning_rate": 2.4664224664224668e-06,
"loss": 0.0002,
"step": 10770
},
{
"epoch": 4.39,
"grad_norm": 0.0029889908619225025,
"learning_rate": 2.4501424501424503e-06,
"loss": 0.0002,
"step": 10780
},
{
"epoch": 4.39,
"grad_norm": 0.0015365415019914508,
"learning_rate": 2.433862433862434e-06,
"loss": 0.0002,
"step": 10790
},
{
"epoch": 4.4,
"grad_norm": 0.0019263201393187046,
"learning_rate": 2.4175824175824177e-06,
"loss": 0.0002,
"step": 10800
},
{
"epoch": 4.4,
"grad_norm": 0.001516710501164198,
"learning_rate": 2.4013024013024013e-06,
"loss": 0.0002,
"step": 10810
},
{
"epoch": 4.4,
"grad_norm": 0.001614395878277719,
"learning_rate": 2.385022385022385e-06,
"loss": 0.0002,
"step": 10820
},
{
"epoch": 4.41,
"grad_norm": 0.0014490768080577254,
"learning_rate": 2.3687423687423687e-06,
"loss": 0.0004,
"step": 10830
},
{
"epoch": 4.41,
"grad_norm": 0.0015428679762408137,
"learning_rate": 2.3524623524623527e-06,
"loss": 0.0002,
"step": 10840
},
{
"epoch": 4.42,
"grad_norm": 0.0015440605347976089,
"learning_rate": 2.336182336182336e-06,
"loss": 0.0004,
"step": 10850
},
{
"epoch": 4.42,
"grad_norm": 0.00148781796451658,
"learning_rate": 2.31990231990232e-06,
"loss": 0.0002,
"step": 10860
},
{
"epoch": 4.42,
"grad_norm": 0.0015348844463005662,
"learning_rate": 2.3036223036223036e-06,
"loss": 0.0002,
"step": 10870
},
{
"epoch": 4.43,
"grad_norm": 0.001880201743915677,
"learning_rate": 2.2873422873422876e-06,
"loss": 0.0002,
"step": 10880
},
{
"epoch": 4.43,
"grad_norm": 0.001558057265356183,
"learning_rate": 2.271062271062271e-06,
"loss": 0.0002,
"step": 10890
},
{
"epoch": 4.44,
"grad_norm": 0.010920335538685322,
"learning_rate": 2.254782254782255e-06,
"loss": 0.0002,
"step": 10900
},
{
"epoch": 4.44,
"grad_norm": 0.0014644470065832138,
"learning_rate": 2.2385022385022386e-06,
"loss": 0.0002,
"step": 10910
},
{
"epoch": 4.44,
"grad_norm": 0.0014618238201364875,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0002,
"step": 10920
},
{
"epoch": 4.45,
"grad_norm": 0.0016169185983017087,
"learning_rate": 2.205942205942206e-06,
"loss": 0.0002,
"step": 10930
},
{
"epoch": 4.45,
"grad_norm": 0.0014386329567059875,
"learning_rate": 2.1896621896621895e-06,
"loss": 0.0002,
"step": 10940
},
{
"epoch": 4.46,
"grad_norm": 0.0015079034492373466,
"learning_rate": 2.1733821733821735e-06,
"loss": 0.0002,
"step": 10950
},
{
"epoch": 4.46,
"grad_norm": 0.00197400595061481,
"learning_rate": 2.157102157102157e-06,
"loss": 0.0002,
"step": 10960
},
{
"epoch": 4.46,
"grad_norm": 0.001524322316981852,
"learning_rate": 2.140822140822141e-06,
"loss": 0.0311,
"step": 10970
},
{
"epoch": 4.47,
"grad_norm": 0.0014644163893535733,
"learning_rate": 2.1245421245421245e-06,
"loss": 0.0002,
"step": 10980
},
{
"epoch": 4.47,
"grad_norm": 0.0014774493174627423,
"learning_rate": 2.1082621082621084e-06,
"loss": 0.0002,
"step": 10990
},
{
"epoch": 4.48,
"grad_norm": 0.0014835140900686383,
"learning_rate": 2.091982091982092e-06,
"loss": 0.0002,
"step": 11000
},
{
"epoch": 4.48,
"grad_norm": 0.001458540791645646,
"learning_rate": 2.075702075702076e-06,
"loss": 0.0002,
"step": 11010
},
{
"epoch": 4.49,
"grad_norm": 0.002432051347568631,
"learning_rate": 2.05942205942206e-06,
"loss": 0.0002,
"step": 11020
},
{
"epoch": 4.49,
"grad_norm": 0.001562487450428307,
"learning_rate": 2.0431420431420433e-06,
"loss": 0.0353,
"step": 11030
},
{
"epoch": 4.49,
"grad_norm": 0.0016052748542279005,
"learning_rate": 2.0268620268620273e-06,
"loss": 0.0002,
"step": 11040
},
{
"epoch": 4.5,
"grad_norm": 0.001545790466479957,
"learning_rate": 2.0105820105820108e-06,
"loss": 0.0002,
"step": 11050
},
{
"epoch": 4.5,
"grad_norm": 0.001812846981920302,
"learning_rate": 1.9943019943019947e-06,
"loss": 0.0002,
"step": 11060
},
{
"epoch": 4.51,
"grad_norm": 0.0017415074398741126,
"learning_rate": 1.9780219780219782e-06,
"loss": 0.0002,
"step": 11070
},
{
"epoch": 4.51,
"grad_norm": 0.0016338001005351543,
"learning_rate": 1.961741961741962e-06,
"loss": 0.0002,
"step": 11080
},
{
"epoch": 4.51,
"grad_norm": 0.0014169925125315785,
"learning_rate": 1.9454619454619457e-06,
"loss": 0.0006,
"step": 11090
},
{
"epoch": 4.52,
"grad_norm": 0.0016671591438353062,
"learning_rate": 1.9291819291819296e-06,
"loss": 0.0002,
"step": 11100
},
{
"epoch": 4.52,
"grad_norm": 0.0033444638829678297,
"learning_rate": 1.912901912901913e-06,
"loss": 0.0002,
"step": 11110
},
{
"epoch": 4.53,
"grad_norm": 0.0015689071733504534,
"learning_rate": 1.8966218966218969e-06,
"loss": 0.0002,
"step": 11120
},
{
"epoch": 4.53,
"grad_norm": 0.0018193925498053432,
"learning_rate": 1.8803418803418804e-06,
"loss": 0.0002,
"step": 11130
},
{
"epoch": 4.53,
"grad_norm": 0.0015975474379956722,
"learning_rate": 1.8640618640618643e-06,
"loss": 0.0002,
"step": 11140
},
{
"epoch": 4.54,
"grad_norm": 0.0015228153206408024,
"learning_rate": 1.8477818477818479e-06,
"loss": 0.0002,
"step": 11150
},
{
"epoch": 4.54,
"grad_norm": 0.0017481072572991252,
"learning_rate": 1.8315018315018316e-06,
"loss": 0.0002,
"step": 11160
},
{
"epoch": 4.55,
"grad_norm": 0.0014254804700613022,
"learning_rate": 1.8152218152218153e-06,
"loss": 0.0002,
"step": 11170
},
{
"epoch": 4.55,
"grad_norm": 0.0014639191795140505,
"learning_rate": 1.798941798941799e-06,
"loss": 0.0002,
"step": 11180
},
{
"epoch": 4.55,
"grad_norm": 0.0014739630278199911,
"learning_rate": 1.7826617826617828e-06,
"loss": 0.0002,
"step": 11190
},
{
"epoch": 4.56,
"grad_norm": 0.001486291061155498,
"learning_rate": 1.7663817663817665e-06,
"loss": 0.0002,
"step": 11200
},
{
"epoch": 4.56,
"grad_norm": 0.0021130377426743507,
"learning_rate": 1.7501017501017502e-06,
"loss": 0.0002,
"step": 11210
},
{
"epoch": 4.57,
"grad_norm": 0.0014680501772090793,
"learning_rate": 1.733821733821734e-06,
"loss": 0.0002,
"step": 11220
},
{
"epoch": 4.57,
"grad_norm": 0.0018635701853781939,
"learning_rate": 1.7175417175417177e-06,
"loss": 0.0002,
"step": 11230
},
{
"epoch": 4.57,
"grad_norm": 0.0015968162333592772,
"learning_rate": 1.7012617012617014e-06,
"loss": 0.0094,
"step": 11240
},
{
"epoch": 4.58,
"grad_norm": 0.0017093609785661101,
"learning_rate": 1.6849816849816852e-06,
"loss": 0.0002,
"step": 11250
},
{
"epoch": 4.58,
"grad_norm": 0.0014892058679834008,
"learning_rate": 1.6687016687016689e-06,
"loss": 0.0139,
"step": 11260
},
{
"epoch": 4.59,
"grad_norm": 0.0014219109434634447,
"learning_rate": 1.6524216524216524e-06,
"loss": 0.0002,
"step": 11270
},
{
"epoch": 4.59,
"grad_norm": 0.004563894122838974,
"learning_rate": 1.6361416361416363e-06,
"loss": 0.0002,
"step": 11280
},
{
"epoch": 4.6,
"grad_norm": 0.0014352177968248725,
"learning_rate": 1.6198616198616199e-06,
"loss": 0.0002,
"step": 11290
},
{
"epoch": 4.6,
"grad_norm": 0.001390959369018674,
"learning_rate": 1.6035816035816038e-06,
"loss": 0.0002,
"step": 11300
},
{
"epoch": 4.6,
"grad_norm": 0.0038959532976150513,
"learning_rate": 1.5873015873015873e-06,
"loss": 0.0002,
"step": 11310
},
{
"epoch": 4.61,
"grad_norm": 0.001589680789038539,
"learning_rate": 1.5710215710215713e-06,
"loss": 0.0002,
"step": 11320
},
{
"epoch": 4.61,
"grad_norm": 0.001737966202199459,
"learning_rate": 1.5547415547415548e-06,
"loss": 0.0002,
"step": 11330
},
{
"epoch": 4.62,
"grad_norm": 0.0014157581608742476,
"learning_rate": 1.5384615384615387e-06,
"loss": 0.0002,
"step": 11340
},
{
"epoch": 4.62,
"grad_norm": 0.0018974760314449668,
"learning_rate": 1.5221815221815222e-06,
"loss": 0.0002,
"step": 11350
},
{
"epoch": 4.62,
"grad_norm": 0.0015809000469744205,
"learning_rate": 1.5059015059015062e-06,
"loss": 0.0002,
"step": 11360
},
{
"epoch": 4.63,
"grad_norm": 0.01224368717521429,
"learning_rate": 1.4896214896214897e-06,
"loss": 0.0002,
"step": 11370
},
{
"epoch": 4.63,
"grad_norm": 0.0015656572068110108,
"learning_rate": 1.4733414733414736e-06,
"loss": 0.0002,
"step": 11380
},
{
"epoch": 4.64,
"grad_norm": 0.0015062758466228843,
"learning_rate": 1.4570614570614572e-06,
"loss": 0.0002,
"step": 11390
},
{
"epoch": 4.64,
"grad_norm": 0.001575302449055016,
"learning_rate": 1.4407814407814407e-06,
"loss": 0.0002,
"step": 11400
},
{
"epoch": 4.64,
"grad_norm": 0.0014867339050397277,
"learning_rate": 1.4245014245014246e-06,
"loss": 0.0002,
"step": 11410
},
{
"epoch": 4.65,
"grad_norm": 0.0014463013503700495,
"learning_rate": 1.4082214082214083e-06,
"loss": 0.0002,
"step": 11420
},
{
"epoch": 4.65,
"grad_norm": 0.0014738457975909114,
"learning_rate": 1.391941391941392e-06,
"loss": 0.0002,
"step": 11430
},
{
"epoch": 4.66,
"grad_norm": 0.0033326647244393826,
"learning_rate": 1.3756613756613758e-06,
"loss": 0.0002,
"step": 11440
},
{
"epoch": 4.66,
"grad_norm": 0.0014288641978055239,
"learning_rate": 1.3593813593813595e-06,
"loss": 0.0002,
"step": 11450
},
{
"epoch": 4.66,
"grad_norm": 0.0015226053074002266,
"learning_rate": 1.3431013431013433e-06,
"loss": 0.0002,
"step": 11460
},
{
"epoch": 4.67,
"grad_norm": 0.0014885494019836187,
"learning_rate": 1.326821326821327e-06,
"loss": 0.0002,
"step": 11470
},
{
"epoch": 4.67,
"grad_norm": 0.0014367675175890326,
"learning_rate": 1.3105413105413107e-06,
"loss": 0.0002,
"step": 11480
},
{
"epoch": 4.68,
"grad_norm": 0.0014275162247940898,
"learning_rate": 1.2942612942612944e-06,
"loss": 0.0002,
"step": 11490
},
{
"epoch": 4.68,
"grad_norm": 0.0014797335024923086,
"learning_rate": 1.2779812779812782e-06,
"loss": 0.0002,
"step": 11500
},
{
"epoch": 4.68,
"grad_norm": 0.0014295239234343171,
"learning_rate": 1.2617012617012617e-06,
"loss": 0.0002,
"step": 11510
},
{
"epoch": 4.69,
"grad_norm": 0.0014915807405486703,
"learning_rate": 1.2454212454212456e-06,
"loss": 0.0002,
"step": 11520
},
{
"epoch": 4.69,
"grad_norm": 0.0016227615997195244,
"learning_rate": 1.2291412291412294e-06,
"loss": 0.0002,
"step": 11530
},
{
"epoch": 4.7,
"grad_norm": 0.0014580420684069395,
"learning_rate": 1.212861212861213e-06,
"loss": 0.0002,
"step": 11540
},
{
"epoch": 4.7,
"grad_norm": 0.0017063523409888148,
"learning_rate": 1.1965811965811968e-06,
"loss": 0.0002,
"step": 11550
},
{
"epoch": 4.7,
"grad_norm": 0.0014365671668201685,
"learning_rate": 1.1803011803011806e-06,
"loss": 0.0002,
"step": 11560
},
{
"epoch": 4.71,
"grad_norm": 1.2260816097259521,
"learning_rate": 1.164021164021164e-06,
"loss": 0.006,
"step": 11570
},
{
"epoch": 4.71,
"grad_norm": 0.0018874687375500798,
"learning_rate": 1.1477411477411478e-06,
"loss": 0.0002,
"step": 11580
},
{
"epoch": 4.72,
"grad_norm": 0.0013750126818194985,
"learning_rate": 1.1314611314611315e-06,
"loss": 0.0002,
"step": 11590
},
{
"epoch": 4.72,
"grad_norm": 0.001419686945155263,
"learning_rate": 1.1151811151811153e-06,
"loss": 0.0002,
"step": 11600
},
{
"epoch": 4.73,
"grad_norm": 0.0014118729159235954,
"learning_rate": 1.098901098901099e-06,
"loss": 0.0002,
"step": 11610
},
{
"epoch": 4.73,
"grad_norm": 0.0014139912091195583,
"learning_rate": 1.0826210826210827e-06,
"loss": 0.0002,
"step": 11620
},
{
"epoch": 4.73,
"grad_norm": 0.0015478282002732158,
"learning_rate": 1.0663410663410665e-06,
"loss": 0.0002,
"step": 11630
},
{
"epoch": 4.74,
"grad_norm": 0.0015366330044344068,
"learning_rate": 1.0500610500610502e-06,
"loss": 0.0002,
"step": 11640
},
{
"epoch": 4.74,
"grad_norm": 0.001490729977376759,
"learning_rate": 1.033781033781034e-06,
"loss": 0.0002,
"step": 11650
},
{
"epoch": 4.75,
"grad_norm": 0.001423732377588749,
"learning_rate": 1.0175010175010176e-06,
"loss": 0.0002,
"step": 11660
},
{
"epoch": 4.75,
"grad_norm": 0.0014315071748569608,
"learning_rate": 1.0012210012210014e-06,
"loss": 0.0002,
"step": 11670
},
{
"epoch": 4.75,
"grad_norm": 0.001486779423430562,
"learning_rate": 9.84940984940985e-07,
"loss": 0.0002,
"step": 11680
},
{
"epoch": 4.76,
"grad_norm": 0.004522906616330147,
"learning_rate": 9.686609686609686e-07,
"loss": 0.0002,
"step": 11690
},
{
"epoch": 4.76,
"grad_norm": 0.0014580250717699528,
"learning_rate": 9.523809523809525e-07,
"loss": 0.0002,
"step": 11700
},
{
"epoch": 4.77,
"grad_norm": 0.0015162237687036395,
"learning_rate": 9.361009361009362e-07,
"loss": 0.0092,
"step": 11710
},
{
"epoch": 4.77,
"grad_norm": 0.0013567224377766252,
"learning_rate": 9.198209198209199e-07,
"loss": 0.0002,
"step": 11720
},
{
"epoch": 4.77,
"grad_norm": 0.0014696550788357854,
"learning_rate": 9.035409035409036e-07,
"loss": 0.0002,
"step": 11730
},
{
"epoch": 4.78,
"grad_norm": 0.0014795665629208088,
"learning_rate": 8.872608872608874e-07,
"loss": 0.0002,
"step": 11740
},
{
"epoch": 4.78,
"grad_norm": 0.0015325862914323807,
"learning_rate": 8.709808709808711e-07,
"loss": 0.0002,
"step": 11750
},
{
"epoch": 4.79,
"grad_norm": 0.0014404217945411801,
"learning_rate": 8.547008547008548e-07,
"loss": 0.0002,
"step": 11760
},
{
"epoch": 4.79,
"grad_norm": 0.0019404751947149634,
"learning_rate": 8.384208384208386e-07,
"loss": 0.0002,
"step": 11770
},
{
"epoch": 4.79,
"grad_norm": 0.0016487601678818464,
"learning_rate": 8.221408221408223e-07,
"loss": 0.0002,
"step": 11780
},
{
"epoch": 4.8,
"grad_norm": 0.0020900049712508917,
"learning_rate": 8.05860805860806e-07,
"loss": 0.0002,
"step": 11790
},
{
"epoch": 4.8,
"grad_norm": 0.044903818517923355,
"learning_rate": 7.895807895807897e-07,
"loss": 0.0002,
"step": 11800
},
{
"epoch": 4.81,
"grad_norm": 0.006237754598259926,
"learning_rate": 7.733007733007733e-07,
"loss": 0.0002,
"step": 11810
},
{
"epoch": 4.81,
"grad_norm": 0.001496842596679926,
"learning_rate": 7.57020757020757e-07,
"loss": 0.0002,
"step": 11820
},
{
"epoch": 4.81,
"grad_norm": 0.0014312907587736845,
"learning_rate": 7.407407407407407e-07,
"loss": 0.0002,
"step": 11830
},
{
"epoch": 4.82,
"grad_norm": 0.0020331472624093294,
"learning_rate": 7.244607244607245e-07,
"loss": 0.0002,
"step": 11840
},
{
"epoch": 4.82,
"grad_norm": 0.0015430136118084192,
"learning_rate": 7.081807081807082e-07,
"loss": 0.0002,
"step": 11850
},
{
"epoch": 4.83,
"grad_norm": 0.0014734879368916154,
"learning_rate": 6.919006919006919e-07,
"loss": 0.0002,
"step": 11860
},
{
"epoch": 4.83,
"grad_norm": 0.006134878844022751,
"learning_rate": 6.756206756206756e-07,
"loss": 0.0002,
"step": 11870
},
{
"epoch": 4.84,
"grad_norm": 0.0013609755551442504,
"learning_rate": 6.593406593406594e-07,
"loss": 0.0002,
"step": 11880
},
{
"epoch": 4.84,
"grad_norm": 0.002070717280730605,
"learning_rate": 6.430606430606431e-07,
"loss": 0.0002,
"step": 11890
},
{
"epoch": 4.84,
"grad_norm": 0.0014169508358463645,
"learning_rate": 6.267806267806268e-07,
"loss": 0.0002,
"step": 11900
},
{
"epoch": 4.85,
"grad_norm": 0.0014770817942917347,
"learning_rate": 6.105006105006106e-07,
"loss": 0.0002,
"step": 11910
},
{
"epoch": 4.85,
"grad_norm": 0.0014419537037611008,
"learning_rate": 5.942205942205943e-07,
"loss": 0.0002,
"step": 11920
},
{
"epoch": 4.86,
"grad_norm": 0.001446893555112183,
"learning_rate": 5.77940577940578e-07,
"loss": 0.0002,
"step": 11930
},
{
"epoch": 4.86,
"grad_norm": 0.001416919520124793,
"learning_rate": 5.616605616605618e-07,
"loss": 0.0002,
"step": 11940
},
{
"epoch": 4.86,
"grad_norm": 0.0034949700348079205,
"learning_rate": 5.453805453805455e-07,
"loss": 0.0002,
"step": 11950
},
{
"epoch": 4.87,
"grad_norm": 0.0014441277598962188,
"learning_rate": 5.291005291005291e-07,
"loss": 0.0002,
"step": 11960
},
{
"epoch": 4.87,
"grad_norm": 0.0015632550930604339,
"learning_rate": 5.128205128205128e-07,
"loss": 0.0002,
"step": 11970
},
{
"epoch": 4.88,
"grad_norm": 0.001399176544509828,
"learning_rate": 4.965404965404966e-07,
"loss": 0.0002,
"step": 11980
},
{
"epoch": 4.88,
"grad_norm": 0.0013975553447380662,
"learning_rate": 4.802604802604803e-07,
"loss": 0.0002,
"step": 11990
},
{
"epoch": 4.88,
"grad_norm": 0.0013712114887312055,
"learning_rate": 4.63980463980464e-07,
"loss": 0.0002,
"step": 12000
},
{
"epoch": 4.89,
"grad_norm": 0.001828977488912642,
"learning_rate": 4.4770044770044775e-07,
"loss": 0.0002,
"step": 12010
},
{
"epoch": 4.89,
"grad_norm": 0.0014294543070718646,
"learning_rate": 4.3142043142043143e-07,
"loss": 0.0002,
"step": 12020
},
{
"epoch": 4.9,
"grad_norm": 0.0013922780053690076,
"learning_rate": 4.1514041514041516e-07,
"loss": 0.0002,
"step": 12030
},
{
"epoch": 4.9,
"grad_norm": 0.0016130340518429875,
"learning_rate": 3.988603988603989e-07,
"loss": 0.0002,
"step": 12040
},
{
"epoch": 4.9,
"grad_norm": 0.0013872876297682524,
"learning_rate": 3.825803825803826e-07,
"loss": 0.0002,
"step": 12050
},
{
"epoch": 4.91,
"grad_norm": 0.0014586036559194326,
"learning_rate": 3.6630036630036635e-07,
"loss": 0.0002,
"step": 12060
},
{
"epoch": 4.91,
"grad_norm": 0.0014334677252918482,
"learning_rate": 3.500203500203501e-07,
"loss": 0.0002,
"step": 12070
},
{
"epoch": 4.92,
"grad_norm": 0.0014047607546672225,
"learning_rate": 3.3374033374033376e-07,
"loss": 0.0002,
"step": 12080
},
{
"epoch": 4.92,
"grad_norm": 0.0013850359246134758,
"learning_rate": 3.174603174603175e-07,
"loss": 0.0002,
"step": 12090
},
{
"epoch": 4.92,
"grad_norm": 0.0013912185095250607,
"learning_rate": 3.011803011803012e-07,
"loss": 0.0002,
"step": 12100
},
{
"epoch": 4.93,
"grad_norm": 0.001442193053662777,
"learning_rate": 2.8490028490028494e-07,
"loss": 0.0002,
"step": 12110
},
{
"epoch": 4.93,
"grad_norm": 0.0014724673237651587,
"learning_rate": 2.6862026862026867e-07,
"loss": 0.0002,
"step": 12120
},
{
"epoch": 4.94,
"grad_norm": 0.0017670753877609968,
"learning_rate": 2.5234025234025235e-07,
"loss": 0.0002,
"step": 12130
},
{
"epoch": 4.94,
"grad_norm": 0.001458752085454762,
"learning_rate": 2.3606023606023608e-07,
"loss": 0.0002,
"step": 12140
},
{
"epoch": 4.95,
"grad_norm": 0.0015336443902924657,
"learning_rate": 2.197802197802198e-07,
"loss": 0.0002,
"step": 12150
},
{
"epoch": 4.95,
"grad_norm": 0.001413301331922412,
"learning_rate": 2.035002035002035e-07,
"loss": 0.0002,
"step": 12160
},
{
"epoch": 4.95,
"grad_norm": 0.001454474637284875,
"learning_rate": 1.8722018722018724e-07,
"loss": 0.0002,
"step": 12170
},
{
"epoch": 4.96,
"grad_norm": 0.0014644395560026169,
"learning_rate": 1.7094017094017097e-07,
"loss": 0.0002,
"step": 12180
},
{
"epoch": 4.96,
"grad_norm": 0.0014874679036438465,
"learning_rate": 1.5466015466015467e-07,
"loss": 0.0002,
"step": 12190
},
{
"epoch": 4.97,
"grad_norm": 0.0014028714504092932,
"learning_rate": 1.383801383801384e-07,
"loss": 0.0002,
"step": 12200
},
{
"epoch": 4.97,
"grad_norm": 0.0014859441434964538,
"learning_rate": 1.221001221001221e-07,
"loss": 0.0002,
"step": 12210
},
{
"epoch": 4.97,
"grad_norm": 0.0014206055784597993,
"learning_rate": 1.0582010582010582e-07,
"loss": 0.0002,
"step": 12220
},
{
"epoch": 4.98,
"grad_norm": 0.0013865531655028462,
"learning_rate": 8.954008954008955e-08,
"loss": 0.0002,
"step": 12230
},
{
"epoch": 4.98,
"grad_norm": 0.0014404187677428126,
"learning_rate": 7.326007326007327e-08,
"loss": 0.0002,
"step": 12240
},
{
"epoch": 4.99,
"grad_norm": 0.0015573910204693675,
"learning_rate": 5.6980056980056986e-08,
"loss": 0.0003,
"step": 12250
},
{
"epoch": 4.99,
"grad_norm": 0.0015043216990306973,
"learning_rate": 4.07000407000407e-08,
"loss": 0.0002,
"step": 12260
},
{
"epoch": 4.99,
"grad_norm": 0.0015565322246402502,
"learning_rate": 2.4420024420024422e-08,
"loss": 0.0002,
"step": 12270
},
{
"epoch": 5.0,
"grad_norm": 0.003684895345941186,
"learning_rate": 8.14000814000814e-09,
"loss": 0.0002,
"step": 12280
},
{
"epoch": 5.0,
"step": 12285,
"total_flos": 1.523143801869613e+19,
"train_loss": 0.006059203078136595,
"train_runtime": 4479.7513,
"train_samples_per_second": 43.876,
"train_steps_per_second": 2.742
}
],
"logging_steps": 10,
"max_steps": 12285,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"total_flos": 1.523143801869613e+19,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}