{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 12285, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.3860865831375122, "learning_rate": 1.9983719983719984e-05, "loss": 0.684, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.3011243343353271, "learning_rate": 1.996743996743997e-05, "loss": 0.6568, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.646723985671997, "learning_rate": 1.9951159951159952e-05, "loss": 0.6477, "step": 30 }, { "epoch": 0.02, "grad_norm": 1.43569016456604, "learning_rate": 1.9934879934879937e-05, "loss": 0.61, "step": 40 }, { "epoch": 0.02, "grad_norm": 1.4323921203613281, "learning_rate": 1.991859991859992e-05, "loss": 0.5758, "step": 50 }, { "epoch": 0.02, "grad_norm": 2.1664583683013916, "learning_rate": 1.9902319902319905e-05, "loss": 0.5529, "step": 60 }, { "epoch": 0.03, "grad_norm": 1.935644268989563, "learning_rate": 1.9886039886039888e-05, "loss": 0.4969, "step": 70 }, { "epoch": 0.03, "grad_norm": 2.984022617340088, "learning_rate": 1.986975986975987e-05, "loss": 0.5017, "step": 80 }, { "epoch": 0.04, "grad_norm": 1.9753074645996094, "learning_rate": 1.9853479853479855e-05, "loss": 0.4438, "step": 90 }, { "epoch": 0.04, "grad_norm": 4.39138650894165, "learning_rate": 1.9837199837199838e-05, "loss": 0.4033, "step": 100 }, { "epoch": 0.04, "grad_norm": 3.0486788749694824, "learning_rate": 1.9820919820919823e-05, "loss": 0.3642, "step": 110 }, { "epoch": 0.05, "grad_norm": 1.738529920578003, "learning_rate": 1.9804639804639806e-05, "loss": 0.3557, "step": 120 }, { "epoch": 0.05, "grad_norm": 2.9336562156677246, "learning_rate": 1.978835978835979e-05, "loss": 0.3655, "step": 130 }, { "epoch": 0.06, "grad_norm": 2.0220277309417725, "learning_rate": 1.9772079772079773e-05, "loss": 0.2903, "step": 140 }, { "epoch": 0.06, "grad_norm": 2.4428532123565674, "learning_rate": 1.975579975579976e-05, "loss": 0.2706, "step": 150 }, { "epoch": 0.07, "grad_norm": 5.031763076782227, "learning_rate": 1.973951973951974e-05, "loss": 0.3558, "step": 160 }, { "epoch": 0.07, "grad_norm": 3.3514373302459717, "learning_rate": 1.9723239723239724e-05, "loss": 0.232, "step": 170 }, { "epoch": 0.07, "grad_norm": 1.0613574981689453, "learning_rate": 1.970695970695971e-05, "loss": 0.2499, "step": 180 }, { "epoch": 0.08, "grad_norm": 7.994803428649902, "learning_rate": 1.969067969067969e-05, "loss": 0.2609, "step": 190 }, { "epoch": 0.08, "grad_norm": 1.9980034828186035, "learning_rate": 1.9674399674399677e-05, "loss": 0.2267, "step": 200 }, { "epoch": 0.09, "grad_norm": 2.297769069671631, "learning_rate": 1.965811965811966e-05, "loss": 0.2371, "step": 210 }, { "epoch": 0.09, "grad_norm": 5.071080207824707, "learning_rate": 1.9641839641839645e-05, "loss": 0.248, "step": 220 }, { "epoch": 0.09, "grad_norm": 0.43589872121810913, "learning_rate": 1.9625559625559627e-05, "loss": 0.151, "step": 230 }, { "epoch": 0.1, "grad_norm": 6.790423393249512, "learning_rate": 1.960927960927961e-05, "loss": 0.1886, "step": 240 }, { "epoch": 0.1, "grad_norm": 1.1880414485931396, "learning_rate": 1.9592999592999595e-05, "loss": 0.2104, "step": 250 }, { "epoch": 0.11, "grad_norm": 6.434396266937256, "learning_rate": 1.9576719576719577e-05, "loss": 0.2847, "step": 260 }, { "epoch": 0.11, "grad_norm": 2.175398826599121, "learning_rate": 1.9560439560439563e-05, "loss": 0.261, "step": 270 }, { "epoch": 0.11, "grad_norm": 2.2410614490509033, "learning_rate": 1.9544159544159545e-05, "loss": 0.1376, "step": 280 }, { "epoch": 0.12, "grad_norm": 0.5503996014595032, "learning_rate": 1.952787952787953e-05, "loss": 0.2034, "step": 290 }, { "epoch": 0.12, "grad_norm": 3.550145387649536, "learning_rate": 1.9511599511599513e-05, "loss": 0.1802, "step": 300 }, { "epoch": 0.13, "grad_norm": 0.35100820660591125, "learning_rate": 1.94953194953195e-05, "loss": 0.1778, "step": 310 }, { "epoch": 0.13, "grad_norm": 1.0361884832382202, "learning_rate": 1.947903947903948e-05, "loss": 0.2186, "step": 320 }, { "epoch": 0.13, "grad_norm": 7.980532169342041, "learning_rate": 1.9462759462759463e-05, "loss": 0.2742, "step": 330 }, { "epoch": 0.14, "grad_norm": 4.8567585945129395, "learning_rate": 1.9446479446479445e-05, "loss": 0.1544, "step": 340 }, { "epoch": 0.14, "grad_norm": 5.355805397033691, "learning_rate": 1.943019943019943e-05, "loss": 0.2739, "step": 350 }, { "epoch": 0.15, "grad_norm": 6.359828472137451, "learning_rate": 1.9413919413919417e-05, "loss": 0.2076, "step": 360 }, { "epoch": 0.15, "grad_norm": 0.9936553239822388, "learning_rate": 1.93976393976394e-05, "loss": 0.1666, "step": 370 }, { "epoch": 0.15, "grad_norm": 6.090355396270752, "learning_rate": 1.9381359381359385e-05, "loss": 0.1941, "step": 380 }, { "epoch": 0.16, "grad_norm": 2.5009548664093018, "learning_rate": 1.9365079365079367e-05, "loss": 0.1905, "step": 390 }, { "epoch": 0.16, "grad_norm": 8.634650230407715, "learning_rate": 1.934879934879935e-05, "loss": 0.1431, "step": 400 }, { "epoch": 0.17, "grad_norm": 2.43247389793396, "learning_rate": 1.9332519332519335e-05, "loss": 0.1736, "step": 410 }, { "epoch": 0.17, "grad_norm": 15.868481636047363, "learning_rate": 1.9316239316239317e-05, "loss": 0.249, "step": 420 }, { "epoch": 0.18, "grad_norm": 1.8422390222549438, "learning_rate": 1.9299959299959303e-05, "loss": 0.1407, "step": 430 }, { "epoch": 0.18, "grad_norm": 5.148740291595459, "learning_rate": 1.9283679283679285e-05, "loss": 0.1503, "step": 440 }, { "epoch": 0.18, "grad_norm": 2.3315675258636475, "learning_rate": 1.926739926739927e-05, "loss": 0.1885, "step": 450 }, { "epoch": 0.19, "grad_norm": 3.6225030422210693, "learning_rate": 1.9251119251119253e-05, "loss": 0.1403, "step": 460 }, { "epoch": 0.19, "grad_norm": 4.605388641357422, "learning_rate": 1.9234839234839235e-05, "loss": 0.2384, "step": 470 }, { "epoch": 0.2, "grad_norm": 2.3162589073181152, "learning_rate": 1.921855921855922e-05, "loss": 0.129, "step": 480 }, { "epoch": 0.2, "grad_norm": 0.4153892695903778, "learning_rate": 1.9202279202279203e-05, "loss": 0.1109, "step": 490 }, { "epoch": 0.2, "grad_norm": 7.691011905670166, "learning_rate": 1.9185999185999185e-05, "loss": 0.1846, "step": 500 }, { "epoch": 0.21, "grad_norm": 7.940028667449951, "learning_rate": 1.916971916971917e-05, "loss": 0.1391, "step": 510 }, { "epoch": 0.21, "grad_norm": 0.5145124793052673, "learning_rate": 1.9153439153439156e-05, "loss": 0.1288, "step": 520 }, { "epoch": 0.22, "grad_norm": 2.5785932540893555, "learning_rate": 1.913715913715914e-05, "loss": 0.1537, "step": 530 }, { "epoch": 0.22, "grad_norm": 6.997181415557861, "learning_rate": 1.9120879120879124e-05, "loss": 0.1578, "step": 540 }, { "epoch": 0.22, "grad_norm": 2.4879519939422607, "learning_rate": 1.9104599104599107e-05, "loss": 0.1954, "step": 550 }, { "epoch": 0.23, "grad_norm": 5.291905879974365, "learning_rate": 1.908831908831909e-05, "loss": 0.1557, "step": 560 }, { "epoch": 0.23, "grad_norm": 5.735557556152344, "learning_rate": 1.9072039072039074e-05, "loss": 0.1621, "step": 570 }, { "epoch": 0.24, "grad_norm": 5.979973316192627, "learning_rate": 1.9055759055759057e-05, "loss": 0.1503, "step": 580 }, { "epoch": 0.24, "grad_norm": 0.19126015901565552, "learning_rate": 1.9039479039479042e-05, "loss": 0.094, "step": 590 }, { "epoch": 0.24, "grad_norm": 1.0556552410125732, "learning_rate": 1.9023199023199025e-05, "loss": 0.1876, "step": 600 }, { "epoch": 0.25, "grad_norm": 3.954843759536743, "learning_rate": 1.900691900691901e-05, "loss": 0.3162, "step": 610 }, { "epoch": 0.25, "grad_norm": 0.14296281337738037, "learning_rate": 1.8990638990638992e-05, "loss": 0.1288, "step": 620 }, { "epoch": 0.26, "grad_norm": 8.772310256958008, "learning_rate": 1.8974358974358975e-05, "loss": 0.2622, "step": 630 }, { "epoch": 0.26, "grad_norm": 2.3941524028778076, "learning_rate": 1.895807895807896e-05, "loss": 0.1524, "step": 640 }, { "epoch": 0.26, "grad_norm": 8.104179382324219, "learning_rate": 1.8941798941798943e-05, "loss": 0.1109, "step": 650 }, { "epoch": 0.27, "grad_norm": 1.5782121419906616, "learning_rate": 1.8925518925518925e-05, "loss": 0.0729, "step": 660 }, { "epoch": 0.27, "grad_norm": 0.39667731523513794, "learning_rate": 1.890923890923891e-05, "loss": 0.1116, "step": 670 }, { "epoch": 0.28, "grad_norm": 5.58447265625, "learning_rate": 1.8892958892958896e-05, "loss": 0.1312, "step": 680 }, { "epoch": 0.28, "grad_norm": 1.3114192485809326, "learning_rate": 1.887667887667888e-05, "loss": 0.2522, "step": 690 }, { "epoch": 0.28, "grad_norm": 0.997601330280304, "learning_rate": 1.8860398860398864e-05, "loss": 0.1605, "step": 700 }, { "epoch": 0.29, "grad_norm": 3.605452537536621, "learning_rate": 1.8844118844118846e-05, "loss": 0.2443, "step": 710 }, { "epoch": 0.29, "grad_norm": 18.868513107299805, "learning_rate": 1.882783882783883e-05, "loss": 0.1923, "step": 720 }, { "epoch": 0.3, "grad_norm": 3.4352970123291016, "learning_rate": 1.881155881155881e-05, "loss": 0.1099, "step": 730 }, { "epoch": 0.3, "grad_norm": 2.06532883644104, "learning_rate": 1.8795278795278796e-05, "loss": 0.1426, "step": 740 }, { "epoch": 0.31, "grad_norm": 5.7250237464904785, "learning_rate": 1.8778998778998782e-05, "loss": 0.209, "step": 750 }, { "epoch": 0.31, "grad_norm": 0.23493361473083496, "learning_rate": 1.8762718762718764e-05, "loss": 0.1986, "step": 760 }, { "epoch": 0.31, "grad_norm": 17.769451141357422, "learning_rate": 1.874643874643875e-05, "loss": 0.1267, "step": 770 }, { "epoch": 0.32, "grad_norm": 0.27574750781059265, "learning_rate": 1.8730158730158732e-05, "loss": 0.1484, "step": 780 }, { "epoch": 0.32, "grad_norm": 0.30309033393859863, "learning_rate": 1.8713878713878714e-05, "loss": 0.1838, "step": 790 }, { "epoch": 0.33, "grad_norm": 17.183013916015625, "learning_rate": 1.86975986975987e-05, "loss": 0.135, "step": 800 }, { "epoch": 0.33, "grad_norm": 0.5572558641433716, "learning_rate": 1.8681318681318682e-05, "loss": 0.1456, "step": 810 }, { "epoch": 0.33, "grad_norm": 0.4613451659679413, "learning_rate": 1.8665038665038664e-05, "loss": 0.1337, "step": 820 }, { "epoch": 0.34, "grad_norm": 6.645438194274902, "learning_rate": 1.864875864875865e-05, "loss": 0.1446, "step": 830 }, { "epoch": 0.34, "grad_norm": 5.389886856079102, "learning_rate": 1.8632478632478636e-05, "loss": 0.1253, "step": 840 }, { "epoch": 0.35, "grad_norm": 14.86754322052002, "learning_rate": 1.8616198616198618e-05, "loss": 0.1346, "step": 850 }, { "epoch": 0.35, "grad_norm": 13.419057846069336, "learning_rate": 1.85999185999186e-05, "loss": 0.0926, "step": 860 }, { "epoch": 0.35, "grad_norm": 13.904304504394531, "learning_rate": 1.8583638583638586e-05, "loss": 0.1944, "step": 870 }, { "epoch": 0.36, "grad_norm": 0.28235912322998047, "learning_rate": 1.8567358567358568e-05, "loss": 0.1261, "step": 880 }, { "epoch": 0.36, "grad_norm": 5.711563587188721, "learning_rate": 1.855107855107855e-05, "loss": 0.1824, "step": 890 }, { "epoch": 0.37, "grad_norm": 17.74437141418457, "learning_rate": 1.8534798534798536e-05, "loss": 0.1742, "step": 900 }, { "epoch": 0.37, "grad_norm": 3.648202657699585, "learning_rate": 1.851851851851852e-05, "loss": 0.2103, "step": 910 }, { "epoch": 0.37, "grad_norm": 2.0693366527557373, "learning_rate": 1.8502238502238504e-05, "loss": 0.1868, "step": 920 }, { "epoch": 0.38, "grad_norm": 2.299172878265381, "learning_rate": 1.848595848595849e-05, "loss": 0.168, "step": 930 }, { "epoch": 0.38, "grad_norm": 1.4059839248657227, "learning_rate": 1.8469678469678472e-05, "loss": 0.1455, "step": 940 }, { "epoch": 0.39, "grad_norm": 0.926304817199707, "learning_rate": 1.8453398453398454e-05, "loss": 0.2002, "step": 950 }, { "epoch": 0.39, "grad_norm": 4.728667736053467, "learning_rate": 1.8437118437118436e-05, "loss": 0.1245, "step": 960 }, { "epoch": 0.39, "grad_norm": 0.5045005083084106, "learning_rate": 1.8420838420838422e-05, "loss": 0.0638, "step": 970 }, { "epoch": 0.4, "grad_norm": 8.82520580291748, "learning_rate": 1.8404558404558404e-05, "loss": 0.1127, "step": 980 }, { "epoch": 0.4, "grad_norm": 5.101595401763916, "learning_rate": 1.838827838827839e-05, "loss": 0.2363, "step": 990 }, { "epoch": 0.41, "grad_norm": 7.01576566696167, "learning_rate": 1.8371998371998375e-05, "loss": 0.1026, "step": 1000 }, { "epoch": 0.41, "grad_norm": 0.865003764629364, "learning_rate": 1.8355718355718358e-05, "loss": 0.0965, "step": 1010 }, { "epoch": 0.42, "grad_norm": 9.897397994995117, "learning_rate": 1.833943833943834e-05, "loss": 0.1156, "step": 1020 }, { "epoch": 0.42, "grad_norm": 1.5007679462432861, "learning_rate": 1.8323158323158326e-05, "loss": 0.1888, "step": 1030 }, { "epoch": 0.42, "grad_norm": 4.676563262939453, "learning_rate": 1.8306878306878308e-05, "loss": 0.1552, "step": 1040 }, { "epoch": 0.43, "grad_norm": 5.3361430168151855, "learning_rate": 1.829059829059829e-05, "loss": 0.1447, "step": 1050 }, { "epoch": 0.43, "grad_norm": 0.8933970332145691, "learning_rate": 1.8274318274318276e-05, "loss": 0.1394, "step": 1060 }, { "epoch": 0.44, "grad_norm": 7.401905059814453, "learning_rate": 1.825803825803826e-05, "loss": 0.2188, "step": 1070 }, { "epoch": 0.44, "grad_norm": 0.4379027783870697, "learning_rate": 1.8241758241758244e-05, "loss": 0.1277, "step": 1080 }, { "epoch": 0.44, "grad_norm": 2.8909428119659424, "learning_rate": 1.8225478225478226e-05, "loss": 0.1726, "step": 1090 }, { "epoch": 0.45, "grad_norm": 0.11447061598300934, "learning_rate": 1.820919820919821e-05, "loss": 0.1523, "step": 1100 }, { "epoch": 0.45, "grad_norm": 0.12276914715766907, "learning_rate": 1.8192918192918194e-05, "loss": 0.1823, "step": 1110 }, { "epoch": 0.46, "grad_norm": 1.3844455480575562, "learning_rate": 1.8176638176638176e-05, "loss": 0.1006, "step": 1120 }, { "epoch": 0.46, "grad_norm": 3.1034061908721924, "learning_rate": 1.816035816035816e-05, "loss": 0.1387, "step": 1130 }, { "epoch": 0.46, "grad_norm": 8.602412223815918, "learning_rate": 1.8144078144078144e-05, "loss": 0.2345, "step": 1140 }, { "epoch": 0.47, "grad_norm": 0.13919875025749207, "learning_rate": 1.812779812779813e-05, "loss": 0.1653, "step": 1150 }, { "epoch": 0.47, "grad_norm": 0.34385234117507935, "learning_rate": 1.8111518111518115e-05, "loss": 0.2003, "step": 1160 }, { "epoch": 0.48, "grad_norm": 4.868250846862793, "learning_rate": 1.8095238095238097e-05, "loss": 0.1504, "step": 1170 }, { "epoch": 0.48, "grad_norm": 2.267928123474121, "learning_rate": 1.807895807895808e-05, "loss": 0.2023, "step": 1180 }, { "epoch": 0.48, "grad_norm": 3.634040594100952, "learning_rate": 1.8062678062678065e-05, "loss": 0.1682, "step": 1190 }, { "epoch": 0.49, "grad_norm": 1.8135625123977661, "learning_rate": 1.8046398046398047e-05, "loss": 0.2059, "step": 1200 }, { "epoch": 0.49, "grad_norm": 2.294635057449341, "learning_rate": 1.803011803011803e-05, "loss": 0.1498, "step": 1210 }, { "epoch": 0.5, "grad_norm": 3.8841567039489746, "learning_rate": 1.8013838013838015e-05, "loss": 0.0727, "step": 1220 }, { "epoch": 0.5, "grad_norm": 0.9216524958610535, "learning_rate": 1.7997557997558e-05, "loss": 0.1273, "step": 1230 }, { "epoch": 0.5, "grad_norm": 0.08572695404291153, "learning_rate": 1.7981277981277983e-05, "loss": 0.1066, "step": 1240 }, { "epoch": 0.51, "grad_norm": 5.445361137390137, "learning_rate": 1.7964997964997966e-05, "loss": 0.1894, "step": 1250 }, { "epoch": 0.51, "grad_norm": 4.239029407501221, "learning_rate": 1.794871794871795e-05, "loss": 0.0947, "step": 1260 }, { "epoch": 0.52, "grad_norm": 0.7807052135467529, "learning_rate": 1.7932437932437933e-05, "loss": 0.1692, "step": 1270 }, { "epoch": 0.52, "grad_norm": 0.1252571940422058, "learning_rate": 1.7916157916157916e-05, "loss": 0.0901, "step": 1280 }, { "epoch": 0.53, "grad_norm": 5.491313457489014, "learning_rate": 1.78998778998779e-05, "loss": 0.0849, "step": 1290 }, { "epoch": 0.53, "grad_norm": 0.3406262695789337, "learning_rate": 1.7883597883597884e-05, "loss": 0.1345, "step": 1300 }, { "epoch": 0.53, "grad_norm": 3.4588377475738525, "learning_rate": 1.786731786731787e-05, "loss": 0.1501, "step": 1310 }, { "epoch": 0.54, "grad_norm": 3.2964069843292236, "learning_rate": 1.7851037851037855e-05, "loss": 0.1679, "step": 1320 }, { "epoch": 0.54, "grad_norm": 6.95346212387085, "learning_rate": 1.7834757834757837e-05, "loss": 0.095, "step": 1330 }, { "epoch": 0.55, "grad_norm": 2.9120900630950928, "learning_rate": 1.781847781847782e-05, "loss": 0.1089, "step": 1340 }, { "epoch": 0.55, "grad_norm": 8.793939590454102, "learning_rate": 1.78021978021978e-05, "loss": 0.1338, "step": 1350 }, { "epoch": 0.55, "grad_norm": 0.08519359678030014, "learning_rate": 1.7785917785917787e-05, "loss": 0.0932, "step": 1360 }, { "epoch": 0.56, "grad_norm": 16.41631317138672, "learning_rate": 1.776963776963777e-05, "loss": 0.1376, "step": 1370 }, { "epoch": 0.56, "grad_norm": 3.0415103435516357, "learning_rate": 1.7753357753357755e-05, "loss": 0.1567, "step": 1380 }, { "epoch": 0.57, "grad_norm": 0.8246403336524963, "learning_rate": 1.773707773707774e-05, "loss": 0.122, "step": 1390 }, { "epoch": 0.57, "grad_norm": 2.198512077331543, "learning_rate": 1.7720797720797723e-05, "loss": 0.1528, "step": 1400 }, { "epoch": 0.57, "grad_norm": 0.5246292352676392, "learning_rate": 1.7704517704517705e-05, "loss": 0.1088, "step": 1410 }, { "epoch": 0.58, "grad_norm": 12.515607833862305, "learning_rate": 1.768823768823769e-05, "loss": 0.1627, "step": 1420 }, { "epoch": 0.58, "grad_norm": 13.734766006469727, "learning_rate": 1.7671957671957673e-05, "loss": 0.1475, "step": 1430 }, { "epoch": 0.59, "grad_norm": 2.593158483505249, "learning_rate": 1.7655677655677655e-05, "loss": 0.0968, "step": 1440 }, { "epoch": 0.59, "grad_norm": 0.3462279736995697, "learning_rate": 1.763939763939764e-05, "loss": 0.0854, "step": 1450 }, { "epoch": 0.59, "grad_norm": 1.6409497261047363, "learning_rate": 1.7623117623117623e-05, "loss": 0.2295, "step": 1460 }, { "epoch": 0.6, "grad_norm": 2.9609594345092773, "learning_rate": 1.760683760683761e-05, "loss": 0.1883, "step": 1470 }, { "epoch": 0.6, "grad_norm": 0.673570454120636, "learning_rate": 1.759055759055759e-05, "loss": 0.1369, "step": 1480 }, { "epoch": 0.61, "grad_norm": 0.2929579019546509, "learning_rate": 1.7574277574277577e-05, "loss": 0.1189, "step": 1490 }, { "epoch": 0.61, "grad_norm": 1.4493731260299683, "learning_rate": 1.755799755799756e-05, "loss": 0.1187, "step": 1500 }, { "epoch": 0.61, "grad_norm": 0.07135419547557831, "learning_rate": 1.754171754171754e-05, "loss": 0.0603, "step": 1510 }, { "epoch": 0.62, "grad_norm": 0.10734464973211288, "learning_rate": 1.7525437525437527e-05, "loss": 0.0217, "step": 1520 }, { "epoch": 0.62, "grad_norm": 0.2217961698770523, "learning_rate": 1.750915750915751e-05, "loss": 0.0303, "step": 1530 }, { "epoch": 0.63, "grad_norm": 0.14218159019947052, "learning_rate": 1.7492877492877495e-05, "loss": 0.0603, "step": 1540 }, { "epoch": 0.63, "grad_norm": 0.09605604410171509, "learning_rate": 1.747659747659748e-05, "loss": 0.0407, "step": 1550 }, { "epoch": 0.63, "grad_norm": 0.07094033062458038, "learning_rate": 1.7460317460317463e-05, "loss": 0.0202, "step": 1560 }, { "epoch": 0.64, "grad_norm": 4.650410175323486, "learning_rate": 1.7444037444037445e-05, "loss": 0.0611, "step": 1570 }, { "epoch": 0.64, "grad_norm": 23.229633331298828, "learning_rate": 1.742775742775743e-05, "loss": 0.0528, "step": 1580 }, { "epoch": 0.65, "grad_norm": 1.466739535331726, "learning_rate": 1.7411477411477413e-05, "loss": 0.07, "step": 1590 }, { "epoch": 0.65, "grad_norm": 0.05839679762721062, "learning_rate": 1.7395197395197395e-05, "loss": 0.0077, "step": 1600 }, { "epoch": 0.66, "grad_norm": 1.6192926168441772, "learning_rate": 1.737891737891738e-05, "loss": 0.0239, "step": 1610 }, { "epoch": 0.66, "grad_norm": 3.8529036045074463, "learning_rate": 1.7362637362637363e-05, "loss": 0.0976, "step": 1620 }, { "epoch": 0.66, "grad_norm": 0.24398411810398102, "learning_rate": 1.734635734635735e-05, "loss": 0.0079, "step": 1630 }, { "epoch": 0.67, "grad_norm": 0.04527588561177254, "learning_rate": 1.733007733007733e-05, "loss": 0.0065, "step": 1640 }, { "epoch": 0.67, "grad_norm": 6.153138160705566, "learning_rate": 1.7313797313797316e-05, "loss": 0.0364, "step": 1650 }, { "epoch": 0.68, "grad_norm": 0.03938959911465645, "learning_rate": 1.72975172975173e-05, "loss": 0.009, "step": 1660 }, { "epoch": 0.68, "grad_norm": 0.04055130481719971, "learning_rate": 1.728123728123728e-05, "loss": 0.0472, "step": 1670 }, { "epoch": 0.68, "grad_norm": 0.07095145434141159, "learning_rate": 1.7264957264957267e-05, "loss": 0.0078, "step": 1680 }, { "epoch": 0.69, "grad_norm": 2.7965128421783447, "learning_rate": 1.724867724867725e-05, "loss": 0.0559, "step": 1690 }, { "epoch": 0.69, "grad_norm": 6.2940592765808105, "learning_rate": 1.7232397232397234e-05, "loss": 0.0366, "step": 1700 }, { "epoch": 0.7, "grad_norm": 0.11980397999286652, "learning_rate": 1.721611721611722e-05, "loss": 0.0125, "step": 1710 }, { "epoch": 0.7, "grad_norm": 8.26235294342041, "learning_rate": 1.7199837199837202e-05, "loss": 0.0137, "step": 1720 }, { "epoch": 0.7, "grad_norm": 0.04125256836414337, "learning_rate": 1.7183557183557185e-05, "loss": 0.0051, "step": 1730 }, { "epoch": 0.71, "grad_norm": 0.03920783847570419, "learning_rate": 1.7167277167277167e-05, "loss": 0.0067, "step": 1740 }, { "epoch": 0.71, "grad_norm": 0.13922813534736633, "learning_rate": 1.7150997150997152e-05, "loss": 0.0374, "step": 1750 }, { "epoch": 0.72, "grad_norm": 0.034091122448444366, "learning_rate": 1.7134717134717135e-05, "loss": 0.006, "step": 1760 }, { "epoch": 0.72, "grad_norm": 10.509510040283203, "learning_rate": 1.711843711843712e-05, "loss": 0.0589, "step": 1770 }, { "epoch": 0.72, "grad_norm": 0.043251294642686844, "learning_rate": 1.7102157102157103e-05, "loss": 0.0226, "step": 1780 }, { "epoch": 0.73, "grad_norm": 0.8053480982780457, "learning_rate": 1.7085877085877088e-05, "loss": 0.0582, "step": 1790 }, { "epoch": 0.73, "grad_norm": 0.04081906005740166, "learning_rate": 1.706959706959707e-05, "loss": 0.0391, "step": 1800 }, { "epoch": 0.74, "grad_norm": 0.03760745748877525, "learning_rate": 1.7053317053317056e-05, "loss": 0.027, "step": 1810 }, { "epoch": 0.74, "grad_norm": 0.04111940413713455, "learning_rate": 1.7037037037037038e-05, "loss": 0.0368, "step": 1820 }, { "epoch": 0.74, "grad_norm": 2.6297411918640137, "learning_rate": 1.702075702075702e-05, "loss": 0.0478, "step": 1830 }, { "epoch": 0.75, "grad_norm": 0.1751009225845337, "learning_rate": 1.7004477004477006e-05, "loss": 0.0298, "step": 1840 }, { "epoch": 0.75, "grad_norm": 0.042650580406188965, "learning_rate": 1.698819698819699e-05, "loss": 0.0203, "step": 1850 }, { "epoch": 0.76, "grad_norm": 0.034141793847084045, "learning_rate": 1.6971916971916974e-05, "loss": 0.0044, "step": 1860 }, { "epoch": 0.76, "grad_norm": 0.03497103974223137, "learning_rate": 1.6955636955636956e-05, "loss": 0.0304, "step": 1870 }, { "epoch": 0.77, "grad_norm": 3.8585641384124756, "learning_rate": 1.6939356939356942e-05, "loss": 0.0365, "step": 1880 }, { "epoch": 0.77, "grad_norm": 0.0322452187538147, "learning_rate": 1.6923076923076924e-05, "loss": 0.0596, "step": 1890 }, { "epoch": 0.77, "grad_norm": 0.034800559282302856, "learning_rate": 1.6906796906796906e-05, "loss": 0.0061, "step": 1900 }, { "epoch": 0.78, "grad_norm": 0.0860045924782753, "learning_rate": 1.6890516890516892e-05, "loss": 0.0172, "step": 1910 }, { "epoch": 0.78, "grad_norm": 0.031149201095104218, "learning_rate": 1.6874236874236874e-05, "loss": 0.0238, "step": 1920 }, { "epoch": 0.79, "grad_norm": 0.03368987515568733, "learning_rate": 1.685795685795686e-05, "loss": 0.0043, "step": 1930 }, { "epoch": 0.79, "grad_norm": 0.03161125257611275, "learning_rate": 1.6841676841676846e-05, "loss": 0.0146, "step": 1940 }, { "epoch": 0.79, "grad_norm": 0.029046092182397842, "learning_rate": 1.6825396825396828e-05, "loss": 0.0236, "step": 1950 }, { "epoch": 0.8, "grad_norm": 0.9345057606697083, "learning_rate": 1.680911680911681e-05, "loss": 0.0042, "step": 1960 }, { "epoch": 0.8, "grad_norm": 0.028860267251729965, "learning_rate": 1.6792836792836796e-05, "loss": 0.0286, "step": 1970 }, { "epoch": 0.81, "grad_norm": 0.02852853201329708, "learning_rate": 1.6776556776556778e-05, "loss": 0.023, "step": 1980 }, { "epoch": 0.81, "grad_norm": 0.03128168359398842, "learning_rate": 1.676027676027676e-05, "loss": 0.0036, "step": 1990 }, { "epoch": 0.81, "grad_norm": 0.037479083985090256, "learning_rate": 1.6743996743996746e-05, "loss": 0.0591, "step": 2000 }, { "epoch": 0.82, "grad_norm": 0.04688659682869911, "learning_rate": 1.6727716727716728e-05, "loss": 0.0316, "step": 2010 }, { "epoch": 0.82, "grad_norm": 0.03302760049700737, "learning_rate": 1.6711436711436714e-05, "loss": 0.0668, "step": 2020 }, { "epoch": 0.83, "grad_norm": 0.06181880831718445, "learning_rate": 1.6695156695156696e-05, "loss": 0.0281, "step": 2030 }, { "epoch": 0.83, "grad_norm": 0.0320013165473938, "learning_rate": 1.667887667887668e-05, "loss": 0.0232, "step": 2040 }, { "epoch": 0.83, "grad_norm": 0.13600216805934906, "learning_rate": 1.6662596662596664e-05, "loss": 0.0442, "step": 2050 }, { "epoch": 0.84, "grad_norm": 0.12886099517345428, "learning_rate": 1.6646316646316646e-05, "loss": 0.0305, "step": 2060 }, { "epoch": 0.84, "grad_norm": 0.0625109001994133, "learning_rate": 1.6630036630036632e-05, "loss": 0.0233, "step": 2070 }, { "epoch": 0.85, "grad_norm": 13.604376792907715, "learning_rate": 1.6613756613756614e-05, "loss": 0.0288, "step": 2080 }, { "epoch": 0.85, "grad_norm": 0.029248738661408424, "learning_rate": 1.65974765974766e-05, "loss": 0.0039, "step": 2090 }, { "epoch": 0.85, "grad_norm": 1.4231517314910889, "learning_rate": 1.6581196581196585e-05, "loss": 0.0095, "step": 2100 }, { "epoch": 0.86, "grad_norm": 0.02830047346651554, "learning_rate": 1.6564916564916568e-05, "loss": 0.007, "step": 2110 }, { "epoch": 0.86, "grad_norm": 0.027091912925243378, "learning_rate": 1.654863654863655e-05, "loss": 0.0041, "step": 2120 }, { "epoch": 0.87, "grad_norm": 0.02793751284480095, "learning_rate": 1.6532356532356532e-05, "loss": 0.0087, "step": 2130 }, { "epoch": 0.87, "grad_norm": 0.030688917264342308, "learning_rate": 1.6516076516076518e-05, "loss": 0.0033, "step": 2140 }, { "epoch": 0.88, "grad_norm": 0.02540646307170391, "learning_rate": 1.64997964997965e-05, "loss": 0.0254, "step": 2150 }, { "epoch": 0.88, "grad_norm": 0.026573829352855682, "learning_rate": 1.6483516483516486e-05, "loss": 0.0195, "step": 2160 }, { "epoch": 0.88, "grad_norm": 0.025454262271523476, "learning_rate": 1.6467236467236468e-05, "loss": 0.0031, "step": 2170 }, { "epoch": 0.89, "grad_norm": 0.038121115416288376, "learning_rate": 1.6450956450956453e-05, "loss": 0.0035, "step": 2180 }, { "epoch": 0.89, "grad_norm": 0.025772370398044586, "learning_rate": 1.6434676434676436e-05, "loss": 0.003, "step": 2190 }, { "epoch": 0.9, "grad_norm": 3.4986250400543213, "learning_rate": 1.641839641839642e-05, "loss": 0.0038, "step": 2200 }, { "epoch": 0.9, "grad_norm": 25.038734436035156, "learning_rate": 1.6402116402116404e-05, "loss": 0.0119, "step": 2210 }, { "epoch": 0.9, "grad_norm": 0.025794176384806633, "learning_rate": 1.6385836385836386e-05, "loss": 0.0353, "step": 2220 }, { "epoch": 0.91, "grad_norm": 4.056914806365967, "learning_rate": 1.636955636955637e-05, "loss": 0.0517, "step": 2230 }, { "epoch": 0.91, "grad_norm": 0.19518433511257172, "learning_rate": 1.6353276353276354e-05, "loss": 0.0291, "step": 2240 }, { "epoch": 0.92, "grad_norm": 0.02424285002052784, "learning_rate": 1.633699633699634e-05, "loss": 0.0359, "step": 2250 }, { "epoch": 0.92, "grad_norm": 0.03164544701576233, "learning_rate": 1.632071632071632e-05, "loss": 0.0382, "step": 2260 }, { "epoch": 0.92, "grad_norm": 0.022855272516608238, "learning_rate": 1.6304436304436307e-05, "loss": 0.003, "step": 2270 }, { "epoch": 0.93, "grad_norm": 0.023591142147779465, "learning_rate": 1.628815628815629e-05, "loss": 0.0497, "step": 2280 }, { "epoch": 0.93, "grad_norm": 0.02427799627184868, "learning_rate": 1.627187627187627e-05, "loss": 0.0381, "step": 2290 }, { "epoch": 0.94, "grad_norm": 0.022075733169913292, "learning_rate": 1.6255596255596257e-05, "loss": 0.0038, "step": 2300 }, { "epoch": 0.94, "grad_norm": 0.25007203221321106, "learning_rate": 1.623931623931624e-05, "loss": 0.0364, "step": 2310 }, { "epoch": 0.94, "grad_norm": 0.02502160519361496, "learning_rate": 1.6223036223036225e-05, "loss": 0.0029, "step": 2320 }, { "epoch": 0.95, "grad_norm": 0.036409296095371246, "learning_rate": 1.6206756206756207e-05, "loss": 0.0387, "step": 2330 }, { "epoch": 0.95, "grad_norm": 0.027146685868501663, "learning_rate": 1.6190476190476193e-05, "loss": 0.0045, "step": 2340 }, { "epoch": 0.96, "grad_norm": 0.024981442838907242, "learning_rate": 1.6174196174196175e-05, "loss": 0.0264, "step": 2350 }, { "epoch": 0.96, "grad_norm": 0.027865292504429817, "learning_rate": 1.615791615791616e-05, "loss": 0.0029, "step": 2360 }, { "epoch": 0.96, "grad_norm": 0.034725822508335114, "learning_rate": 1.6141636141636143e-05, "loss": 0.0029, "step": 2370 }, { "epoch": 0.97, "grad_norm": 0.022250523790717125, "learning_rate": 1.6125356125356125e-05, "loss": 0.0337, "step": 2380 }, { "epoch": 0.97, "grad_norm": 0.024188194423913956, "learning_rate": 1.610907610907611e-05, "loss": 0.0026, "step": 2390 }, { "epoch": 0.98, "grad_norm": 0.02303464338183403, "learning_rate": 1.6092796092796093e-05, "loss": 0.0285, "step": 2400 }, { "epoch": 0.98, "grad_norm": 0.020316725596785545, "learning_rate": 1.607651607651608e-05, "loss": 0.0026, "step": 2410 }, { "epoch": 0.98, "grad_norm": 0.023156961426138878, "learning_rate": 1.606023606023606e-05, "loss": 0.0031, "step": 2420 }, { "epoch": 0.99, "grad_norm": 2.9847331047058105, "learning_rate": 1.6043956043956047e-05, "loss": 0.0034, "step": 2430 }, { "epoch": 0.99, "grad_norm": 10.845735549926758, "learning_rate": 1.602767602767603e-05, "loss": 0.0557, "step": 2440 }, { "epoch": 1.0, "grad_norm": 0.02037137933075428, "learning_rate": 1.601139601139601e-05, "loss": 0.0333, "step": 2450 }, { "epoch": 1.0, "grad_norm": 0.019075889140367508, "learning_rate": 1.5995115995115997e-05, "loss": 0.0029, "step": 2460 }, { "epoch": 1.01, "grad_norm": 0.02034451812505722, "learning_rate": 1.597883597883598e-05, "loss": 0.0035, "step": 2470 }, { "epoch": 1.01, "grad_norm": 0.02513672597706318, "learning_rate": 1.5962555962555965e-05, "loss": 0.0149, "step": 2480 }, { "epoch": 1.01, "grad_norm": 0.0232282355427742, "learning_rate": 1.5946275946275947e-05, "loss": 0.0066, "step": 2490 }, { "epoch": 1.02, "grad_norm": 0.019541621208190918, "learning_rate": 1.5929995929995933e-05, "loss": 0.003, "step": 2500 }, { "epoch": 1.02, "grad_norm": 0.027926787734031677, "learning_rate": 1.5913715913715915e-05, "loss": 0.0024, "step": 2510 }, { "epoch": 1.03, "grad_norm": 0.021236905828118324, "learning_rate": 1.5897435897435897e-05, "loss": 0.0023, "step": 2520 }, { "epoch": 1.03, "grad_norm": 0.017625728622078896, "learning_rate": 1.5881155881155883e-05, "loss": 0.0023, "step": 2530 }, { "epoch": 1.03, "grad_norm": 3.0908312797546387, "learning_rate": 1.5864875864875865e-05, "loss": 0.0032, "step": 2540 }, { "epoch": 1.04, "grad_norm": 0.025432445108890533, "learning_rate": 1.584859584859585e-05, "loss": 0.0246, "step": 2550 }, { "epoch": 1.04, "grad_norm": 0.0189252570271492, "learning_rate": 1.5832315832315833e-05, "loss": 0.0025, "step": 2560 }, { "epoch": 1.05, "grad_norm": 0.16396763920783997, "learning_rate": 1.581603581603582e-05, "loss": 0.0378, "step": 2570 }, { "epoch": 1.05, "grad_norm": 0.019563721492886543, "learning_rate": 1.57997557997558e-05, "loss": 0.0281, "step": 2580 }, { "epoch": 1.05, "grad_norm": 0.02156243473291397, "learning_rate": 1.5783475783475787e-05, "loss": 0.1073, "step": 2590 }, { "epoch": 1.06, "grad_norm": 3.184936285018921, "learning_rate": 1.576719576719577e-05, "loss": 0.0413, "step": 2600 }, { "epoch": 1.06, "grad_norm": 0.0187922902405262, "learning_rate": 1.575091575091575e-05, "loss": 0.0423, "step": 2610 }, { "epoch": 1.07, "grad_norm": 0.020309004932641983, "learning_rate": 1.5734635734635737e-05, "loss": 0.0026, "step": 2620 }, { "epoch": 1.07, "grad_norm": 0.028299883008003235, "learning_rate": 1.571835571835572e-05, "loss": 0.0026, "step": 2630 }, { "epoch": 1.07, "grad_norm": 0.022750265896320343, "learning_rate": 1.5702075702075705e-05, "loss": 0.0026, "step": 2640 }, { "epoch": 1.08, "grad_norm": 0.017459379509091377, "learning_rate": 1.5685795685795687e-05, "loss": 0.0026, "step": 2650 }, { "epoch": 1.08, "grad_norm": 0.02400645986199379, "learning_rate": 1.5669515669515672e-05, "loss": 0.0022, "step": 2660 }, { "epoch": 1.09, "grad_norm": 0.037710972130298615, "learning_rate": 1.5653235653235655e-05, "loss": 0.0024, "step": 2670 }, { "epoch": 1.09, "grad_norm": 0.01844876818358898, "learning_rate": 1.5636955636955637e-05, "loss": 0.0022, "step": 2680 }, { "epoch": 1.09, "grad_norm": 0.015886761248111725, "learning_rate": 1.5620675620675623e-05, "loss": 0.0021, "step": 2690 }, { "epoch": 1.1, "grad_norm": 0.016119027510285378, "learning_rate": 1.5604395604395605e-05, "loss": 0.0024, "step": 2700 }, { "epoch": 1.1, "grad_norm": 0.01977747306227684, "learning_rate": 1.558811558811559e-05, "loss": 0.0405, "step": 2710 }, { "epoch": 1.11, "grad_norm": 0.01591884344816208, "learning_rate": 1.5571835571835573e-05, "loss": 0.0021, "step": 2720 }, { "epoch": 1.11, "grad_norm": 0.017170535400509834, "learning_rate": 1.555555555555556e-05, "loss": 0.0102, "step": 2730 }, { "epoch": 1.12, "grad_norm": 0.02160962112247944, "learning_rate": 1.553927553927554e-05, "loss": 0.0164, "step": 2740 }, { "epoch": 1.12, "grad_norm": 0.04177393019199371, "learning_rate": 1.5522995522995526e-05, "loss": 0.002, "step": 2750 }, { "epoch": 1.12, "grad_norm": 0.01732414774596691, "learning_rate": 1.550671550671551e-05, "loss": 0.0022, "step": 2760 }, { "epoch": 1.13, "grad_norm": 0.05687391385436058, "learning_rate": 1.549043549043549e-05, "loss": 0.002, "step": 2770 }, { "epoch": 1.13, "grad_norm": 0.015546981245279312, "learning_rate": 1.5474155474155473e-05, "loss": 0.0296, "step": 2780 }, { "epoch": 1.14, "grad_norm": 11.891217231750488, "learning_rate": 1.545787545787546e-05, "loss": 0.0303, "step": 2790 }, { "epoch": 1.14, "grad_norm": 3.074970245361328, "learning_rate": 1.5441595441595444e-05, "loss": 0.0346, "step": 2800 }, { "epoch": 1.14, "grad_norm": 1.3277289867401123, "learning_rate": 1.5425315425315426e-05, "loss": 0.0053, "step": 2810 }, { "epoch": 1.15, "grad_norm": 0.014851146377623081, "learning_rate": 1.5409035409035412e-05, "loss": 0.0021, "step": 2820 }, { "epoch": 1.15, "grad_norm": 0.02586003951728344, "learning_rate": 1.5392755392755394e-05, "loss": 0.0194, "step": 2830 }, { "epoch": 1.16, "grad_norm": 0.018063299357891083, "learning_rate": 1.5376475376475377e-05, "loss": 0.0374, "step": 2840 }, { "epoch": 1.16, "grad_norm": 0.014860156923532486, "learning_rate": 1.5360195360195362e-05, "loss": 0.0368, "step": 2850 }, { "epoch": 1.16, "grad_norm": 0.016715556383132935, "learning_rate": 1.5343915343915344e-05, "loss": 0.0232, "step": 2860 }, { "epoch": 1.17, "grad_norm": 0.017222585156559944, "learning_rate": 1.532763532763533e-05, "loss": 0.0021, "step": 2870 }, { "epoch": 1.17, "grad_norm": 0.015297485515475273, "learning_rate": 1.5311355311355312e-05, "loss": 0.002, "step": 2880 }, { "epoch": 1.18, "grad_norm": 0.01927722617983818, "learning_rate": 1.5295075295075298e-05, "loss": 0.0344, "step": 2890 }, { "epoch": 1.18, "grad_norm": 0.014726200141012669, "learning_rate": 1.527879527879528e-05, "loss": 0.0105, "step": 2900 }, { "epoch": 1.18, "grad_norm": 0.015239718370139599, "learning_rate": 1.5262515262515263e-05, "loss": 0.0019, "step": 2910 }, { "epoch": 1.19, "grad_norm": 0.014116072095930576, "learning_rate": 1.5246235246235248e-05, "loss": 0.0482, "step": 2920 }, { "epoch": 1.19, "grad_norm": 0.014437291771173477, "learning_rate": 1.522995522995523e-05, "loss": 0.0028, "step": 2930 }, { "epoch": 1.2, "grad_norm": 0.017663761973381042, "learning_rate": 1.5213675213675214e-05, "loss": 0.007, "step": 2940 }, { "epoch": 1.2, "grad_norm": 0.024807853624224663, "learning_rate": 1.51973951973952e-05, "loss": 0.0044, "step": 2950 }, { "epoch": 1.2, "grad_norm": 0.01389392837882042, "learning_rate": 1.5181115181115182e-05, "loss": 0.021, "step": 2960 }, { "epoch": 1.21, "grad_norm": 0.014578912407159805, "learning_rate": 1.5164835164835166e-05, "loss": 0.002, "step": 2970 }, { "epoch": 1.21, "grad_norm": 0.013830927200615406, "learning_rate": 1.514855514855515e-05, "loss": 0.0017, "step": 2980 }, { "epoch": 1.22, "grad_norm": 0.012908479198813438, "learning_rate": 1.5132275132275134e-05, "loss": 0.0047, "step": 2990 }, { "epoch": 1.22, "grad_norm": 0.013685975223779678, "learning_rate": 1.5115995115995116e-05, "loss": 0.0062, "step": 3000 }, { "epoch": 1.23, "grad_norm": 0.015914512798190117, "learning_rate": 1.50997150997151e-05, "loss": 0.0415, "step": 3010 }, { "epoch": 1.23, "grad_norm": 0.09328664839267731, "learning_rate": 1.5083435083435086e-05, "loss": 0.0017, "step": 3020 }, { "epoch": 1.23, "grad_norm": 0.013503558933734894, "learning_rate": 1.5067155067155068e-05, "loss": 0.0292, "step": 3030 }, { "epoch": 1.24, "grad_norm": 0.012664329260587692, "learning_rate": 1.505087505087505e-05, "loss": 0.0108, "step": 3040 }, { "epoch": 1.24, "grad_norm": 0.013521691784262657, "learning_rate": 1.5034595034595036e-05, "loss": 0.0016, "step": 3050 }, { "epoch": 1.25, "grad_norm": 0.017031285911798477, "learning_rate": 1.501831501831502e-05, "loss": 0.0056, "step": 3060 }, { "epoch": 1.25, "grad_norm": 0.0123978890478611, "learning_rate": 1.5002035002035002e-05, "loss": 0.0454, "step": 3070 }, { "epoch": 1.25, "grad_norm": 0.01293584518134594, "learning_rate": 1.4985754985754988e-05, "loss": 0.004, "step": 3080 }, { "epoch": 1.26, "grad_norm": 0.013730690814554691, "learning_rate": 1.496947496947497e-05, "loss": 0.0355, "step": 3090 }, { "epoch": 1.26, "grad_norm": 0.01241120882332325, "learning_rate": 1.4953194953194954e-05, "loss": 0.0017, "step": 3100 }, { "epoch": 1.27, "grad_norm": 0.016001150012016296, "learning_rate": 1.493691493691494e-05, "loss": 0.0017, "step": 3110 }, { "epoch": 1.27, "grad_norm": 0.019151071086525917, "learning_rate": 1.4920634920634922e-05, "loss": 0.0335, "step": 3120 }, { "epoch": 1.27, "grad_norm": 0.014675545506179333, "learning_rate": 1.4904354904354906e-05, "loss": 0.0203, "step": 3130 }, { "epoch": 1.28, "grad_norm": 0.5518173575401306, "learning_rate": 1.4888074888074888e-05, "loss": 0.0025, "step": 3140 }, { "epoch": 1.28, "grad_norm": 0.012442667037248611, "learning_rate": 1.4871794871794874e-05, "loss": 0.0021, "step": 3150 }, { "epoch": 1.29, "grad_norm": 0.013752995058894157, "learning_rate": 1.4855514855514856e-05, "loss": 0.0018, "step": 3160 }, { "epoch": 1.29, "grad_norm": 0.011561810038983822, "learning_rate": 1.483923483923484e-05, "loss": 0.0016, "step": 3170 }, { "epoch": 1.29, "grad_norm": 0.011732109822332859, "learning_rate": 1.4822954822954826e-05, "loss": 0.0015, "step": 3180 }, { "epoch": 1.3, "grad_norm": 0.011794438585639, "learning_rate": 1.4806674806674808e-05, "loss": 0.0014, "step": 3190 }, { "epoch": 1.3, "grad_norm": 0.011947757564485073, "learning_rate": 1.479039479039479e-05, "loss": 0.0026, "step": 3200 }, { "epoch": 1.31, "grad_norm": 0.017924221232533455, "learning_rate": 1.4774114774114776e-05, "loss": 0.0015, "step": 3210 }, { "epoch": 1.31, "grad_norm": 0.011501024477183819, "learning_rate": 1.475783475783476e-05, "loss": 0.0021, "step": 3220 }, { "epoch": 1.31, "grad_norm": 0.05062294751405716, "learning_rate": 1.4741554741554742e-05, "loss": 0.0015, "step": 3230 }, { "epoch": 1.32, "grad_norm": 0.011451934464275837, "learning_rate": 1.4725274725274727e-05, "loss": 0.0015, "step": 3240 }, { "epoch": 1.32, "grad_norm": 0.011398130096495152, "learning_rate": 1.470899470899471e-05, "loss": 0.0262, "step": 3250 }, { "epoch": 1.33, "grad_norm": 0.011111021041870117, "learning_rate": 1.4692714692714694e-05, "loss": 0.0015, "step": 3260 }, { "epoch": 1.33, "grad_norm": 0.011720293201506138, "learning_rate": 1.4676434676434676e-05, "loss": 0.0014, "step": 3270 }, { "epoch": 1.33, "grad_norm": 0.01106089074164629, "learning_rate": 1.4660154660154662e-05, "loss": 0.0248, "step": 3280 }, { "epoch": 1.34, "grad_norm": 0.031572628766298294, "learning_rate": 1.4643874643874645e-05, "loss": 0.0015, "step": 3290 }, { "epoch": 1.34, "grad_norm": 0.010560325346887112, "learning_rate": 1.4627594627594628e-05, "loss": 0.0014, "step": 3300 }, { "epoch": 1.35, "grad_norm": 31.388111114501953, "learning_rate": 1.4611314611314613e-05, "loss": 0.0255, "step": 3310 }, { "epoch": 1.35, "grad_norm": 0.016965394839644432, "learning_rate": 1.4595034595034596e-05, "loss": 0.0014, "step": 3320 }, { "epoch": 1.36, "grad_norm": 0.022373100742697716, "learning_rate": 1.457875457875458e-05, "loss": 0.0013, "step": 3330 }, { "epoch": 1.36, "grad_norm": 0.011025676503777504, "learning_rate": 1.4562474562474565e-05, "loss": 0.0374, "step": 3340 }, { "epoch": 1.36, "grad_norm": 0.016683539375662804, "learning_rate": 1.4546194546194547e-05, "loss": 0.0389, "step": 3350 }, { "epoch": 1.37, "grad_norm": 0.012086950242519379, "learning_rate": 1.4529914529914531e-05, "loss": 0.0304, "step": 3360 }, { "epoch": 1.37, "grad_norm": 0.011172090657055378, "learning_rate": 1.4513634513634515e-05, "loss": 0.0178, "step": 3370 }, { "epoch": 1.38, "grad_norm": 0.013024254702031612, "learning_rate": 1.44973544973545e-05, "loss": 0.0014, "step": 3380 }, { "epoch": 1.38, "grad_norm": 0.010836287401616573, "learning_rate": 1.4481074481074482e-05, "loss": 0.0014, "step": 3390 }, { "epoch": 1.38, "grad_norm": 0.014210844412446022, "learning_rate": 1.4464794464794465e-05, "loss": 0.0014, "step": 3400 }, { "epoch": 1.39, "grad_norm": 0.010528087615966797, "learning_rate": 1.444851444851445e-05, "loss": 0.0044, "step": 3410 }, { "epoch": 1.39, "grad_norm": 0.01593305543065071, "learning_rate": 1.4432234432234433e-05, "loss": 0.0455, "step": 3420 }, { "epoch": 1.4, "grad_norm": 0.015049874782562256, "learning_rate": 1.4415954415954416e-05, "loss": 0.0027, "step": 3430 }, { "epoch": 1.4, "grad_norm": 0.011662309989333153, "learning_rate": 1.4399674399674401e-05, "loss": 0.0013, "step": 3440 }, { "epoch": 1.4, "grad_norm": 0.011207195930182934, "learning_rate": 1.4383394383394385e-05, "loss": 0.0018, "step": 3450 }, { "epoch": 1.41, "grad_norm": 3.6042699813842773, "learning_rate": 1.4367114367114367e-05, "loss": 0.0029, "step": 3460 }, { "epoch": 1.41, "grad_norm": 0.09215729683637619, "learning_rate": 1.4350834350834353e-05, "loss": 0.002, "step": 3470 }, { "epoch": 1.42, "grad_norm": 0.010877463966608047, "learning_rate": 1.4334554334554335e-05, "loss": 0.0014, "step": 3480 }, { "epoch": 1.42, "grad_norm": 0.009993131272494793, "learning_rate": 1.431827431827432e-05, "loss": 0.0016, "step": 3490 }, { "epoch": 1.42, "grad_norm": 1.349046230316162, "learning_rate": 1.4301994301994305e-05, "loss": 0.0018, "step": 3500 }, { "epoch": 1.43, "grad_norm": 0.009341539815068245, "learning_rate": 1.4285714285714287e-05, "loss": 0.0012, "step": 3510 }, { "epoch": 1.43, "grad_norm": 0.009393510408699512, "learning_rate": 1.4269434269434271e-05, "loss": 0.0011, "step": 3520 }, { "epoch": 1.44, "grad_norm": 0.009326926432549953, "learning_rate": 1.4253154253154253e-05, "loss": 0.0012, "step": 3530 }, { "epoch": 1.44, "grad_norm": 0.009275635704398155, "learning_rate": 1.4236874236874239e-05, "loss": 0.0384, "step": 3540 }, { "epoch": 1.44, "grad_norm": 22.40707778930664, "learning_rate": 1.4220594220594221e-05, "loss": 0.0131, "step": 3550 }, { "epoch": 1.45, "grad_norm": 0.00953533872961998, "learning_rate": 1.4204314204314205e-05, "loss": 0.0347, "step": 3560 }, { "epoch": 1.45, "grad_norm": 0.5032986998558044, "learning_rate": 1.4188034188034189e-05, "loss": 0.0402, "step": 3570 }, { "epoch": 1.46, "grad_norm": 0.011732584796845913, "learning_rate": 1.4171754171754173e-05, "loss": 0.0592, "step": 3580 }, { "epoch": 1.46, "grad_norm": 0.010645696893334389, "learning_rate": 1.4155474155474155e-05, "loss": 0.0268, "step": 3590 }, { "epoch": 1.47, "grad_norm": 0.013740918599069118, "learning_rate": 1.4139194139194141e-05, "loss": 0.0252, "step": 3600 }, { "epoch": 1.47, "grad_norm": 0.013372181914746761, "learning_rate": 1.4122914122914125e-05, "loss": 0.0376, "step": 3610 }, { "epoch": 1.47, "grad_norm": 0.015505131334066391, "learning_rate": 1.4106634106634107e-05, "loss": 0.0014, "step": 3620 }, { "epoch": 1.48, "grad_norm": 0.014338747598230839, "learning_rate": 1.4090354090354093e-05, "loss": 0.0853, "step": 3630 }, { "epoch": 1.48, "grad_norm": 0.01571911759674549, "learning_rate": 1.4074074074074075e-05, "loss": 0.0298, "step": 3640 }, { "epoch": 1.49, "grad_norm": 0.020005526021122932, "learning_rate": 1.4057794057794059e-05, "loss": 0.0017, "step": 3650 }, { "epoch": 1.49, "grad_norm": 0.018354693427681923, "learning_rate": 1.4041514041514041e-05, "loss": 0.0016, "step": 3660 }, { "epoch": 1.49, "grad_norm": 0.021922029554843903, "learning_rate": 1.4025234025234027e-05, "loss": 0.0017, "step": 3670 }, { "epoch": 1.5, "grad_norm": 0.013702883385121822, "learning_rate": 1.400895400895401e-05, "loss": 0.0014, "step": 3680 }, { "epoch": 1.5, "grad_norm": 0.010742840357124805, "learning_rate": 1.3992673992673993e-05, "loss": 0.0026, "step": 3690 }, { "epoch": 1.51, "grad_norm": 0.15446045994758606, "learning_rate": 1.3976393976393979e-05, "loss": 0.0013, "step": 3700 }, { "epoch": 1.51, "grad_norm": 0.01300391647964716, "learning_rate": 1.3960113960113961e-05, "loss": 0.0012, "step": 3710 }, { "epoch": 1.51, "grad_norm": 0.017101220786571503, "learning_rate": 1.3943833943833945e-05, "loss": 0.0012, "step": 3720 }, { "epoch": 1.52, "grad_norm": 0.009062445722520351, "learning_rate": 1.3927553927553929e-05, "loss": 0.0012, "step": 3730 }, { "epoch": 1.52, "grad_norm": 0.008803702890872955, "learning_rate": 1.3911273911273913e-05, "loss": 0.0011, "step": 3740 }, { "epoch": 1.53, "grad_norm": 0.008593735285103321, "learning_rate": 1.3894993894993895e-05, "loss": 0.0012, "step": 3750 }, { "epoch": 1.53, "grad_norm": 0.009692203253507614, "learning_rate": 1.387871387871388e-05, "loss": 0.0011, "step": 3760 }, { "epoch": 1.53, "grad_norm": 0.011008762754499912, "learning_rate": 1.3862433862433865e-05, "loss": 0.0011, "step": 3770 }, { "epoch": 1.54, "grad_norm": 0.009994535706937313, "learning_rate": 1.3846153846153847e-05, "loss": 0.022, "step": 3780 }, { "epoch": 1.54, "grad_norm": 0.009117243811488152, "learning_rate": 1.382987382987383e-05, "loss": 0.0011, "step": 3790 }, { "epoch": 1.55, "grad_norm": 0.008967447094619274, "learning_rate": 1.3813593813593815e-05, "loss": 0.0057, "step": 3800 }, { "epoch": 1.55, "grad_norm": 0.008691845461726189, "learning_rate": 1.3797313797313799e-05, "loss": 0.0013, "step": 3810 }, { "epoch": 1.55, "grad_norm": 0.011074850335717201, "learning_rate": 1.378103378103378e-05, "loss": 0.001, "step": 3820 }, { "epoch": 1.56, "grad_norm": 0.00832684338092804, "learning_rate": 1.3764753764753766e-05, "loss": 0.0011, "step": 3830 }, { "epoch": 1.56, "grad_norm": 0.008292116224765778, "learning_rate": 1.374847374847375e-05, "loss": 0.001, "step": 3840 }, { "epoch": 1.57, "grad_norm": 0.009205167181789875, "learning_rate": 1.3732193732193733e-05, "loss": 0.0011, "step": 3850 }, { "epoch": 1.57, "grad_norm": 0.008790573105216026, "learning_rate": 1.3715913715913718e-05, "loss": 0.001, "step": 3860 }, { "epoch": 1.58, "grad_norm": 0.008000485599040985, "learning_rate": 1.36996336996337e-05, "loss": 0.008, "step": 3870 }, { "epoch": 1.58, "grad_norm": 0.00819096527993679, "learning_rate": 1.3683353683353684e-05, "loss": 0.001, "step": 3880 }, { "epoch": 1.58, "grad_norm": 0.014848892576992512, "learning_rate": 1.3667073667073668e-05, "loss": 0.015, "step": 3890 }, { "epoch": 1.59, "grad_norm": 0.008053899742662907, "learning_rate": 1.3650793650793652e-05, "loss": 0.0009, "step": 3900 }, { "epoch": 1.59, "grad_norm": 6.416678428649902, "learning_rate": 1.3634513634513635e-05, "loss": 0.0344, "step": 3910 }, { "epoch": 1.6, "grad_norm": 0.10300695151090622, "learning_rate": 1.3618233618233619e-05, "loss": 0.001, "step": 3920 }, { "epoch": 1.6, "grad_norm": 0.008424129337072372, "learning_rate": 1.3601953601953604e-05, "loss": 0.0267, "step": 3930 }, { "epoch": 1.6, "grad_norm": 0.00800679437816143, "learning_rate": 1.3585673585673586e-05, "loss": 0.0326, "step": 3940 }, { "epoch": 1.61, "grad_norm": 0.009919759817421436, "learning_rate": 1.356939356939357e-05, "loss": 0.0011, "step": 3950 }, { "epoch": 1.61, "grad_norm": 0.02416282147169113, "learning_rate": 1.3553113553113554e-05, "loss": 0.0012, "step": 3960 }, { "epoch": 1.62, "grad_norm": 5.555994033813477, "learning_rate": 1.3536833536833538e-05, "loss": 0.043, "step": 3970 }, { "epoch": 1.62, "grad_norm": 0.10745339095592499, "learning_rate": 1.352055352055352e-05, "loss": 0.0011, "step": 3980 }, { "epoch": 1.62, "grad_norm": 0.00835937075316906, "learning_rate": 1.3504273504273506e-05, "loss": 0.0009, "step": 3990 }, { "epoch": 1.63, "grad_norm": 0.007618330419063568, "learning_rate": 1.348799348799349e-05, "loss": 0.0241, "step": 4000 }, { "epoch": 1.63, "grad_norm": 0.022973209619522095, "learning_rate": 1.3471713471713472e-05, "loss": 0.001, "step": 4010 }, { "epoch": 1.64, "grad_norm": 0.008424985222518444, "learning_rate": 1.3455433455433458e-05, "loss": 0.0018, "step": 4020 }, { "epoch": 1.64, "grad_norm": 0.015286185778677464, "learning_rate": 1.343915343915344e-05, "loss": 0.0009, "step": 4030 }, { "epoch": 1.64, "grad_norm": 0.007264839485287666, "learning_rate": 1.3422873422873424e-05, "loss": 0.0009, "step": 4040 }, { "epoch": 1.65, "grad_norm": 0.0074860285967588425, "learning_rate": 1.3406593406593406e-05, "loss": 0.0009, "step": 4050 }, { "epoch": 1.65, "grad_norm": 0.008237460628151894, "learning_rate": 1.3390313390313392e-05, "loss": 0.0373, "step": 4060 }, { "epoch": 1.66, "grad_norm": 0.007270953617990017, "learning_rate": 1.3374033374033374e-05, "loss": 0.0009, "step": 4070 }, { "epoch": 1.66, "grad_norm": 0.03919156640768051, "learning_rate": 1.3357753357753358e-05, "loss": 0.001, "step": 4080 }, { "epoch": 1.66, "grad_norm": 0.11515277624130249, "learning_rate": 1.3341473341473344e-05, "loss": 0.001, "step": 4090 }, { "epoch": 1.67, "grad_norm": 0.007153298240154982, "learning_rate": 1.3325193325193326e-05, "loss": 0.0014, "step": 4100 }, { "epoch": 1.67, "grad_norm": 0.00894332304596901, "learning_rate": 1.330891330891331e-05, "loss": 0.0022, "step": 4110 }, { "epoch": 1.68, "grad_norm": 0.046884216368198395, "learning_rate": 1.3292633292633294e-05, "loss": 0.001, "step": 4120 }, { "epoch": 1.68, "grad_norm": 0.0074531338177621365, "learning_rate": 1.3276353276353278e-05, "loss": 0.0009, "step": 4130 }, { "epoch": 1.68, "grad_norm": 0.008025778457522392, "learning_rate": 1.326007326007326e-05, "loss": 0.0008, "step": 4140 }, { "epoch": 1.69, "grad_norm": 0.007099485024809837, "learning_rate": 1.3243793243793246e-05, "loss": 0.0349, "step": 4150 }, { "epoch": 1.69, "grad_norm": 0.007894063368439674, "learning_rate": 1.322751322751323e-05, "loss": 0.0008, "step": 4160 }, { "epoch": 1.7, "grad_norm": 0.008376212790608406, "learning_rate": 1.3211233211233212e-05, "loss": 0.0009, "step": 4170 }, { "epoch": 1.7, "grad_norm": 0.007172748912125826, "learning_rate": 1.3194953194953194e-05, "loss": 0.0011, "step": 4180 }, { "epoch": 1.71, "grad_norm": 0.007325605023652315, "learning_rate": 1.317867317867318e-05, "loss": 0.0008, "step": 4190 }, { "epoch": 1.71, "grad_norm": 0.007277225609868765, "learning_rate": 1.3162393162393164e-05, "loss": 0.0009, "step": 4200 }, { "epoch": 1.71, "grad_norm": 0.007008700165897608, "learning_rate": 1.3146113146113146e-05, "loss": 0.0009, "step": 4210 }, { "epoch": 1.72, "grad_norm": 0.007119116373360157, "learning_rate": 1.3129833129833132e-05, "loss": 0.0088, "step": 4220 }, { "epoch": 1.72, "grad_norm": 0.006735885515809059, "learning_rate": 1.3113553113553114e-05, "loss": 0.0011, "step": 4230 }, { "epoch": 1.73, "grad_norm": 0.006696558557450771, "learning_rate": 1.3097273097273098e-05, "loss": 0.0057, "step": 4240 }, { "epoch": 1.73, "grad_norm": 0.01188244204968214, "learning_rate": 1.3080993080993084e-05, "loss": 0.0011, "step": 4250 }, { "epoch": 1.73, "grad_norm": 0.007251105271279812, "learning_rate": 1.3064713064713066e-05, "loss": 0.0357, "step": 4260 }, { "epoch": 1.74, "grad_norm": 0.006903903558850288, "learning_rate": 1.304843304843305e-05, "loss": 0.0008, "step": 4270 }, { "epoch": 1.74, "grad_norm": 0.008923369459807873, "learning_rate": 1.3032153032153034e-05, "loss": 0.0008, "step": 4280 }, { "epoch": 1.75, "grad_norm": 0.006224838085472584, "learning_rate": 1.3015873015873018e-05, "loss": 0.0077, "step": 4290 }, { "epoch": 1.75, "grad_norm": 0.00695427879691124, "learning_rate": 1.2999592999593e-05, "loss": 0.0008, "step": 4300 }, { "epoch": 1.75, "grad_norm": 0.007040718570351601, "learning_rate": 1.2983312983312984e-05, "loss": 0.0008, "step": 4310 }, { "epoch": 1.76, "grad_norm": 0.006210348103195429, "learning_rate": 1.296703296703297e-05, "loss": 0.0015, "step": 4320 }, { "epoch": 1.76, "grad_norm": 0.0062638637609779835, "learning_rate": 1.2950752950752952e-05, "loss": 0.0044, "step": 4330 }, { "epoch": 1.77, "grad_norm": 0.006666597910225391, "learning_rate": 1.2934472934472934e-05, "loss": 0.0007, "step": 4340 }, { "epoch": 1.77, "grad_norm": 0.0061942501924932, "learning_rate": 1.291819291819292e-05, "loss": 0.0011, "step": 4350 }, { "epoch": 1.77, "grad_norm": 0.00600019795820117, "learning_rate": 1.2901912901912904e-05, "loss": 0.0008, "step": 4360 }, { "epoch": 1.78, "grad_norm": 0.006045353598892689, "learning_rate": 1.2885632885632886e-05, "loss": 0.0451, "step": 4370 }, { "epoch": 1.78, "grad_norm": 0.006641109474003315, "learning_rate": 1.2869352869352871e-05, "loss": 0.0008, "step": 4380 }, { "epoch": 1.79, "grad_norm": 0.4562086760997772, "learning_rate": 1.2853072853072854e-05, "loss": 0.0009, "step": 4390 }, { "epoch": 1.79, "grad_norm": 0.0076696197502315044, "learning_rate": 1.2836792836792838e-05, "loss": 0.0348, "step": 4400 }, { "epoch": 1.79, "grad_norm": 0.006937106605619192, "learning_rate": 1.2820512820512823e-05, "loss": 0.0596, "step": 4410 }, { "epoch": 1.8, "grad_norm": 0.00782240740954876, "learning_rate": 1.2804232804232805e-05, "loss": 0.0851, "step": 4420 }, { "epoch": 1.8, "grad_norm": 0.007307849358767271, "learning_rate": 1.278795278795279e-05, "loss": 0.0009, "step": 4430 }, { "epoch": 1.81, "grad_norm": 0.008858690969645977, "learning_rate": 1.2771672771672772e-05, "loss": 0.0021, "step": 4440 }, { "epoch": 1.81, "grad_norm": 0.006560084410011768, "learning_rate": 1.2755392755392757e-05, "loss": 0.0008, "step": 4450 }, { "epoch": 1.82, "grad_norm": 0.06266916543245316, "learning_rate": 1.273911273911274e-05, "loss": 0.0011, "step": 4460 }, { "epoch": 1.82, "grad_norm": 0.00679628923535347, "learning_rate": 1.2722832722832723e-05, "loss": 0.0009, "step": 4470 }, { "epoch": 1.82, "grad_norm": 0.006765253376215696, "learning_rate": 1.2706552706552709e-05, "loss": 0.0013, "step": 4480 }, { "epoch": 1.83, "grad_norm": 0.005858385004103184, "learning_rate": 1.2690272690272691e-05, "loss": 0.0007, "step": 4490 }, { "epoch": 1.83, "grad_norm": 0.006266339216381311, "learning_rate": 1.2673992673992674e-05, "loss": 0.0008, "step": 4500 }, { "epoch": 1.84, "grad_norm": 0.006281218025833368, "learning_rate": 1.265771265771266e-05, "loss": 0.1082, "step": 4510 }, { "epoch": 1.84, "grad_norm": 0.006863302085548639, "learning_rate": 1.2641432641432643e-05, "loss": 0.0009, "step": 4520 }, { "epoch": 1.84, "grad_norm": 0.013896014541387558, "learning_rate": 1.2625152625152625e-05, "loss": 0.0281, "step": 4530 }, { "epoch": 1.85, "grad_norm": 0.24578307569026947, "learning_rate": 1.2608872608872611e-05, "loss": 0.001, "step": 4540 }, { "epoch": 1.85, "grad_norm": 0.011449114419519901, "learning_rate": 1.2592592592592593e-05, "loss": 0.0007, "step": 4550 }, { "epoch": 1.86, "grad_norm": 36.35368728637695, "learning_rate": 1.2576312576312577e-05, "loss": 0.0217, "step": 4560 }, { "epoch": 1.86, "grad_norm": 0.011718428693711758, "learning_rate": 1.256003256003256e-05, "loss": 0.0008, "step": 4570 }, { "epoch": 1.86, "grad_norm": 10.411919593811035, "learning_rate": 1.2543752543752545e-05, "loss": 0.0159, "step": 4580 }, { "epoch": 1.87, "grad_norm": 0.006179590709507465, "learning_rate": 1.2527472527472529e-05, "loss": 0.0307, "step": 4590 }, { "epoch": 1.87, "grad_norm": 0.0063836839981377125, "learning_rate": 1.2511192511192511e-05, "loss": 0.0034, "step": 4600 }, { "epoch": 1.88, "grad_norm": 0.008047536946833134, "learning_rate": 1.2494912494912497e-05, "loss": 0.001, "step": 4610 }, { "epoch": 1.88, "grad_norm": 0.010491227731108665, "learning_rate": 1.247863247863248e-05, "loss": 0.0008, "step": 4620 }, { "epoch": 1.88, "grad_norm": 0.005860119592398405, "learning_rate": 1.2462352462352463e-05, "loss": 0.0007, "step": 4630 }, { "epoch": 1.89, "grad_norm": 10.03593635559082, "learning_rate": 1.2446072446072449e-05, "loss": 0.0314, "step": 4640 }, { "epoch": 1.89, "grad_norm": 0.006240949500352144, "learning_rate": 1.2429792429792431e-05, "loss": 0.0009, "step": 4650 }, { "epoch": 1.9, "grad_norm": 0.00653426069766283, "learning_rate": 1.2413512413512413e-05, "loss": 0.0008, "step": 4660 }, { "epoch": 1.9, "grad_norm": 0.0061131748370826244, "learning_rate": 1.2397232397232399e-05, "loss": 0.0385, "step": 4670 }, { "epoch": 1.9, "grad_norm": 0.018757157027721405, "learning_rate": 1.2380952380952383e-05, "loss": 0.0008, "step": 4680 }, { "epoch": 1.91, "grad_norm": 0.005603988189250231, "learning_rate": 1.2364672364672365e-05, "loss": 0.0007, "step": 4690 }, { "epoch": 1.91, "grad_norm": 0.008327238261699677, "learning_rate": 1.2348392348392349e-05, "loss": 0.0007, "step": 4700 }, { "epoch": 1.92, "grad_norm": 0.006342690903693438, "learning_rate": 1.2332112332112333e-05, "loss": 0.0027, "step": 4710 }, { "epoch": 1.92, "grad_norm": 0.007467071060091257, "learning_rate": 1.2315832315832317e-05, "loss": 0.001, "step": 4720 }, { "epoch": 1.93, "grad_norm": 0.005770612042397261, "learning_rate": 1.22995522995523e-05, "loss": 0.0422, "step": 4730 }, { "epoch": 1.93, "grad_norm": 0.01268511638045311, "learning_rate": 1.2283272283272285e-05, "loss": 0.001, "step": 4740 }, { "epoch": 1.93, "grad_norm": 0.025519585236907005, "learning_rate": 1.2266992266992269e-05, "loss": 0.019, "step": 4750 }, { "epoch": 1.94, "grad_norm": 12.875621795654297, "learning_rate": 1.2250712250712251e-05, "loss": 0.0206, "step": 4760 }, { "epoch": 1.94, "grad_norm": 0.018496304750442505, "learning_rate": 1.2234432234432237e-05, "loss": 0.0008, "step": 4770 }, { "epoch": 1.95, "grad_norm": 0.005795106291770935, "learning_rate": 1.2218152218152219e-05, "loss": 0.0032, "step": 4780 }, { "epoch": 1.95, "grad_norm": 0.005989160854369402, "learning_rate": 1.2201872201872203e-05, "loss": 0.0007, "step": 4790 }, { "epoch": 1.95, "grad_norm": 0.005859148222953081, "learning_rate": 1.2185592185592185e-05, "loss": 0.0007, "step": 4800 }, { "epoch": 1.96, "grad_norm": 0.008097686804831028, "learning_rate": 1.216931216931217e-05, "loss": 0.0007, "step": 4810 }, { "epoch": 1.96, "grad_norm": 0.005901312455534935, "learning_rate": 1.2153032153032153e-05, "loss": 0.0007, "step": 4820 }, { "epoch": 1.97, "grad_norm": 0.006804050877690315, "learning_rate": 1.2136752136752137e-05, "loss": 0.0009, "step": 4830 }, { "epoch": 1.97, "grad_norm": 0.006251387298107147, "learning_rate": 1.2120472120472123e-05, "loss": 0.0423, "step": 4840 }, { "epoch": 1.97, "grad_norm": 0.0055562574416399, "learning_rate": 1.2104192104192105e-05, "loss": 0.0008, "step": 4850 }, { "epoch": 1.98, "grad_norm": 0.006534604821354151, "learning_rate": 1.2087912087912089e-05, "loss": 0.0038, "step": 4860 }, { "epoch": 1.98, "grad_norm": 0.010235198773443699, "learning_rate": 1.2071632071632073e-05, "loss": 0.003, "step": 4870 }, { "epoch": 1.99, "grad_norm": 0.006196849979460239, "learning_rate": 1.2055352055352057e-05, "loss": 0.0007, "step": 4880 }, { "epoch": 1.99, "grad_norm": 0.015244298614561558, "learning_rate": 1.2039072039072039e-05, "loss": 0.0007, "step": 4890 }, { "epoch": 1.99, "grad_norm": 0.03133594989776611, "learning_rate": 1.2022792022792024e-05, "loss": 0.0319, "step": 4900 }, { "epoch": 2.0, "grad_norm": 0.012942776083946228, "learning_rate": 1.2006512006512008e-05, "loss": 0.0007, "step": 4910 }, { "epoch": 2.0, "grad_norm": 0.0054002669639885426, "learning_rate": 1.199023199023199e-05, "loss": 0.0386, "step": 4920 }, { "epoch": 2.01, "grad_norm": 0.006965090055018663, "learning_rate": 1.1973951973951975e-05, "loss": 0.0414, "step": 4930 }, { "epoch": 2.01, "grad_norm": 0.005913823377341032, "learning_rate": 1.1957671957671959e-05, "loss": 0.0008, "step": 4940 }, { "epoch": 2.01, "grad_norm": 0.00729360431432724, "learning_rate": 1.1941391941391942e-05, "loss": 0.0015, "step": 4950 }, { "epoch": 2.02, "grad_norm": 0.005881543271243572, "learning_rate": 1.1925111925111925e-05, "loss": 0.0017, "step": 4960 }, { "epoch": 2.02, "grad_norm": 0.00946744717657566, "learning_rate": 1.190883190883191e-05, "loss": 0.0008, "step": 4970 }, { "epoch": 2.03, "grad_norm": 0.7791256904602051, "learning_rate": 1.1892551892551893e-05, "loss": 0.0456, "step": 4980 }, { "epoch": 2.03, "grad_norm": 0.08430014550685883, "learning_rate": 1.1876271876271877e-05, "loss": 0.0048, "step": 4990 }, { "epoch": 2.04, "grad_norm": 0.007524729706346989, "learning_rate": 1.1859991859991862e-05, "loss": 0.0008, "step": 5000 }, { "epoch": 2.04, "grad_norm": 0.007158556021749973, "learning_rate": 1.1843711843711844e-05, "loss": 0.0007, "step": 5010 }, { "epoch": 2.04, "grad_norm": 0.006158571690320969, "learning_rate": 1.1827431827431828e-05, "loss": 0.0007, "step": 5020 }, { "epoch": 2.05, "grad_norm": 0.0062376465648412704, "learning_rate": 1.1811151811151812e-05, "loss": 0.0007, "step": 5030 }, { "epoch": 2.05, "grad_norm": 0.009434174746274948, "learning_rate": 1.1794871794871796e-05, "loss": 0.0333, "step": 5040 }, { "epoch": 2.06, "grad_norm": 0.006017903331667185, "learning_rate": 1.1778591778591779e-05, "loss": 0.0007, "step": 5050 }, { "epoch": 2.06, "grad_norm": 0.007532346062362194, "learning_rate": 1.1762311762311762e-05, "loss": 0.0007, "step": 5060 }, { "epoch": 2.06, "grad_norm": 0.005684974603354931, "learning_rate": 1.1746031746031748e-05, "loss": 0.0008, "step": 5070 }, { "epoch": 2.07, "grad_norm": 0.005241623613983393, "learning_rate": 1.172975172975173e-05, "loss": 0.0306, "step": 5080 }, { "epoch": 2.07, "grad_norm": 0.019347479566931725, "learning_rate": 1.1713471713471714e-05, "loss": 0.0008, "step": 5090 }, { "epoch": 2.08, "grad_norm": 0.08700444549322128, "learning_rate": 1.1697191697191698e-05, "loss": 0.0009, "step": 5100 }, { "epoch": 2.08, "grad_norm": 0.005539617035537958, "learning_rate": 1.1680911680911682e-05, "loss": 0.0009, "step": 5110 }, { "epoch": 2.08, "grad_norm": 0.005851482041180134, "learning_rate": 1.1664631664631664e-05, "loss": 0.0007, "step": 5120 }, { "epoch": 2.09, "grad_norm": 0.007532169576734304, "learning_rate": 1.164835164835165e-05, "loss": 0.0011, "step": 5130 }, { "epoch": 2.09, "grad_norm": 0.00506225973367691, "learning_rate": 1.1632071632071634e-05, "loss": 0.0007, "step": 5140 }, { "epoch": 2.1, "grad_norm": 0.005589496809989214, "learning_rate": 1.1615791615791616e-05, "loss": 0.0007, "step": 5150 }, { "epoch": 2.1, "grad_norm": 0.004957486409693956, "learning_rate": 1.1599511599511602e-05, "loss": 0.0156, "step": 5160 }, { "epoch": 2.1, "grad_norm": 0.00666527496650815, "learning_rate": 1.1583231583231584e-05, "loss": 0.0007, "step": 5170 }, { "epoch": 2.11, "grad_norm": 0.006306789815425873, "learning_rate": 1.1566951566951568e-05, "loss": 0.0006, "step": 5180 }, { "epoch": 2.11, "grad_norm": 0.005329395178705454, "learning_rate": 1.155067155067155e-05, "loss": 0.0006, "step": 5190 }, { "epoch": 2.12, "grad_norm": 0.0049823857843875885, "learning_rate": 1.1534391534391536e-05, "loss": 0.0006, "step": 5200 }, { "epoch": 2.12, "grad_norm": 0.0051444037817418575, "learning_rate": 1.1518111518111518e-05, "loss": 0.0022, "step": 5210 }, { "epoch": 2.12, "grad_norm": 0.00532697094604373, "learning_rate": 1.1501831501831502e-05, "loss": 0.0006, "step": 5220 }, { "epoch": 2.13, "grad_norm": 0.006971771828830242, "learning_rate": 1.1485551485551488e-05, "loss": 0.0007, "step": 5230 }, { "epoch": 2.13, "grad_norm": 0.005065458826720715, "learning_rate": 1.146927146927147e-05, "loss": 0.0006, "step": 5240 }, { "epoch": 2.14, "grad_norm": 0.00542556494474411, "learning_rate": 1.1452991452991454e-05, "loss": 0.0006, "step": 5250 }, { "epoch": 2.14, "grad_norm": 0.005721778143197298, "learning_rate": 1.1436711436711438e-05, "loss": 0.0006, "step": 5260 }, { "epoch": 2.14, "grad_norm": 0.0050778863951563835, "learning_rate": 1.1420431420431422e-05, "loss": 0.0006, "step": 5270 }, { "epoch": 2.15, "grad_norm": 0.005689846817404032, "learning_rate": 1.1404151404151404e-05, "loss": 0.0007, "step": 5280 }, { "epoch": 2.15, "grad_norm": 0.005032387096434832, "learning_rate": 1.138787138787139e-05, "loss": 0.0053, "step": 5290 }, { "epoch": 2.16, "grad_norm": 0.004602556582540274, "learning_rate": 1.1371591371591374e-05, "loss": 0.0006, "step": 5300 }, { "epoch": 2.16, "grad_norm": 0.005181928165256977, "learning_rate": 1.1355311355311356e-05, "loss": 0.0006, "step": 5310 }, { "epoch": 2.17, "grad_norm": 0.004627116955816746, "learning_rate": 1.1339031339031338e-05, "loss": 0.0006, "step": 5320 }, { "epoch": 2.17, "grad_norm": 0.004680185578763485, "learning_rate": 1.1322751322751324e-05, "loss": 0.0006, "step": 5330 }, { "epoch": 2.17, "grad_norm": 0.00517154298722744, "learning_rate": 1.1306471306471308e-05, "loss": 0.0006, "step": 5340 }, { "epoch": 2.18, "grad_norm": 0.2655492126941681, "learning_rate": 1.129019129019129e-05, "loss": 0.04, "step": 5350 }, { "epoch": 2.18, "grad_norm": 0.004791987128555775, "learning_rate": 1.1273911273911276e-05, "loss": 0.0027, "step": 5360 }, { "epoch": 2.19, "grad_norm": 0.00524140102788806, "learning_rate": 1.1257631257631258e-05, "loss": 0.0019, "step": 5370 }, { "epoch": 2.19, "grad_norm": 0.004854326602071524, "learning_rate": 1.1241351241351242e-05, "loss": 0.0006, "step": 5380 }, { "epoch": 2.19, "grad_norm": 0.004912737291306257, "learning_rate": 1.1225071225071227e-05, "loss": 0.0229, "step": 5390 }, { "epoch": 2.2, "grad_norm": 0.009351348504424095, "learning_rate": 1.120879120879121e-05, "loss": 0.0006, "step": 5400 }, { "epoch": 2.2, "grad_norm": 0.006594196427613497, "learning_rate": 1.1192511192511194e-05, "loss": 0.0007, "step": 5410 }, { "epoch": 2.21, "grad_norm": 0.004785753786563873, "learning_rate": 1.1176231176231178e-05, "loss": 0.0006, "step": 5420 }, { "epoch": 2.21, "grad_norm": 0.010175659321248531, "learning_rate": 1.1159951159951162e-05, "loss": 0.0347, "step": 5430 }, { "epoch": 2.21, "grad_norm": 0.007659697439521551, "learning_rate": 1.1143671143671144e-05, "loss": 0.0006, "step": 5440 }, { "epoch": 2.22, "grad_norm": 0.005518093705177307, "learning_rate": 1.1127391127391128e-05, "loss": 0.0007, "step": 5450 }, { "epoch": 2.22, "grad_norm": 0.004838414024561644, "learning_rate": 1.1111111111111113e-05, "loss": 0.0006, "step": 5460 }, { "epoch": 2.23, "grad_norm": 0.004535248037427664, "learning_rate": 1.1094831094831096e-05, "loss": 0.0007, "step": 5470 }, { "epoch": 2.23, "grad_norm": 0.004755628295242786, "learning_rate": 1.1078551078551078e-05, "loss": 0.0006, "step": 5480 }, { "epoch": 2.23, "grad_norm": 0.007153332699090242, "learning_rate": 1.1062271062271063e-05, "loss": 0.0006, "step": 5490 }, { "epoch": 2.24, "grad_norm": 0.004593558143824339, "learning_rate": 1.1045991045991047e-05, "loss": 0.0006, "step": 5500 }, { "epoch": 2.24, "grad_norm": 0.004781143739819527, "learning_rate": 1.102971102971103e-05, "loss": 0.0187, "step": 5510 }, { "epoch": 2.25, "grad_norm": 0.022694548591971397, "learning_rate": 1.1013431013431015e-05, "loss": 0.0006, "step": 5520 }, { "epoch": 2.25, "grad_norm": 0.004701571073383093, "learning_rate": 1.0997150997150998e-05, "loss": 0.0005, "step": 5530 }, { "epoch": 2.25, "grad_norm": 0.014217639341950417, "learning_rate": 1.0980870980870981e-05, "loss": 0.0006, "step": 5540 }, { "epoch": 2.26, "grad_norm": 0.0047623575665056705, "learning_rate": 1.0964590964590967e-05, "loss": 0.0005, "step": 5550 }, { "epoch": 2.26, "grad_norm": 0.004431570880115032, "learning_rate": 1.094831094831095e-05, "loss": 0.0006, "step": 5560 }, { "epoch": 2.27, "grad_norm": 0.006182719487696886, "learning_rate": 1.0932030932030933e-05, "loss": 0.0006, "step": 5570 }, { "epoch": 2.27, "grad_norm": 0.004717973992228508, "learning_rate": 1.0915750915750916e-05, "loss": 0.0005, "step": 5580 }, { "epoch": 2.28, "grad_norm": 0.005284770391881466, "learning_rate": 1.0899470899470901e-05, "loss": 0.0051, "step": 5590 }, { "epoch": 2.28, "grad_norm": 0.004852925427258015, "learning_rate": 1.0883190883190883e-05, "loss": 0.0129, "step": 5600 }, { "epoch": 2.28, "grad_norm": 0.011825304478406906, "learning_rate": 1.0866910866910867e-05, "loss": 0.0006, "step": 5610 }, { "epoch": 2.29, "grad_norm": 5.4084672927856445, "learning_rate": 1.0850630850630853e-05, "loss": 0.0014, "step": 5620 }, { "epoch": 2.29, "grad_norm": 0.0045865620486438274, "learning_rate": 1.0834350834350835e-05, "loss": 0.0012, "step": 5630 }, { "epoch": 2.3, "grad_norm": 0.004212076775729656, "learning_rate": 1.0818070818070818e-05, "loss": 0.0005, "step": 5640 }, { "epoch": 2.3, "grad_norm": 0.0043626646511256695, "learning_rate": 1.0801790801790803e-05, "loss": 0.0005, "step": 5650 }, { "epoch": 2.3, "grad_norm": 0.003995486069470644, "learning_rate": 1.0785510785510787e-05, "loss": 0.0005, "step": 5660 }, { "epoch": 2.31, "grad_norm": 12.674348831176758, "learning_rate": 1.076923076923077e-05, "loss": 0.0288, "step": 5670 }, { "epoch": 2.31, "grad_norm": 0.004922170657664537, "learning_rate": 1.0752950752950755e-05, "loss": 0.0005, "step": 5680 }, { "epoch": 2.32, "grad_norm": 0.013311301358044147, "learning_rate": 1.0736670736670737e-05, "loss": 0.0006, "step": 5690 }, { "epoch": 2.32, "grad_norm": 0.004092982970178127, "learning_rate": 1.0720390720390721e-05, "loss": 0.0306, "step": 5700 }, { "epoch": 2.32, "grad_norm": 0.005637271795421839, "learning_rate": 1.0704110704110703e-05, "loss": 0.0483, "step": 5710 }, { "epoch": 2.33, "grad_norm": 8.750419616699219, "learning_rate": 1.0687830687830689e-05, "loss": 0.0386, "step": 5720 }, { "epoch": 2.33, "grad_norm": 0.01064012385904789, "learning_rate": 1.0671550671550673e-05, "loss": 0.0006, "step": 5730 }, { "epoch": 2.34, "grad_norm": 0.004589624237269163, "learning_rate": 1.0655270655270655e-05, "loss": 0.0006, "step": 5740 }, { "epoch": 2.34, "grad_norm": 0.004802080802619457, "learning_rate": 1.0638990638990641e-05, "loss": 0.0009, "step": 5750 }, { "epoch": 2.34, "grad_norm": 0.004713993053883314, "learning_rate": 1.0622710622710623e-05, "loss": 0.0006, "step": 5760 }, { "epoch": 2.35, "grad_norm": 0.004530477803200483, "learning_rate": 1.0606430606430607e-05, "loss": 0.001, "step": 5770 }, { "epoch": 2.35, "grad_norm": 0.00422940356656909, "learning_rate": 1.0590150590150593e-05, "loss": 0.0007, "step": 5780 }, { "epoch": 2.36, "grad_norm": 0.004178835544735193, "learning_rate": 1.0573870573870575e-05, "loss": 0.0006, "step": 5790 }, { "epoch": 2.36, "grad_norm": 0.006506350357085466, "learning_rate": 1.0557590557590557e-05, "loss": 0.0005, "step": 5800 }, { "epoch": 2.36, "grad_norm": 0.004273206926882267, "learning_rate": 1.0541310541310543e-05, "loss": 0.0005, "step": 5810 }, { "epoch": 2.37, "grad_norm": 0.004112168215215206, "learning_rate": 1.0525030525030527e-05, "loss": 0.0005, "step": 5820 }, { "epoch": 2.37, "grad_norm": 0.005212805233895779, "learning_rate": 1.0508750508750509e-05, "loss": 0.0005, "step": 5830 }, { "epoch": 2.38, "grad_norm": 0.004351438954472542, "learning_rate": 1.0492470492470493e-05, "loss": 0.0005, "step": 5840 }, { "epoch": 2.38, "grad_norm": 0.011514941230416298, "learning_rate": 1.0476190476190477e-05, "loss": 0.0005, "step": 5850 }, { "epoch": 2.39, "grad_norm": 0.005969716235995293, "learning_rate": 1.045991045991046e-05, "loss": 0.0005, "step": 5860 }, { "epoch": 2.39, "grad_norm": 0.004150481894612312, "learning_rate": 1.0443630443630443e-05, "loss": 0.0005, "step": 5870 }, { "epoch": 2.39, "grad_norm": 0.003940541297197342, "learning_rate": 1.0427350427350429e-05, "loss": 0.0005, "step": 5880 }, { "epoch": 2.4, "grad_norm": 0.00408910820260644, "learning_rate": 1.0411070411070413e-05, "loss": 0.0005, "step": 5890 }, { "epoch": 2.4, "grad_norm": 0.0038459610659629107, "learning_rate": 1.0394790394790395e-05, "loss": 0.0006, "step": 5900 }, { "epoch": 2.41, "grad_norm": 0.004051607567816973, "learning_rate": 1.037851037851038e-05, "loss": 0.0449, "step": 5910 }, { "epoch": 2.41, "grad_norm": 0.005520727019757032, "learning_rate": 1.0362230362230363e-05, "loss": 0.0078, "step": 5920 }, { "epoch": 2.41, "grad_norm": 0.004394978284835815, "learning_rate": 1.0345950345950347e-05, "loss": 0.0564, "step": 5930 }, { "epoch": 2.42, "grad_norm": 0.004857844207435846, "learning_rate": 1.0329670329670332e-05, "loss": 0.0005, "step": 5940 }, { "epoch": 2.42, "grad_norm": 0.06114115193486214, "learning_rate": 1.0313390313390315e-05, "loss": 0.0007, "step": 5950 }, { "epoch": 2.43, "grad_norm": 0.004661387763917446, "learning_rate": 1.0297110297110297e-05, "loss": 0.0014, "step": 5960 }, { "epoch": 2.43, "grad_norm": 0.005134343635290861, "learning_rate": 1.028083028083028e-05, "loss": 0.001, "step": 5970 }, { "epoch": 2.43, "grad_norm": 0.004655875731259584, "learning_rate": 1.0264550264550266e-05, "loss": 0.0006, "step": 5980 }, { "epoch": 2.44, "grad_norm": 0.0050579700618982315, "learning_rate": 1.0248270248270249e-05, "loss": 0.0015, "step": 5990 }, { "epoch": 2.44, "grad_norm": 0.0047796061262488365, "learning_rate": 1.0231990231990233e-05, "loss": 0.0005, "step": 6000 }, { "epoch": 2.45, "grad_norm": 0.003949730657041073, "learning_rate": 1.0215710215710217e-05, "loss": 0.0005, "step": 6010 }, { "epoch": 2.45, "grad_norm": 0.004095940385013819, "learning_rate": 1.01994301994302e-05, "loss": 0.0005, "step": 6020 }, { "epoch": 2.45, "grad_norm": 0.005133763421326876, "learning_rate": 1.0183150183150183e-05, "loss": 0.0005, "step": 6030 }, { "epoch": 2.46, "grad_norm": 0.01469303946942091, "learning_rate": 1.0166870166870168e-05, "loss": 0.0013, "step": 6040 }, { "epoch": 2.46, "grad_norm": 0.004049224779009819, "learning_rate": 1.0150590150590152e-05, "loss": 0.0018, "step": 6050 }, { "epoch": 2.47, "grad_norm": 0.004054594319313765, "learning_rate": 1.0134310134310135e-05, "loss": 0.0184, "step": 6060 }, { "epoch": 2.47, "grad_norm": 0.004326994996517897, "learning_rate": 1.011803011803012e-05, "loss": 0.0005, "step": 6070 }, { "epoch": 2.47, "grad_norm": 0.004046597983688116, "learning_rate": 1.0101750101750102e-05, "loss": 0.0005, "step": 6080 }, { "epoch": 2.48, "grad_norm": 0.00401376374065876, "learning_rate": 1.0085470085470086e-05, "loss": 0.0006, "step": 6090 }, { "epoch": 2.48, "grad_norm": 0.005812219809740782, "learning_rate": 1.0069190069190069e-05, "loss": 0.0006, "step": 6100 }, { "epoch": 2.49, "grad_norm": 0.003832985181361437, "learning_rate": 1.0052910052910054e-05, "loss": 0.0005, "step": 6110 }, { "epoch": 2.49, "grad_norm": 0.07061895728111267, "learning_rate": 1.0036630036630037e-05, "loss": 0.0005, "step": 6120 }, { "epoch": 2.49, "grad_norm": 0.0039010499604046345, "learning_rate": 1.002035002035002e-05, "loss": 0.0017, "step": 6130 }, { "epoch": 2.5, "grad_norm": 0.004468753468245268, "learning_rate": 1.0004070004070006e-05, "loss": 0.0022, "step": 6140 }, { "epoch": 2.5, "grad_norm": 0.004342631436884403, "learning_rate": 9.987789987789988e-06, "loss": 0.0005, "step": 6150 }, { "epoch": 2.51, "grad_norm": 0.011564524844288826, "learning_rate": 9.971509971509972e-06, "loss": 0.0005, "step": 6160 }, { "epoch": 2.51, "grad_norm": 0.00380577496252954, "learning_rate": 9.955229955229956e-06, "loss": 0.0005, "step": 6170 }, { "epoch": 2.52, "grad_norm": 0.0037414473481476307, "learning_rate": 9.93894993894994e-06, "loss": 0.0011, "step": 6180 }, { "epoch": 2.52, "grad_norm": 0.003814409486949444, "learning_rate": 9.922669922669922e-06, "loss": 0.0005, "step": 6190 }, { "epoch": 2.52, "grad_norm": 0.0039341021329164505, "learning_rate": 9.906389906389906e-06, "loss": 0.0005, "step": 6200 }, { "epoch": 2.53, "grad_norm": 0.003710733028128743, "learning_rate": 9.890109890109892e-06, "loss": 0.0005, "step": 6210 }, { "epoch": 2.53, "grad_norm": 3.491090774536133, "learning_rate": 9.873829873829874e-06, "loss": 0.0356, "step": 6220 }, { "epoch": 2.54, "grad_norm": 0.005923949647694826, "learning_rate": 9.857549857549858e-06, "loss": 0.0005, "step": 6230 }, { "epoch": 2.54, "grad_norm": 0.0036399513483047485, "learning_rate": 9.841269841269842e-06, "loss": 0.0004, "step": 6240 }, { "epoch": 2.54, "grad_norm": 0.003884287318214774, "learning_rate": 9.824989824989826e-06, "loss": 0.0005, "step": 6250 }, { "epoch": 2.55, "grad_norm": 0.0036194841377437115, "learning_rate": 9.80870980870981e-06, "loss": 0.0004, "step": 6260 }, { "epoch": 2.55, "grad_norm": 0.005207626614719629, "learning_rate": 9.792429792429792e-06, "loss": 0.0004, "step": 6270 }, { "epoch": 2.56, "grad_norm": 0.008327057585120201, "learning_rate": 9.776149776149776e-06, "loss": 0.0005, "step": 6280 }, { "epoch": 2.56, "grad_norm": 0.003949583508074284, "learning_rate": 9.759869759869762e-06, "loss": 0.0004, "step": 6290 }, { "epoch": 2.56, "grad_norm": 0.004071325063705444, "learning_rate": 9.743589743589744e-06, "loss": 0.0005, "step": 6300 }, { "epoch": 2.57, "grad_norm": 0.0036700996570289135, "learning_rate": 9.727309727309728e-06, "loss": 0.0004, "step": 6310 }, { "epoch": 2.57, "grad_norm": 0.005211398471146822, "learning_rate": 9.711029711029712e-06, "loss": 0.0005, "step": 6320 }, { "epoch": 2.58, "grad_norm": 0.003708272473886609, "learning_rate": 9.694749694749696e-06, "loss": 0.0004, "step": 6330 }, { "epoch": 2.58, "grad_norm": 0.0042539420537650585, "learning_rate": 9.67846967846968e-06, "loss": 0.0004, "step": 6340 }, { "epoch": 2.58, "grad_norm": 0.0038529515732079744, "learning_rate": 9.662189662189662e-06, "loss": 0.0004, "step": 6350 }, { "epoch": 2.59, "grad_norm": 0.003946115728467703, "learning_rate": 9.645909645909646e-06, "loss": 0.0005, "step": 6360 }, { "epoch": 2.59, "grad_norm": 0.004324799869209528, "learning_rate": 9.62962962962963e-06, "loss": 0.0004, "step": 6370 }, { "epoch": 2.6, "grad_norm": 0.0038023737724870443, "learning_rate": 9.613349613349614e-06, "loss": 0.0004, "step": 6380 }, { "epoch": 2.6, "grad_norm": 0.0037666463758796453, "learning_rate": 9.597069597069598e-06, "loss": 0.0004, "step": 6390 }, { "epoch": 2.6, "grad_norm": 0.0034590172581374645, "learning_rate": 9.580789580789582e-06, "loss": 0.017, "step": 6400 }, { "epoch": 2.61, "grad_norm": 0.0038201683200895786, "learning_rate": 9.564509564509566e-06, "loss": 0.0004, "step": 6410 }, { "epoch": 2.61, "grad_norm": 0.004171228501945734, "learning_rate": 9.54822954822955e-06, "loss": 0.0004, "step": 6420 }, { "epoch": 2.62, "grad_norm": 0.0038926773704588413, "learning_rate": 9.531949531949532e-06, "loss": 0.0004, "step": 6430 }, { "epoch": 2.62, "grad_norm": 0.0037587357219308615, "learning_rate": 9.515669515669516e-06, "loss": 0.0004, "step": 6440 }, { "epoch": 2.63, "grad_norm": 0.0034505994990468025, "learning_rate": 9.4993894993895e-06, "loss": 0.0024, "step": 6450 }, { "epoch": 2.63, "grad_norm": 0.0034958263859152794, "learning_rate": 9.483109483109484e-06, "loss": 0.0004, "step": 6460 }, { "epoch": 2.63, "grad_norm": 0.0037652612663805485, "learning_rate": 9.466829466829468e-06, "loss": 0.0004, "step": 6470 }, { "epoch": 2.64, "grad_norm": 0.003452475182712078, "learning_rate": 9.450549450549452e-06, "loss": 0.0004, "step": 6480 }, { "epoch": 2.64, "grad_norm": 0.005090977996587753, "learning_rate": 9.434269434269436e-06, "loss": 0.0004, "step": 6490 }, { "epoch": 2.65, "grad_norm": 0.0036007205490022898, "learning_rate": 9.417989417989418e-06, "loss": 0.0004, "step": 6500 }, { "epoch": 2.65, "grad_norm": 0.0033244409132748842, "learning_rate": 9.401709401709402e-06, "loss": 0.0004, "step": 6510 }, { "epoch": 2.65, "grad_norm": 0.00387198431417346, "learning_rate": 9.385429385429386e-06, "loss": 0.0004, "step": 6520 }, { "epoch": 2.66, "grad_norm": 0.003582969307899475, "learning_rate": 9.36914936914937e-06, "loss": 0.0004, "step": 6530 }, { "epoch": 2.66, "grad_norm": 0.0032744621858000755, "learning_rate": 9.352869352869354e-06, "loss": 0.0005, "step": 6540 }, { "epoch": 2.67, "grad_norm": 0.0034951018169522285, "learning_rate": 9.336589336589338e-06, "loss": 0.0004, "step": 6550 }, { "epoch": 2.67, "grad_norm": 0.0034060273319482803, "learning_rate": 9.320309320309321e-06, "loss": 0.0004, "step": 6560 }, { "epoch": 2.67, "grad_norm": 0.0034066797234117985, "learning_rate": 9.304029304029305e-06, "loss": 0.0004, "step": 6570 }, { "epoch": 2.68, "grad_norm": 0.0035453049931675196, "learning_rate": 9.287749287749288e-06, "loss": 0.0004, "step": 6580 }, { "epoch": 2.68, "grad_norm": 0.0033404843416064978, "learning_rate": 9.271469271469272e-06, "loss": 0.0004, "step": 6590 }, { "epoch": 2.69, "grad_norm": 0.0032289137598127127, "learning_rate": 9.255189255189256e-06, "loss": 0.0004, "step": 6600 }, { "epoch": 2.69, "grad_norm": 0.0035338301677256823, "learning_rate": 9.23890923890924e-06, "loss": 0.0004, "step": 6610 }, { "epoch": 2.69, "grad_norm": 0.0032329142559319735, "learning_rate": 9.222629222629223e-06, "loss": 0.0004, "step": 6620 }, { "epoch": 2.7, "grad_norm": 0.0033918411936610937, "learning_rate": 9.206349206349207e-06, "loss": 0.0004, "step": 6630 }, { "epoch": 2.7, "grad_norm": 0.003434843849390745, "learning_rate": 9.190069190069191e-06, "loss": 0.0004, "step": 6640 }, { "epoch": 2.71, "grad_norm": 0.0032904883846640587, "learning_rate": 9.173789173789175e-06, "loss": 0.0004, "step": 6650 }, { "epoch": 2.71, "grad_norm": 0.003165784990414977, "learning_rate": 9.157509157509158e-06, "loss": 0.0004, "step": 6660 }, { "epoch": 2.71, "grad_norm": 0.0034379889257252216, "learning_rate": 9.141229141229141e-06, "loss": 0.0004, "step": 6670 }, { "epoch": 2.72, "grad_norm": 0.0032244266476482153, "learning_rate": 9.124949124949125e-06, "loss": 0.001, "step": 6680 }, { "epoch": 2.72, "grad_norm": 0.003119837259873748, "learning_rate": 9.10866910866911e-06, "loss": 0.0004, "step": 6690 }, { "epoch": 2.73, "grad_norm": 0.0038290254306048155, "learning_rate": 9.092389092389093e-06, "loss": 0.0004, "step": 6700 }, { "epoch": 2.73, "grad_norm": 0.0032256192062050104, "learning_rate": 9.076109076109077e-06, "loss": 0.0004, "step": 6710 }, { "epoch": 2.74, "grad_norm": 0.004083781037479639, "learning_rate": 9.059829059829061e-06, "loss": 0.0004, "step": 6720 }, { "epoch": 2.74, "grad_norm": 0.003274232381954789, "learning_rate": 9.043549043549045e-06, "loss": 0.0004, "step": 6730 }, { "epoch": 2.74, "grad_norm": 0.0032298911828547716, "learning_rate": 9.027269027269027e-06, "loss": 0.0004, "step": 6740 }, { "epoch": 2.75, "grad_norm": 0.0031462605111300945, "learning_rate": 9.010989010989011e-06, "loss": 0.0425, "step": 6750 }, { "epoch": 2.75, "grad_norm": 0.00312459422275424, "learning_rate": 8.994708994708995e-06, "loss": 0.0004, "step": 6760 }, { "epoch": 2.76, "grad_norm": 0.0036323266103863716, "learning_rate": 8.97842897842898e-06, "loss": 0.0004, "step": 6770 }, { "epoch": 2.76, "grad_norm": 0.0033034805674105883, "learning_rate": 8.962148962148963e-06, "loss": 0.0004, "step": 6780 }, { "epoch": 2.76, "grad_norm": 0.003054459812119603, "learning_rate": 8.945868945868947e-06, "loss": 0.0004, "step": 6790 }, { "epoch": 2.77, "grad_norm": 0.005314236972481012, "learning_rate": 8.929588929588931e-06, "loss": 0.0004, "step": 6800 }, { "epoch": 2.77, "grad_norm": 0.010932357981801033, "learning_rate": 8.913308913308915e-06, "loss": 0.0004, "step": 6810 }, { "epoch": 2.78, "grad_norm": 0.0031523159705102444, "learning_rate": 8.897028897028897e-06, "loss": 0.0004, "step": 6820 }, { "epoch": 2.78, "grad_norm": 0.0034312924835830927, "learning_rate": 8.880748880748881e-06, "loss": 0.0101, "step": 6830 }, { "epoch": 2.78, "grad_norm": 0.00318572367541492, "learning_rate": 8.864468864468865e-06, "loss": 0.0004, "step": 6840 }, { "epoch": 2.79, "grad_norm": 0.0032511164899915457, "learning_rate": 8.848188848188849e-06, "loss": 0.0379, "step": 6850 }, { "epoch": 2.79, "grad_norm": 0.0037167894188314676, "learning_rate": 8.831908831908833e-06, "loss": 0.0004, "step": 6860 }, { "epoch": 2.8, "grad_norm": 0.003721152199432254, "learning_rate": 8.815628815628817e-06, "loss": 0.0008, "step": 6870 }, { "epoch": 2.8, "grad_norm": 0.0030927169136703014, "learning_rate": 8.7993487993488e-06, "loss": 0.0004, "step": 6880 }, { "epoch": 2.8, "grad_norm": 0.0034221247769892216, "learning_rate": 8.783068783068783e-06, "loss": 0.0031, "step": 6890 }, { "epoch": 2.81, "grad_norm": 0.0033293466549366713, "learning_rate": 8.766788766788767e-06, "loss": 0.0004, "step": 6900 }, { "epoch": 2.81, "grad_norm": 0.003214113647118211, "learning_rate": 8.750508750508751e-06, "loss": 0.0004, "step": 6910 }, { "epoch": 2.82, "grad_norm": 0.0032116910442709923, "learning_rate": 8.734228734228735e-06, "loss": 0.034, "step": 6920 }, { "epoch": 2.82, "grad_norm": 0.003405655035749078, "learning_rate": 8.717948717948719e-06, "loss": 0.0515, "step": 6930 }, { "epoch": 2.82, "grad_norm": 0.003949843347072601, "learning_rate": 8.701668701668703e-06, "loss": 0.0508, "step": 6940 }, { "epoch": 2.83, "grad_norm": 0.0030140685848891735, "learning_rate": 8.685388685388687e-06, "loss": 0.0385, "step": 6950 }, { "epoch": 2.83, "grad_norm": 0.0034769896883517504, "learning_rate": 8.66910866910867e-06, "loss": 0.0004, "step": 6960 }, { "epoch": 2.84, "grad_norm": 0.005965403746813536, "learning_rate": 8.652828652828653e-06, "loss": 0.0454, "step": 6970 }, { "epoch": 2.84, "grad_norm": 0.004475270863622427, "learning_rate": 8.636548636548637e-06, "loss": 0.0005, "step": 6980 }, { "epoch": 2.84, "grad_norm": 0.0039094300009310246, "learning_rate": 8.62026862026862e-06, "loss": 0.0005, "step": 6990 }, { "epoch": 2.85, "grad_norm": 0.004547227174043655, "learning_rate": 8.603988603988605e-06, "loss": 0.0004, "step": 7000 }, { "epoch": 2.85, "grad_norm": 0.0033658877946436405, "learning_rate": 8.587708587708589e-06, "loss": 0.0005, "step": 7010 }, { "epoch": 2.86, "grad_norm": 0.0037282053381204605, "learning_rate": 8.571428571428571e-06, "loss": 0.0005, "step": 7020 }, { "epoch": 2.86, "grad_norm": 0.012108271941542625, "learning_rate": 8.555148555148557e-06, "loss": 0.0005, "step": 7030 }, { "epoch": 2.87, "grad_norm": 0.00378889380954206, "learning_rate": 8.53886853886854e-06, "loss": 0.0142, "step": 7040 }, { "epoch": 2.87, "grad_norm": 0.0037225610576570034, "learning_rate": 8.522588522588523e-06, "loss": 0.0009, "step": 7050 }, { "epoch": 2.87, "grad_norm": 0.005083514377474785, "learning_rate": 8.506308506308507e-06, "loss": 0.0004, "step": 7060 }, { "epoch": 2.88, "grad_norm": 0.0035945470444858074, "learning_rate": 8.49002849002849e-06, "loss": 0.0005, "step": 7070 }, { "epoch": 2.88, "grad_norm": 0.0031938895117491484, "learning_rate": 8.473748473748475e-06, "loss": 0.0007, "step": 7080 }, { "epoch": 2.89, "grad_norm": 0.007891247980296612, "learning_rate": 8.457468457468459e-06, "loss": 0.0004, "step": 7090 }, { "epoch": 2.89, "grad_norm": 0.003397688502445817, "learning_rate": 8.44118844118844e-06, "loss": 0.0004, "step": 7100 }, { "epoch": 2.89, "grad_norm": 0.004096095450222492, "learning_rate": 8.424908424908426e-06, "loss": 0.0004, "step": 7110 }, { "epoch": 2.9, "grad_norm": 0.004969074856489897, "learning_rate": 8.40862840862841e-06, "loss": 0.0004, "step": 7120 }, { "epoch": 2.9, "grad_norm": 0.002869043732061982, "learning_rate": 8.392348392348393e-06, "loss": 0.0376, "step": 7130 }, { "epoch": 2.91, "grad_norm": 0.004255395848304033, "learning_rate": 8.376068376068377e-06, "loss": 0.0004, "step": 7140 }, { "epoch": 2.91, "grad_norm": 0.003371414029970765, "learning_rate": 8.35978835978836e-06, "loss": 0.0005, "step": 7150 }, { "epoch": 2.91, "grad_norm": 0.0031468465458601713, "learning_rate": 8.343508343508344e-06, "loss": 0.0004, "step": 7160 }, { "epoch": 2.92, "grad_norm": 0.004064807202666998, "learning_rate": 8.327228327228328e-06, "loss": 0.0004, "step": 7170 }, { "epoch": 2.92, "grad_norm": 0.0038253762759268284, "learning_rate": 8.31094831094831e-06, "loss": 0.0275, "step": 7180 }, { "epoch": 2.93, "grad_norm": 0.0029601927381008863, "learning_rate": 8.294668294668296e-06, "loss": 0.0162, "step": 7190 }, { "epoch": 2.93, "grad_norm": 0.0035592832136899233, "learning_rate": 8.278388278388278e-06, "loss": 0.001, "step": 7200 }, { "epoch": 2.93, "grad_norm": 0.003166656941175461, "learning_rate": 8.262108262108262e-06, "loss": 0.0004, "step": 7210 }, { "epoch": 2.94, "grad_norm": 0.0038591506890952587, "learning_rate": 8.245828245828246e-06, "loss": 0.0004, "step": 7220 }, { "epoch": 2.94, "grad_norm": 0.004316645674407482, "learning_rate": 8.22954822954823e-06, "loss": 0.0339, "step": 7230 }, { "epoch": 2.95, "grad_norm": 0.003106352873146534, "learning_rate": 8.213268213268214e-06, "loss": 0.0004, "step": 7240 }, { "epoch": 2.95, "grad_norm": 0.003383921692147851, "learning_rate": 8.196988196988198e-06, "loss": 0.0004, "step": 7250 }, { "epoch": 2.95, "grad_norm": 0.003904301906004548, "learning_rate": 8.18070818070818e-06, "loss": 0.009, "step": 7260 }, { "epoch": 2.96, "grad_norm": 0.002857522340491414, "learning_rate": 8.164428164428166e-06, "loss": 0.0004, "step": 7270 }, { "epoch": 2.96, "grad_norm": 0.0028671324253082275, "learning_rate": 8.148148148148148e-06, "loss": 0.0004, "step": 7280 }, { "epoch": 2.97, "grad_norm": 0.0028230687603354454, "learning_rate": 8.131868131868132e-06, "loss": 0.0009, "step": 7290 }, { "epoch": 2.97, "grad_norm": 0.0028381363954395056, "learning_rate": 8.115588115588116e-06, "loss": 0.0003, "step": 7300 }, { "epoch": 2.98, "grad_norm": 0.0028295046649873257, "learning_rate": 8.0993080993081e-06, "loss": 0.0099, "step": 7310 }, { "epoch": 2.98, "grad_norm": 0.0051268660463392735, "learning_rate": 8.083028083028084e-06, "loss": 0.0004, "step": 7320 }, { "epoch": 2.98, "grad_norm": 0.006851341109722853, "learning_rate": 8.066748066748066e-06, "loss": 0.0569, "step": 7330 }, { "epoch": 2.99, "grad_norm": 0.003248844761401415, "learning_rate": 8.05046805046805e-06, "loss": 0.0004, "step": 7340 }, { "epoch": 2.99, "grad_norm": 0.003859333461150527, "learning_rate": 8.034188034188036e-06, "loss": 0.0012, "step": 7350 }, { "epoch": 3.0, "grad_norm": 0.002941732294857502, "learning_rate": 8.017908017908018e-06, "loss": 0.0145, "step": 7360 }, { "epoch": 3.0, "grad_norm": 0.0032136046793311834, "learning_rate": 8.001628001628002e-06, "loss": 0.0004, "step": 7370 }, { "epoch": 3.0, "grad_norm": 0.0037520972546190023, "learning_rate": 7.985347985347986e-06, "loss": 0.0003, "step": 7380 }, { "epoch": 3.01, "grad_norm": 0.002765586832538247, "learning_rate": 7.96906796906797e-06, "loss": 0.0003, "step": 7390 }, { "epoch": 3.01, "grad_norm": 0.002917984500527382, "learning_rate": 7.952787952787954e-06, "loss": 0.0003, "step": 7400 }, { "epoch": 3.02, "grad_norm": 0.002771808998659253, "learning_rate": 7.936507936507936e-06, "loss": 0.0003, "step": 7410 }, { "epoch": 3.02, "grad_norm": 0.0028077957686036825, "learning_rate": 7.92022792022792e-06, "loss": 0.0003, "step": 7420 }, { "epoch": 3.02, "grad_norm": 0.014859122224152088, "learning_rate": 7.903947903947906e-06, "loss": 0.0187, "step": 7430 }, { "epoch": 3.03, "grad_norm": 0.0029327922966331244, "learning_rate": 7.887667887667888e-06, "loss": 0.0003, "step": 7440 }, { "epoch": 3.03, "grad_norm": 0.003155101090669632, "learning_rate": 7.871387871387872e-06, "loss": 0.0003, "step": 7450 }, { "epoch": 3.04, "grad_norm": 0.002822224283590913, "learning_rate": 7.855107855107856e-06, "loss": 0.0003, "step": 7460 }, { "epoch": 3.04, "grad_norm": 0.002726204926148057, "learning_rate": 7.83882783882784e-06, "loss": 0.0003, "step": 7470 }, { "epoch": 3.04, "grad_norm": 0.0026202842127531767, "learning_rate": 7.822547822547824e-06, "loss": 0.0004, "step": 7480 }, { "epoch": 3.05, "grad_norm": 0.0026620635762810707, "learning_rate": 7.806267806267806e-06, "loss": 0.0003, "step": 7490 }, { "epoch": 3.05, "grad_norm": 0.0026845140382647514, "learning_rate": 7.78998778998779e-06, "loss": 0.0003, "step": 7500 }, { "epoch": 3.06, "grad_norm": 0.002793940482661128, "learning_rate": 7.773707773707776e-06, "loss": 0.0004, "step": 7510 }, { "epoch": 3.06, "grad_norm": 0.002819318324327469, "learning_rate": 7.757427757427758e-06, "loss": 0.0003, "step": 7520 }, { "epoch": 3.06, "grad_norm": 0.0027769345324486494, "learning_rate": 7.741147741147742e-06, "loss": 0.0003, "step": 7530 }, { "epoch": 3.07, "grad_norm": 0.002659664023667574, "learning_rate": 7.724867724867726e-06, "loss": 0.0003, "step": 7540 }, { "epoch": 3.07, "grad_norm": 0.0025388060603290796, "learning_rate": 7.70858770858771e-06, "loss": 0.0003, "step": 7550 }, { "epoch": 3.08, "grad_norm": 0.002629263559356332, "learning_rate": 7.692307692307694e-06, "loss": 0.0003, "step": 7560 }, { "epoch": 3.08, "grad_norm": 0.0025471756234765053, "learning_rate": 7.676027676027676e-06, "loss": 0.0003, "step": 7570 }, { "epoch": 3.09, "grad_norm": 0.006246237549930811, "learning_rate": 7.65974765974766e-06, "loss": 0.0003, "step": 7580 }, { "epoch": 3.09, "grad_norm": 0.0031642026733607054, "learning_rate": 7.643467643467644e-06, "loss": 0.0003, "step": 7590 }, { "epoch": 3.09, "grad_norm": 0.0028460524044930935, "learning_rate": 7.627187627187628e-06, "loss": 0.0003, "step": 7600 }, { "epoch": 3.1, "grad_norm": 0.0027321220841258764, "learning_rate": 7.610907610907612e-06, "loss": 0.0004, "step": 7610 }, { "epoch": 3.1, "grad_norm": 0.07277552038431168, "learning_rate": 7.594627594627595e-06, "loss": 0.0003, "step": 7620 }, { "epoch": 3.11, "grad_norm": 0.002561114262789488, "learning_rate": 7.578347578347579e-06, "loss": 0.0003, "step": 7630 }, { "epoch": 3.11, "grad_norm": 0.002666006563231349, "learning_rate": 7.5620675620675634e-06, "loss": 0.0003, "step": 7640 }, { "epoch": 3.11, "grad_norm": 0.003249433124437928, "learning_rate": 7.5457875457875465e-06, "loss": 0.0003, "step": 7650 }, { "epoch": 3.12, "grad_norm": 0.002814142033457756, "learning_rate": 7.5295075295075305e-06, "loss": 0.0003, "step": 7660 }, { "epoch": 3.12, "grad_norm": 0.002647695131599903, "learning_rate": 7.5132275132275136e-06, "loss": 0.0003, "step": 7670 }, { "epoch": 3.13, "grad_norm": 0.0028357000555843115, "learning_rate": 7.4969474969474975e-06, "loss": 0.0003, "step": 7680 }, { "epoch": 3.13, "grad_norm": 0.002574663609266281, "learning_rate": 7.4806674806674814e-06, "loss": 0.0003, "step": 7690 }, { "epoch": 3.13, "grad_norm": 0.002485772827640176, "learning_rate": 7.4643874643874645e-06, "loss": 0.0004, "step": 7700 }, { "epoch": 3.14, "grad_norm": 0.0026384114753454924, "learning_rate": 7.448107448107449e-06, "loss": 0.0003, "step": 7710 }, { "epoch": 3.14, "grad_norm": 0.0025012667756527662, "learning_rate": 7.4318274318274316e-06, "loss": 0.0003, "step": 7720 }, { "epoch": 3.15, "grad_norm": 0.0023603325244039297, "learning_rate": 7.415547415547416e-06, "loss": 0.0008, "step": 7730 }, { "epoch": 3.15, "grad_norm": 0.006851641461253166, "learning_rate": 7.3992673992674e-06, "loss": 0.0003, "step": 7740 }, { "epoch": 3.15, "grad_norm": 0.0029785565566271544, "learning_rate": 7.382987382987383e-06, "loss": 0.0003, "step": 7750 }, { "epoch": 3.16, "grad_norm": 0.002378121018409729, "learning_rate": 7.366707366707367e-06, "loss": 0.0062, "step": 7760 }, { "epoch": 3.16, "grad_norm": 0.0024877325631678104, "learning_rate": 7.350427350427351e-06, "loss": 0.0003, "step": 7770 }, { "epoch": 3.17, "grad_norm": 0.004979600198566914, "learning_rate": 7.334147334147334e-06, "loss": 0.0003, "step": 7780 }, { "epoch": 3.17, "grad_norm": 0.002649629721418023, "learning_rate": 7.317867317867319e-06, "loss": 0.0003, "step": 7790 }, { "epoch": 3.17, "grad_norm": 0.0030928929336369038, "learning_rate": 7.301587301587301e-06, "loss": 0.0003, "step": 7800 }, { "epoch": 3.18, "grad_norm": 0.00250143650919199, "learning_rate": 7.285307285307286e-06, "loss": 0.0003, "step": 7810 }, { "epoch": 3.18, "grad_norm": 0.002448960905894637, "learning_rate": 7.26902726902727e-06, "loss": 0.0003, "step": 7820 }, { "epoch": 3.19, "grad_norm": 0.0023297348525375128, "learning_rate": 7.252747252747253e-06, "loss": 0.0003, "step": 7830 }, { "epoch": 3.19, "grad_norm": 0.0023908980656415224, "learning_rate": 7.236467236467237e-06, "loss": 0.0003, "step": 7840 }, { "epoch": 3.19, "grad_norm": 0.003359014866873622, "learning_rate": 7.22018722018722e-06, "loss": 0.0003, "step": 7850 }, { "epoch": 3.2, "grad_norm": 0.002836639992892742, "learning_rate": 7.203907203907204e-06, "loss": 0.0003, "step": 7860 }, { "epoch": 3.2, "grad_norm": 0.0024746765848249197, "learning_rate": 7.187627187627189e-06, "loss": 0.0003, "step": 7870 }, { "epoch": 3.21, "grad_norm": 0.002388924825936556, "learning_rate": 7.171347171347171e-06, "loss": 0.0003, "step": 7880 }, { "epoch": 3.21, "grad_norm": 0.0024251139257103205, "learning_rate": 7.155067155067156e-06, "loss": 0.0003, "step": 7890 }, { "epoch": 3.22, "grad_norm": 0.00242208456620574, "learning_rate": 7.13878713878714e-06, "loss": 0.0003, "step": 7900 }, { "epoch": 3.22, "grad_norm": 0.0023256507702171803, "learning_rate": 7.122507122507123e-06, "loss": 0.0003, "step": 7910 }, { "epoch": 3.22, "grad_norm": 0.0022887035738676786, "learning_rate": 7.106227106227107e-06, "loss": 0.0003, "step": 7920 }, { "epoch": 3.23, "grad_norm": 0.0022210038732737303, "learning_rate": 7.08994708994709e-06, "loss": 0.0003, "step": 7930 }, { "epoch": 3.23, "grad_norm": 0.002328604692593217, "learning_rate": 7.073667073667074e-06, "loss": 0.0003, "step": 7940 }, { "epoch": 3.24, "grad_norm": 0.002483953256160021, "learning_rate": 7.057387057387059e-06, "loss": 0.0003, "step": 7950 }, { "epoch": 3.24, "grad_norm": 0.002675483236089349, "learning_rate": 7.041107041107041e-06, "loss": 0.0003, "step": 7960 }, { "epoch": 3.24, "grad_norm": 0.0023732264526188374, "learning_rate": 7.024827024827026e-06, "loss": 0.0003, "step": 7970 }, { "epoch": 3.25, "grad_norm": 0.002226916840299964, "learning_rate": 7.008547008547009e-06, "loss": 0.0003, "step": 7980 }, { "epoch": 3.25, "grad_norm": 0.003264982718974352, "learning_rate": 6.992266992266993e-06, "loss": 0.0003, "step": 7990 }, { "epoch": 3.26, "grad_norm": 0.0026976047083735466, "learning_rate": 6.975986975986977e-06, "loss": 0.0003, "step": 8000 }, { "epoch": 3.26, "grad_norm": 0.002336106961593032, "learning_rate": 6.95970695970696e-06, "loss": 0.0003, "step": 8010 }, { "epoch": 3.26, "grad_norm": 0.0023025060072541237, "learning_rate": 6.943426943426944e-06, "loss": 0.0003, "step": 8020 }, { "epoch": 3.27, "grad_norm": 0.0024826654698699713, "learning_rate": 6.927146927146929e-06, "loss": 0.0003, "step": 8030 }, { "epoch": 3.27, "grad_norm": 0.002214565174654126, "learning_rate": 6.910866910866911e-06, "loss": 0.0003, "step": 8040 }, { "epoch": 3.28, "grad_norm": 0.002279749372974038, "learning_rate": 6.894586894586896e-06, "loss": 0.0003, "step": 8050 }, { "epoch": 3.28, "grad_norm": 0.002262295223772526, "learning_rate": 6.878306878306879e-06, "loss": 0.0003, "step": 8060 }, { "epoch": 3.28, "grad_norm": 0.0022824567276984453, "learning_rate": 6.862026862026863e-06, "loss": 0.0003, "step": 8070 }, { "epoch": 3.29, "grad_norm": 0.0022059327457100153, "learning_rate": 6.845746845746847e-06, "loss": 0.0003, "step": 8080 }, { "epoch": 3.29, "grad_norm": 0.0022225133143365383, "learning_rate": 6.82946682946683e-06, "loss": 0.0003, "step": 8090 }, { "epoch": 3.3, "grad_norm": 0.0030766648706048727, "learning_rate": 6.813186813186814e-06, "loss": 0.0003, "step": 8100 }, { "epoch": 3.3, "grad_norm": 0.0020688914228230715, "learning_rate": 6.796906796906797e-06, "loss": 0.0003, "step": 8110 }, { "epoch": 3.3, "grad_norm": 0.0026230113580822945, "learning_rate": 6.780626780626781e-06, "loss": 0.0003, "step": 8120 }, { "epoch": 3.31, "grad_norm": 0.0027380469255149364, "learning_rate": 6.7643467643467655e-06, "loss": 0.0002, "step": 8130 }, { "epoch": 3.31, "grad_norm": 0.0020218545105308294, "learning_rate": 6.748066748066749e-06, "loss": 0.0002, "step": 8140 }, { "epoch": 3.32, "grad_norm": 0.0022498080506920815, "learning_rate": 6.7317867317867326e-06, "loss": 0.0002, "step": 8150 }, { "epoch": 3.32, "grad_norm": 0.0026646710466593504, "learning_rate": 6.715506715506716e-06, "loss": 0.0002, "step": 8160 }, { "epoch": 3.33, "grad_norm": 0.0021166689693927765, "learning_rate": 6.6992266992267e-06, "loss": 0.0002, "step": 8170 }, { "epoch": 3.33, "grad_norm": 0.0022176315542310476, "learning_rate": 6.6829466829466836e-06, "loss": 0.0003, "step": 8180 }, { "epoch": 3.33, "grad_norm": 0.0020941500551998615, "learning_rate": 6.666666666666667e-06, "loss": 0.0002, "step": 8190 }, { "epoch": 3.34, "grad_norm": 0.002201402559876442, "learning_rate": 6.650386650386651e-06, "loss": 0.0003, "step": 8200 }, { "epoch": 3.34, "grad_norm": 0.002235386986285448, "learning_rate": 6.634106634106635e-06, "loss": 0.0062, "step": 8210 }, { "epoch": 3.35, "grad_norm": 0.002202383242547512, "learning_rate": 6.6178266178266185e-06, "loss": 0.0002, "step": 8220 }, { "epoch": 3.35, "grad_norm": 0.002144381171092391, "learning_rate": 6.601546601546602e-06, "loss": 0.0002, "step": 8230 }, { "epoch": 3.35, "grad_norm": 0.0027761892415583134, "learning_rate": 6.5852665852665855e-06, "loss": 0.0003, "step": 8240 }, { "epoch": 3.36, "grad_norm": 0.002119843615218997, "learning_rate": 6.5689865689865694e-06, "loss": 0.0002, "step": 8250 }, { "epoch": 3.36, "grad_norm": 0.003361073322594166, "learning_rate": 6.552706552706553e-06, "loss": 0.0002, "step": 8260 }, { "epoch": 3.37, "grad_norm": 0.0021668022964149714, "learning_rate": 6.5364265364265365e-06, "loss": 0.0003, "step": 8270 }, { "epoch": 3.37, "grad_norm": 0.0020495818462222815, "learning_rate": 6.5201465201465204e-06, "loss": 0.0002, "step": 8280 }, { "epoch": 3.37, "grad_norm": 0.002108585089445114, "learning_rate": 6.5038665038665035e-06, "loss": 0.0004, "step": 8290 }, { "epoch": 3.38, "grad_norm": 0.0022084820084273815, "learning_rate": 6.487586487586488e-06, "loss": 0.0094, "step": 8300 }, { "epoch": 3.38, "grad_norm": 0.002132968744263053, "learning_rate": 6.471306471306472e-06, "loss": 0.0002, "step": 8310 }, { "epoch": 3.39, "grad_norm": 0.002239073161035776, "learning_rate": 6.455026455026455e-06, "loss": 0.0002, "step": 8320 }, { "epoch": 3.39, "grad_norm": 0.00218349602073431, "learning_rate": 6.438746438746439e-06, "loss": 0.0002, "step": 8330 }, { "epoch": 3.39, "grad_norm": 0.00208345171995461, "learning_rate": 6.422466422466423e-06, "loss": 0.0002, "step": 8340 }, { "epoch": 3.4, "grad_norm": 0.003050567815080285, "learning_rate": 6.406186406186406e-06, "loss": 0.0002, "step": 8350 }, { "epoch": 3.4, "grad_norm": 0.0019847999792546034, "learning_rate": 6.38990638990639e-06, "loss": 0.0003, "step": 8360 }, { "epoch": 3.41, "grad_norm": 0.0020100034307688475, "learning_rate": 6.373626373626373e-06, "loss": 0.0002, "step": 8370 }, { "epoch": 3.41, "grad_norm": 0.0020706066861748695, "learning_rate": 6.357346357346358e-06, "loss": 0.0002, "step": 8380 }, { "epoch": 3.41, "grad_norm": 0.0019506254466250539, "learning_rate": 6.341066341066342e-06, "loss": 0.0002, "step": 8390 }, { "epoch": 3.42, "grad_norm": 0.0020071598701179028, "learning_rate": 6.324786324786325e-06, "loss": 0.0002, "step": 8400 }, { "epoch": 3.42, "grad_norm": 0.002606179565191269, "learning_rate": 6.308506308506309e-06, "loss": 0.0467, "step": 8410 }, { "epoch": 3.43, "grad_norm": 0.0021410868503153324, "learning_rate": 6.292226292226292e-06, "loss": 0.0002, "step": 8420 }, { "epoch": 3.43, "grad_norm": 0.002439359435811639, "learning_rate": 6.275946275946276e-06, "loss": 0.0002, "step": 8430 }, { "epoch": 3.44, "grad_norm": 0.0037037180736660957, "learning_rate": 6.25966625966626e-06, "loss": 0.0002, "step": 8440 }, { "epoch": 3.44, "grad_norm": 0.0022582276724278927, "learning_rate": 6.243386243386243e-06, "loss": 0.0002, "step": 8450 }, { "epoch": 3.44, "grad_norm": 0.006983071565628052, "learning_rate": 6.227106227106228e-06, "loss": 0.0002, "step": 8460 }, { "epoch": 3.45, "grad_norm": 0.01085950993001461, "learning_rate": 6.210826210826212e-06, "loss": 0.0003, "step": 8470 }, { "epoch": 3.45, "grad_norm": 0.0021798298694193363, "learning_rate": 6.194546194546195e-06, "loss": 0.0002, "step": 8480 }, { "epoch": 3.46, "grad_norm": 0.0021102093160152435, "learning_rate": 6.178266178266179e-06, "loss": 0.0002, "step": 8490 }, { "epoch": 3.46, "grad_norm": 0.0021143911872059107, "learning_rate": 6.161986161986162e-06, "loss": 0.0002, "step": 8500 }, { "epoch": 3.46, "grad_norm": 0.002472953638061881, "learning_rate": 6.145706145706146e-06, "loss": 0.0002, "step": 8510 }, { "epoch": 3.47, "grad_norm": 0.0019736222457140684, "learning_rate": 6.12942612942613e-06, "loss": 0.0002, "step": 8520 }, { "epoch": 3.47, "grad_norm": 0.001965272007510066, "learning_rate": 6.113146113146113e-06, "loss": 0.0002, "step": 8530 }, { "epoch": 3.48, "grad_norm": 0.001975101651623845, "learning_rate": 6.096866096866098e-06, "loss": 0.0002, "step": 8540 }, { "epoch": 3.48, "grad_norm": 0.002040453255176544, "learning_rate": 6.080586080586081e-06, "loss": 0.0002, "step": 8550 }, { "epoch": 3.48, "grad_norm": 7.389462471008301, "learning_rate": 6.064306064306065e-06, "loss": 0.034, "step": 8560 }, { "epoch": 3.49, "grad_norm": 0.0022456683218479156, "learning_rate": 6.048026048026049e-06, "loss": 0.0002, "step": 8570 }, { "epoch": 3.49, "grad_norm": 0.003975760657340288, "learning_rate": 6.031746031746032e-06, "loss": 0.0002, "step": 8580 }, { "epoch": 3.5, "grad_norm": 0.0020120914559811354, "learning_rate": 6.015466015466016e-06, "loss": 0.0002, "step": 8590 }, { "epoch": 3.5, "grad_norm": 0.0022050223778933287, "learning_rate": 5.999185999186001e-06, "loss": 0.0002, "step": 8600 }, { "epoch": 3.5, "grad_norm": 6.6309919357299805, "learning_rate": 5.982905982905983e-06, "loss": 0.043, "step": 8610 }, { "epoch": 3.51, "grad_norm": 0.0022617350332438946, "learning_rate": 5.966625966625968e-06, "loss": 0.0002, "step": 8620 }, { "epoch": 3.51, "grad_norm": 0.0019437572918832302, "learning_rate": 5.950345950345951e-06, "loss": 0.0002, "step": 8630 }, { "epoch": 3.52, "grad_norm": 0.001993882469832897, "learning_rate": 5.934065934065935e-06, "loss": 0.0002, "step": 8640 }, { "epoch": 3.52, "grad_norm": 0.0022044796496629715, "learning_rate": 5.917785917785919e-06, "loss": 0.0002, "step": 8650 }, { "epoch": 3.52, "grad_norm": 0.0020595360547304153, "learning_rate": 5.901505901505902e-06, "loss": 0.0004, "step": 8660 }, { "epoch": 3.53, "grad_norm": 0.002459390088915825, "learning_rate": 5.885225885225886e-06, "loss": 0.0002, "step": 8670 }, { "epoch": 3.53, "grad_norm": 0.0018390618497505784, "learning_rate": 5.868945868945869e-06, "loss": 0.0002, "step": 8680 }, { "epoch": 3.54, "grad_norm": 0.002049500122666359, "learning_rate": 5.852665852665853e-06, "loss": 0.0002, "step": 8690 }, { "epoch": 3.54, "grad_norm": 0.001947426819242537, "learning_rate": 5.8363858363858375e-06, "loss": 0.0002, "step": 8700 }, { "epoch": 3.54, "grad_norm": 0.0030878265388309956, "learning_rate": 5.820105820105821e-06, "loss": 0.0002, "step": 8710 }, { "epoch": 3.55, "grad_norm": 0.001884807599708438, "learning_rate": 5.8038258038258045e-06, "loss": 0.0002, "step": 8720 }, { "epoch": 3.55, "grad_norm": 0.0019810153171420097, "learning_rate": 5.7875457875457885e-06, "loss": 0.0002, "step": 8730 }, { "epoch": 3.56, "grad_norm": 0.001923812204040587, "learning_rate": 5.7712657712657716e-06, "loss": 0.0002, "step": 8740 }, { "epoch": 3.56, "grad_norm": 0.001998158637434244, "learning_rate": 5.7549857549857555e-06, "loss": 0.0002, "step": 8750 }, { "epoch": 3.57, "grad_norm": 0.0018681226065382361, "learning_rate": 5.738705738705739e-06, "loss": 0.0002, "step": 8760 }, { "epoch": 3.57, "grad_norm": 0.006764058023691177, "learning_rate": 5.7224257224257225e-06, "loss": 0.0068, "step": 8770 }, { "epoch": 3.57, "grad_norm": 0.003150691743940115, "learning_rate": 5.706145706145707e-06, "loss": 0.0007, "step": 8780 }, { "epoch": 3.58, "grad_norm": 0.00217335089109838, "learning_rate": 5.68986568986569e-06, "loss": 0.0002, "step": 8790 }, { "epoch": 3.58, "grad_norm": 0.00865620281547308, "learning_rate": 5.673585673585674e-06, "loss": 0.0002, "step": 8800 }, { "epoch": 3.59, "grad_norm": 0.0020344313234090805, "learning_rate": 5.6573056573056575e-06, "loss": 0.0002, "step": 8810 }, { "epoch": 3.59, "grad_norm": 0.0018948889337480068, "learning_rate": 5.641025641025641e-06, "loss": 0.0002, "step": 8820 }, { "epoch": 3.59, "grad_norm": 0.001868214923888445, "learning_rate": 5.624745624745625e-06, "loss": 0.0002, "step": 8830 }, { "epoch": 3.6, "grad_norm": 0.0019942354410886765, "learning_rate": 5.6084656084656084e-06, "loss": 0.0002, "step": 8840 }, { "epoch": 3.6, "grad_norm": 0.0018839197000488639, "learning_rate": 5.592185592185592e-06, "loss": 0.0002, "step": 8850 }, { "epoch": 3.61, "grad_norm": 0.0022100857459008694, "learning_rate": 5.575905575905577e-06, "loss": 0.0002, "step": 8860 }, { "epoch": 3.61, "grad_norm": 0.0019310906063765287, "learning_rate": 5.55962555962556e-06, "loss": 0.0002, "step": 8870 }, { "epoch": 3.61, "grad_norm": 0.002325033536180854, "learning_rate": 5.543345543345544e-06, "loss": 0.0002, "step": 8880 }, { "epoch": 3.62, "grad_norm": 0.0017883091932162642, "learning_rate": 5.527065527065527e-06, "loss": 0.0002, "step": 8890 }, { "epoch": 3.62, "grad_norm": 0.0018799800891429186, "learning_rate": 5.510785510785511e-06, "loss": 0.0002, "step": 8900 }, { "epoch": 3.63, "grad_norm": 0.0017864195397123694, "learning_rate": 5.494505494505495e-06, "loss": 0.0002, "step": 8910 }, { "epoch": 3.63, "grad_norm": 0.0018234961898997426, "learning_rate": 5.478225478225478e-06, "loss": 0.0002, "step": 8920 }, { "epoch": 3.63, "grad_norm": 0.0017443567048758268, "learning_rate": 5.461945461945462e-06, "loss": 0.0002, "step": 8930 }, { "epoch": 3.64, "grad_norm": 0.0017067781882360578, "learning_rate": 5.445665445665445e-06, "loss": 0.0002, "step": 8940 }, { "epoch": 3.64, "grad_norm": 0.002276692306622863, "learning_rate": 5.42938542938543e-06, "loss": 0.0186, "step": 8950 }, { "epoch": 3.65, "grad_norm": 0.0017357119359076023, "learning_rate": 5.413105413105414e-06, "loss": 0.0002, "step": 8960 }, { "epoch": 3.65, "grad_norm": 0.0019965972751379013, "learning_rate": 5.396825396825397e-06, "loss": 0.0002, "step": 8970 }, { "epoch": 3.65, "grad_norm": 0.0017553390935063362, "learning_rate": 5.380545380545381e-06, "loss": 0.0003, "step": 8980 }, { "epoch": 3.66, "grad_norm": 0.0019675048533827066, "learning_rate": 5.364265364265364e-06, "loss": 0.0002, "step": 8990 }, { "epoch": 3.66, "grad_norm": 0.002049475908279419, "learning_rate": 5.347985347985348e-06, "loss": 0.0002, "step": 9000 }, { "epoch": 3.67, "grad_norm": 0.0019142305245622993, "learning_rate": 5.331705331705332e-06, "loss": 0.0002, "step": 9010 }, { "epoch": 3.67, "grad_norm": 0.0018189084948971868, "learning_rate": 5.315425315425315e-06, "loss": 0.042, "step": 9020 }, { "epoch": 3.68, "grad_norm": 0.0019228870514780283, "learning_rate": 5.2991452991453e-06, "loss": 0.0005, "step": 9030 }, { "epoch": 3.68, "grad_norm": 0.002307659713551402, "learning_rate": 5.282865282865284e-06, "loss": 0.0002, "step": 9040 }, { "epoch": 3.68, "grad_norm": 0.0021766172721982002, "learning_rate": 5.266585266585267e-06, "loss": 0.0002, "step": 9050 }, { "epoch": 3.69, "grad_norm": 0.0017359366174787283, "learning_rate": 5.250305250305251e-06, "loss": 0.0341, "step": 9060 }, { "epoch": 3.69, "grad_norm": 0.0017763186478987336, "learning_rate": 5.234025234025234e-06, "loss": 0.0002, "step": 9070 }, { "epoch": 3.7, "grad_norm": 0.001665986143052578, "learning_rate": 5.217745217745218e-06, "loss": 0.0008, "step": 9080 }, { "epoch": 3.7, "grad_norm": 0.0017538231331855059, "learning_rate": 5.201465201465202e-06, "loss": 0.0002, "step": 9090 }, { "epoch": 3.7, "grad_norm": 0.0016558667412027717, "learning_rate": 5.185185185185185e-06, "loss": 0.0002, "step": 9100 }, { "epoch": 3.71, "grad_norm": 0.0018909978680312634, "learning_rate": 5.16890516890517e-06, "loss": 0.0002, "step": 9110 }, { "epoch": 3.71, "grad_norm": 0.0017842132365331054, "learning_rate": 5.152625152625153e-06, "loss": 0.0002, "step": 9120 }, { "epoch": 3.72, "grad_norm": 0.0017819767817854881, "learning_rate": 5.136345136345137e-06, "loss": 0.0002, "step": 9130 }, { "epoch": 3.72, "grad_norm": 0.00168974872212857, "learning_rate": 5.120065120065121e-06, "loss": 0.0002, "step": 9140 }, { "epoch": 3.72, "grad_norm": 0.0017720448086038232, "learning_rate": 5.103785103785104e-06, "loss": 0.0002, "step": 9150 }, { "epoch": 3.73, "grad_norm": 0.0017071804031729698, "learning_rate": 5.087505087505088e-06, "loss": 0.0002, "step": 9160 }, { "epoch": 3.73, "grad_norm": 0.0018827036255970597, "learning_rate": 5.071225071225072e-06, "loss": 0.0002, "step": 9170 }, { "epoch": 3.74, "grad_norm": 0.0022226206492632627, "learning_rate": 5.054945054945055e-06, "loss": 0.0002, "step": 9180 }, { "epoch": 3.74, "grad_norm": 0.0019109738059341908, "learning_rate": 5.03866503866504e-06, "loss": 0.0494, "step": 9190 }, { "epoch": 3.74, "grad_norm": 0.0019527949625626206, "learning_rate": 5.022385022385023e-06, "loss": 0.0002, "step": 9200 }, { "epoch": 3.75, "grad_norm": 0.0020662578754127026, "learning_rate": 5.006105006105007e-06, "loss": 0.0415, "step": 9210 }, { "epoch": 3.75, "grad_norm": 0.002151912311092019, "learning_rate": 4.98982498982499e-06, "loss": 0.0003, "step": 9220 }, { "epoch": 3.76, "grad_norm": 0.001946290722116828, "learning_rate": 4.973544973544974e-06, "loss": 0.0002, "step": 9230 }, { "epoch": 3.76, "grad_norm": 0.001949216122739017, "learning_rate": 4.957264957264958e-06, "loss": 0.0002, "step": 9240 }, { "epoch": 3.76, "grad_norm": 0.002394681563600898, "learning_rate": 4.9409849409849416e-06, "loss": 0.0352, "step": 9250 }, { "epoch": 3.77, "grad_norm": 0.0022585808765143156, "learning_rate": 4.924704924704925e-06, "loss": 0.0471, "step": 9260 }, { "epoch": 3.77, "grad_norm": 0.002393248025327921, "learning_rate": 4.908424908424909e-06, "loss": 0.0002, "step": 9270 }, { "epoch": 3.78, "grad_norm": 0.02360522374510765, "learning_rate": 4.8921448921448925e-06, "loss": 0.0003, "step": 9280 }, { "epoch": 3.78, "grad_norm": 0.0023453827016055584, "learning_rate": 4.8758648758648765e-06, "loss": 0.0003, "step": 9290 }, { "epoch": 3.79, "grad_norm": 0.00220270873978734, "learning_rate": 4.8595848595848596e-06, "loss": 0.0003, "step": 9300 }, { "epoch": 3.79, "grad_norm": 0.0021812734194099903, "learning_rate": 4.8433048433048435e-06, "loss": 0.0003, "step": 9310 }, { "epoch": 3.79, "grad_norm": 0.0021124074701219797, "learning_rate": 4.8270248270248275e-06, "loss": 0.0002, "step": 9320 }, { "epoch": 3.8, "grad_norm": 0.002312874887138605, "learning_rate": 4.810744810744811e-06, "loss": 0.0003, "step": 9330 }, { "epoch": 3.8, "grad_norm": 0.0025071410927921534, "learning_rate": 4.7944647944647945e-06, "loss": 0.0002, "step": 9340 }, { "epoch": 3.81, "grad_norm": 0.0022760110441595316, "learning_rate": 4.7781847781847784e-06, "loss": 0.0003, "step": 9350 }, { "epoch": 3.81, "grad_norm": 0.002391684567555785, "learning_rate": 4.761904761904762e-06, "loss": 0.0002, "step": 9360 }, { "epoch": 3.81, "grad_norm": 0.0021324707195162773, "learning_rate": 4.745624745624746e-06, "loss": 0.0002, "step": 9370 }, { "epoch": 3.82, "grad_norm": 0.0021602713968604803, "learning_rate": 4.729344729344729e-06, "loss": 0.0002, "step": 9380 }, { "epoch": 3.82, "grad_norm": 0.003342408686876297, "learning_rate": 4.713064713064713e-06, "loss": 0.0002, "step": 9390 }, { "epoch": 3.83, "grad_norm": 0.003199818776920438, "learning_rate": 4.696784696784697e-06, "loss": 0.0002, "step": 9400 }, { "epoch": 3.83, "grad_norm": 0.002259862143546343, "learning_rate": 4.680504680504681e-06, "loss": 0.0005, "step": 9410 }, { "epoch": 3.83, "grad_norm": 0.0020261441823095083, "learning_rate": 4.664224664224664e-06, "loss": 0.0002, "step": 9420 }, { "epoch": 3.84, "grad_norm": 0.001844863872975111, "learning_rate": 4.647944647944648e-06, "loss": 0.0003, "step": 9430 }, { "epoch": 3.84, "grad_norm": 0.0022536173928529024, "learning_rate": 4.631664631664632e-06, "loss": 0.0002, "step": 9440 }, { "epoch": 3.85, "grad_norm": 0.001871871529147029, "learning_rate": 4.615384615384616e-06, "loss": 0.0003, "step": 9450 }, { "epoch": 3.85, "grad_norm": 0.0019549911376088858, "learning_rate": 4.599104599104599e-06, "loss": 0.0414, "step": 9460 }, { "epoch": 3.85, "grad_norm": 0.002595256781205535, "learning_rate": 4.582824582824583e-06, "loss": 0.0002, "step": 9470 }, { "epoch": 3.86, "grad_norm": 0.0021485532633960247, "learning_rate": 4.566544566544567e-06, "loss": 0.0002, "step": 9480 }, { "epoch": 3.86, "grad_norm": 0.0018896989058703184, "learning_rate": 4.55026455026455e-06, "loss": 0.0264, "step": 9490 }, { "epoch": 3.87, "grad_norm": 0.0023643136955797672, "learning_rate": 4.533984533984534e-06, "loss": 0.0003, "step": 9500 }, { "epoch": 3.87, "grad_norm": 0.0017869413131847978, "learning_rate": 4.517704517704518e-06, "loss": 0.0003, "step": 9510 }, { "epoch": 3.87, "grad_norm": 0.0022810434456914663, "learning_rate": 4.501424501424502e-06, "loss": 0.0002, "step": 9520 }, { "epoch": 3.88, "grad_norm": 0.0020936301443725824, "learning_rate": 4.485144485144485e-06, "loss": 0.0003, "step": 9530 }, { "epoch": 3.88, "grad_norm": 0.0017164949094876647, "learning_rate": 4.468864468864469e-06, "loss": 0.0147, "step": 9540 }, { "epoch": 3.89, "grad_norm": 0.008885451592504978, "learning_rate": 4.452584452584453e-06, "loss": 0.0003, "step": 9550 }, { "epoch": 3.89, "grad_norm": 0.20433610677719116, "learning_rate": 4.436304436304437e-06, "loss": 0.0004, "step": 9560 }, { "epoch": 3.89, "grad_norm": 0.0018083051545545459, "learning_rate": 4.42002442002442e-06, "loss": 0.0002, "step": 9570 }, { "epoch": 3.9, "grad_norm": 0.00233688997104764, "learning_rate": 4.403744403744404e-06, "loss": 0.0002, "step": 9580 }, { "epoch": 3.9, "grad_norm": 0.0020819292403757572, "learning_rate": 4.387464387464388e-06, "loss": 0.0012, "step": 9590 }, { "epoch": 3.91, "grad_norm": 0.0069807544350624084, "learning_rate": 4.371184371184372e-06, "loss": 0.0003, "step": 9600 }, { "epoch": 3.91, "grad_norm": 0.0027952860109508038, "learning_rate": 4.354904354904355e-06, "loss": 0.0002, "step": 9610 }, { "epoch": 3.92, "grad_norm": 0.0018937455024570227, "learning_rate": 4.338624338624339e-06, "loss": 0.0272, "step": 9620 }, { "epoch": 3.92, "grad_norm": 0.001811556052416563, "learning_rate": 4.322344322344323e-06, "loss": 0.0002, "step": 9630 }, { "epoch": 3.92, "grad_norm": 0.0017631722148507833, "learning_rate": 4.306064306064307e-06, "loss": 0.0002, "step": 9640 }, { "epoch": 3.93, "grad_norm": 0.001867889310233295, "learning_rate": 4.28978428978429e-06, "loss": 0.0197, "step": 9650 }, { "epoch": 3.93, "grad_norm": 0.0020562438294291496, "learning_rate": 4.273504273504274e-06, "loss": 0.0002, "step": 9660 }, { "epoch": 3.94, "grad_norm": 0.007918364368379116, "learning_rate": 4.257224257224258e-06, "loss": 0.0003, "step": 9670 }, { "epoch": 3.94, "grad_norm": 0.0026931529864668846, "learning_rate": 4.240944240944242e-06, "loss": 0.0003, "step": 9680 }, { "epoch": 3.94, "grad_norm": 0.002624350832775235, "learning_rate": 4.224664224664225e-06, "loss": 0.0003, "step": 9690 }, { "epoch": 3.95, "grad_norm": 0.001771993818692863, "learning_rate": 4.208384208384209e-06, "loss": 0.0002, "step": 9700 }, { "epoch": 3.95, "grad_norm": 0.010523835197091103, "learning_rate": 4.192104192104192e-06, "loss": 0.0003, "step": 9710 }, { "epoch": 3.96, "grad_norm": 0.0034396941773593426, "learning_rate": 4.175824175824177e-06, "loss": 0.0003, "step": 9720 }, { "epoch": 3.96, "grad_norm": 0.003138788277283311, "learning_rate": 4.15954415954416e-06, "loss": 0.0058, "step": 9730 }, { "epoch": 3.96, "grad_norm": 0.002142369979992509, "learning_rate": 4.143264143264144e-06, "loss": 0.0002, "step": 9740 }, { "epoch": 3.97, "grad_norm": 0.006518381182104349, "learning_rate": 4.126984126984127e-06, "loss": 0.0002, "step": 9750 }, { "epoch": 3.97, "grad_norm": 0.0019359017023816705, "learning_rate": 4.1107041107041116e-06, "loss": 0.0002, "step": 9760 }, { "epoch": 3.98, "grad_norm": 0.0018001939170062542, "learning_rate": 4.094424094424095e-06, "loss": 0.0002, "step": 9770 }, { "epoch": 3.98, "grad_norm": 0.002167722210288048, "learning_rate": 4.078144078144079e-06, "loss": 0.0002, "step": 9780 }, { "epoch": 3.98, "grad_norm": 0.008154891431331635, "learning_rate": 4.061864061864062e-06, "loss": 0.0002, "step": 9790 }, { "epoch": 3.99, "grad_norm": 0.001978978980332613, "learning_rate": 4.0455840455840465e-06, "loss": 0.0002, "step": 9800 }, { "epoch": 3.99, "grad_norm": 0.0018466059118509293, "learning_rate": 4.0293040293040296e-06, "loss": 0.0002, "step": 9810 }, { "epoch": 4.0, "grad_norm": 0.00179979985114187, "learning_rate": 4.0130240130240135e-06, "loss": 0.0002, "step": 9820 }, { "epoch": 4.0, "grad_norm": 0.002002492779865861, "learning_rate": 3.996743996743997e-06, "loss": 0.0002, "step": 9830 }, { "epoch": 4.0, "grad_norm": 0.0019970801658928394, "learning_rate": 3.9804639804639805e-06, "loss": 0.0002, "step": 9840 }, { "epoch": 4.01, "grad_norm": 0.0017706368817016482, "learning_rate": 3.9641839641839645e-06, "loss": 0.0002, "step": 9850 }, { "epoch": 4.01, "grad_norm": 0.0017488128505647182, "learning_rate": 3.9479039479039484e-06, "loss": 0.0003, "step": 9860 }, { "epoch": 4.02, "grad_norm": 0.0025758370757102966, "learning_rate": 3.9316239316239315e-06, "loss": 0.0002, "step": 9870 }, { "epoch": 4.02, "grad_norm": 0.002105166669934988, "learning_rate": 3.9153439153439155e-06, "loss": 0.0002, "step": 9880 }, { "epoch": 4.03, "grad_norm": 0.0027692352887243032, "learning_rate": 3.899063899063899e-06, "loss": 0.0043, "step": 9890 }, { "epoch": 4.03, "grad_norm": 0.0020704329945147038, "learning_rate": 3.882783882783883e-06, "loss": 0.0002, "step": 9900 }, { "epoch": 4.03, "grad_norm": 0.0019208292942494154, "learning_rate": 3.8665038665038664e-06, "loss": 0.0002, "step": 9910 }, { "epoch": 4.04, "grad_norm": 0.0017399511998519301, "learning_rate": 3.85022385022385e-06, "loss": 0.0002, "step": 9920 }, { "epoch": 4.04, "grad_norm": 0.0017688291845843196, "learning_rate": 3.833943833943834e-06, "loss": 0.0002, "step": 9930 }, { "epoch": 4.05, "grad_norm": 4.471590995788574, "learning_rate": 3.817663817663818e-06, "loss": 0.0023, "step": 9940 }, { "epoch": 4.05, "grad_norm": 0.0016602250980213284, "learning_rate": 3.8013838013838018e-06, "loss": 0.0002, "step": 9950 }, { "epoch": 4.05, "grad_norm": 0.001645643264055252, "learning_rate": 3.7851037851037853e-06, "loss": 0.0002, "step": 9960 }, { "epoch": 4.06, "grad_norm": 0.0017087948508560658, "learning_rate": 3.768823768823769e-06, "loss": 0.0002, "step": 9970 }, { "epoch": 4.06, "grad_norm": 0.002038088161498308, "learning_rate": 3.752543752543753e-06, "loss": 0.0002, "step": 9980 }, { "epoch": 4.07, "grad_norm": 0.0071817911230027676, "learning_rate": 3.7362637362637367e-06, "loss": 0.0002, "step": 9990 }, { "epoch": 4.07, "grad_norm": 0.0021325184497982264, "learning_rate": 3.7199837199837202e-06, "loss": 0.0002, "step": 10000 }, { "epoch": 4.07, "grad_norm": 0.001710103009827435, "learning_rate": 3.7037037037037037e-06, "loss": 0.0002, "step": 10010 }, { "epoch": 4.08, "grad_norm": 0.0015926583437249064, "learning_rate": 3.687423687423688e-06, "loss": 0.0003, "step": 10020 }, { "epoch": 4.08, "grad_norm": 0.0016407363582402468, "learning_rate": 3.6711436711436716e-06, "loss": 0.0002, "step": 10030 }, { "epoch": 4.09, "grad_norm": 0.005499332211911678, "learning_rate": 3.654863654863655e-06, "loss": 0.0002, "step": 10040 }, { "epoch": 4.09, "grad_norm": 0.0018358811503276229, "learning_rate": 3.6385836385836387e-06, "loss": 0.0002, "step": 10050 }, { "epoch": 4.09, "grad_norm": 0.0016708581242710352, "learning_rate": 3.622303622303623e-06, "loss": 0.0002, "step": 10060 }, { "epoch": 4.1, "grad_norm": 0.0017990090418606997, "learning_rate": 3.6060236060236065e-06, "loss": 0.0002, "step": 10070 }, { "epoch": 4.1, "grad_norm": 0.0018943555187433958, "learning_rate": 3.58974358974359e-06, "loss": 0.0002, "step": 10080 }, { "epoch": 4.11, "grad_norm": 0.0016270867781713605, "learning_rate": 3.5734635734635736e-06, "loss": 0.0002, "step": 10090 }, { "epoch": 4.11, "grad_norm": 0.009296965785324574, "learning_rate": 3.557183557183557e-06, "loss": 0.0002, "step": 10100 }, { "epoch": 4.11, "grad_norm": 0.0016765177715569735, "learning_rate": 3.5409035409035415e-06, "loss": 0.0002, "step": 10110 }, { "epoch": 4.12, "grad_norm": 0.004676634445786476, "learning_rate": 3.524623524623525e-06, "loss": 0.0002, "step": 10120 }, { "epoch": 4.12, "grad_norm": 0.001882671844214201, "learning_rate": 3.5083435083435085e-06, "loss": 0.0011, "step": 10130 }, { "epoch": 4.13, "grad_norm": 0.0016701704589650035, "learning_rate": 3.492063492063492e-06, "loss": 0.0002, "step": 10140 }, { "epoch": 4.13, "grad_norm": 0.0018036847468465567, "learning_rate": 3.4757834757834764e-06, "loss": 0.0002, "step": 10150 }, { "epoch": 4.14, "grad_norm": 0.0019449255196377635, "learning_rate": 3.45950345950346e-06, "loss": 0.0002, "step": 10160 }, { "epoch": 4.14, "grad_norm": 0.0023109372705221176, "learning_rate": 3.4432234432234434e-06, "loss": 0.0002, "step": 10170 }, { "epoch": 4.14, "grad_norm": 0.001794449402950704, "learning_rate": 3.426943426943427e-06, "loss": 0.0002, "step": 10180 }, { "epoch": 4.15, "grad_norm": 0.0016366175841540098, "learning_rate": 3.410663410663411e-06, "loss": 0.0002, "step": 10190 }, { "epoch": 4.15, "grad_norm": 0.0022932947613298893, "learning_rate": 3.394383394383395e-06, "loss": 0.0002, "step": 10200 }, { "epoch": 4.16, "grad_norm": 0.003153660800307989, "learning_rate": 3.3781033781033783e-06, "loss": 0.0002, "step": 10210 }, { "epoch": 4.16, "grad_norm": 0.0018573219422250986, "learning_rate": 3.361823361823362e-06, "loss": 0.0002, "step": 10220 }, { "epoch": 4.16, "grad_norm": 0.0016019688919186592, "learning_rate": 3.345543345543346e-06, "loss": 0.0002, "step": 10230 }, { "epoch": 4.17, "grad_norm": 0.0016897093737497926, "learning_rate": 3.3292633292633297e-06, "loss": 0.0002, "step": 10240 }, { "epoch": 4.17, "grad_norm": 0.0018914591055363417, "learning_rate": 3.3129833129833133e-06, "loss": 0.0003, "step": 10250 }, { "epoch": 4.18, "grad_norm": 0.0018889600178226829, "learning_rate": 3.2967032967032968e-06, "loss": 0.0002, "step": 10260 }, { "epoch": 4.18, "grad_norm": 0.00160633132327348, "learning_rate": 3.2804232804232807e-06, "loss": 0.0002, "step": 10270 }, { "epoch": 4.18, "grad_norm": 0.00516732269898057, "learning_rate": 3.2641432641432647e-06, "loss": 0.0002, "step": 10280 }, { "epoch": 4.19, "grad_norm": 0.0015665347455069423, "learning_rate": 3.247863247863248e-06, "loss": 0.0002, "step": 10290 }, { "epoch": 4.19, "grad_norm": 0.0016588406870141625, "learning_rate": 3.2315832315832317e-06, "loss": 0.0002, "step": 10300 }, { "epoch": 4.2, "grad_norm": 0.00242376746609807, "learning_rate": 3.2153032153032156e-06, "loss": 0.0002, "step": 10310 }, { "epoch": 4.2, "grad_norm": 0.0070383488200604916, "learning_rate": 3.199023199023199e-06, "loss": 0.0002, "step": 10320 }, { "epoch": 4.2, "grad_norm": 0.0019135623006150126, "learning_rate": 3.182743182743183e-06, "loss": 0.0002, "step": 10330 }, { "epoch": 4.21, "grad_norm": 0.0018966845236718655, "learning_rate": 3.1664631664631666e-06, "loss": 0.0002, "step": 10340 }, { "epoch": 4.21, "grad_norm": 0.0014899246161803603, "learning_rate": 3.1501831501831505e-06, "loss": 0.0002, "step": 10350 }, { "epoch": 4.22, "grad_norm": 0.001564052072353661, "learning_rate": 3.133903133903134e-06, "loss": 0.0002, "step": 10360 }, { "epoch": 4.22, "grad_norm": 0.001840132987126708, "learning_rate": 3.117623117623118e-06, "loss": 0.0002, "step": 10370 }, { "epoch": 4.22, "grad_norm": 0.0020550840999931097, "learning_rate": 3.1013431013431015e-06, "loss": 0.0002, "step": 10380 }, { "epoch": 4.23, "grad_norm": 0.0018264094833284616, "learning_rate": 3.0850630850630855e-06, "loss": 0.0002, "step": 10390 }, { "epoch": 4.23, "grad_norm": 0.001516546355560422, "learning_rate": 3.068783068783069e-06, "loss": 0.0002, "step": 10400 }, { "epoch": 4.24, "grad_norm": 0.0016487749526277184, "learning_rate": 3.052503052503053e-06, "loss": 0.0002, "step": 10410 }, { "epoch": 4.24, "grad_norm": 0.0016116101760417223, "learning_rate": 3.0362230362230364e-06, "loss": 0.0002, "step": 10420 }, { "epoch": 4.25, "grad_norm": 0.001680860761553049, "learning_rate": 3.0199430199430204e-06, "loss": 0.0002, "step": 10430 }, { "epoch": 4.25, "grad_norm": 0.002029112773016095, "learning_rate": 3.003663003663004e-06, "loss": 0.0002, "step": 10440 }, { "epoch": 4.25, "grad_norm": 0.002056869911029935, "learning_rate": 2.9873829873829874e-06, "loss": 0.0002, "step": 10450 }, { "epoch": 4.26, "grad_norm": 0.0016365089686587453, "learning_rate": 2.9711029711029714e-06, "loss": 0.0017, "step": 10460 }, { "epoch": 4.26, "grad_norm": 0.001570598571561277, "learning_rate": 2.9548229548229553e-06, "loss": 0.0002, "step": 10470 }, { "epoch": 4.27, "grad_norm": 0.0019338660640642047, "learning_rate": 2.938542938542939e-06, "loss": 0.0002, "step": 10480 }, { "epoch": 4.27, "grad_norm": 0.001604044926352799, "learning_rate": 2.9222629222629223e-06, "loss": 0.0002, "step": 10490 }, { "epoch": 4.27, "grad_norm": 0.0015405503800138831, "learning_rate": 2.9059829059829063e-06, "loss": 0.0003, "step": 10500 }, { "epoch": 4.28, "grad_norm": 0.001597168273292482, "learning_rate": 2.8897028897028902e-06, "loss": 0.0002, "step": 10510 }, { "epoch": 4.28, "grad_norm": 0.001601763884536922, "learning_rate": 2.8734228734228737e-06, "loss": 0.0002, "step": 10520 }, { "epoch": 4.29, "grad_norm": 0.0014684420311823487, "learning_rate": 2.8571428571428573e-06, "loss": 0.0002, "step": 10530 }, { "epoch": 4.29, "grad_norm": 0.0019548002164810896, "learning_rate": 2.840862840862841e-06, "loss": 0.0002, "step": 10540 }, { "epoch": 4.29, "grad_norm": 0.0019341334700584412, "learning_rate": 2.824582824582825e-06, "loss": 0.0002, "step": 10550 }, { "epoch": 4.3, "grad_norm": 0.0015359672252088785, "learning_rate": 2.8083028083028087e-06, "loss": 0.0002, "step": 10560 }, { "epoch": 4.3, "grad_norm": 0.001660957932472229, "learning_rate": 2.792022792022792e-06, "loss": 0.0002, "step": 10570 }, { "epoch": 4.31, "grad_norm": 0.002642634091898799, "learning_rate": 2.7757427757427757e-06, "loss": 0.0002, "step": 10580 }, { "epoch": 4.31, "grad_norm": 0.001577245187945664, "learning_rate": 2.75946275946276e-06, "loss": 0.0002, "step": 10590 }, { "epoch": 4.31, "grad_norm": 0.0016623500268906355, "learning_rate": 2.7431827431827436e-06, "loss": 0.0274, "step": 10600 }, { "epoch": 4.32, "grad_norm": 0.001559157157316804, "learning_rate": 2.726902726902727e-06, "loss": 0.0002, "step": 10610 }, { "epoch": 4.32, "grad_norm": 0.0015379212563857436, "learning_rate": 2.7106227106227106e-06, "loss": 0.0002, "step": 10620 }, { "epoch": 4.33, "grad_norm": 0.001682962873019278, "learning_rate": 2.694342694342695e-06, "loss": 0.0002, "step": 10630 }, { "epoch": 4.33, "grad_norm": 0.0015784628922119737, "learning_rate": 2.6780626780626785e-06, "loss": 0.0002, "step": 10640 }, { "epoch": 4.33, "grad_norm": 0.0015349604655057192, "learning_rate": 2.661782661782662e-06, "loss": 0.0002, "step": 10650 }, { "epoch": 4.34, "grad_norm": 0.0015412438660860062, "learning_rate": 2.6455026455026455e-06, "loss": 0.0002, "step": 10660 }, { "epoch": 4.34, "grad_norm": 0.0016461275517940521, "learning_rate": 2.629222629222629e-06, "loss": 0.0002, "step": 10670 }, { "epoch": 4.35, "grad_norm": 0.0016684934962540865, "learning_rate": 2.6129426129426134e-06, "loss": 0.0002, "step": 10680 }, { "epoch": 4.35, "grad_norm": 0.0015019102720543742, "learning_rate": 2.596662596662597e-06, "loss": 0.0002, "step": 10690 }, { "epoch": 4.35, "grad_norm": 0.0015912950038909912, "learning_rate": 2.5803825803825804e-06, "loss": 0.0002, "step": 10700 }, { "epoch": 4.36, "grad_norm": 0.002051288727670908, "learning_rate": 2.564102564102564e-06, "loss": 0.0002, "step": 10710 }, { "epoch": 4.36, "grad_norm": 0.0014287488302215934, "learning_rate": 2.5478225478225483e-06, "loss": 0.0002, "step": 10720 }, { "epoch": 4.37, "grad_norm": 0.0014953837962821126, "learning_rate": 2.531542531542532e-06, "loss": 0.0002, "step": 10730 }, { "epoch": 4.37, "grad_norm": 0.0016842116601765156, "learning_rate": 2.5152625152625154e-06, "loss": 0.0002, "step": 10740 }, { "epoch": 4.38, "grad_norm": 0.0016165722627192736, "learning_rate": 2.4989824989824993e-06, "loss": 0.0004, "step": 10750 }, { "epoch": 4.38, "grad_norm": 0.0016578533686697483, "learning_rate": 2.482702482702483e-06, "loss": 0.0002, "step": 10760 }, { "epoch": 4.38, "grad_norm": 0.001627171179279685, "learning_rate": 2.4664224664224668e-06, "loss": 0.0002, "step": 10770 }, { "epoch": 4.39, "grad_norm": 0.0029889908619225025, "learning_rate": 2.4501424501424503e-06, "loss": 0.0002, "step": 10780 }, { "epoch": 4.39, "grad_norm": 0.0015365415019914508, "learning_rate": 2.433862433862434e-06, "loss": 0.0002, "step": 10790 }, { "epoch": 4.4, "grad_norm": 0.0019263201393187046, "learning_rate": 2.4175824175824177e-06, "loss": 0.0002, "step": 10800 }, { "epoch": 4.4, "grad_norm": 0.001516710501164198, "learning_rate": 2.4013024013024013e-06, "loss": 0.0002, "step": 10810 }, { "epoch": 4.4, "grad_norm": 0.001614395878277719, "learning_rate": 2.385022385022385e-06, "loss": 0.0002, "step": 10820 }, { "epoch": 4.41, "grad_norm": 0.0014490768080577254, "learning_rate": 2.3687423687423687e-06, "loss": 0.0004, "step": 10830 }, { "epoch": 4.41, "grad_norm": 0.0015428679762408137, "learning_rate": 2.3524623524623527e-06, "loss": 0.0002, "step": 10840 }, { "epoch": 4.42, "grad_norm": 0.0015440605347976089, "learning_rate": 2.336182336182336e-06, "loss": 0.0004, "step": 10850 }, { "epoch": 4.42, "grad_norm": 0.00148781796451658, "learning_rate": 2.31990231990232e-06, "loss": 0.0002, "step": 10860 }, { "epoch": 4.42, "grad_norm": 0.0015348844463005662, "learning_rate": 2.3036223036223036e-06, "loss": 0.0002, "step": 10870 }, { "epoch": 4.43, "grad_norm": 0.001880201743915677, "learning_rate": 2.2873422873422876e-06, "loss": 0.0002, "step": 10880 }, { "epoch": 4.43, "grad_norm": 0.001558057265356183, "learning_rate": 2.271062271062271e-06, "loss": 0.0002, "step": 10890 }, { "epoch": 4.44, "grad_norm": 0.010920335538685322, "learning_rate": 2.254782254782255e-06, "loss": 0.0002, "step": 10900 }, { "epoch": 4.44, "grad_norm": 0.0014644470065832138, "learning_rate": 2.2385022385022386e-06, "loss": 0.0002, "step": 10910 }, { "epoch": 4.44, "grad_norm": 0.0014618238201364875, "learning_rate": 2.222222222222222e-06, "loss": 0.0002, "step": 10920 }, { "epoch": 4.45, "grad_norm": 0.0016169185983017087, "learning_rate": 2.205942205942206e-06, "loss": 0.0002, "step": 10930 }, { "epoch": 4.45, "grad_norm": 0.0014386329567059875, "learning_rate": 2.1896621896621895e-06, "loss": 0.0002, "step": 10940 }, { "epoch": 4.46, "grad_norm": 0.0015079034492373466, "learning_rate": 2.1733821733821735e-06, "loss": 0.0002, "step": 10950 }, { "epoch": 4.46, "grad_norm": 0.00197400595061481, "learning_rate": 2.157102157102157e-06, "loss": 0.0002, "step": 10960 }, { "epoch": 4.46, "grad_norm": 0.001524322316981852, "learning_rate": 2.140822140822141e-06, "loss": 0.0311, "step": 10970 }, { "epoch": 4.47, "grad_norm": 0.0014644163893535733, "learning_rate": 2.1245421245421245e-06, "loss": 0.0002, "step": 10980 }, { "epoch": 4.47, "grad_norm": 0.0014774493174627423, "learning_rate": 2.1082621082621084e-06, "loss": 0.0002, "step": 10990 }, { "epoch": 4.48, "grad_norm": 0.0014835140900686383, "learning_rate": 2.091982091982092e-06, "loss": 0.0002, "step": 11000 }, { "epoch": 4.48, "grad_norm": 0.001458540791645646, "learning_rate": 2.075702075702076e-06, "loss": 0.0002, "step": 11010 }, { "epoch": 4.49, "grad_norm": 0.002432051347568631, "learning_rate": 2.05942205942206e-06, "loss": 0.0002, "step": 11020 }, { "epoch": 4.49, "grad_norm": 0.001562487450428307, "learning_rate": 2.0431420431420433e-06, "loss": 0.0353, "step": 11030 }, { "epoch": 4.49, "grad_norm": 0.0016052748542279005, "learning_rate": 2.0268620268620273e-06, "loss": 0.0002, "step": 11040 }, { "epoch": 4.5, "grad_norm": 0.001545790466479957, "learning_rate": 2.0105820105820108e-06, "loss": 0.0002, "step": 11050 }, { "epoch": 4.5, "grad_norm": 0.001812846981920302, "learning_rate": 1.9943019943019947e-06, "loss": 0.0002, "step": 11060 }, { "epoch": 4.51, "grad_norm": 0.0017415074398741126, "learning_rate": 1.9780219780219782e-06, "loss": 0.0002, "step": 11070 }, { "epoch": 4.51, "grad_norm": 0.0016338001005351543, "learning_rate": 1.961741961741962e-06, "loss": 0.0002, "step": 11080 }, { "epoch": 4.51, "grad_norm": 0.0014169925125315785, "learning_rate": 1.9454619454619457e-06, "loss": 0.0006, "step": 11090 }, { "epoch": 4.52, "grad_norm": 0.0016671591438353062, "learning_rate": 1.9291819291819296e-06, "loss": 0.0002, "step": 11100 }, { "epoch": 4.52, "grad_norm": 0.0033444638829678297, "learning_rate": 1.912901912901913e-06, "loss": 0.0002, "step": 11110 }, { "epoch": 4.53, "grad_norm": 0.0015689071733504534, "learning_rate": 1.8966218966218969e-06, "loss": 0.0002, "step": 11120 }, { "epoch": 4.53, "grad_norm": 0.0018193925498053432, "learning_rate": 1.8803418803418804e-06, "loss": 0.0002, "step": 11130 }, { "epoch": 4.53, "grad_norm": 0.0015975474379956722, "learning_rate": 1.8640618640618643e-06, "loss": 0.0002, "step": 11140 }, { "epoch": 4.54, "grad_norm": 0.0015228153206408024, "learning_rate": 1.8477818477818479e-06, "loss": 0.0002, "step": 11150 }, { "epoch": 4.54, "grad_norm": 0.0017481072572991252, "learning_rate": 1.8315018315018316e-06, "loss": 0.0002, "step": 11160 }, { "epoch": 4.55, "grad_norm": 0.0014254804700613022, "learning_rate": 1.8152218152218153e-06, "loss": 0.0002, "step": 11170 }, { "epoch": 4.55, "grad_norm": 0.0014639191795140505, "learning_rate": 1.798941798941799e-06, "loss": 0.0002, "step": 11180 }, { "epoch": 4.55, "grad_norm": 0.0014739630278199911, "learning_rate": 1.7826617826617828e-06, "loss": 0.0002, "step": 11190 }, { "epoch": 4.56, "grad_norm": 0.001486291061155498, "learning_rate": 1.7663817663817665e-06, "loss": 0.0002, "step": 11200 }, { "epoch": 4.56, "grad_norm": 0.0021130377426743507, "learning_rate": 1.7501017501017502e-06, "loss": 0.0002, "step": 11210 }, { "epoch": 4.57, "grad_norm": 0.0014680501772090793, "learning_rate": 1.733821733821734e-06, "loss": 0.0002, "step": 11220 }, { "epoch": 4.57, "grad_norm": 0.0018635701853781939, "learning_rate": 1.7175417175417177e-06, "loss": 0.0002, "step": 11230 }, { "epoch": 4.57, "grad_norm": 0.0015968162333592772, "learning_rate": 1.7012617012617014e-06, "loss": 0.0094, "step": 11240 }, { "epoch": 4.58, "grad_norm": 0.0017093609785661101, "learning_rate": 1.6849816849816852e-06, "loss": 0.0002, "step": 11250 }, { "epoch": 4.58, "grad_norm": 0.0014892058679834008, "learning_rate": 1.6687016687016689e-06, "loss": 0.0139, "step": 11260 }, { "epoch": 4.59, "grad_norm": 0.0014219109434634447, "learning_rate": 1.6524216524216524e-06, "loss": 0.0002, "step": 11270 }, { "epoch": 4.59, "grad_norm": 0.004563894122838974, "learning_rate": 1.6361416361416363e-06, "loss": 0.0002, "step": 11280 }, { "epoch": 4.6, "grad_norm": 0.0014352177968248725, "learning_rate": 1.6198616198616199e-06, "loss": 0.0002, "step": 11290 }, { "epoch": 4.6, "grad_norm": 0.001390959369018674, "learning_rate": 1.6035816035816038e-06, "loss": 0.0002, "step": 11300 }, { "epoch": 4.6, "grad_norm": 0.0038959532976150513, "learning_rate": 1.5873015873015873e-06, "loss": 0.0002, "step": 11310 }, { "epoch": 4.61, "grad_norm": 0.001589680789038539, "learning_rate": 1.5710215710215713e-06, "loss": 0.0002, "step": 11320 }, { "epoch": 4.61, "grad_norm": 0.001737966202199459, "learning_rate": 1.5547415547415548e-06, "loss": 0.0002, "step": 11330 }, { "epoch": 4.62, "grad_norm": 0.0014157581608742476, "learning_rate": 1.5384615384615387e-06, "loss": 0.0002, "step": 11340 }, { "epoch": 4.62, "grad_norm": 0.0018974760314449668, "learning_rate": 1.5221815221815222e-06, "loss": 0.0002, "step": 11350 }, { "epoch": 4.62, "grad_norm": 0.0015809000469744205, "learning_rate": 1.5059015059015062e-06, "loss": 0.0002, "step": 11360 }, { "epoch": 4.63, "grad_norm": 0.01224368717521429, "learning_rate": 1.4896214896214897e-06, "loss": 0.0002, "step": 11370 }, { "epoch": 4.63, "grad_norm": 0.0015656572068110108, "learning_rate": 1.4733414733414736e-06, "loss": 0.0002, "step": 11380 }, { "epoch": 4.64, "grad_norm": 0.0015062758466228843, "learning_rate": 1.4570614570614572e-06, "loss": 0.0002, "step": 11390 }, { "epoch": 4.64, "grad_norm": 0.001575302449055016, "learning_rate": 1.4407814407814407e-06, "loss": 0.0002, "step": 11400 }, { "epoch": 4.64, "grad_norm": 0.0014867339050397277, "learning_rate": 1.4245014245014246e-06, "loss": 0.0002, "step": 11410 }, { "epoch": 4.65, "grad_norm": 0.0014463013503700495, "learning_rate": 1.4082214082214083e-06, "loss": 0.0002, "step": 11420 }, { "epoch": 4.65, "grad_norm": 0.0014738457975909114, "learning_rate": 1.391941391941392e-06, "loss": 0.0002, "step": 11430 }, { "epoch": 4.66, "grad_norm": 0.0033326647244393826, "learning_rate": 1.3756613756613758e-06, "loss": 0.0002, "step": 11440 }, { "epoch": 4.66, "grad_norm": 0.0014288641978055239, "learning_rate": 1.3593813593813595e-06, "loss": 0.0002, "step": 11450 }, { "epoch": 4.66, "grad_norm": 0.0015226053074002266, "learning_rate": 1.3431013431013433e-06, "loss": 0.0002, "step": 11460 }, { "epoch": 4.67, "grad_norm": 0.0014885494019836187, "learning_rate": 1.326821326821327e-06, "loss": 0.0002, "step": 11470 }, { "epoch": 4.67, "grad_norm": 0.0014367675175890326, "learning_rate": 1.3105413105413107e-06, "loss": 0.0002, "step": 11480 }, { "epoch": 4.68, "grad_norm": 0.0014275162247940898, "learning_rate": 1.2942612942612944e-06, "loss": 0.0002, "step": 11490 }, { "epoch": 4.68, "grad_norm": 0.0014797335024923086, "learning_rate": 1.2779812779812782e-06, "loss": 0.0002, "step": 11500 }, { "epoch": 4.68, "grad_norm": 0.0014295239234343171, "learning_rate": 1.2617012617012617e-06, "loss": 0.0002, "step": 11510 }, { "epoch": 4.69, "grad_norm": 0.0014915807405486703, "learning_rate": 1.2454212454212456e-06, "loss": 0.0002, "step": 11520 }, { "epoch": 4.69, "grad_norm": 0.0016227615997195244, "learning_rate": 1.2291412291412294e-06, "loss": 0.0002, "step": 11530 }, { "epoch": 4.7, "grad_norm": 0.0014580420684069395, "learning_rate": 1.212861212861213e-06, "loss": 0.0002, "step": 11540 }, { "epoch": 4.7, "grad_norm": 0.0017063523409888148, "learning_rate": 1.1965811965811968e-06, "loss": 0.0002, "step": 11550 }, { "epoch": 4.7, "grad_norm": 0.0014365671668201685, "learning_rate": 1.1803011803011806e-06, "loss": 0.0002, "step": 11560 }, { "epoch": 4.71, "grad_norm": 1.2260816097259521, "learning_rate": 1.164021164021164e-06, "loss": 0.006, "step": 11570 }, { "epoch": 4.71, "grad_norm": 0.0018874687375500798, "learning_rate": 1.1477411477411478e-06, "loss": 0.0002, "step": 11580 }, { "epoch": 4.72, "grad_norm": 0.0013750126818194985, "learning_rate": 1.1314611314611315e-06, "loss": 0.0002, "step": 11590 }, { "epoch": 4.72, "grad_norm": 0.001419686945155263, "learning_rate": 1.1151811151811153e-06, "loss": 0.0002, "step": 11600 }, { "epoch": 4.73, "grad_norm": 0.0014118729159235954, "learning_rate": 1.098901098901099e-06, "loss": 0.0002, "step": 11610 }, { "epoch": 4.73, "grad_norm": 0.0014139912091195583, "learning_rate": 1.0826210826210827e-06, "loss": 0.0002, "step": 11620 }, { "epoch": 4.73, "grad_norm": 0.0015478282002732158, "learning_rate": 1.0663410663410665e-06, "loss": 0.0002, "step": 11630 }, { "epoch": 4.74, "grad_norm": 0.0015366330044344068, "learning_rate": 1.0500610500610502e-06, "loss": 0.0002, "step": 11640 }, { "epoch": 4.74, "grad_norm": 0.001490729977376759, "learning_rate": 1.033781033781034e-06, "loss": 0.0002, "step": 11650 }, { "epoch": 4.75, "grad_norm": 0.001423732377588749, "learning_rate": 1.0175010175010176e-06, "loss": 0.0002, "step": 11660 }, { "epoch": 4.75, "grad_norm": 0.0014315071748569608, "learning_rate": 1.0012210012210014e-06, "loss": 0.0002, "step": 11670 }, { "epoch": 4.75, "grad_norm": 0.001486779423430562, "learning_rate": 9.84940984940985e-07, "loss": 0.0002, "step": 11680 }, { "epoch": 4.76, "grad_norm": 0.004522906616330147, "learning_rate": 9.686609686609686e-07, "loss": 0.0002, "step": 11690 }, { "epoch": 4.76, "grad_norm": 0.0014580250717699528, "learning_rate": 9.523809523809525e-07, "loss": 0.0002, "step": 11700 }, { "epoch": 4.77, "grad_norm": 0.0015162237687036395, "learning_rate": 9.361009361009362e-07, "loss": 0.0092, "step": 11710 }, { "epoch": 4.77, "grad_norm": 0.0013567224377766252, "learning_rate": 9.198209198209199e-07, "loss": 0.0002, "step": 11720 }, { "epoch": 4.77, "grad_norm": 0.0014696550788357854, "learning_rate": 9.035409035409036e-07, "loss": 0.0002, "step": 11730 }, { "epoch": 4.78, "grad_norm": 0.0014795665629208088, "learning_rate": 8.872608872608874e-07, "loss": 0.0002, "step": 11740 }, { "epoch": 4.78, "grad_norm": 0.0015325862914323807, "learning_rate": 8.709808709808711e-07, "loss": 0.0002, "step": 11750 }, { "epoch": 4.79, "grad_norm": 0.0014404217945411801, "learning_rate": 8.547008547008548e-07, "loss": 0.0002, "step": 11760 }, { "epoch": 4.79, "grad_norm": 0.0019404751947149634, "learning_rate": 8.384208384208386e-07, "loss": 0.0002, "step": 11770 }, { "epoch": 4.79, "grad_norm": 0.0016487601678818464, "learning_rate": 8.221408221408223e-07, "loss": 0.0002, "step": 11780 }, { "epoch": 4.8, "grad_norm": 0.0020900049712508917, "learning_rate": 8.05860805860806e-07, "loss": 0.0002, "step": 11790 }, { "epoch": 4.8, "grad_norm": 0.044903818517923355, "learning_rate": 7.895807895807897e-07, "loss": 0.0002, "step": 11800 }, { "epoch": 4.81, "grad_norm": 0.006237754598259926, "learning_rate": 7.733007733007733e-07, "loss": 0.0002, "step": 11810 }, { "epoch": 4.81, "grad_norm": 0.001496842596679926, "learning_rate": 7.57020757020757e-07, "loss": 0.0002, "step": 11820 }, { "epoch": 4.81, "grad_norm": 0.0014312907587736845, "learning_rate": 7.407407407407407e-07, "loss": 0.0002, "step": 11830 }, { "epoch": 4.82, "grad_norm": 0.0020331472624093294, "learning_rate": 7.244607244607245e-07, "loss": 0.0002, "step": 11840 }, { "epoch": 4.82, "grad_norm": 0.0015430136118084192, "learning_rate": 7.081807081807082e-07, "loss": 0.0002, "step": 11850 }, { "epoch": 4.83, "grad_norm": 0.0014734879368916154, "learning_rate": 6.919006919006919e-07, "loss": 0.0002, "step": 11860 }, { "epoch": 4.83, "grad_norm": 0.006134878844022751, "learning_rate": 6.756206756206756e-07, "loss": 0.0002, "step": 11870 }, { "epoch": 4.84, "grad_norm": 0.0013609755551442504, "learning_rate": 6.593406593406594e-07, "loss": 0.0002, "step": 11880 }, { "epoch": 4.84, "grad_norm": 0.002070717280730605, "learning_rate": 6.430606430606431e-07, "loss": 0.0002, "step": 11890 }, { "epoch": 4.84, "grad_norm": 0.0014169508358463645, "learning_rate": 6.267806267806268e-07, "loss": 0.0002, "step": 11900 }, { "epoch": 4.85, "grad_norm": 0.0014770817942917347, "learning_rate": 6.105006105006106e-07, "loss": 0.0002, "step": 11910 }, { "epoch": 4.85, "grad_norm": 0.0014419537037611008, "learning_rate": 5.942205942205943e-07, "loss": 0.0002, "step": 11920 }, { "epoch": 4.86, "grad_norm": 0.001446893555112183, "learning_rate": 5.77940577940578e-07, "loss": 0.0002, "step": 11930 }, { "epoch": 4.86, "grad_norm": 0.001416919520124793, "learning_rate": 5.616605616605618e-07, "loss": 0.0002, "step": 11940 }, { "epoch": 4.86, "grad_norm": 0.0034949700348079205, "learning_rate": 5.453805453805455e-07, "loss": 0.0002, "step": 11950 }, { "epoch": 4.87, "grad_norm": 0.0014441277598962188, "learning_rate": 5.291005291005291e-07, "loss": 0.0002, "step": 11960 }, { "epoch": 4.87, "grad_norm": 0.0015632550930604339, "learning_rate": 5.128205128205128e-07, "loss": 0.0002, "step": 11970 }, { "epoch": 4.88, "grad_norm": 0.001399176544509828, "learning_rate": 4.965404965404966e-07, "loss": 0.0002, "step": 11980 }, { "epoch": 4.88, "grad_norm": 0.0013975553447380662, "learning_rate": 4.802604802604803e-07, "loss": 0.0002, "step": 11990 }, { "epoch": 4.88, "grad_norm": 0.0013712114887312055, "learning_rate": 4.63980463980464e-07, "loss": 0.0002, "step": 12000 }, { "epoch": 4.89, "grad_norm": 0.001828977488912642, "learning_rate": 4.4770044770044775e-07, "loss": 0.0002, "step": 12010 }, { "epoch": 4.89, "grad_norm": 0.0014294543070718646, "learning_rate": 4.3142043142043143e-07, "loss": 0.0002, "step": 12020 }, { "epoch": 4.9, "grad_norm": 0.0013922780053690076, "learning_rate": 4.1514041514041516e-07, "loss": 0.0002, "step": 12030 }, { "epoch": 4.9, "grad_norm": 0.0016130340518429875, "learning_rate": 3.988603988603989e-07, "loss": 0.0002, "step": 12040 }, { "epoch": 4.9, "grad_norm": 0.0013872876297682524, "learning_rate": 3.825803825803826e-07, "loss": 0.0002, "step": 12050 }, { "epoch": 4.91, "grad_norm": 0.0014586036559194326, "learning_rate": 3.6630036630036635e-07, "loss": 0.0002, "step": 12060 }, { "epoch": 4.91, "grad_norm": 0.0014334677252918482, "learning_rate": 3.500203500203501e-07, "loss": 0.0002, "step": 12070 }, { "epoch": 4.92, "grad_norm": 0.0014047607546672225, "learning_rate": 3.3374033374033376e-07, "loss": 0.0002, "step": 12080 }, { "epoch": 4.92, "grad_norm": 0.0013850359246134758, "learning_rate": 3.174603174603175e-07, "loss": 0.0002, "step": 12090 }, { "epoch": 4.92, "grad_norm": 0.0013912185095250607, "learning_rate": 3.011803011803012e-07, "loss": 0.0002, "step": 12100 }, { "epoch": 4.93, "grad_norm": 0.001442193053662777, "learning_rate": 2.8490028490028494e-07, "loss": 0.0002, "step": 12110 }, { "epoch": 4.93, "grad_norm": 0.0014724673237651587, "learning_rate": 2.6862026862026867e-07, "loss": 0.0002, "step": 12120 }, { "epoch": 4.94, "grad_norm": 0.0017670753877609968, "learning_rate": 2.5234025234025235e-07, "loss": 0.0002, "step": 12130 }, { "epoch": 4.94, "grad_norm": 0.001458752085454762, "learning_rate": 2.3606023606023608e-07, "loss": 0.0002, "step": 12140 }, { "epoch": 4.95, "grad_norm": 0.0015336443902924657, "learning_rate": 2.197802197802198e-07, "loss": 0.0002, "step": 12150 }, { "epoch": 4.95, "grad_norm": 0.001413301331922412, "learning_rate": 2.035002035002035e-07, "loss": 0.0002, "step": 12160 }, { "epoch": 4.95, "grad_norm": 0.001454474637284875, "learning_rate": 1.8722018722018724e-07, "loss": 0.0002, "step": 12170 }, { "epoch": 4.96, "grad_norm": 0.0014644395560026169, "learning_rate": 1.7094017094017097e-07, "loss": 0.0002, "step": 12180 }, { "epoch": 4.96, "grad_norm": 0.0014874679036438465, "learning_rate": 1.5466015466015467e-07, "loss": 0.0002, "step": 12190 }, { "epoch": 4.97, "grad_norm": 0.0014028714504092932, "learning_rate": 1.383801383801384e-07, "loss": 0.0002, "step": 12200 }, { "epoch": 4.97, "grad_norm": 0.0014859441434964538, "learning_rate": 1.221001221001221e-07, "loss": 0.0002, "step": 12210 }, { "epoch": 4.97, "grad_norm": 0.0014206055784597993, "learning_rate": 1.0582010582010582e-07, "loss": 0.0002, "step": 12220 }, { "epoch": 4.98, "grad_norm": 0.0013865531655028462, "learning_rate": 8.954008954008955e-08, "loss": 0.0002, "step": 12230 }, { "epoch": 4.98, "grad_norm": 0.0014404187677428126, "learning_rate": 7.326007326007327e-08, "loss": 0.0002, "step": 12240 }, { "epoch": 4.99, "grad_norm": 0.0015573910204693675, "learning_rate": 5.6980056980056986e-08, "loss": 0.0003, "step": 12250 }, { "epoch": 4.99, "grad_norm": 0.0015043216990306973, "learning_rate": 4.07000407000407e-08, "loss": 0.0002, "step": 12260 }, { "epoch": 4.99, "grad_norm": 0.0015565322246402502, "learning_rate": 2.4420024420024422e-08, "loss": 0.0002, "step": 12270 }, { "epoch": 5.0, "grad_norm": 0.003684895345941186, "learning_rate": 8.14000814000814e-09, "loss": 0.0002, "step": 12280 }, { "epoch": 5.0, "step": 12285, "total_flos": 1.523143801869613e+19, "train_loss": 0.006059203078136595, "train_runtime": 4479.7513, "train_samples_per_second": 43.876, "train_steps_per_second": 2.742 } ], "logging_steps": 10, "max_steps": 12285, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "total_flos": 1.523143801869613e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }