|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 12285, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.3860865831375122, |
|
"learning_rate": 1.9983719983719984e-05, |
|
"loss": 0.684, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.3011243343353271, |
|
"learning_rate": 1.996743996743997e-05, |
|
"loss": 0.6568, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.646723985671997, |
|
"learning_rate": 1.9951159951159952e-05, |
|
"loss": 0.6477, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.43569016456604, |
|
"learning_rate": 1.9934879934879937e-05, |
|
"loss": 0.61, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.4323921203613281, |
|
"learning_rate": 1.991859991859992e-05, |
|
"loss": 0.5758, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.1664583683013916, |
|
"learning_rate": 1.9902319902319905e-05, |
|
"loss": 0.5529, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.935644268989563, |
|
"learning_rate": 1.9886039886039888e-05, |
|
"loss": 0.4969, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.984022617340088, |
|
"learning_rate": 1.986975986975987e-05, |
|
"loss": 0.5017, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.9753074645996094, |
|
"learning_rate": 1.9853479853479855e-05, |
|
"loss": 0.4438, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.39138650894165, |
|
"learning_rate": 1.9837199837199838e-05, |
|
"loss": 0.4033, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.0486788749694824, |
|
"learning_rate": 1.9820919820919823e-05, |
|
"loss": 0.3642, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.738529920578003, |
|
"learning_rate": 1.9804639804639806e-05, |
|
"loss": 0.3557, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.9336562156677246, |
|
"learning_rate": 1.978835978835979e-05, |
|
"loss": 0.3655, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.0220277309417725, |
|
"learning_rate": 1.9772079772079773e-05, |
|
"loss": 0.2903, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.4428532123565674, |
|
"learning_rate": 1.975579975579976e-05, |
|
"loss": 0.2706, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.031763076782227, |
|
"learning_rate": 1.973951973951974e-05, |
|
"loss": 0.3558, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.3514373302459717, |
|
"learning_rate": 1.9723239723239724e-05, |
|
"loss": 0.232, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.0613574981689453, |
|
"learning_rate": 1.970695970695971e-05, |
|
"loss": 0.2499, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 7.994803428649902, |
|
"learning_rate": 1.969067969067969e-05, |
|
"loss": 0.2609, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.9980034828186035, |
|
"learning_rate": 1.9674399674399677e-05, |
|
"loss": 0.2267, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.297769069671631, |
|
"learning_rate": 1.965811965811966e-05, |
|
"loss": 0.2371, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.071080207824707, |
|
"learning_rate": 1.9641839641839645e-05, |
|
"loss": 0.248, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.43589872121810913, |
|
"learning_rate": 1.9625559625559627e-05, |
|
"loss": 0.151, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.790423393249512, |
|
"learning_rate": 1.960927960927961e-05, |
|
"loss": 0.1886, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.1880414485931396, |
|
"learning_rate": 1.9592999592999595e-05, |
|
"loss": 0.2104, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.434396266937256, |
|
"learning_rate": 1.9576719576719577e-05, |
|
"loss": 0.2847, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.175398826599121, |
|
"learning_rate": 1.9560439560439563e-05, |
|
"loss": 0.261, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.2410614490509033, |
|
"learning_rate": 1.9544159544159545e-05, |
|
"loss": 0.1376, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5503996014595032, |
|
"learning_rate": 1.952787952787953e-05, |
|
"loss": 0.2034, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.550145387649536, |
|
"learning_rate": 1.9511599511599513e-05, |
|
"loss": 0.1802, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.35100820660591125, |
|
"learning_rate": 1.94953194953195e-05, |
|
"loss": 0.1778, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0361884832382202, |
|
"learning_rate": 1.947903947903948e-05, |
|
"loss": 0.2186, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 7.980532169342041, |
|
"learning_rate": 1.9462759462759463e-05, |
|
"loss": 0.2742, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.8567585945129395, |
|
"learning_rate": 1.9446479446479445e-05, |
|
"loss": 0.1544, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.355805397033691, |
|
"learning_rate": 1.943019943019943e-05, |
|
"loss": 0.2739, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.359828472137451, |
|
"learning_rate": 1.9413919413919417e-05, |
|
"loss": 0.2076, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.9936553239822388, |
|
"learning_rate": 1.93976393976394e-05, |
|
"loss": 0.1666, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.090355396270752, |
|
"learning_rate": 1.9381359381359385e-05, |
|
"loss": 0.1941, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 2.5009548664093018, |
|
"learning_rate": 1.9365079365079367e-05, |
|
"loss": 0.1905, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 8.634650230407715, |
|
"learning_rate": 1.934879934879935e-05, |
|
"loss": 0.1431, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.43247389793396, |
|
"learning_rate": 1.9332519332519335e-05, |
|
"loss": 0.1736, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 15.868481636047363, |
|
"learning_rate": 1.9316239316239317e-05, |
|
"loss": 0.249, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.8422390222549438, |
|
"learning_rate": 1.9299959299959303e-05, |
|
"loss": 0.1407, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 5.148740291595459, |
|
"learning_rate": 1.9283679283679285e-05, |
|
"loss": 0.1503, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.3315675258636475, |
|
"learning_rate": 1.926739926739927e-05, |
|
"loss": 0.1885, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.6225030422210693, |
|
"learning_rate": 1.9251119251119253e-05, |
|
"loss": 0.1403, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.605388641357422, |
|
"learning_rate": 1.9234839234839235e-05, |
|
"loss": 0.2384, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 2.3162589073181152, |
|
"learning_rate": 1.921855921855922e-05, |
|
"loss": 0.129, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4153892695903778, |
|
"learning_rate": 1.9202279202279203e-05, |
|
"loss": 0.1109, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 7.691011905670166, |
|
"learning_rate": 1.9185999185999185e-05, |
|
"loss": 0.1846, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 7.940028667449951, |
|
"learning_rate": 1.916971916971917e-05, |
|
"loss": 0.1391, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.5145124793052673, |
|
"learning_rate": 1.9153439153439156e-05, |
|
"loss": 0.1288, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.5785932540893555, |
|
"learning_rate": 1.913715913715914e-05, |
|
"loss": 0.1537, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 6.997181415557861, |
|
"learning_rate": 1.9120879120879124e-05, |
|
"loss": 0.1578, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.4879519939422607, |
|
"learning_rate": 1.9104599104599107e-05, |
|
"loss": 0.1954, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.291905879974365, |
|
"learning_rate": 1.908831908831909e-05, |
|
"loss": 0.1557, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.735557556152344, |
|
"learning_rate": 1.9072039072039074e-05, |
|
"loss": 0.1621, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.979973316192627, |
|
"learning_rate": 1.9055759055759057e-05, |
|
"loss": 0.1503, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.19126015901565552, |
|
"learning_rate": 1.9039479039479042e-05, |
|
"loss": 0.094, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0556552410125732, |
|
"learning_rate": 1.9023199023199025e-05, |
|
"loss": 0.1876, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.954843759536743, |
|
"learning_rate": 1.900691900691901e-05, |
|
"loss": 0.3162, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.14296281337738037, |
|
"learning_rate": 1.8990638990638992e-05, |
|
"loss": 0.1288, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 8.772310256958008, |
|
"learning_rate": 1.8974358974358975e-05, |
|
"loss": 0.2622, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.3941524028778076, |
|
"learning_rate": 1.895807895807896e-05, |
|
"loss": 0.1524, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 8.104179382324219, |
|
"learning_rate": 1.8941798941798943e-05, |
|
"loss": 0.1109, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.5782121419906616, |
|
"learning_rate": 1.8925518925518925e-05, |
|
"loss": 0.0729, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.39667731523513794, |
|
"learning_rate": 1.890923890923891e-05, |
|
"loss": 0.1116, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.58447265625, |
|
"learning_rate": 1.8892958892958896e-05, |
|
"loss": 0.1312, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.3114192485809326, |
|
"learning_rate": 1.887667887667888e-05, |
|
"loss": 0.2522, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.997601330280304, |
|
"learning_rate": 1.8860398860398864e-05, |
|
"loss": 0.1605, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.605452537536621, |
|
"learning_rate": 1.8844118844118846e-05, |
|
"loss": 0.2443, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 18.868513107299805, |
|
"learning_rate": 1.882783882783883e-05, |
|
"loss": 0.1923, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.4352970123291016, |
|
"learning_rate": 1.881155881155881e-05, |
|
"loss": 0.1099, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 2.06532883644104, |
|
"learning_rate": 1.8795278795278796e-05, |
|
"loss": 0.1426, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.7250237464904785, |
|
"learning_rate": 1.8778998778998782e-05, |
|
"loss": 0.209, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.23493361473083496, |
|
"learning_rate": 1.8762718762718764e-05, |
|
"loss": 0.1986, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 17.769451141357422, |
|
"learning_rate": 1.874643874643875e-05, |
|
"loss": 0.1267, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.27574750781059265, |
|
"learning_rate": 1.8730158730158732e-05, |
|
"loss": 0.1484, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.30309033393859863, |
|
"learning_rate": 1.8713878713878714e-05, |
|
"loss": 0.1838, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 17.183013916015625, |
|
"learning_rate": 1.86975986975987e-05, |
|
"loss": 0.135, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.5572558641433716, |
|
"learning_rate": 1.8681318681318682e-05, |
|
"loss": 0.1456, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.4613451659679413, |
|
"learning_rate": 1.8665038665038664e-05, |
|
"loss": 0.1337, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.645438194274902, |
|
"learning_rate": 1.864875864875865e-05, |
|
"loss": 0.1446, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.389886856079102, |
|
"learning_rate": 1.8632478632478636e-05, |
|
"loss": 0.1253, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 14.86754322052002, |
|
"learning_rate": 1.8616198616198618e-05, |
|
"loss": 0.1346, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 13.419057846069336, |
|
"learning_rate": 1.85999185999186e-05, |
|
"loss": 0.0926, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 13.904304504394531, |
|
"learning_rate": 1.8583638583638586e-05, |
|
"loss": 0.1944, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.28235912322998047, |
|
"learning_rate": 1.8567358567358568e-05, |
|
"loss": 0.1261, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.711563587188721, |
|
"learning_rate": 1.855107855107855e-05, |
|
"loss": 0.1824, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 17.74437141418457, |
|
"learning_rate": 1.8534798534798536e-05, |
|
"loss": 0.1742, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.648202657699585, |
|
"learning_rate": 1.851851851851852e-05, |
|
"loss": 0.2103, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 2.0693366527557373, |
|
"learning_rate": 1.8502238502238504e-05, |
|
"loss": 0.1868, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 2.299172878265381, |
|
"learning_rate": 1.848595848595849e-05, |
|
"loss": 0.168, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.4059839248657227, |
|
"learning_rate": 1.8469678469678472e-05, |
|
"loss": 0.1455, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.926304817199707, |
|
"learning_rate": 1.8453398453398454e-05, |
|
"loss": 0.2002, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.728667736053467, |
|
"learning_rate": 1.8437118437118436e-05, |
|
"loss": 0.1245, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5045005083084106, |
|
"learning_rate": 1.8420838420838422e-05, |
|
"loss": 0.0638, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 8.82520580291748, |
|
"learning_rate": 1.8404558404558404e-05, |
|
"loss": 0.1127, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 5.101595401763916, |
|
"learning_rate": 1.838827838827839e-05, |
|
"loss": 0.2363, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 7.01576566696167, |
|
"learning_rate": 1.8371998371998375e-05, |
|
"loss": 0.1026, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.865003764629364, |
|
"learning_rate": 1.8355718355718358e-05, |
|
"loss": 0.0965, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 9.897397994995117, |
|
"learning_rate": 1.833943833943834e-05, |
|
"loss": 0.1156, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.5007679462432861, |
|
"learning_rate": 1.8323158323158326e-05, |
|
"loss": 0.1888, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.676563262939453, |
|
"learning_rate": 1.8306878306878308e-05, |
|
"loss": 0.1552, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 5.3361430168151855, |
|
"learning_rate": 1.829059829059829e-05, |
|
"loss": 0.1447, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8933970332145691, |
|
"learning_rate": 1.8274318274318276e-05, |
|
"loss": 0.1394, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 7.401905059814453, |
|
"learning_rate": 1.825803825803826e-05, |
|
"loss": 0.2188, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4379027783870697, |
|
"learning_rate": 1.8241758241758244e-05, |
|
"loss": 0.1277, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 2.8909428119659424, |
|
"learning_rate": 1.8225478225478226e-05, |
|
"loss": 0.1726, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.11447061598300934, |
|
"learning_rate": 1.820919820919821e-05, |
|
"loss": 0.1523, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.12276914715766907, |
|
"learning_rate": 1.8192918192918194e-05, |
|
"loss": 0.1823, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.3844455480575562, |
|
"learning_rate": 1.8176638176638176e-05, |
|
"loss": 0.1006, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.1034061908721924, |
|
"learning_rate": 1.816035816035816e-05, |
|
"loss": 0.1387, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 8.602412223815918, |
|
"learning_rate": 1.8144078144078144e-05, |
|
"loss": 0.2345, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.13919875025749207, |
|
"learning_rate": 1.812779812779813e-05, |
|
"loss": 0.1653, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.34385234117507935, |
|
"learning_rate": 1.8111518111518115e-05, |
|
"loss": 0.2003, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.868250846862793, |
|
"learning_rate": 1.8095238095238097e-05, |
|
"loss": 0.1504, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 2.267928123474121, |
|
"learning_rate": 1.807895807895808e-05, |
|
"loss": 0.2023, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.634040594100952, |
|
"learning_rate": 1.8062678062678065e-05, |
|
"loss": 0.1682, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.8135625123977661, |
|
"learning_rate": 1.8046398046398047e-05, |
|
"loss": 0.2059, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.294635057449341, |
|
"learning_rate": 1.803011803011803e-05, |
|
"loss": 0.1498, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.8841567039489746, |
|
"learning_rate": 1.8013838013838015e-05, |
|
"loss": 0.0727, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.9216524958610535, |
|
"learning_rate": 1.7997557997558e-05, |
|
"loss": 0.1273, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.08572695404291153, |
|
"learning_rate": 1.7981277981277983e-05, |
|
"loss": 0.1066, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 5.445361137390137, |
|
"learning_rate": 1.7964997964997966e-05, |
|
"loss": 0.1894, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.239029407501221, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 0.0947, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7807052135467529, |
|
"learning_rate": 1.7932437932437933e-05, |
|
"loss": 0.1692, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.1252571940422058, |
|
"learning_rate": 1.7916157916157916e-05, |
|
"loss": 0.0901, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.491313457489014, |
|
"learning_rate": 1.78998778998779e-05, |
|
"loss": 0.0849, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.3406262695789337, |
|
"learning_rate": 1.7883597883597884e-05, |
|
"loss": 0.1345, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.4588377475738525, |
|
"learning_rate": 1.786731786731787e-05, |
|
"loss": 0.1501, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.2964069843292236, |
|
"learning_rate": 1.7851037851037855e-05, |
|
"loss": 0.1679, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 6.95346212387085, |
|
"learning_rate": 1.7834757834757837e-05, |
|
"loss": 0.095, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.9120900630950928, |
|
"learning_rate": 1.781847781847782e-05, |
|
"loss": 0.1089, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 8.793939590454102, |
|
"learning_rate": 1.78021978021978e-05, |
|
"loss": 0.1338, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.08519359678030014, |
|
"learning_rate": 1.7785917785917787e-05, |
|
"loss": 0.0932, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 16.41631317138672, |
|
"learning_rate": 1.776963776963777e-05, |
|
"loss": 0.1376, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.0415103435516357, |
|
"learning_rate": 1.7753357753357755e-05, |
|
"loss": 0.1567, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.8246403336524963, |
|
"learning_rate": 1.773707773707774e-05, |
|
"loss": 0.122, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.198512077331543, |
|
"learning_rate": 1.7720797720797723e-05, |
|
"loss": 0.1528, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.5246292352676392, |
|
"learning_rate": 1.7704517704517705e-05, |
|
"loss": 0.1088, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 12.515607833862305, |
|
"learning_rate": 1.768823768823769e-05, |
|
"loss": 0.1627, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 13.734766006469727, |
|
"learning_rate": 1.7671957671957673e-05, |
|
"loss": 0.1475, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 2.593158483505249, |
|
"learning_rate": 1.7655677655677655e-05, |
|
"loss": 0.0968, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.3462279736995697, |
|
"learning_rate": 1.763939763939764e-05, |
|
"loss": 0.0854, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.6409497261047363, |
|
"learning_rate": 1.7623117623117623e-05, |
|
"loss": 0.2295, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.9609594345092773, |
|
"learning_rate": 1.760683760683761e-05, |
|
"loss": 0.1883, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.673570454120636, |
|
"learning_rate": 1.759055759055759e-05, |
|
"loss": 0.1369, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.2929579019546509, |
|
"learning_rate": 1.7574277574277577e-05, |
|
"loss": 0.1189, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.4493731260299683, |
|
"learning_rate": 1.755799755799756e-05, |
|
"loss": 0.1187, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.07135419547557831, |
|
"learning_rate": 1.754171754171754e-05, |
|
"loss": 0.0603, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.10734464973211288, |
|
"learning_rate": 1.7525437525437527e-05, |
|
"loss": 0.0217, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.2217961698770523, |
|
"learning_rate": 1.750915750915751e-05, |
|
"loss": 0.0303, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.14218159019947052, |
|
"learning_rate": 1.7492877492877495e-05, |
|
"loss": 0.0603, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.09605604410171509, |
|
"learning_rate": 1.747659747659748e-05, |
|
"loss": 0.0407, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.07094033062458038, |
|
"learning_rate": 1.7460317460317463e-05, |
|
"loss": 0.0202, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.650410175323486, |
|
"learning_rate": 1.7444037444037445e-05, |
|
"loss": 0.0611, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 23.229633331298828, |
|
"learning_rate": 1.742775742775743e-05, |
|
"loss": 0.0528, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.466739535331726, |
|
"learning_rate": 1.7411477411477413e-05, |
|
"loss": 0.07, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.05839679762721062, |
|
"learning_rate": 1.7395197395197395e-05, |
|
"loss": 0.0077, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.6192926168441772, |
|
"learning_rate": 1.737891737891738e-05, |
|
"loss": 0.0239, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.8529036045074463, |
|
"learning_rate": 1.7362637362637363e-05, |
|
"loss": 0.0976, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.24398411810398102, |
|
"learning_rate": 1.734635734635735e-05, |
|
"loss": 0.0079, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.04527588561177254, |
|
"learning_rate": 1.733007733007733e-05, |
|
"loss": 0.0065, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 6.153138160705566, |
|
"learning_rate": 1.7313797313797316e-05, |
|
"loss": 0.0364, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.03938959911465645, |
|
"learning_rate": 1.72975172975173e-05, |
|
"loss": 0.009, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.04055130481719971, |
|
"learning_rate": 1.728123728123728e-05, |
|
"loss": 0.0472, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.07095145434141159, |
|
"learning_rate": 1.7264957264957267e-05, |
|
"loss": 0.0078, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 2.7965128421783447, |
|
"learning_rate": 1.724867724867725e-05, |
|
"loss": 0.0559, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 6.2940592765808105, |
|
"learning_rate": 1.7232397232397234e-05, |
|
"loss": 0.0366, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.11980397999286652, |
|
"learning_rate": 1.721611721611722e-05, |
|
"loss": 0.0125, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 8.26235294342041, |
|
"learning_rate": 1.7199837199837202e-05, |
|
"loss": 0.0137, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.04125256836414337, |
|
"learning_rate": 1.7183557183557185e-05, |
|
"loss": 0.0051, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.03920783847570419, |
|
"learning_rate": 1.7167277167277167e-05, |
|
"loss": 0.0067, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.13922813534736633, |
|
"learning_rate": 1.7150997150997152e-05, |
|
"loss": 0.0374, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.034091122448444366, |
|
"learning_rate": 1.7134717134717135e-05, |
|
"loss": 0.006, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 10.509510040283203, |
|
"learning_rate": 1.711843711843712e-05, |
|
"loss": 0.0589, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.043251294642686844, |
|
"learning_rate": 1.7102157102157103e-05, |
|
"loss": 0.0226, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8053480982780457, |
|
"learning_rate": 1.7085877085877088e-05, |
|
"loss": 0.0582, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.04081906005740166, |
|
"learning_rate": 1.706959706959707e-05, |
|
"loss": 0.0391, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.03760745748877525, |
|
"learning_rate": 1.7053317053317056e-05, |
|
"loss": 0.027, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.04111940413713455, |
|
"learning_rate": 1.7037037037037038e-05, |
|
"loss": 0.0368, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 2.6297411918640137, |
|
"learning_rate": 1.702075702075702e-05, |
|
"loss": 0.0478, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.1751009225845337, |
|
"learning_rate": 1.7004477004477006e-05, |
|
"loss": 0.0298, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.042650580406188965, |
|
"learning_rate": 1.698819698819699e-05, |
|
"loss": 0.0203, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.034141793847084045, |
|
"learning_rate": 1.6971916971916974e-05, |
|
"loss": 0.0044, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.03497103974223137, |
|
"learning_rate": 1.6955636955636956e-05, |
|
"loss": 0.0304, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.8585641384124756, |
|
"learning_rate": 1.6939356939356942e-05, |
|
"loss": 0.0365, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.0322452187538147, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 0.0596, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.034800559282302856, |
|
"learning_rate": 1.6906796906796906e-05, |
|
"loss": 0.0061, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.0860045924782753, |
|
"learning_rate": 1.6890516890516892e-05, |
|
"loss": 0.0172, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.031149201095104218, |
|
"learning_rate": 1.6874236874236874e-05, |
|
"loss": 0.0238, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.03368987515568733, |
|
"learning_rate": 1.685795685795686e-05, |
|
"loss": 0.0043, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.03161125257611275, |
|
"learning_rate": 1.6841676841676846e-05, |
|
"loss": 0.0146, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.029046092182397842, |
|
"learning_rate": 1.6825396825396828e-05, |
|
"loss": 0.0236, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9345057606697083, |
|
"learning_rate": 1.680911680911681e-05, |
|
"loss": 0.0042, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.028860267251729965, |
|
"learning_rate": 1.6792836792836796e-05, |
|
"loss": 0.0286, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.02852853201329708, |
|
"learning_rate": 1.6776556776556778e-05, |
|
"loss": 0.023, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.03128168359398842, |
|
"learning_rate": 1.676027676027676e-05, |
|
"loss": 0.0036, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.037479083985090256, |
|
"learning_rate": 1.6743996743996746e-05, |
|
"loss": 0.0591, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.04688659682869911, |
|
"learning_rate": 1.6727716727716728e-05, |
|
"loss": 0.0316, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.03302760049700737, |
|
"learning_rate": 1.6711436711436714e-05, |
|
"loss": 0.0668, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.06181880831718445, |
|
"learning_rate": 1.6695156695156696e-05, |
|
"loss": 0.0281, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.0320013165473938, |
|
"learning_rate": 1.667887667887668e-05, |
|
"loss": 0.0232, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.13600216805934906, |
|
"learning_rate": 1.6662596662596664e-05, |
|
"loss": 0.0442, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.12886099517345428, |
|
"learning_rate": 1.6646316646316646e-05, |
|
"loss": 0.0305, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.0625109001994133, |
|
"learning_rate": 1.6630036630036632e-05, |
|
"loss": 0.0233, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 13.604376792907715, |
|
"learning_rate": 1.6613756613756614e-05, |
|
"loss": 0.0288, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.029248738661408424, |
|
"learning_rate": 1.65974765974766e-05, |
|
"loss": 0.0039, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.4231517314910889, |
|
"learning_rate": 1.6581196581196585e-05, |
|
"loss": 0.0095, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.02830047346651554, |
|
"learning_rate": 1.6564916564916568e-05, |
|
"loss": 0.007, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.027091912925243378, |
|
"learning_rate": 1.654863654863655e-05, |
|
"loss": 0.0041, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.02793751284480095, |
|
"learning_rate": 1.6532356532356532e-05, |
|
"loss": 0.0087, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.030688917264342308, |
|
"learning_rate": 1.6516076516076518e-05, |
|
"loss": 0.0033, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.02540646307170391, |
|
"learning_rate": 1.64997964997965e-05, |
|
"loss": 0.0254, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.026573829352855682, |
|
"learning_rate": 1.6483516483516486e-05, |
|
"loss": 0.0195, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.025454262271523476, |
|
"learning_rate": 1.6467236467236468e-05, |
|
"loss": 0.0031, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.038121115416288376, |
|
"learning_rate": 1.6450956450956453e-05, |
|
"loss": 0.0035, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.025772370398044586, |
|
"learning_rate": 1.6434676434676436e-05, |
|
"loss": 0.003, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.4986250400543213, |
|
"learning_rate": 1.641839641839642e-05, |
|
"loss": 0.0038, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 25.038734436035156, |
|
"learning_rate": 1.6402116402116404e-05, |
|
"loss": 0.0119, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.025794176384806633, |
|
"learning_rate": 1.6385836385836386e-05, |
|
"loss": 0.0353, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.056914806365967, |
|
"learning_rate": 1.636955636955637e-05, |
|
"loss": 0.0517, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.19518433511257172, |
|
"learning_rate": 1.6353276353276354e-05, |
|
"loss": 0.0291, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.02424285002052784, |
|
"learning_rate": 1.633699633699634e-05, |
|
"loss": 0.0359, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.03164544701576233, |
|
"learning_rate": 1.632071632071632e-05, |
|
"loss": 0.0382, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.022855272516608238, |
|
"learning_rate": 1.6304436304436307e-05, |
|
"loss": 0.003, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.023591142147779465, |
|
"learning_rate": 1.628815628815629e-05, |
|
"loss": 0.0497, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.02427799627184868, |
|
"learning_rate": 1.627187627187627e-05, |
|
"loss": 0.0381, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.022075733169913292, |
|
"learning_rate": 1.6255596255596257e-05, |
|
"loss": 0.0038, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.25007203221321106, |
|
"learning_rate": 1.623931623931624e-05, |
|
"loss": 0.0364, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.02502160519361496, |
|
"learning_rate": 1.6223036223036225e-05, |
|
"loss": 0.0029, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.036409296095371246, |
|
"learning_rate": 1.6206756206756207e-05, |
|
"loss": 0.0387, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.027146685868501663, |
|
"learning_rate": 1.6190476190476193e-05, |
|
"loss": 0.0045, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.024981442838907242, |
|
"learning_rate": 1.6174196174196175e-05, |
|
"loss": 0.0264, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.027865292504429817, |
|
"learning_rate": 1.615791615791616e-05, |
|
"loss": 0.0029, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.034725822508335114, |
|
"learning_rate": 1.6141636141636143e-05, |
|
"loss": 0.0029, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.022250523790717125, |
|
"learning_rate": 1.6125356125356125e-05, |
|
"loss": 0.0337, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.024188194423913956, |
|
"learning_rate": 1.610907610907611e-05, |
|
"loss": 0.0026, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.02303464338183403, |
|
"learning_rate": 1.6092796092796093e-05, |
|
"loss": 0.0285, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.020316725596785545, |
|
"learning_rate": 1.607651607651608e-05, |
|
"loss": 0.0026, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.023156961426138878, |
|
"learning_rate": 1.606023606023606e-05, |
|
"loss": 0.0031, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 2.9847331047058105, |
|
"learning_rate": 1.6043956043956047e-05, |
|
"loss": 0.0034, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 10.845735549926758, |
|
"learning_rate": 1.602767602767603e-05, |
|
"loss": 0.0557, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.02037137933075428, |
|
"learning_rate": 1.601139601139601e-05, |
|
"loss": 0.0333, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.019075889140367508, |
|
"learning_rate": 1.5995115995115997e-05, |
|
"loss": 0.0029, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.02034451812505722, |
|
"learning_rate": 1.597883597883598e-05, |
|
"loss": 0.0035, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.02513672597706318, |
|
"learning_rate": 1.5962555962555965e-05, |
|
"loss": 0.0149, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.0232282355427742, |
|
"learning_rate": 1.5946275946275947e-05, |
|
"loss": 0.0066, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.019541621208190918, |
|
"learning_rate": 1.5929995929995933e-05, |
|
"loss": 0.003, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.027926787734031677, |
|
"learning_rate": 1.5913715913715915e-05, |
|
"loss": 0.0024, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.021236905828118324, |
|
"learning_rate": 1.5897435897435897e-05, |
|
"loss": 0.0023, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.017625728622078896, |
|
"learning_rate": 1.5881155881155883e-05, |
|
"loss": 0.0023, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 3.0908312797546387, |
|
"learning_rate": 1.5864875864875865e-05, |
|
"loss": 0.0032, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.025432445108890533, |
|
"learning_rate": 1.584859584859585e-05, |
|
"loss": 0.0246, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.0189252570271492, |
|
"learning_rate": 1.5832315832315833e-05, |
|
"loss": 0.0025, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.16396763920783997, |
|
"learning_rate": 1.581603581603582e-05, |
|
"loss": 0.0378, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.019563721492886543, |
|
"learning_rate": 1.57997557997558e-05, |
|
"loss": 0.0281, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.02156243473291397, |
|
"learning_rate": 1.5783475783475787e-05, |
|
"loss": 0.1073, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 3.184936285018921, |
|
"learning_rate": 1.576719576719577e-05, |
|
"loss": 0.0413, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.0187922902405262, |
|
"learning_rate": 1.575091575091575e-05, |
|
"loss": 0.0423, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.020309004932641983, |
|
"learning_rate": 1.5734635734635737e-05, |
|
"loss": 0.0026, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.028299883008003235, |
|
"learning_rate": 1.571835571835572e-05, |
|
"loss": 0.0026, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.022750265896320343, |
|
"learning_rate": 1.5702075702075705e-05, |
|
"loss": 0.0026, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.017459379509091377, |
|
"learning_rate": 1.5685795685795687e-05, |
|
"loss": 0.0026, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.02400645986199379, |
|
"learning_rate": 1.5669515669515672e-05, |
|
"loss": 0.0022, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.037710972130298615, |
|
"learning_rate": 1.5653235653235655e-05, |
|
"loss": 0.0024, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.01844876818358898, |
|
"learning_rate": 1.5636955636955637e-05, |
|
"loss": 0.0022, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.015886761248111725, |
|
"learning_rate": 1.5620675620675623e-05, |
|
"loss": 0.0021, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.016119027510285378, |
|
"learning_rate": 1.5604395604395605e-05, |
|
"loss": 0.0024, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.01977747306227684, |
|
"learning_rate": 1.558811558811559e-05, |
|
"loss": 0.0405, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.01591884344816208, |
|
"learning_rate": 1.5571835571835573e-05, |
|
"loss": 0.0021, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.017170535400509834, |
|
"learning_rate": 1.555555555555556e-05, |
|
"loss": 0.0102, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.02160962112247944, |
|
"learning_rate": 1.553927553927554e-05, |
|
"loss": 0.0164, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.04177393019199371, |
|
"learning_rate": 1.5522995522995526e-05, |
|
"loss": 0.002, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.01732414774596691, |
|
"learning_rate": 1.550671550671551e-05, |
|
"loss": 0.0022, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.05687391385436058, |
|
"learning_rate": 1.549043549043549e-05, |
|
"loss": 0.002, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.015546981245279312, |
|
"learning_rate": 1.5474155474155473e-05, |
|
"loss": 0.0296, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 11.891217231750488, |
|
"learning_rate": 1.545787545787546e-05, |
|
"loss": 0.0303, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 3.074970245361328, |
|
"learning_rate": 1.5441595441595444e-05, |
|
"loss": 0.0346, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.3277289867401123, |
|
"learning_rate": 1.5425315425315426e-05, |
|
"loss": 0.0053, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.014851146377623081, |
|
"learning_rate": 1.5409035409035412e-05, |
|
"loss": 0.0021, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.02586003951728344, |
|
"learning_rate": 1.5392755392755394e-05, |
|
"loss": 0.0194, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.018063299357891083, |
|
"learning_rate": 1.5376475376475377e-05, |
|
"loss": 0.0374, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.014860156923532486, |
|
"learning_rate": 1.5360195360195362e-05, |
|
"loss": 0.0368, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.016715556383132935, |
|
"learning_rate": 1.5343915343915344e-05, |
|
"loss": 0.0232, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.017222585156559944, |
|
"learning_rate": 1.532763532763533e-05, |
|
"loss": 0.0021, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.015297485515475273, |
|
"learning_rate": 1.5311355311355312e-05, |
|
"loss": 0.002, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.01927722617983818, |
|
"learning_rate": 1.5295075295075298e-05, |
|
"loss": 0.0344, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.014726200141012669, |
|
"learning_rate": 1.527879527879528e-05, |
|
"loss": 0.0105, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.015239718370139599, |
|
"learning_rate": 1.5262515262515263e-05, |
|
"loss": 0.0019, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.014116072095930576, |
|
"learning_rate": 1.5246235246235248e-05, |
|
"loss": 0.0482, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.014437291771173477, |
|
"learning_rate": 1.522995522995523e-05, |
|
"loss": 0.0028, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.017663761973381042, |
|
"learning_rate": 1.5213675213675214e-05, |
|
"loss": 0.007, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.024807853624224663, |
|
"learning_rate": 1.51973951973952e-05, |
|
"loss": 0.0044, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.01389392837882042, |
|
"learning_rate": 1.5181115181115182e-05, |
|
"loss": 0.021, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.014578912407159805, |
|
"learning_rate": 1.5164835164835166e-05, |
|
"loss": 0.002, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.013830927200615406, |
|
"learning_rate": 1.514855514855515e-05, |
|
"loss": 0.0017, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.012908479198813438, |
|
"learning_rate": 1.5132275132275134e-05, |
|
"loss": 0.0047, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.013685975223779678, |
|
"learning_rate": 1.5115995115995116e-05, |
|
"loss": 0.0062, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.015914512798190117, |
|
"learning_rate": 1.50997150997151e-05, |
|
"loss": 0.0415, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.09328664839267731, |
|
"learning_rate": 1.5083435083435086e-05, |
|
"loss": 0.0017, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.013503558933734894, |
|
"learning_rate": 1.5067155067155068e-05, |
|
"loss": 0.0292, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.012664329260587692, |
|
"learning_rate": 1.505087505087505e-05, |
|
"loss": 0.0108, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.013521691784262657, |
|
"learning_rate": 1.5034595034595036e-05, |
|
"loss": 0.0016, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.017031285911798477, |
|
"learning_rate": 1.501831501831502e-05, |
|
"loss": 0.0056, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.0123978890478611, |
|
"learning_rate": 1.5002035002035002e-05, |
|
"loss": 0.0454, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.01293584518134594, |
|
"learning_rate": 1.4985754985754988e-05, |
|
"loss": 0.004, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.013730690814554691, |
|
"learning_rate": 1.496947496947497e-05, |
|
"loss": 0.0355, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.01241120882332325, |
|
"learning_rate": 1.4953194953194954e-05, |
|
"loss": 0.0017, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.016001150012016296, |
|
"learning_rate": 1.493691493691494e-05, |
|
"loss": 0.0017, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.019151071086525917, |
|
"learning_rate": 1.4920634920634922e-05, |
|
"loss": 0.0335, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.014675545506179333, |
|
"learning_rate": 1.4904354904354906e-05, |
|
"loss": 0.0203, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.5518173575401306, |
|
"learning_rate": 1.4888074888074888e-05, |
|
"loss": 0.0025, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.012442667037248611, |
|
"learning_rate": 1.4871794871794874e-05, |
|
"loss": 0.0021, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.013752995058894157, |
|
"learning_rate": 1.4855514855514856e-05, |
|
"loss": 0.0018, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.011561810038983822, |
|
"learning_rate": 1.483923483923484e-05, |
|
"loss": 0.0016, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.011732109822332859, |
|
"learning_rate": 1.4822954822954826e-05, |
|
"loss": 0.0015, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.011794438585639, |
|
"learning_rate": 1.4806674806674808e-05, |
|
"loss": 0.0014, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.011947757564485073, |
|
"learning_rate": 1.479039479039479e-05, |
|
"loss": 0.0026, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.017924221232533455, |
|
"learning_rate": 1.4774114774114776e-05, |
|
"loss": 0.0015, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.011501024477183819, |
|
"learning_rate": 1.475783475783476e-05, |
|
"loss": 0.0021, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.05062294751405716, |
|
"learning_rate": 1.4741554741554742e-05, |
|
"loss": 0.0015, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.011451934464275837, |
|
"learning_rate": 1.4725274725274727e-05, |
|
"loss": 0.0015, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.011398130096495152, |
|
"learning_rate": 1.470899470899471e-05, |
|
"loss": 0.0262, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.011111021041870117, |
|
"learning_rate": 1.4692714692714694e-05, |
|
"loss": 0.0015, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.011720293201506138, |
|
"learning_rate": 1.4676434676434676e-05, |
|
"loss": 0.0014, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.01106089074164629, |
|
"learning_rate": 1.4660154660154662e-05, |
|
"loss": 0.0248, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.031572628766298294, |
|
"learning_rate": 1.4643874643874645e-05, |
|
"loss": 0.0015, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.010560325346887112, |
|
"learning_rate": 1.4627594627594628e-05, |
|
"loss": 0.0014, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 31.388111114501953, |
|
"learning_rate": 1.4611314611314613e-05, |
|
"loss": 0.0255, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.016965394839644432, |
|
"learning_rate": 1.4595034595034596e-05, |
|
"loss": 0.0014, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.022373100742697716, |
|
"learning_rate": 1.457875457875458e-05, |
|
"loss": 0.0013, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.011025676503777504, |
|
"learning_rate": 1.4562474562474565e-05, |
|
"loss": 0.0374, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.016683539375662804, |
|
"learning_rate": 1.4546194546194547e-05, |
|
"loss": 0.0389, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.012086950242519379, |
|
"learning_rate": 1.4529914529914531e-05, |
|
"loss": 0.0304, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.011172090657055378, |
|
"learning_rate": 1.4513634513634515e-05, |
|
"loss": 0.0178, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.013024254702031612, |
|
"learning_rate": 1.44973544973545e-05, |
|
"loss": 0.0014, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.010836287401616573, |
|
"learning_rate": 1.4481074481074482e-05, |
|
"loss": 0.0014, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.014210844412446022, |
|
"learning_rate": 1.4464794464794465e-05, |
|
"loss": 0.0014, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.010528087615966797, |
|
"learning_rate": 1.444851444851445e-05, |
|
"loss": 0.0044, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.01593305543065071, |
|
"learning_rate": 1.4432234432234433e-05, |
|
"loss": 0.0455, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.015049874782562256, |
|
"learning_rate": 1.4415954415954416e-05, |
|
"loss": 0.0027, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.011662309989333153, |
|
"learning_rate": 1.4399674399674401e-05, |
|
"loss": 0.0013, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.011207195930182934, |
|
"learning_rate": 1.4383394383394385e-05, |
|
"loss": 0.0018, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 3.6042699813842773, |
|
"learning_rate": 1.4367114367114367e-05, |
|
"loss": 0.0029, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.09215729683637619, |
|
"learning_rate": 1.4350834350834353e-05, |
|
"loss": 0.002, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.010877463966608047, |
|
"learning_rate": 1.4334554334554335e-05, |
|
"loss": 0.0014, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.009993131272494793, |
|
"learning_rate": 1.431827431827432e-05, |
|
"loss": 0.0016, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.349046230316162, |
|
"learning_rate": 1.4301994301994305e-05, |
|
"loss": 0.0018, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.009341539815068245, |
|
"learning_rate": 1.4285714285714287e-05, |
|
"loss": 0.0012, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.009393510408699512, |
|
"learning_rate": 1.4269434269434271e-05, |
|
"loss": 0.0011, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.009326926432549953, |
|
"learning_rate": 1.4253154253154253e-05, |
|
"loss": 0.0012, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.009275635704398155, |
|
"learning_rate": 1.4236874236874239e-05, |
|
"loss": 0.0384, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 22.40707778930664, |
|
"learning_rate": 1.4220594220594221e-05, |
|
"loss": 0.0131, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.00953533872961998, |
|
"learning_rate": 1.4204314204314205e-05, |
|
"loss": 0.0347, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.5032986998558044, |
|
"learning_rate": 1.4188034188034189e-05, |
|
"loss": 0.0402, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.011732584796845913, |
|
"learning_rate": 1.4171754171754173e-05, |
|
"loss": 0.0592, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.010645696893334389, |
|
"learning_rate": 1.4155474155474155e-05, |
|
"loss": 0.0268, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.013740918599069118, |
|
"learning_rate": 1.4139194139194141e-05, |
|
"loss": 0.0252, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.013372181914746761, |
|
"learning_rate": 1.4122914122914125e-05, |
|
"loss": 0.0376, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.015505131334066391, |
|
"learning_rate": 1.4106634106634107e-05, |
|
"loss": 0.0014, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.014338747598230839, |
|
"learning_rate": 1.4090354090354093e-05, |
|
"loss": 0.0853, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.01571911759674549, |
|
"learning_rate": 1.4074074074074075e-05, |
|
"loss": 0.0298, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.020005526021122932, |
|
"learning_rate": 1.4057794057794059e-05, |
|
"loss": 0.0017, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.018354693427681923, |
|
"learning_rate": 1.4041514041514041e-05, |
|
"loss": 0.0016, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.021922029554843903, |
|
"learning_rate": 1.4025234025234027e-05, |
|
"loss": 0.0017, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.013702883385121822, |
|
"learning_rate": 1.400895400895401e-05, |
|
"loss": 0.0014, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.010742840357124805, |
|
"learning_rate": 1.3992673992673993e-05, |
|
"loss": 0.0026, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.15446045994758606, |
|
"learning_rate": 1.3976393976393979e-05, |
|
"loss": 0.0013, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.01300391647964716, |
|
"learning_rate": 1.3960113960113961e-05, |
|
"loss": 0.0012, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.017101220786571503, |
|
"learning_rate": 1.3943833943833945e-05, |
|
"loss": 0.0012, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.009062445722520351, |
|
"learning_rate": 1.3927553927553929e-05, |
|
"loss": 0.0012, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.008803702890872955, |
|
"learning_rate": 1.3911273911273913e-05, |
|
"loss": 0.0011, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.008593735285103321, |
|
"learning_rate": 1.3894993894993895e-05, |
|
"loss": 0.0012, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.009692203253507614, |
|
"learning_rate": 1.387871387871388e-05, |
|
"loss": 0.0011, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.011008762754499912, |
|
"learning_rate": 1.3862433862433865e-05, |
|
"loss": 0.0011, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.009994535706937313, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.022, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.009117243811488152, |
|
"learning_rate": 1.382987382987383e-05, |
|
"loss": 0.0011, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.008967447094619274, |
|
"learning_rate": 1.3813593813593815e-05, |
|
"loss": 0.0057, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.008691845461726189, |
|
"learning_rate": 1.3797313797313799e-05, |
|
"loss": 0.0013, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.011074850335717201, |
|
"learning_rate": 1.378103378103378e-05, |
|
"loss": 0.001, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.00832684338092804, |
|
"learning_rate": 1.3764753764753766e-05, |
|
"loss": 0.0011, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.008292116224765778, |
|
"learning_rate": 1.374847374847375e-05, |
|
"loss": 0.001, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.009205167181789875, |
|
"learning_rate": 1.3732193732193733e-05, |
|
"loss": 0.0011, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.008790573105216026, |
|
"learning_rate": 1.3715913715913718e-05, |
|
"loss": 0.001, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.008000485599040985, |
|
"learning_rate": 1.36996336996337e-05, |
|
"loss": 0.008, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.00819096527993679, |
|
"learning_rate": 1.3683353683353684e-05, |
|
"loss": 0.001, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.014848892576992512, |
|
"learning_rate": 1.3667073667073668e-05, |
|
"loss": 0.015, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.008053899742662907, |
|
"learning_rate": 1.3650793650793652e-05, |
|
"loss": 0.0009, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 6.416678428649902, |
|
"learning_rate": 1.3634513634513635e-05, |
|
"loss": 0.0344, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.10300695151090622, |
|
"learning_rate": 1.3618233618233619e-05, |
|
"loss": 0.001, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.008424129337072372, |
|
"learning_rate": 1.3601953601953604e-05, |
|
"loss": 0.0267, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.00800679437816143, |
|
"learning_rate": 1.3585673585673586e-05, |
|
"loss": 0.0326, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.009919759817421436, |
|
"learning_rate": 1.356939356939357e-05, |
|
"loss": 0.0011, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.02416282147169113, |
|
"learning_rate": 1.3553113553113554e-05, |
|
"loss": 0.0012, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 5.555994033813477, |
|
"learning_rate": 1.3536833536833538e-05, |
|
"loss": 0.043, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.10745339095592499, |
|
"learning_rate": 1.352055352055352e-05, |
|
"loss": 0.0011, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.00835937075316906, |
|
"learning_rate": 1.3504273504273506e-05, |
|
"loss": 0.0009, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.007618330419063568, |
|
"learning_rate": 1.348799348799349e-05, |
|
"loss": 0.0241, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.022973209619522095, |
|
"learning_rate": 1.3471713471713472e-05, |
|
"loss": 0.001, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.008424985222518444, |
|
"learning_rate": 1.3455433455433458e-05, |
|
"loss": 0.0018, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.015286185778677464, |
|
"learning_rate": 1.343915343915344e-05, |
|
"loss": 0.0009, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.007264839485287666, |
|
"learning_rate": 1.3422873422873424e-05, |
|
"loss": 0.0009, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.0074860285967588425, |
|
"learning_rate": 1.3406593406593406e-05, |
|
"loss": 0.0009, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.008237460628151894, |
|
"learning_rate": 1.3390313390313392e-05, |
|
"loss": 0.0373, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.007270953617990017, |
|
"learning_rate": 1.3374033374033374e-05, |
|
"loss": 0.0009, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.03919156640768051, |
|
"learning_rate": 1.3357753357753358e-05, |
|
"loss": 0.001, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.11515277624130249, |
|
"learning_rate": 1.3341473341473344e-05, |
|
"loss": 0.001, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.007153298240154982, |
|
"learning_rate": 1.3325193325193326e-05, |
|
"loss": 0.0014, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.00894332304596901, |
|
"learning_rate": 1.330891330891331e-05, |
|
"loss": 0.0022, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.046884216368198395, |
|
"learning_rate": 1.3292633292633294e-05, |
|
"loss": 0.001, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.0074531338177621365, |
|
"learning_rate": 1.3276353276353278e-05, |
|
"loss": 0.0009, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.008025778457522392, |
|
"learning_rate": 1.326007326007326e-05, |
|
"loss": 0.0008, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.007099485024809837, |
|
"learning_rate": 1.3243793243793246e-05, |
|
"loss": 0.0349, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.007894063368439674, |
|
"learning_rate": 1.322751322751323e-05, |
|
"loss": 0.0008, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.008376212790608406, |
|
"learning_rate": 1.3211233211233212e-05, |
|
"loss": 0.0009, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.007172748912125826, |
|
"learning_rate": 1.3194953194953194e-05, |
|
"loss": 0.0011, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.007325605023652315, |
|
"learning_rate": 1.317867317867318e-05, |
|
"loss": 0.0008, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.007277225609868765, |
|
"learning_rate": 1.3162393162393164e-05, |
|
"loss": 0.0009, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.007008700165897608, |
|
"learning_rate": 1.3146113146113146e-05, |
|
"loss": 0.0009, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.007119116373360157, |
|
"learning_rate": 1.3129833129833132e-05, |
|
"loss": 0.0088, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.006735885515809059, |
|
"learning_rate": 1.3113553113553114e-05, |
|
"loss": 0.0011, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.006696558557450771, |
|
"learning_rate": 1.3097273097273098e-05, |
|
"loss": 0.0057, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.01188244204968214, |
|
"learning_rate": 1.3080993080993084e-05, |
|
"loss": 0.0011, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.007251105271279812, |
|
"learning_rate": 1.3064713064713066e-05, |
|
"loss": 0.0357, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.006903903558850288, |
|
"learning_rate": 1.304843304843305e-05, |
|
"loss": 0.0008, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.008923369459807873, |
|
"learning_rate": 1.3032153032153034e-05, |
|
"loss": 0.0008, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.006224838085472584, |
|
"learning_rate": 1.3015873015873018e-05, |
|
"loss": 0.0077, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.00695427879691124, |
|
"learning_rate": 1.2999592999593e-05, |
|
"loss": 0.0008, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.007040718570351601, |
|
"learning_rate": 1.2983312983312984e-05, |
|
"loss": 0.0008, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.006210348103195429, |
|
"learning_rate": 1.296703296703297e-05, |
|
"loss": 0.0015, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.0062638637609779835, |
|
"learning_rate": 1.2950752950752952e-05, |
|
"loss": 0.0044, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.006666597910225391, |
|
"learning_rate": 1.2934472934472934e-05, |
|
"loss": 0.0007, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.0061942501924932, |
|
"learning_rate": 1.291819291819292e-05, |
|
"loss": 0.0011, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.00600019795820117, |
|
"learning_rate": 1.2901912901912904e-05, |
|
"loss": 0.0008, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.006045353598892689, |
|
"learning_rate": 1.2885632885632886e-05, |
|
"loss": 0.0451, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.006641109474003315, |
|
"learning_rate": 1.2869352869352871e-05, |
|
"loss": 0.0008, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.4562086760997772, |
|
"learning_rate": 1.2853072853072854e-05, |
|
"loss": 0.0009, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.0076696197502315044, |
|
"learning_rate": 1.2836792836792838e-05, |
|
"loss": 0.0348, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.006937106605619192, |
|
"learning_rate": 1.2820512820512823e-05, |
|
"loss": 0.0596, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.00782240740954876, |
|
"learning_rate": 1.2804232804232805e-05, |
|
"loss": 0.0851, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.007307849358767271, |
|
"learning_rate": 1.278795278795279e-05, |
|
"loss": 0.0009, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.008858690969645977, |
|
"learning_rate": 1.2771672771672772e-05, |
|
"loss": 0.0021, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.006560084410011768, |
|
"learning_rate": 1.2755392755392757e-05, |
|
"loss": 0.0008, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.06266916543245316, |
|
"learning_rate": 1.273911273911274e-05, |
|
"loss": 0.0011, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.00679628923535347, |
|
"learning_rate": 1.2722832722832723e-05, |
|
"loss": 0.0009, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.006765253376215696, |
|
"learning_rate": 1.2706552706552709e-05, |
|
"loss": 0.0013, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.005858385004103184, |
|
"learning_rate": 1.2690272690272691e-05, |
|
"loss": 0.0007, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.006266339216381311, |
|
"learning_rate": 1.2673992673992674e-05, |
|
"loss": 0.0008, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.006281218025833368, |
|
"learning_rate": 1.265771265771266e-05, |
|
"loss": 0.1082, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.006863302085548639, |
|
"learning_rate": 1.2641432641432643e-05, |
|
"loss": 0.0009, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.013896014541387558, |
|
"learning_rate": 1.2625152625152625e-05, |
|
"loss": 0.0281, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.24578307569026947, |
|
"learning_rate": 1.2608872608872611e-05, |
|
"loss": 0.001, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.011449114419519901, |
|
"learning_rate": 1.2592592592592593e-05, |
|
"loss": 0.0007, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 36.35368728637695, |
|
"learning_rate": 1.2576312576312577e-05, |
|
"loss": 0.0217, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.011718428693711758, |
|
"learning_rate": 1.256003256003256e-05, |
|
"loss": 0.0008, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 10.411919593811035, |
|
"learning_rate": 1.2543752543752545e-05, |
|
"loss": 0.0159, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.006179590709507465, |
|
"learning_rate": 1.2527472527472529e-05, |
|
"loss": 0.0307, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.0063836839981377125, |
|
"learning_rate": 1.2511192511192511e-05, |
|
"loss": 0.0034, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.008047536946833134, |
|
"learning_rate": 1.2494912494912497e-05, |
|
"loss": 0.001, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.010491227731108665, |
|
"learning_rate": 1.247863247863248e-05, |
|
"loss": 0.0008, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.005860119592398405, |
|
"learning_rate": 1.2462352462352463e-05, |
|
"loss": 0.0007, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 10.03593635559082, |
|
"learning_rate": 1.2446072446072449e-05, |
|
"loss": 0.0314, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.006240949500352144, |
|
"learning_rate": 1.2429792429792431e-05, |
|
"loss": 0.0009, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.00653426069766283, |
|
"learning_rate": 1.2413512413512413e-05, |
|
"loss": 0.0008, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.0061131748370826244, |
|
"learning_rate": 1.2397232397232399e-05, |
|
"loss": 0.0385, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.018757157027721405, |
|
"learning_rate": 1.2380952380952383e-05, |
|
"loss": 0.0008, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.005603988189250231, |
|
"learning_rate": 1.2364672364672365e-05, |
|
"loss": 0.0007, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.008327238261699677, |
|
"learning_rate": 1.2348392348392349e-05, |
|
"loss": 0.0007, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.006342690903693438, |
|
"learning_rate": 1.2332112332112333e-05, |
|
"loss": 0.0027, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.007467071060091257, |
|
"learning_rate": 1.2315832315832317e-05, |
|
"loss": 0.001, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.005770612042397261, |
|
"learning_rate": 1.22995522995523e-05, |
|
"loss": 0.0422, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.01268511638045311, |
|
"learning_rate": 1.2283272283272285e-05, |
|
"loss": 0.001, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.025519585236907005, |
|
"learning_rate": 1.2266992266992269e-05, |
|
"loss": 0.019, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 12.875621795654297, |
|
"learning_rate": 1.2250712250712251e-05, |
|
"loss": 0.0206, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.018496304750442505, |
|
"learning_rate": 1.2234432234432237e-05, |
|
"loss": 0.0008, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.005795106291770935, |
|
"learning_rate": 1.2218152218152219e-05, |
|
"loss": 0.0032, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.005989160854369402, |
|
"learning_rate": 1.2201872201872203e-05, |
|
"loss": 0.0007, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.005859148222953081, |
|
"learning_rate": 1.2185592185592185e-05, |
|
"loss": 0.0007, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.008097686804831028, |
|
"learning_rate": 1.216931216931217e-05, |
|
"loss": 0.0007, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.005901312455534935, |
|
"learning_rate": 1.2153032153032153e-05, |
|
"loss": 0.0007, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.006804050877690315, |
|
"learning_rate": 1.2136752136752137e-05, |
|
"loss": 0.0009, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.006251387298107147, |
|
"learning_rate": 1.2120472120472123e-05, |
|
"loss": 0.0423, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.0055562574416399, |
|
"learning_rate": 1.2104192104192105e-05, |
|
"loss": 0.0008, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.006534604821354151, |
|
"learning_rate": 1.2087912087912089e-05, |
|
"loss": 0.0038, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.010235198773443699, |
|
"learning_rate": 1.2071632071632073e-05, |
|
"loss": 0.003, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.006196849979460239, |
|
"learning_rate": 1.2055352055352057e-05, |
|
"loss": 0.0007, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.015244298614561558, |
|
"learning_rate": 1.2039072039072039e-05, |
|
"loss": 0.0007, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.03133594989776611, |
|
"learning_rate": 1.2022792022792024e-05, |
|
"loss": 0.0319, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.012942776083946228, |
|
"learning_rate": 1.2006512006512008e-05, |
|
"loss": 0.0007, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.0054002669639885426, |
|
"learning_rate": 1.199023199023199e-05, |
|
"loss": 0.0386, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.006965090055018663, |
|
"learning_rate": 1.1973951973951975e-05, |
|
"loss": 0.0414, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.005913823377341032, |
|
"learning_rate": 1.1957671957671959e-05, |
|
"loss": 0.0008, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.00729360431432724, |
|
"learning_rate": 1.1941391941391942e-05, |
|
"loss": 0.0015, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.005881543271243572, |
|
"learning_rate": 1.1925111925111925e-05, |
|
"loss": 0.0017, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.00946744717657566, |
|
"learning_rate": 1.190883190883191e-05, |
|
"loss": 0.0008, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.7791256904602051, |
|
"learning_rate": 1.1892551892551893e-05, |
|
"loss": 0.0456, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.08430014550685883, |
|
"learning_rate": 1.1876271876271877e-05, |
|
"loss": 0.0048, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.007524729706346989, |
|
"learning_rate": 1.1859991859991862e-05, |
|
"loss": 0.0008, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.007158556021749973, |
|
"learning_rate": 1.1843711843711844e-05, |
|
"loss": 0.0007, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.006158571690320969, |
|
"learning_rate": 1.1827431827431828e-05, |
|
"loss": 0.0007, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.0062376465648412704, |
|
"learning_rate": 1.1811151811151812e-05, |
|
"loss": 0.0007, |
|
"step": 5030 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.009434174746274948, |
|
"learning_rate": 1.1794871794871796e-05, |
|
"loss": 0.0333, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.006017903331667185, |
|
"learning_rate": 1.1778591778591779e-05, |
|
"loss": 0.0007, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.007532346062362194, |
|
"learning_rate": 1.1762311762311762e-05, |
|
"loss": 0.0007, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.005684974603354931, |
|
"learning_rate": 1.1746031746031748e-05, |
|
"loss": 0.0008, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.005241623613983393, |
|
"learning_rate": 1.172975172975173e-05, |
|
"loss": 0.0306, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.019347479566931725, |
|
"learning_rate": 1.1713471713471714e-05, |
|
"loss": 0.0008, |
|
"step": 5090 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.08700444549322128, |
|
"learning_rate": 1.1697191697191698e-05, |
|
"loss": 0.0009, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.005539617035537958, |
|
"learning_rate": 1.1680911680911682e-05, |
|
"loss": 0.0009, |
|
"step": 5110 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.005851482041180134, |
|
"learning_rate": 1.1664631664631664e-05, |
|
"loss": 0.0007, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.007532169576734304, |
|
"learning_rate": 1.164835164835165e-05, |
|
"loss": 0.0011, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.00506225973367691, |
|
"learning_rate": 1.1632071632071634e-05, |
|
"loss": 0.0007, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.005589496809989214, |
|
"learning_rate": 1.1615791615791616e-05, |
|
"loss": 0.0007, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.004957486409693956, |
|
"learning_rate": 1.1599511599511602e-05, |
|
"loss": 0.0156, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.00666527496650815, |
|
"learning_rate": 1.1583231583231584e-05, |
|
"loss": 0.0007, |
|
"step": 5170 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.006306789815425873, |
|
"learning_rate": 1.1566951566951568e-05, |
|
"loss": 0.0006, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.005329395178705454, |
|
"learning_rate": 1.155067155067155e-05, |
|
"loss": 0.0006, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.0049823857843875885, |
|
"learning_rate": 1.1534391534391536e-05, |
|
"loss": 0.0006, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.0051444037817418575, |
|
"learning_rate": 1.1518111518111518e-05, |
|
"loss": 0.0022, |
|
"step": 5210 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.00532697094604373, |
|
"learning_rate": 1.1501831501831502e-05, |
|
"loss": 0.0006, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.006971771828830242, |
|
"learning_rate": 1.1485551485551488e-05, |
|
"loss": 0.0007, |
|
"step": 5230 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.005065458826720715, |
|
"learning_rate": 1.146927146927147e-05, |
|
"loss": 0.0006, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.00542556494474411, |
|
"learning_rate": 1.1452991452991454e-05, |
|
"loss": 0.0006, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.005721778143197298, |
|
"learning_rate": 1.1436711436711438e-05, |
|
"loss": 0.0006, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.0050778863951563835, |
|
"learning_rate": 1.1420431420431422e-05, |
|
"loss": 0.0006, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.005689846817404032, |
|
"learning_rate": 1.1404151404151404e-05, |
|
"loss": 0.0007, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.005032387096434832, |
|
"learning_rate": 1.138787138787139e-05, |
|
"loss": 0.0053, |
|
"step": 5290 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.004602556582540274, |
|
"learning_rate": 1.1371591371591374e-05, |
|
"loss": 0.0006, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.005181928165256977, |
|
"learning_rate": 1.1355311355311356e-05, |
|
"loss": 0.0006, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.004627116955816746, |
|
"learning_rate": 1.1339031339031338e-05, |
|
"loss": 0.0006, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.004680185578763485, |
|
"learning_rate": 1.1322751322751324e-05, |
|
"loss": 0.0006, |
|
"step": 5330 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.00517154298722744, |
|
"learning_rate": 1.1306471306471308e-05, |
|
"loss": 0.0006, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.2655492126941681, |
|
"learning_rate": 1.129019129019129e-05, |
|
"loss": 0.04, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.004791987128555775, |
|
"learning_rate": 1.1273911273911276e-05, |
|
"loss": 0.0027, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.00524140102788806, |
|
"learning_rate": 1.1257631257631258e-05, |
|
"loss": 0.0019, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.004854326602071524, |
|
"learning_rate": 1.1241351241351242e-05, |
|
"loss": 0.0006, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.004912737291306257, |
|
"learning_rate": 1.1225071225071227e-05, |
|
"loss": 0.0229, |
|
"step": 5390 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.009351348504424095, |
|
"learning_rate": 1.120879120879121e-05, |
|
"loss": 0.0006, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.006594196427613497, |
|
"learning_rate": 1.1192511192511194e-05, |
|
"loss": 0.0007, |
|
"step": 5410 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.004785753786563873, |
|
"learning_rate": 1.1176231176231178e-05, |
|
"loss": 0.0006, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.010175659321248531, |
|
"learning_rate": 1.1159951159951162e-05, |
|
"loss": 0.0347, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.007659697439521551, |
|
"learning_rate": 1.1143671143671144e-05, |
|
"loss": 0.0006, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.005518093705177307, |
|
"learning_rate": 1.1127391127391128e-05, |
|
"loss": 0.0007, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.004838414024561644, |
|
"learning_rate": 1.1111111111111113e-05, |
|
"loss": 0.0006, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.004535248037427664, |
|
"learning_rate": 1.1094831094831096e-05, |
|
"loss": 0.0007, |
|
"step": 5470 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.004755628295242786, |
|
"learning_rate": 1.1078551078551078e-05, |
|
"loss": 0.0006, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.007153332699090242, |
|
"learning_rate": 1.1062271062271063e-05, |
|
"loss": 0.0006, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.004593558143824339, |
|
"learning_rate": 1.1045991045991047e-05, |
|
"loss": 0.0006, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.004781143739819527, |
|
"learning_rate": 1.102971102971103e-05, |
|
"loss": 0.0187, |
|
"step": 5510 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.022694548591971397, |
|
"learning_rate": 1.1013431013431015e-05, |
|
"loss": 0.0006, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.004701571073383093, |
|
"learning_rate": 1.0997150997150998e-05, |
|
"loss": 0.0005, |
|
"step": 5530 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.014217639341950417, |
|
"learning_rate": 1.0980870980870981e-05, |
|
"loss": 0.0006, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.0047623575665056705, |
|
"learning_rate": 1.0964590964590967e-05, |
|
"loss": 0.0005, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.004431570880115032, |
|
"learning_rate": 1.094831094831095e-05, |
|
"loss": 0.0006, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.006182719487696886, |
|
"learning_rate": 1.0932030932030933e-05, |
|
"loss": 0.0006, |
|
"step": 5570 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.004717973992228508, |
|
"learning_rate": 1.0915750915750916e-05, |
|
"loss": 0.0005, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.005284770391881466, |
|
"learning_rate": 1.0899470899470901e-05, |
|
"loss": 0.0051, |
|
"step": 5590 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.004852925427258015, |
|
"learning_rate": 1.0883190883190883e-05, |
|
"loss": 0.0129, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.011825304478406906, |
|
"learning_rate": 1.0866910866910867e-05, |
|
"loss": 0.0006, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 5.4084672927856445, |
|
"learning_rate": 1.0850630850630853e-05, |
|
"loss": 0.0014, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.0045865620486438274, |
|
"learning_rate": 1.0834350834350835e-05, |
|
"loss": 0.0012, |
|
"step": 5630 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.004212076775729656, |
|
"learning_rate": 1.0818070818070818e-05, |
|
"loss": 0.0005, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.0043626646511256695, |
|
"learning_rate": 1.0801790801790803e-05, |
|
"loss": 0.0005, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.003995486069470644, |
|
"learning_rate": 1.0785510785510787e-05, |
|
"loss": 0.0005, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 12.674348831176758, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 0.0288, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.004922170657664537, |
|
"learning_rate": 1.0752950752950755e-05, |
|
"loss": 0.0005, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.013311301358044147, |
|
"learning_rate": 1.0736670736670737e-05, |
|
"loss": 0.0006, |
|
"step": 5690 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.004092982970178127, |
|
"learning_rate": 1.0720390720390721e-05, |
|
"loss": 0.0306, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.005637271795421839, |
|
"learning_rate": 1.0704110704110703e-05, |
|
"loss": 0.0483, |
|
"step": 5710 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 8.750419616699219, |
|
"learning_rate": 1.0687830687830689e-05, |
|
"loss": 0.0386, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.01064012385904789, |
|
"learning_rate": 1.0671550671550673e-05, |
|
"loss": 0.0006, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.004589624237269163, |
|
"learning_rate": 1.0655270655270655e-05, |
|
"loss": 0.0006, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.004802080802619457, |
|
"learning_rate": 1.0638990638990641e-05, |
|
"loss": 0.0009, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.004713993053883314, |
|
"learning_rate": 1.0622710622710623e-05, |
|
"loss": 0.0006, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.004530477803200483, |
|
"learning_rate": 1.0606430606430607e-05, |
|
"loss": 0.001, |
|
"step": 5770 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.00422940356656909, |
|
"learning_rate": 1.0590150590150593e-05, |
|
"loss": 0.0007, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.004178835544735193, |
|
"learning_rate": 1.0573870573870575e-05, |
|
"loss": 0.0006, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.006506350357085466, |
|
"learning_rate": 1.0557590557590557e-05, |
|
"loss": 0.0005, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.004273206926882267, |
|
"learning_rate": 1.0541310541310543e-05, |
|
"loss": 0.0005, |
|
"step": 5810 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.004112168215215206, |
|
"learning_rate": 1.0525030525030527e-05, |
|
"loss": 0.0005, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.005212805233895779, |
|
"learning_rate": 1.0508750508750509e-05, |
|
"loss": 0.0005, |
|
"step": 5830 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.004351438954472542, |
|
"learning_rate": 1.0492470492470493e-05, |
|
"loss": 0.0005, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.011514941230416298, |
|
"learning_rate": 1.0476190476190477e-05, |
|
"loss": 0.0005, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.005969716235995293, |
|
"learning_rate": 1.045991045991046e-05, |
|
"loss": 0.0005, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.004150481894612312, |
|
"learning_rate": 1.0443630443630443e-05, |
|
"loss": 0.0005, |
|
"step": 5870 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.003940541297197342, |
|
"learning_rate": 1.0427350427350429e-05, |
|
"loss": 0.0005, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.00408910820260644, |
|
"learning_rate": 1.0411070411070413e-05, |
|
"loss": 0.0005, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.0038459610659629107, |
|
"learning_rate": 1.0394790394790395e-05, |
|
"loss": 0.0006, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.004051607567816973, |
|
"learning_rate": 1.037851037851038e-05, |
|
"loss": 0.0449, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.005520727019757032, |
|
"learning_rate": 1.0362230362230363e-05, |
|
"loss": 0.0078, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.004394978284835815, |
|
"learning_rate": 1.0345950345950347e-05, |
|
"loss": 0.0564, |
|
"step": 5930 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.004857844207435846, |
|
"learning_rate": 1.0329670329670332e-05, |
|
"loss": 0.0005, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.06114115193486214, |
|
"learning_rate": 1.0313390313390315e-05, |
|
"loss": 0.0007, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.004661387763917446, |
|
"learning_rate": 1.0297110297110297e-05, |
|
"loss": 0.0014, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.005134343635290861, |
|
"learning_rate": 1.028083028083028e-05, |
|
"loss": 0.001, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.004655875731259584, |
|
"learning_rate": 1.0264550264550266e-05, |
|
"loss": 0.0006, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.0050579700618982315, |
|
"learning_rate": 1.0248270248270249e-05, |
|
"loss": 0.0015, |
|
"step": 5990 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.0047796061262488365, |
|
"learning_rate": 1.0231990231990233e-05, |
|
"loss": 0.0005, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.003949730657041073, |
|
"learning_rate": 1.0215710215710217e-05, |
|
"loss": 0.0005, |
|
"step": 6010 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.004095940385013819, |
|
"learning_rate": 1.01994301994302e-05, |
|
"loss": 0.0005, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.005133763421326876, |
|
"learning_rate": 1.0183150183150183e-05, |
|
"loss": 0.0005, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.01469303946942091, |
|
"learning_rate": 1.0166870166870168e-05, |
|
"loss": 0.0013, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.004049224779009819, |
|
"learning_rate": 1.0150590150590152e-05, |
|
"loss": 0.0018, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.004054594319313765, |
|
"learning_rate": 1.0134310134310135e-05, |
|
"loss": 0.0184, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.004326994996517897, |
|
"learning_rate": 1.011803011803012e-05, |
|
"loss": 0.0005, |
|
"step": 6070 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.004046597983688116, |
|
"learning_rate": 1.0101750101750102e-05, |
|
"loss": 0.0005, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.00401376374065876, |
|
"learning_rate": 1.0085470085470086e-05, |
|
"loss": 0.0006, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.005812219809740782, |
|
"learning_rate": 1.0069190069190069e-05, |
|
"loss": 0.0006, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.003832985181361437, |
|
"learning_rate": 1.0052910052910054e-05, |
|
"loss": 0.0005, |
|
"step": 6110 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.07061895728111267, |
|
"learning_rate": 1.0036630036630037e-05, |
|
"loss": 0.0005, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.0039010499604046345, |
|
"learning_rate": 1.002035002035002e-05, |
|
"loss": 0.0017, |
|
"step": 6130 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.004468753468245268, |
|
"learning_rate": 1.0004070004070006e-05, |
|
"loss": 0.0022, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.004342631436884403, |
|
"learning_rate": 9.987789987789988e-06, |
|
"loss": 0.0005, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.011564524844288826, |
|
"learning_rate": 9.971509971509972e-06, |
|
"loss": 0.0005, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.00380577496252954, |
|
"learning_rate": 9.955229955229956e-06, |
|
"loss": 0.0005, |
|
"step": 6170 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.0037414473481476307, |
|
"learning_rate": 9.93894993894994e-06, |
|
"loss": 0.0011, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.003814409486949444, |
|
"learning_rate": 9.922669922669922e-06, |
|
"loss": 0.0005, |
|
"step": 6190 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.0039341021329164505, |
|
"learning_rate": 9.906389906389906e-06, |
|
"loss": 0.0005, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.003710733028128743, |
|
"learning_rate": 9.890109890109892e-06, |
|
"loss": 0.0005, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 3.491090774536133, |
|
"learning_rate": 9.873829873829874e-06, |
|
"loss": 0.0356, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.005923949647694826, |
|
"learning_rate": 9.857549857549858e-06, |
|
"loss": 0.0005, |
|
"step": 6230 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.0036399513483047485, |
|
"learning_rate": 9.841269841269842e-06, |
|
"loss": 0.0004, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.003884287318214774, |
|
"learning_rate": 9.824989824989826e-06, |
|
"loss": 0.0005, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.0036194841377437115, |
|
"learning_rate": 9.80870980870981e-06, |
|
"loss": 0.0004, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.005207626614719629, |
|
"learning_rate": 9.792429792429792e-06, |
|
"loss": 0.0004, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.008327057585120201, |
|
"learning_rate": 9.776149776149776e-06, |
|
"loss": 0.0005, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.003949583508074284, |
|
"learning_rate": 9.759869759869762e-06, |
|
"loss": 0.0004, |
|
"step": 6290 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.004071325063705444, |
|
"learning_rate": 9.743589743589744e-06, |
|
"loss": 0.0005, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.0036700996570289135, |
|
"learning_rate": 9.727309727309728e-06, |
|
"loss": 0.0004, |
|
"step": 6310 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.005211398471146822, |
|
"learning_rate": 9.711029711029712e-06, |
|
"loss": 0.0005, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.003708272473886609, |
|
"learning_rate": 9.694749694749696e-06, |
|
"loss": 0.0004, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.0042539420537650585, |
|
"learning_rate": 9.67846967846968e-06, |
|
"loss": 0.0004, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.0038529515732079744, |
|
"learning_rate": 9.662189662189662e-06, |
|
"loss": 0.0004, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.003946115728467703, |
|
"learning_rate": 9.645909645909646e-06, |
|
"loss": 0.0005, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.004324799869209528, |
|
"learning_rate": 9.62962962962963e-06, |
|
"loss": 0.0004, |
|
"step": 6370 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.0038023737724870443, |
|
"learning_rate": 9.613349613349614e-06, |
|
"loss": 0.0004, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.0037666463758796453, |
|
"learning_rate": 9.597069597069598e-06, |
|
"loss": 0.0004, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.0034590172581374645, |
|
"learning_rate": 9.580789580789582e-06, |
|
"loss": 0.017, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.0038201683200895786, |
|
"learning_rate": 9.564509564509566e-06, |
|
"loss": 0.0004, |
|
"step": 6410 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.004171228501945734, |
|
"learning_rate": 9.54822954822955e-06, |
|
"loss": 0.0004, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.0038926773704588413, |
|
"learning_rate": 9.531949531949532e-06, |
|
"loss": 0.0004, |
|
"step": 6430 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.0037587357219308615, |
|
"learning_rate": 9.515669515669516e-06, |
|
"loss": 0.0004, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.0034505994990468025, |
|
"learning_rate": 9.4993894993895e-06, |
|
"loss": 0.0024, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.0034958263859152794, |
|
"learning_rate": 9.483109483109484e-06, |
|
"loss": 0.0004, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.0037652612663805485, |
|
"learning_rate": 9.466829466829468e-06, |
|
"loss": 0.0004, |
|
"step": 6470 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.003452475182712078, |
|
"learning_rate": 9.450549450549452e-06, |
|
"loss": 0.0004, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.005090977996587753, |
|
"learning_rate": 9.434269434269436e-06, |
|
"loss": 0.0004, |
|
"step": 6490 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.0036007205490022898, |
|
"learning_rate": 9.417989417989418e-06, |
|
"loss": 0.0004, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.0033244409132748842, |
|
"learning_rate": 9.401709401709402e-06, |
|
"loss": 0.0004, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.00387198431417346, |
|
"learning_rate": 9.385429385429386e-06, |
|
"loss": 0.0004, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.003582969307899475, |
|
"learning_rate": 9.36914936914937e-06, |
|
"loss": 0.0004, |
|
"step": 6530 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.0032744621858000755, |
|
"learning_rate": 9.352869352869354e-06, |
|
"loss": 0.0005, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.0034951018169522285, |
|
"learning_rate": 9.336589336589338e-06, |
|
"loss": 0.0004, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.0034060273319482803, |
|
"learning_rate": 9.320309320309321e-06, |
|
"loss": 0.0004, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.0034066797234117985, |
|
"learning_rate": 9.304029304029305e-06, |
|
"loss": 0.0004, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.0035453049931675196, |
|
"learning_rate": 9.287749287749288e-06, |
|
"loss": 0.0004, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.0033404843416064978, |
|
"learning_rate": 9.271469271469272e-06, |
|
"loss": 0.0004, |
|
"step": 6590 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.0032289137598127127, |
|
"learning_rate": 9.255189255189256e-06, |
|
"loss": 0.0004, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.0035338301677256823, |
|
"learning_rate": 9.23890923890924e-06, |
|
"loss": 0.0004, |
|
"step": 6610 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.0032329142559319735, |
|
"learning_rate": 9.222629222629223e-06, |
|
"loss": 0.0004, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.0033918411936610937, |
|
"learning_rate": 9.206349206349207e-06, |
|
"loss": 0.0004, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.003434843849390745, |
|
"learning_rate": 9.190069190069191e-06, |
|
"loss": 0.0004, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.0032904883846640587, |
|
"learning_rate": 9.173789173789175e-06, |
|
"loss": 0.0004, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.003165784990414977, |
|
"learning_rate": 9.157509157509158e-06, |
|
"loss": 0.0004, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.0034379889257252216, |
|
"learning_rate": 9.141229141229141e-06, |
|
"loss": 0.0004, |
|
"step": 6670 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.0032244266476482153, |
|
"learning_rate": 9.124949124949125e-06, |
|
"loss": 0.001, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.003119837259873748, |
|
"learning_rate": 9.10866910866911e-06, |
|
"loss": 0.0004, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.0038290254306048155, |
|
"learning_rate": 9.092389092389093e-06, |
|
"loss": 0.0004, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.0032256192062050104, |
|
"learning_rate": 9.076109076109077e-06, |
|
"loss": 0.0004, |
|
"step": 6710 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.004083781037479639, |
|
"learning_rate": 9.059829059829061e-06, |
|
"loss": 0.0004, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.003274232381954789, |
|
"learning_rate": 9.043549043549045e-06, |
|
"loss": 0.0004, |
|
"step": 6730 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.0032298911828547716, |
|
"learning_rate": 9.027269027269027e-06, |
|
"loss": 0.0004, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.0031462605111300945, |
|
"learning_rate": 9.010989010989011e-06, |
|
"loss": 0.0425, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.00312459422275424, |
|
"learning_rate": 8.994708994708995e-06, |
|
"loss": 0.0004, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.0036323266103863716, |
|
"learning_rate": 8.97842897842898e-06, |
|
"loss": 0.0004, |
|
"step": 6770 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.0033034805674105883, |
|
"learning_rate": 8.962148962148963e-06, |
|
"loss": 0.0004, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.003054459812119603, |
|
"learning_rate": 8.945868945868947e-06, |
|
"loss": 0.0004, |
|
"step": 6790 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.005314236972481012, |
|
"learning_rate": 8.929588929588931e-06, |
|
"loss": 0.0004, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.010932357981801033, |
|
"learning_rate": 8.913308913308915e-06, |
|
"loss": 0.0004, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.0031523159705102444, |
|
"learning_rate": 8.897028897028897e-06, |
|
"loss": 0.0004, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.0034312924835830927, |
|
"learning_rate": 8.880748880748881e-06, |
|
"loss": 0.0101, |
|
"step": 6830 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.00318572367541492, |
|
"learning_rate": 8.864468864468865e-06, |
|
"loss": 0.0004, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.0032511164899915457, |
|
"learning_rate": 8.848188848188849e-06, |
|
"loss": 0.0379, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.0037167894188314676, |
|
"learning_rate": 8.831908831908833e-06, |
|
"loss": 0.0004, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.003721152199432254, |
|
"learning_rate": 8.815628815628817e-06, |
|
"loss": 0.0008, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.0030927169136703014, |
|
"learning_rate": 8.7993487993488e-06, |
|
"loss": 0.0004, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.0034221247769892216, |
|
"learning_rate": 8.783068783068783e-06, |
|
"loss": 0.0031, |
|
"step": 6890 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.0033293466549366713, |
|
"learning_rate": 8.766788766788767e-06, |
|
"loss": 0.0004, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.003214113647118211, |
|
"learning_rate": 8.750508750508751e-06, |
|
"loss": 0.0004, |
|
"step": 6910 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.0032116910442709923, |
|
"learning_rate": 8.734228734228735e-06, |
|
"loss": 0.034, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.003405655035749078, |
|
"learning_rate": 8.717948717948719e-06, |
|
"loss": 0.0515, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.003949843347072601, |
|
"learning_rate": 8.701668701668703e-06, |
|
"loss": 0.0508, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.0030140685848891735, |
|
"learning_rate": 8.685388685388687e-06, |
|
"loss": 0.0385, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.0034769896883517504, |
|
"learning_rate": 8.66910866910867e-06, |
|
"loss": 0.0004, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.005965403746813536, |
|
"learning_rate": 8.652828652828653e-06, |
|
"loss": 0.0454, |
|
"step": 6970 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.004475270863622427, |
|
"learning_rate": 8.636548636548637e-06, |
|
"loss": 0.0005, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.0039094300009310246, |
|
"learning_rate": 8.62026862026862e-06, |
|
"loss": 0.0005, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.004547227174043655, |
|
"learning_rate": 8.603988603988605e-06, |
|
"loss": 0.0004, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.0033658877946436405, |
|
"learning_rate": 8.587708587708589e-06, |
|
"loss": 0.0005, |
|
"step": 7010 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.0037282053381204605, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.0005, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.012108271941542625, |
|
"learning_rate": 8.555148555148557e-06, |
|
"loss": 0.0005, |
|
"step": 7030 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.00378889380954206, |
|
"learning_rate": 8.53886853886854e-06, |
|
"loss": 0.0142, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.0037225610576570034, |
|
"learning_rate": 8.522588522588523e-06, |
|
"loss": 0.0009, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.005083514377474785, |
|
"learning_rate": 8.506308506308507e-06, |
|
"loss": 0.0004, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.0035945470444858074, |
|
"learning_rate": 8.49002849002849e-06, |
|
"loss": 0.0005, |
|
"step": 7070 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.0031938895117491484, |
|
"learning_rate": 8.473748473748475e-06, |
|
"loss": 0.0007, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.007891247980296612, |
|
"learning_rate": 8.457468457468459e-06, |
|
"loss": 0.0004, |
|
"step": 7090 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.003397688502445817, |
|
"learning_rate": 8.44118844118844e-06, |
|
"loss": 0.0004, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.004096095450222492, |
|
"learning_rate": 8.424908424908426e-06, |
|
"loss": 0.0004, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.004969074856489897, |
|
"learning_rate": 8.40862840862841e-06, |
|
"loss": 0.0004, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.002869043732061982, |
|
"learning_rate": 8.392348392348393e-06, |
|
"loss": 0.0376, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.004255395848304033, |
|
"learning_rate": 8.376068376068377e-06, |
|
"loss": 0.0004, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.003371414029970765, |
|
"learning_rate": 8.35978835978836e-06, |
|
"loss": 0.0005, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.0031468465458601713, |
|
"learning_rate": 8.343508343508344e-06, |
|
"loss": 0.0004, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.004064807202666998, |
|
"learning_rate": 8.327228327228328e-06, |
|
"loss": 0.0004, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.0038253762759268284, |
|
"learning_rate": 8.31094831094831e-06, |
|
"loss": 0.0275, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.0029601927381008863, |
|
"learning_rate": 8.294668294668296e-06, |
|
"loss": 0.0162, |
|
"step": 7190 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.0035592832136899233, |
|
"learning_rate": 8.278388278388278e-06, |
|
"loss": 0.001, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.003166656941175461, |
|
"learning_rate": 8.262108262108262e-06, |
|
"loss": 0.0004, |
|
"step": 7210 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.0038591506890952587, |
|
"learning_rate": 8.245828245828246e-06, |
|
"loss": 0.0004, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.004316645674407482, |
|
"learning_rate": 8.22954822954823e-06, |
|
"loss": 0.0339, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.003106352873146534, |
|
"learning_rate": 8.213268213268214e-06, |
|
"loss": 0.0004, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.003383921692147851, |
|
"learning_rate": 8.196988196988198e-06, |
|
"loss": 0.0004, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.003904301906004548, |
|
"learning_rate": 8.18070818070818e-06, |
|
"loss": 0.009, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.002857522340491414, |
|
"learning_rate": 8.164428164428166e-06, |
|
"loss": 0.0004, |
|
"step": 7270 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.0028671324253082275, |
|
"learning_rate": 8.148148148148148e-06, |
|
"loss": 0.0004, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.0028230687603354454, |
|
"learning_rate": 8.131868131868132e-06, |
|
"loss": 0.0009, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.0028381363954395056, |
|
"learning_rate": 8.115588115588116e-06, |
|
"loss": 0.0003, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.0028295046649873257, |
|
"learning_rate": 8.0993080993081e-06, |
|
"loss": 0.0099, |
|
"step": 7310 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.0051268660463392735, |
|
"learning_rate": 8.083028083028084e-06, |
|
"loss": 0.0004, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.006851341109722853, |
|
"learning_rate": 8.066748066748066e-06, |
|
"loss": 0.0569, |
|
"step": 7330 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.003248844761401415, |
|
"learning_rate": 8.05046805046805e-06, |
|
"loss": 0.0004, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.003859333461150527, |
|
"learning_rate": 8.034188034188036e-06, |
|
"loss": 0.0012, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.002941732294857502, |
|
"learning_rate": 8.017908017908018e-06, |
|
"loss": 0.0145, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.0032136046793311834, |
|
"learning_rate": 8.001628001628002e-06, |
|
"loss": 0.0004, |
|
"step": 7370 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.0037520972546190023, |
|
"learning_rate": 7.985347985347986e-06, |
|
"loss": 0.0003, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.002765586832538247, |
|
"learning_rate": 7.96906796906797e-06, |
|
"loss": 0.0003, |
|
"step": 7390 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 0.002917984500527382, |
|
"learning_rate": 7.952787952787954e-06, |
|
"loss": 0.0003, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.002771808998659253, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.0003, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.0028077957686036825, |
|
"learning_rate": 7.92022792022792e-06, |
|
"loss": 0.0003, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 0.014859122224152088, |
|
"learning_rate": 7.903947903947906e-06, |
|
"loss": 0.0187, |
|
"step": 7430 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.0029327922966331244, |
|
"learning_rate": 7.887667887667888e-06, |
|
"loss": 0.0003, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 0.003155101090669632, |
|
"learning_rate": 7.871387871387872e-06, |
|
"loss": 0.0003, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.002822224283590913, |
|
"learning_rate": 7.855107855107856e-06, |
|
"loss": 0.0003, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.002726204926148057, |
|
"learning_rate": 7.83882783882784e-06, |
|
"loss": 0.0003, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 0.0026202842127531767, |
|
"learning_rate": 7.822547822547824e-06, |
|
"loss": 0.0004, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.0026620635762810707, |
|
"learning_rate": 7.806267806267806e-06, |
|
"loss": 0.0003, |
|
"step": 7490 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 0.0026845140382647514, |
|
"learning_rate": 7.78998778998779e-06, |
|
"loss": 0.0003, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.002793940482661128, |
|
"learning_rate": 7.773707773707776e-06, |
|
"loss": 0.0004, |
|
"step": 7510 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.002819318324327469, |
|
"learning_rate": 7.757427757427758e-06, |
|
"loss": 0.0003, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.0027769345324486494, |
|
"learning_rate": 7.741147741147742e-06, |
|
"loss": 0.0003, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.002659664023667574, |
|
"learning_rate": 7.724867724867726e-06, |
|
"loss": 0.0003, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 3.07, |
|
"grad_norm": 0.0025388060603290796, |
|
"learning_rate": 7.70858770858771e-06, |
|
"loss": 0.0003, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.002629263559356332, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.0003, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 0.0025471756234765053, |
|
"learning_rate": 7.676027676027676e-06, |
|
"loss": 0.0003, |
|
"step": 7570 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.006246237549930811, |
|
"learning_rate": 7.65974765974766e-06, |
|
"loss": 0.0003, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.0031642026733607054, |
|
"learning_rate": 7.643467643467644e-06, |
|
"loss": 0.0003, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 0.0028460524044930935, |
|
"learning_rate": 7.627187627187628e-06, |
|
"loss": 0.0003, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.0027321220841258764, |
|
"learning_rate": 7.610907610907612e-06, |
|
"loss": 0.0004, |
|
"step": 7610 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 0.07277552038431168, |
|
"learning_rate": 7.594627594627595e-06, |
|
"loss": 0.0003, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.002561114262789488, |
|
"learning_rate": 7.578347578347579e-06, |
|
"loss": 0.0003, |
|
"step": 7630 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.002666006563231349, |
|
"learning_rate": 7.5620675620675634e-06, |
|
"loss": 0.0003, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 3.11, |
|
"grad_norm": 0.003249433124437928, |
|
"learning_rate": 7.5457875457875465e-06, |
|
"loss": 0.0003, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.002814142033457756, |
|
"learning_rate": 7.5295075295075305e-06, |
|
"loss": 0.0003, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 0.002647695131599903, |
|
"learning_rate": 7.5132275132275136e-06, |
|
"loss": 0.0003, |
|
"step": 7670 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.0028357000555843115, |
|
"learning_rate": 7.4969474969474975e-06, |
|
"loss": 0.0003, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.002574663609266281, |
|
"learning_rate": 7.4806674806674814e-06, |
|
"loss": 0.0003, |
|
"step": 7690 |
|
}, |
|
{ |
|
"epoch": 3.13, |
|
"grad_norm": 0.002485772827640176, |
|
"learning_rate": 7.4643874643874645e-06, |
|
"loss": 0.0004, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.0026384114753454924, |
|
"learning_rate": 7.448107448107449e-06, |
|
"loss": 0.0003, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 3.14, |
|
"grad_norm": 0.0025012667756527662, |
|
"learning_rate": 7.4318274318274316e-06, |
|
"loss": 0.0003, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.0023603325244039297, |
|
"learning_rate": 7.415547415547416e-06, |
|
"loss": 0.0008, |
|
"step": 7730 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.006851641461253166, |
|
"learning_rate": 7.3992673992674e-06, |
|
"loss": 0.0003, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 0.0029785565566271544, |
|
"learning_rate": 7.382987382987383e-06, |
|
"loss": 0.0003, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.002378121018409729, |
|
"learning_rate": 7.366707366707367e-06, |
|
"loss": 0.0062, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 0.0024877325631678104, |
|
"learning_rate": 7.350427350427351e-06, |
|
"loss": 0.0003, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.004979600198566914, |
|
"learning_rate": 7.334147334147334e-06, |
|
"loss": 0.0003, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.002649629721418023, |
|
"learning_rate": 7.317867317867319e-06, |
|
"loss": 0.0003, |
|
"step": 7790 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 0.0030928929336369038, |
|
"learning_rate": 7.301587301587301e-06, |
|
"loss": 0.0003, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.00250143650919199, |
|
"learning_rate": 7.285307285307286e-06, |
|
"loss": 0.0003, |
|
"step": 7810 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.002448960905894637, |
|
"learning_rate": 7.26902726902727e-06, |
|
"loss": 0.0003, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.0023297348525375128, |
|
"learning_rate": 7.252747252747253e-06, |
|
"loss": 0.0003, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.0023908980656415224, |
|
"learning_rate": 7.236467236467237e-06, |
|
"loss": 0.0003, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.003359014866873622, |
|
"learning_rate": 7.22018722018722e-06, |
|
"loss": 0.0003, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.002836639992892742, |
|
"learning_rate": 7.203907203907204e-06, |
|
"loss": 0.0003, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.0024746765848249197, |
|
"learning_rate": 7.187627187627189e-06, |
|
"loss": 0.0003, |
|
"step": 7870 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.002388924825936556, |
|
"learning_rate": 7.171347171347171e-06, |
|
"loss": 0.0003, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 0.0024251139257103205, |
|
"learning_rate": 7.155067155067156e-06, |
|
"loss": 0.0003, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.00242208456620574, |
|
"learning_rate": 7.13878713878714e-06, |
|
"loss": 0.0003, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.0023256507702171803, |
|
"learning_rate": 7.122507122507123e-06, |
|
"loss": 0.0003, |
|
"step": 7910 |
|
}, |
|
{ |
|
"epoch": 3.22, |
|
"grad_norm": 0.0022887035738676786, |
|
"learning_rate": 7.106227106227107e-06, |
|
"loss": 0.0003, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.0022210038732737303, |
|
"learning_rate": 7.08994708994709e-06, |
|
"loss": 0.0003, |
|
"step": 7930 |
|
}, |
|
{ |
|
"epoch": 3.23, |
|
"grad_norm": 0.002328604692593217, |
|
"learning_rate": 7.073667073667074e-06, |
|
"loss": 0.0003, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.002483953256160021, |
|
"learning_rate": 7.057387057387059e-06, |
|
"loss": 0.0003, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.002675483236089349, |
|
"learning_rate": 7.041107041107041e-06, |
|
"loss": 0.0003, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 0.0023732264526188374, |
|
"learning_rate": 7.024827024827026e-06, |
|
"loss": 0.0003, |
|
"step": 7970 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.002226916840299964, |
|
"learning_rate": 7.008547008547009e-06, |
|
"loss": 0.0003, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 3.25, |
|
"grad_norm": 0.003264982718974352, |
|
"learning_rate": 6.992266992266993e-06, |
|
"loss": 0.0003, |
|
"step": 7990 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.0026976047083735466, |
|
"learning_rate": 6.975986975986977e-06, |
|
"loss": 0.0003, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.002336106961593032, |
|
"learning_rate": 6.95970695970696e-06, |
|
"loss": 0.0003, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 0.0023025060072541237, |
|
"learning_rate": 6.943426943426944e-06, |
|
"loss": 0.0003, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.0024826654698699713, |
|
"learning_rate": 6.927146927146929e-06, |
|
"loss": 0.0003, |
|
"step": 8030 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"grad_norm": 0.002214565174654126, |
|
"learning_rate": 6.910866910866911e-06, |
|
"loss": 0.0003, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.002279749372974038, |
|
"learning_rate": 6.894586894586896e-06, |
|
"loss": 0.0003, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.002262295223772526, |
|
"learning_rate": 6.878306878306879e-06, |
|
"loss": 0.0003, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 0.0022824567276984453, |
|
"learning_rate": 6.862026862026863e-06, |
|
"loss": 0.0003, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.0022059327457100153, |
|
"learning_rate": 6.845746845746847e-06, |
|
"loss": 0.0003, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 3.29, |
|
"grad_norm": 0.0022225133143365383, |
|
"learning_rate": 6.82946682946683e-06, |
|
"loss": 0.0003, |
|
"step": 8090 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.0030766648706048727, |
|
"learning_rate": 6.813186813186814e-06, |
|
"loss": 0.0003, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.0020688914228230715, |
|
"learning_rate": 6.796906796906797e-06, |
|
"loss": 0.0003, |
|
"step": 8110 |
|
}, |
|
{ |
|
"epoch": 3.3, |
|
"grad_norm": 0.0026230113580822945, |
|
"learning_rate": 6.780626780626781e-06, |
|
"loss": 0.0003, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.0027380469255149364, |
|
"learning_rate": 6.7643467643467655e-06, |
|
"loss": 0.0002, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 0.0020218545105308294, |
|
"learning_rate": 6.748066748066749e-06, |
|
"loss": 0.0002, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.0022498080506920815, |
|
"learning_rate": 6.7317867317867326e-06, |
|
"loss": 0.0002, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 0.0026646710466593504, |
|
"learning_rate": 6.715506715506716e-06, |
|
"loss": 0.0002, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.0021166689693927765, |
|
"learning_rate": 6.6992266992267e-06, |
|
"loss": 0.0002, |
|
"step": 8170 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.0022176315542310476, |
|
"learning_rate": 6.6829466829466836e-06, |
|
"loss": 0.0003, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 0.0020941500551998615, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.0002, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.002201402559876442, |
|
"learning_rate": 6.650386650386651e-06, |
|
"loss": 0.0003, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 0.002235386986285448, |
|
"learning_rate": 6.634106634106635e-06, |
|
"loss": 0.0062, |
|
"step": 8210 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.002202383242547512, |
|
"learning_rate": 6.6178266178266185e-06, |
|
"loss": 0.0002, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.002144381171092391, |
|
"learning_rate": 6.601546601546602e-06, |
|
"loss": 0.0002, |
|
"step": 8230 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 0.0027761892415583134, |
|
"learning_rate": 6.5852665852665855e-06, |
|
"loss": 0.0003, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.002119843615218997, |
|
"learning_rate": 6.5689865689865694e-06, |
|
"loss": 0.0002, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 0.003361073322594166, |
|
"learning_rate": 6.552706552706553e-06, |
|
"loss": 0.0002, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.0021668022964149714, |
|
"learning_rate": 6.5364265364265365e-06, |
|
"loss": 0.0003, |
|
"step": 8270 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.0020495818462222815, |
|
"learning_rate": 6.5201465201465204e-06, |
|
"loss": 0.0002, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 0.002108585089445114, |
|
"learning_rate": 6.5038665038665035e-06, |
|
"loss": 0.0004, |
|
"step": 8290 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.0022084820084273815, |
|
"learning_rate": 6.487586487586488e-06, |
|
"loss": 0.0094, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 0.002132968744263053, |
|
"learning_rate": 6.471306471306472e-06, |
|
"loss": 0.0002, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.002239073161035776, |
|
"learning_rate": 6.455026455026455e-06, |
|
"loss": 0.0002, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.00218349602073431, |
|
"learning_rate": 6.438746438746439e-06, |
|
"loss": 0.0002, |
|
"step": 8330 |
|
}, |
|
{ |
|
"epoch": 3.39, |
|
"grad_norm": 0.00208345171995461, |
|
"learning_rate": 6.422466422466423e-06, |
|
"loss": 0.0002, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.003050567815080285, |
|
"learning_rate": 6.406186406186406e-06, |
|
"loss": 0.0002, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.0019847999792546034, |
|
"learning_rate": 6.38990638990639e-06, |
|
"loss": 0.0003, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.0020100034307688475, |
|
"learning_rate": 6.373626373626373e-06, |
|
"loss": 0.0002, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.0020706066861748695, |
|
"learning_rate": 6.357346357346358e-06, |
|
"loss": 0.0002, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 3.41, |
|
"grad_norm": 0.0019506254466250539, |
|
"learning_rate": 6.341066341066342e-06, |
|
"loss": 0.0002, |
|
"step": 8390 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.0020071598701179028, |
|
"learning_rate": 6.324786324786325e-06, |
|
"loss": 0.0002, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 0.002606179565191269, |
|
"learning_rate": 6.308506308506309e-06, |
|
"loss": 0.0467, |
|
"step": 8410 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.0021410868503153324, |
|
"learning_rate": 6.292226292226292e-06, |
|
"loss": 0.0002, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 3.43, |
|
"grad_norm": 0.002439359435811639, |
|
"learning_rate": 6.275946275946276e-06, |
|
"loss": 0.0002, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.0037037180736660957, |
|
"learning_rate": 6.25966625966626e-06, |
|
"loss": 0.0002, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.0022582276724278927, |
|
"learning_rate": 6.243386243386243e-06, |
|
"loss": 0.0002, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 0.006983071565628052, |
|
"learning_rate": 6.227106227106228e-06, |
|
"loss": 0.0002, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.01085950993001461, |
|
"learning_rate": 6.210826210826212e-06, |
|
"loss": 0.0003, |
|
"step": 8470 |
|
}, |
|
{ |
|
"epoch": 3.45, |
|
"grad_norm": 0.0021798298694193363, |
|
"learning_rate": 6.194546194546195e-06, |
|
"loss": 0.0002, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.0021102093160152435, |
|
"learning_rate": 6.178266178266179e-06, |
|
"loss": 0.0002, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.0021143911872059107, |
|
"learning_rate": 6.161986161986162e-06, |
|
"loss": 0.0002, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 0.002472953638061881, |
|
"learning_rate": 6.145706145706146e-06, |
|
"loss": 0.0002, |
|
"step": 8510 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.0019736222457140684, |
|
"learning_rate": 6.12942612942613e-06, |
|
"loss": 0.0002, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 3.47, |
|
"grad_norm": 0.001965272007510066, |
|
"learning_rate": 6.113146113146113e-06, |
|
"loss": 0.0002, |
|
"step": 8530 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.001975101651623845, |
|
"learning_rate": 6.096866096866098e-06, |
|
"loss": 0.0002, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.002040453255176544, |
|
"learning_rate": 6.080586080586081e-06, |
|
"loss": 0.0002, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 7.389462471008301, |
|
"learning_rate": 6.064306064306065e-06, |
|
"loss": 0.034, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.0022456683218479156, |
|
"learning_rate": 6.048026048026049e-06, |
|
"loss": 0.0002, |
|
"step": 8570 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.003975760657340288, |
|
"learning_rate": 6.031746031746032e-06, |
|
"loss": 0.0002, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.0020120914559811354, |
|
"learning_rate": 6.015466015466016e-06, |
|
"loss": 0.0002, |
|
"step": 8590 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 0.0022050223778933287, |
|
"learning_rate": 5.999185999186001e-06, |
|
"loss": 0.0002, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 3.5, |
|
"grad_norm": 6.6309919357299805, |
|
"learning_rate": 5.982905982905983e-06, |
|
"loss": 0.043, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.0022617350332438946, |
|
"learning_rate": 5.966625966625968e-06, |
|
"loss": 0.0002, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 0.0019437572918832302, |
|
"learning_rate": 5.950345950345951e-06, |
|
"loss": 0.0002, |
|
"step": 8630 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.001993882469832897, |
|
"learning_rate": 5.934065934065935e-06, |
|
"loss": 0.0002, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.0022044796496629715, |
|
"learning_rate": 5.917785917785919e-06, |
|
"loss": 0.0002, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 0.0020595360547304153, |
|
"learning_rate": 5.901505901505902e-06, |
|
"loss": 0.0004, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.002459390088915825, |
|
"learning_rate": 5.885225885225886e-06, |
|
"loss": 0.0002, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 0.0018390618497505784, |
|
"learning_rate": 5.868945868945869e-06, |
|
"loss": 0.0002, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.002049500122666359, |
|
"learning_rate": 5.852665852665853e-06, |
|
"loss": 0.0002, |
|
"step": 8690 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.001947426819242537, |
|
"learning_rate": 5.8363858363858375e-06, |
|
"loss": 0.0002, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 3.54, |
|
"grad_norm": 0.0030878265388309956, |
|
"learning_rate": 5.820105820105821e-06, |
|
"loss": 0.0002, |
|
"step": 8710 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.001884807599708438, |
|
"learning_rate": 5.8038258038258045e-06, |
|
"loss": 0.0002, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 0.0019810153171420097, |
|
"learning_rate": 5.7875457875457885e-06, |
|
"loss": 0.0002, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.001923812204040587, |
|
"learning_rate": 5.7712657712657716e-06, |
|
"loss": 0.0002, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 0.001998158637434244, |
|
"learning_rate": 5.7549857549857555e-06, |
|
"loss": 0.0002, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.0018681226065382361, |
|
"learning_rate": 5.738705738705739e-06, |
|
"loss": 0.0002, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.006764058023691177, |
|
"learning_rate": 5.7224257224257225e-06, |
|
"loss": 0.0068, |
|
"step": 8770 |
|
}, |
|
{ |
|
"epoch": 3.57, |
|
"grad_norm": 0.003150691743940115, |
|
"learning_rate": 5.706145706145707e-06, |
|
"loss": 0.0007, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.00217335089109838, |
|
"learning_rate": 5.68986568986569e-06, |
|
"loss": 0.0002, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 0.00865620281547308, |
|
"learning_rate": 5.673585673585674e-06, |
|
"loss": 0.0002, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.0020344313234090805, |
|
"learning_rate": 5.6573056573056575e-06, |
|
"loss": 0.0002, |
|
"step": 8810 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.0018948889337480068, |
|
"learning_rate": 5.641025641025641e-06, |
|
"loss": 0.0002, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 3.59, |
|
"grad_norm": 0.001868214923888445, |
|
"learning_rate": 5.624745624745625e-06, |
|
"loss": 0.0002, |
|
"step": 8830 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.0019942354410886765, |
|
"learning_rate": 5.6084656084656084e-06, |
|
"loss": 0.0002, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.0018839197000488639, |
|
"learning_rate": 5.592185592185592e-06, |
|
"loss": 0.0002, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.0022100857459008694, |
|
"learning_rate": 5.575905575905577e-06, |
|
"loss": 0.0002, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.0019310906063765287, |
|
"learning_rate": 5.55962555962556e-06, |
|
"loss": 0.0002, |
|
"step": 8870 |
|
}, |
|
{ |
|
"epoch": 3.61, |
|
"grad_norm": 0.002325033536180854, |
|
"learning_rate": 5.543345543345544e-06, |
|
"loss": 0.0002, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.0017883091932162642, |
|
"learning_rate": 5.527065527065527e-06, |
|
"loss": 0.0002, |
|
"step": 8890 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.0018799800891429186, |
|
"learning_rate": 5.510785510785511e-06, |
|
"loss": 0.0002, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.0017864195397123694, |
|
"learning_rate": 5.494505494505495e-06, |
|
"loss": 0.0002, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.0018234961898997426, |
|
"learning_rate": 5.478225478225478e-06, |
|
"loss": 0.0002, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 0.0017443567048758268, |
|
"learning_rate": 5.461945461945462e-06, |
|
"loss": 0.0002, |
|
"step": 8930 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.0017067781882360578, |
|
"learning_rate": 5.445665445665445e-06, |
|
"loss": 0.0002, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 0.002276692306622863, |
|
"learning_rate": 5.42938542938543e-06, |
|
"loss": 0.0186, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.0017357119359076023, |
|
"learning_rate": 5.413105413105414e-06, |
|
"loss": 0.0002, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.0019965972751379013, |
|
"learning_rate": 5.396825396825397e-06, |
|
"loss": 0.0002, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.0017553390935063362, |
|
"learning_rate": 5.380545380545381e-06, |
|
"loss": 0.0003, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.0019675048533827066, |
|
"learning_rate": 5.364265364265364e-06, |
|
"loss": 0.0002, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 3.66, |
|
"grad_norm": 0.002049475908279419, |
|
"learning_rate": 5.347985347985348e-06, |
|
"loss": 0.0002, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.0019142305245622993, |
|
"learning_rate": 5.331705331705332e-06, |
|
"loss": 0.0002, |
|
"step": 9010 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 0.0018189084948971868, |
|
"learning_rate": 5.315425315425315e-06, |
|
"loss": 0.042, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.0019228870514780283, |
|
"learning_rate": 5.2991452991453e-06, |
|
"loss": 0.0005, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.002307659713551402, |
|
"learning_rate": 5.282865282865284e-06, |
|
"loss": 0.0002, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 0.0021766172721982002, |
|
"learning_rate": 5.266585266585267e-06, |
|
"loss": 0.0002, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.0017359366174787283, |
|
"learning_rate": 5.250305250305251e-06, |
|
"loss": 0.0341, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 0.0017763186478987336, |
|
"learning_rate": 5.234025234025234e-06, |
|
"loss": 0.0002, |
|
"step": 9070 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.001665986143052578, |
|
"learning_rate": 5.217745217745218e-06, |
|
"loss": 0.0008, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.0017538231331855059, |
|
"learning_rate": 5.201465201465202e-06, |
|
"loss": 0.0002, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 3.7, |
|
"grad_norm": 0.0016558667412027717, |
|
"learning_rate": 5.185185185185185e-06, |
|
"loss": 0.0002, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.0018909978680312634, |
|
"learning_rate": 5.16890516890517e-06, |
|
"loss": 0.0002, |
|
"step": 9110 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 0.0017842132365331054, |
|
"learning_rate": 5.152625152625153e-06, |
|
"loss": 0.0002, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.0017819767817854881, |
|
"learning_rate": 5.136345136345137e-06, |
|
"loss": 0.0002, |
|
"step": 9130 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.00168974872212857, |
|
"learning_rate": 5.120065120065121e-06, |
|
"loss": 0.0002, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 0.0017720448086038232, |
|
"learning_rate": 5.103785103785104e-06, |
|
"loss": 0.0002, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.0017071804031729698, |
|
"learning_rate": 5.087505087505088e-06, |
|
"loss": 0.0002, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.0018827036255970597, |
|
"learning_rate": 5.071225071225072e-06, |
|
"loss": 0.0002, |
|
"step": 9170 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.0022226206492632627, |
|
"learning_rate": 5.054945054945055e-06, |
|
"loss": 0.0002, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.0019109738059341908, |
|
"learning_rate": 5.03866503866504e-06, |
|
"loss": 0.0494, |
|
"step": 9190 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 0.0019527949625626206, |
|
"learning_rate": 5.022385022385023e-06, |
|
"loss": 0.0002, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.0020662578754127026, |
|
"learning_rate": 5.006105006105007e-06, |
|
"loss": 0.0415, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 0.002151912311092019, |
|
"learning_rate": 4.98982498982499e-06, |
|
"loss": 0.0003, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.001946290722116828, |
|
"learning_rate": 4.973544973544974e-06, |
|
"loss": 0.0002, |
|
"step": 9230 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.001949216122739017, |
|
"learning_rate": 4.957264957264958e-06, |
|
"loss": 0.0002, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.002394681563600898, |
|
"learning_rate": 4.9409849409849416e-06, |
|
"loss": 0.0352, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.0022585808765143156, |
|
"learning_rate": 4.924704924704925e-06, |
|
"loss": 0.0471, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 3.77, |
|
"grad_norm": 0.002393248025327921, |
|
"learning_rate": 4.908424908424909e-06, |
|
"loss": 0.0002, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.02360522374510765, |
|
"learning_rate": 4.8921448921448925e-06, |
|
"loss": 0.0003, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 0.0023453827016055584, |
|
"learning_rate": 4.8758648758648765e-06, |
|
"loss": 0.0003, |
|
"step": 9290 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.00220270873978734, |
|
"learning_rate": 4.8595848595848596e-06, |
|
"loss": 0.0003, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.0021812734194099903, |
|
"learning_rate": 4.8433048433048435e-06, |
|
"loss": 0.0003, |
|
"step": 9310 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 0.0021124074701219797, |
|
"learning_rate": 4.8270248270248275e-06, |
|
"loss": 0.0002, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.002312874887138605, |
|
"learning_rate": 4.810744810744811e-06, |
|
"loss": 0.0003, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.0025071410927921534, |
|
"learning_rate": 4.7944647944647945e-06, |
|
"loss": 0.0002, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.0022760110441595316, |
|
"learning_rate": 4.7781847781847784e-06, |
|
"loss": 0.0003, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.002391684567555785, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.0002, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 0.0021324707195162773, |
|
"learning_rate": 4.745624745624746e-06, |
|
"loss": 0.0002, |
|
"step": 9370 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.0021602713968604803, |
|
"learning_rate": 4.729344729344729e-06, |
|
"loss": 0.0002, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.003342408686876297, |
|
"learning_rate": 4.713064713064713e-06, |
|
"loss": 0.0002, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.003199818776920438, |
|
"learning_rate": 4.696784696784697e-06, |
|
"loss": 0.0002, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.002259862143546343, |
|
"learning_rate": 4.680504680504681e-06, |
|
"loss": 0.0005, |
|
"step": 9410 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.0020261441823095083, |
|
"learning_rate": 4.664224664224664e-06, |
|
"loss": 0.0002, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.001844863872975111, |
|
"learning_rate": 4.647944647944648e-06, |
|
"loss": 0.0003, |
|
"step": 9430 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 0.0022536173928529024, |
|
"learning_rate": 4.631664631664632e-06, |
|
"loss": 0.0002, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.001871871529147029, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 0.0003, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.0019549911376088858, |
|
"learning_rate": 4.599104599104599e-06, |
|
"loss": 0.0414, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 0.002595256781205535, |
|
"learning_rate": 4.582824582824583e-06, |
|
"loss": 0.0002, |
|
"step": 9470 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.0021485532633960247, |
|
"learning_rate": 4.566544566544567e-06, |
|
"loss": 0.0002, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 3.86, |
|
"grad_norm": 0.0018896989058703184, |
|
"learning_rate": 4.55026455026455e-06, |
|
"loss": 0.0264, |
|
"step": 9490 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.0023643136955797672, |
|
"learning_rate": 4.533984533984534e-06, |
|
"loss": 0.0003, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.0017869413131847978, |
|
"learning_rate": 4.517704517704518e-06, |
|
"loss": 0.0003, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 0.0022810434456914663, |
|
"learning_rate": 4.501424501424502e-06, |
|
"loss": 0.0002, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.0020936301443725824, |
|
"learning_rate": 4.485144485144485e-06, |
|
"loss": 0.0003, |
|
"step": 9530 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.0017164949094876647, |
|
"learning_rate": 4.468864468864469e-06, |
|
"loss": 0.0147, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.008885451592504978, |
|
"learning_rate": 4.452584452584453e-06, |
|
"loss": 0.0003, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.20433610677719116, |
|
"learning_rate": 4.436304436304437e-06, |
|
"loss": 0.0004, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 3.89, |
|
"grad_norm": 0.0018083051545545459, |
|
"learning_rate": 4.42002442002442e-06, |
|
"loss": 0.0002, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.00233688997104764, |
|
"learning_rate": 4.403744403744404e-06, |
|
"loss": 0.0002, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.0020819292403757572, |
|
"learning_rate": 4.387464387464388e-06, |
|
"loss": 0.0012, |
|
"step": 9590 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.0069807544350624084, |
|
"learning_rate": 4.371184371184372e-06, |
|
"loss": 0.0003, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 0.0027952860109508038, |
|
"learning_rate": 4.354904354904355e-06, |
|
"loss": 0.0002, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.0018937455024570227, |
|
"learning_rate": 4.338624338624339e-06, |
|
"loss": 0.0272, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.001811556052416563, |
|
"learning_rate": 4.322344322344323e-06, |
|
"loss": 0.0002, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 0.0017631722148507833, |
|
"learning_rate": 4.306064306064307e-06, |
|
"loss": 0.0002, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.001867889310233295, |
|
"learning_rate": 4.28978428978429e-06, |
|
"loss": 0.0197, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 3.93, |
|
"grad_norm": 0.0020562438294291496, |
|
"learning_rate": 4.273504273504274e-06, |
|
"loss": 0.0002, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.007918364368379116, |
|
"learning_rate": 4.257224257224258e-06, |
|
"loss": 0.0003, |
|
"step": 9670 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.0026931529864668846, |
|
"learning_rate": 4.240944240944242e-06, |
|
"loss": 0.0003, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 0.002624350832775235, |
|
"learning_rate": 4.224664224664225e-06, |
|
"loss": 0.0003, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.001771993818692863, |
|
"learning_rate": 4.208384208384209e-06, |
|
"loss": 0.0002, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 0.010523835197091103, |
|
"learning_rate": 4.192104192104192e-06, |
|
"loss": 0.0003, |
|
"step": 9710 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.0034396941773593426, |
|
"learning_rate": 4.175824175824177e-06, |
|
"loss": 0.0003, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.003138788277283311, |
|
"learning_rate": 4.15954415954416e-06, |
|
"loss": 0.0058, |
|
"step": 9730 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 0.002142369979992509, |
|
"learning_rate": 4.143264143264144e-06, |
|
"loss": 0.0002, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.006518381182104349, |
|
"learning_rate": 4.126984126984127e-06, |
|
"loss": 0.0002, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 0.0019359017023816705, |
|
"learning_rate": 4.1107041107041116e-06, |
|
"loss": 0.0002, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.0018001939170062542, |
|
"learning_rate": 4.094424094424095e-06, |
|
"loss": 0.0002, |
|
"step": 9770 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.002167722210288048, |
|
"learning_rate": 4.078144078144079e-06, |
|
"loss": 0.0002, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 3.98, |
|
"grad_norm": 0.008154891431331635, |
|
"learning_rate": 4.061864061864062e-06, |
|
"loss": 0.0002, |
|
"step": 9790 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.001978978980332613, |
|
"learning_rate": 4.0455840455840465e-06, |
|
"loss": 0.0002, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 0.0018466059118509293, |
|
"learning_rate": 4.0293040293040296e-06, |
|
"loss": 0.0002, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.00179979985114187, |
|
"learning_rate": 4.0130240130240135e-06, |
|
"loss": 0.0002, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.002002492779865861, |
|
"learning_rate": 3.996743996743997e-06, |
|
"loss": 0.0002, |
|
"step": 9830 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.0019970801658928394, |
|
"learning_rate": 3.9804639804639805e-06, |
|
"loss": 0.0002, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.0017706368817016482, |
|
"learning_rate": 3.9641839641839645e-06, |
|
"loss": 0.0002, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 0.0017488128505647182, |
|
"learning_rate": 3.9479039479039484e-06, |
|
"loss": 0.0003, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.0025758370757102966, |
|
"learning_rate": 3.9316239316239315e-06, |
|
"loss": 0.0002, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 0.002105166669934988, |
|
"learning_rate": 3.9153439153439155e-06, |
|
"loss": 0.0002, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.0027692352887243032, |
|
"learning_rate": 3.899063899063899e-06, |
|
"loss": 0.0043, |
|
"step": 9890 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.0020704329945147038, |
|
"learning_rate": 3.882783882783883e-06, |
|
"loss": 0.0002, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.0019208292942494154, |
|
"learning_rate": 3.8665038665038664e-06, |
|
"loss": 0.0002, |
|
"step": 9910 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.0017399511998519301, |
|
"learning_rate": 3.85022385022385e-06, |
|
"loss": 0.0002, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.0017688291845843196, |
|
"learning_rate": 3.833943833943834e-06, |
|
"loss": 0.0002, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 4.471590995788574, |
|
"learning_rate": 3.817663817663818e-06, |
|
"loss": 0.0023, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.0016602250980213284, |
|
"learning_rate": 3.8013838013838018e-06, |
|
"loss": 0.0002, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 4.05, |
|
"grad_norm": 0.001645643264055252, |
|
"learning_rate": 3.7851037851037853e-06, |
|
"loss": 0.0002, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.0017087948508560658, |
|
"learning_rate": 3.768823768823769e-06, |
|
"loss": 0.0002, |
|
"step": 9970 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 0.002038088161498308, |
|
"learning_rate": 3.752543752543753e-06, |
|
"loss": 0.0002, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.0071817911230027676, |
|
"learning_rate": 3.7362637362637367e-06, |
|
"loss": 0.0002, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.0021325184497982264, |
|
"learning_rate": 3.7199837199837202e-06, |
|
"loss": 0.0002, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 0.001710103009827435, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.0002, |
|
"step": 10010 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.0015926583437249064, |
|
"learning_rate": 3.687423687423688e-06, |
|
"loss": 0.0003, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.0016407363582402468, |
|
"learning_rate": 3.6711436711436716e-06, |
|
"loss": 0.0002, |
|
"step": 10030 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.005499332211911678, |
|
"learning_rate": 3.654863654863655e-06, |
|
"loss": 0.0002, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.0018358811503276229, |
|
"learning_rate": 3.6385836385836387e-06, |
|
"loss": 0.0002, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 0.0016708581242710352, |
|
"learning_rate": 3.622303622303623e-06, |
|
"loss": 0.0002, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.0017990090418606997, |
|
"learning_rate": 3.6060236060236065e-06, |
|
"loss": 0.0002, |
|
"step": 10070 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 0.0018943555187433958, |
|
"learning_rate": 3.58974358974359e-06, |
|
"loss": 0.0002, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.0016270867781713605, |
|
"learning_rate": 3.5734635734635736e-06, |
|
"loss": 0.0002, |
|
"step": 10090 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.009296965785324574, |
|
"learning_rate": 3.557183557183557e-06, |
|
"loss": 0.0002, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 0.0016765177715569735, |
|
"learning_rate": 3.5409035409035415e-06, |
|
"loss": 0.0002, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.004676634445786476, |
|
"learning_rate": 3.524623524623525e-06, |
|
"loss": 0.0002, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.001882671844214201, |
|
"learning_rate": 3.5083435083435085e-06, |
|
"loss": 0.0011, |
|
"step": 10130 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.0016701704589650035, |
|
"learning_rate": 3.492063492063492e-06, |
|
"loss": 0.0002, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 0.0018036847468465567, |
|
"learning_rate": 3.4757834757834764e-06, |
|
"loss": 0.0002, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.0019449255196377635, |
|
"learning_rate": 3.45950345950346e-06, |
|
"loss": 0.0002, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.0023109372705221176, |
|
"learning_rate": 3.4432234432234434e-06, |
|
"loss": 0.0002, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 4.14, |
|
"grad_norm": 0.001794449402950704, |
|
"learning_rate": 3.426943426943427e-06, |
|
"loss": 0.0002, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.0016366175841540098, |
|
"learning_rate": 3.410663410663411e-06, |
|
"loss": 0.0002, |
|
"step": 10190 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 0.0022932947613298893, |
|
"learning_rate": 3.394383394383395e-06, |
|
"loss": 0.0002, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.003153660800307989, |
|
"learning_rate": 3.3781033781033783e-06, |
|
"loss": 0.0002, |
|
"step": 10210 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.0018573219422250986, |
|
"learning_rate": 3.361823361823362e-06, |
|
"loss": 0.0002, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 0.0016019688919186592, |
|
"learning_rate": 3.345543345543346e-06, |
|
"loss": 0.0002, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.0016897093737497926, |
|
"learning_rate": 3.3292633292633297e-06, |
|
"loss": 0.0002, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.0018914591055363417, |
|
"learning_rate": 3.3129833129833133e-06, |
|
"loss": 0.0003, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.0018889600178226829, |
|
"learning_rate": 3.2967032967032968e-06, |
|
"loss": 0.0002, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.00160633132327348, |
|
"learning_rate": 3.2804232804232807e-06, |
|
"loss": 0.0002, |
|
"step": 10270 |
|
}, |
|
{ |
|
"epoch": 4.18, |
|
"grad_norm": 0.00516732269898057, |
|
"learning_rate": 3.2641432641432647e-06, |
|
"loss": 0.0002, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.0015665347455069423, |
|
"learning_rate": 3.247863247863248e-06, |
|
"loss": 0.0002, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 0.0016588406870141625, |
|
"learning_rate": 3.2315832315832317e-06, |
|
"loss": 0.0002, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.00242376746609807, |
|
"learning_rate": 3.2153032153032156e-06, |
|
"loss": 0.0002, |
|
"step": 10310 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.0070383488200604916, |
|
"learning_rate": 3.199023199023199e-06, |
|
"loss": 0.0002, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.0019135623006150126, |
|
"learning_rate": 3.182743182743183e-06, |
|
"loss": 0.0002, |
|
"step": 10330 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.0018966845236718655, |
|
"learning_rate": 3.1664631664631666e-06, |
|
"loss": 0.0002, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 4.21, |
|
"grad_norm": 0.0014899246161803603, |
|
"learning_rate": 3.1501831501831505e-06, |
|
"loss": 0.0002, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.001564052072353661, |
|
"learning_rate": 3.133903133903134e-06, |
|
"loss": 0.0002, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.001840132987126708, |
|
"learning_rate": 3.117623117623118e-06, |
|
"loss": 0.0002, |
|
"step": 10370 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 0.0020550840999931097, |
|
"learning_rate": 3.1013431013431015e-06, |
|
"loss": 0.0002, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.0018264094833284616, |
|
"learning_rate": 3.0850630850630855e-06, |
|
"loss": 0.0002, |
|
"step": 10390 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 0.001516546355560422, |
|
"learning_rate": 3.068783068783069e-06, |
|
"loss": 0.0002, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.0016487749526277184, |
|
"learning_rate": 3.052503052503053e-06, |
|
"loss": 0.0002, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 0.0016116101760417223, |
|
"learning_rate": 3.0362230362230364e-06, |
|
"loss": 0.0002, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.001680860761553049, |
|
"learning_rate": 3.0199430199430204e-06, |
|
"loss": 0.0002, |
|
"step": 10430 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.002029112773016095, |
|
"learning_rate": 3.003663003663004e-06, |
|
"loss": 0.0002, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.002056869911029935, |
|
"learning_rate": 2.9873829873829874e-06, |
|
"loss": 0.0002, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.0016365089686587453, |
|
"learning_rate": 2.9711029711029714e-06, |
|
"loss": 0.0017, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 0.001570598571561277, |
|
"learning_rate": 2.9548229548229553e-06, |
|
"loss": 0.0002, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.0019338660640642047, |
|
"learning_rate": 2.938542938542939e-06, |
|
"loss": 0.0002, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.001604044926352799, |
|
"learning_rate": 2.9222629222629223e-06, |
|
"loss": 0.0002, |
|
"step": 10490 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 0.0015405503800138831, |
|
"learning_rate": 2.9059829059829063e-06, |
|
"loss": 0.0003, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.001597168273292482, |
|
"learning_rate": 2.8897028897028902e-06, |
|
"loss": 0.0002, |
|
"step": 10510 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.001601763884536922, |
|
"learning_rate": 2.8734228734228737e-06, |
|
"loss": 0.0002, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.0014684420311823487, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.0002, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.0019548002164810896, |
|
"learning_rate": 2.840862840862841e-06, |
|
"loss": 0.0002, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 0.0019341334700584412, |
|
"learning_rate": 2.824582824582825e-06, |
|
"loss": 0.0002, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.0015359672252088785, |
|
"learning_rate": 2.8083028083028087e-06, |
|
"loss": 0.0002, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 4.3, |
|
"grad_norm": 0.001660957932472229, |
|
"learning_rate": 2.792022792022792e-06, |
|
"loss": 0.0002, |
|
"step": 10570 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.002642634091898799, |
|
"learning_rate": 2.7757427757427757e-06, |
|
"loss": 0.0002, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.001577245187945664, |
|
"learning_rate": 2.75946275946276e-06, |
|
"loss": 0.0002, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.0016623500268906355, |
|
"learning_rate": 2.7431827431827436e-06, |
|
"loss": 0.0274, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.001559157157316804, |
|
"learning_rate": 2.726902726902727e-06, |
|
"loss": 0.0002, |
|
"step": 10610 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.0015379212563857436, |
|
"learning_rate": 2.7106227106227106e-06, |
|
"loss": 0.0002, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.001682962873019278, |
|
"learning_rate": 2.694342694342695e-06, |
|
"loss": 0.0002, |
|
"step": 10630 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.0015784628922119737, |
|
"learning_rate": 2.6780626780626785e-06, |
|
"loss": 0.0002, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 0.0015349604655057192, |
|
"learning_rate": 2.661782661782662e-06, |
|
"loss": 0.0002, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.0015412438660860062, |
|
"learning_rate": 2.6455026455026455e-06, |
|
"loss": 0.0002, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 4.34, |
|
"grad_norm": 0.0016461275517940521, |
|
"learning_rate": 2.629222629222629e-06, |
|
"loss": 0.0002, |
|
"step": 10670 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.0016684934962540865, |
|
"learning_rate": 2.6129426129426134e-06, |
|
"loss": 0.0002, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.0015019102720543742, |
|
"learning_rate": 2.596662596662597e-06, |
|
"loss": 0.0002, |
|
"step": 10690 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 0.0015912950038909912, |
|
"learning_rate": 2.5803825803825804e-06, |
|
"loss": 0.0002, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.002051288727670908, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 0.0002, |
|
"step": 10710 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.0014287488302215934, |
|
"learning_rate": 2.5478225478225483e-06, |
|
"loss": 0.0002, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.0014953837962821126, |
|
"learning_rate": 2.531542531542532e-06, |
|
"loss": 0.0002, |
|
"step": 10730 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.0016842116601765156, |
|
"learning_rate": 2.5152625152625154e-06, |
|
"loss": 0.0002, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.0016165722627192736, |
|
"learning_rate": 2.4989824989824993e-06, |
|
"loss": 0.0004, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.0016578533686697483, |
|
"learning_rate": 2.482702482702483e-06, |
|
"loss": 0.0002, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 0.001627171179279685, |
|
"learning_rate": 2.4664224664224668e-06, |
|
"loss": 0.0002, |
|
"step": 10770 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.0029889908619225025, |
|
"learning_rate": 2.4501424501424503e-06, |
|
"loss": 0.0002, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 0.0015365415019914508, |
|
"learning_rate": 2.433862433862434e-06, |
|
"loss": 0.0002, |
|
"step": 10790 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.0019263201393187046, |
|
"learning_rate": 2.4175824175824177e-06, |
|
"loss": 0.0002, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.001516710501164198, |
|
"learning_rate": 2.4013024013024013e-06, |
|
"loss": 0.0002, |
|
"step": 10810 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.001614395878277719, |
|
"learning_rate": 2.385022385022385e-06, |
|
"loss": 0.0002, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.0014490768080577254, |
|
"learning_rate": 2.3687423687423687e-06, |
|
"loss": 0.0004, |
|
"step": 10830 |
|
}, |
|
{ |
|
"epoch": 4.41, |
|
"grad_norm": 0.0015428679762408137, |
|
"learning_rate": 2.3524623524623527e-06, |
|
"loss": 0.0002, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.0015440605347976089, |
|
"learning_rate": 2.336182336182336e-06, |
|
"loss": 0.0004, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.00148781796451658, |
|
"learning_rate": 2.31990231990232e-06, |
|
"loss": 0.0002, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 0.0015348844463005662, |
|
"learning_rate": 2.3036223036223036e-06, |
|
"loss": 0.0002, |
|
"step": 10870 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.001880201743915677, |
|
"learning_rate": 2.2873422873422876e-06, |
|
"loss": 0.0002, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 4.43, |
|
"grad_norm": 0.001558057265356183, |
|
"learning_rate": 2.271062271062271e-06, |
|
"loss": 0.0002, |
|
"step": 10890 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.010920335538685322, |
|
"learning_rate": 2.254782254782255e-06, |
|
"loss": 0.0002, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.0014644470065832138, |
|
"learning_rate": 2.2385022385022386e-06, |
|
"loss": 0.0002, |
|
"step": 10910 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 0.0014618238201364875, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.0002, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.0016169185983017087, |
|
"learning_rate": 2.205942205942206e-06, |
|
"loss": 0.0002, |
|
"step": 10930 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.0014386329567059875, |
|
"learning_rate": 2.1896621896621895e-06, |
|
"loss": 0.0002, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.0015079034492373466, |
|
"learning_rate": 2.1733821733821735e-06, |
|
"loss": 0.0002, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.00197400595061481, |
|
"learning_rate": 2.157102157102157e-06, |
|
"loss": 0.0002, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 0.001524322316981852, |
|
"learning_rate": 2.140822140822141e-06, |
|
"loss": 0.0311, |
|
"step": 10970 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.0014644163893535733, |
|
"learning_rate": 2.1245421245421245e-06, |
|
"loss": 0.0002, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.0014774493174627423, |
|
"learning_rate": 2.1082621082621084e-06, |
|
"loss": 0.0002, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.0014835140900686383, |
|
"learning_rate": 2.091982091982092e-06, |
|
"loss": 0.0002, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.001458540791645646, |
|
"learning_rate": 2.075702075702076e-06, |
|
"loss": 0.0002, |
|
"step": 11010 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.002432051347568631, |
|
"learning_rate": 2.05942205942206e-06, |
|
"loss": 0.0002, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.001562487450428307, |
|
"learning_rate": 2.0431420431420433e-06, |
|
"loss": 0.0353, |
|
"step": 11030 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 0.0016052748542279005, |
|
"learning_rate": 2.0268620268620273e-06, |
|
"loss": 0.0002, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.001545790466479957, |
|
"learning_rate": 2.0105820105820108e-06, |
|
"loss": 0.0002, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 0.001812846981920302, |
|
"learning_rate": 1.9943019943019947e-06, |
|
"loss": 0.0002, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.0017415074398741126, |
|
"learning_rate": 1.9780219780219782e-06, |
|
"loss": 0.0002, |
|
"step": 11070 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.0016338001005351543, |
|
"learning_rate": 1.961741961741962e-06, |
|
"loss": 0.0002, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 0.0014169925125315785, |
|
"learning_rate": 1.9454619454619457e-06, |
|
"loss": 0.0006, |
|
"step": 11090 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.0016671591438353062, |
|
"learning_rate": 1.9291819291819296e-06, |
|
"loss": 0.0002, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.0033444638829678297, |
|
"learning_rate": 1.912901912901913e-06, |
|
"loss": 0.0002, |
|
"step": 11110 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.0015689071733504534, |
|
"learning_rate": 1.8966218966218969e-06, |
|
"loss": 0.0002, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.0018193925498053432, |
|
"learning_rate": 1.8803418803418804e-06, |
|
"loss": 0.0002, |
|
"step": 11130 |
|
}, |
|
{ |
|
"epoch": 4.53, |
|
"grad_norm": 0.0015975474379956722, |
|
"learning_rate": 1.8640618640618643e-06, |
|
"loss": 0.0002, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.0015228153206408024, |
|
"learning_rate": 1.8477818477818479e-06, |
|
"loss": 0.0002, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 0.0017481072572991252, |
|
"learning_rate": 1.8315018315018316e-06, |
|
"loss": 0.0002, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.0014254804700613022, |
|
"learning_rate": 1.8152218152218153e-06, |
|
"loss": 0.0002, |
|
"step": 11170 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.0014639191795140505, |
|
"learning_rate": 1.798941798941799e-06, |
|
"loss": 0.0002, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 0.0014739630278199911, |
|
"learning_rate": 1.7826617826617828e-06, |
|
"loss": 0.0002, |
|
"step": 11190 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.001486291061155498, |
|
"learning_rate": 1.7663817663817665e-06, |
|
"loss": 0.0002, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 0.0021130377426743507, |
|
"learning_rate": 1.7501017501017502e-06, |
|
"loss": 0.0002, |
|
"step": 11210 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.0014680501772090793, |
|
"learning_rate": 1.733821733821734e-06, |
|
"loss": 0.0002, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.0018635701853781939, |
|
"learning_rate": 1.7175417175417177e-06, |
|
"loss": 0.0002, |
|
"step": 11230 |
|
}, |
|
{ |
|
"epoch": 4.57, |
|
"grad_norm": 0.0015968162333592772, |
|
"learning_rate": 1.7012617012617014e-06, |
|
"loss": 0.0094, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.0017093609785661101, |
|
"learning_rate": 1.6849816849816852e-06, |
|
"loss": 0.0002, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 0.0014892058679834008, |
|
"learning_rate": 1.6687016687016689e-06, |
|
"loss": 0.0139, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.0014219109434634447, |
|
"learning_rate": 1.6524216524216524e-06, |
|
"loss": 0.0002, |
|
"step": 11270 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.004563894122838974, |
|
"learning_rate": 1.6361416361416363e-06, |
|
"loss": 0.0002, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.0014352177968248725, |
|
"learning_rate": 1.6198616198616199e-06, |
|
"loss": 0.0002, |
|
"step": 11290 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.001390959369018674, |
|
"learning_rate": 1.6035816035816038e-06, |
|
"loss": 0.0002, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.0038959532976150513, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 0.0002, |
|
"step": 11310 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.001589680789038539, |
|
"learning_rate": 1.5710215710215713e-06, |
|
"loss": 0.0002, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 4.61, |
|
"grad_norm": 0.001737966202199459, |
|
"learning_rate": 1.5547415547415548e-06, |
|
"loss": 0.0002, |
|
"step": 11330 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.0014157581608742476, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 0.0002, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.0018974760314449668, |
|
"learning_rate": 1.5221815221815222e-06, |
|
"loss": 0.0002, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 0.0015809000469744205, |
|
"learning_rate": 1.5059015059015062e-06, |
|
"loss": 0.0002, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.01224368717521429, |
|
"learning_rate": 1.4896214896214897e-06, |
|
"loss": 0.0002, |
|
"step": 11370 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 0.0015656572068110108, |
|
"learning_rate": 1.4733414733414736e-06, |
|
"loss": 0.0002, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.0015062758466228843, |
|
"learning_rate": 1.4570614570614572e-06, |
|
"loss": 0.0002, |
|
"step": 11390 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.001575302449055016, |
|
"learning_rate": 1.4407814407814407e-06, |
|
"loss": 0.0002, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.0014867339050397277, |
|
"learning_rate": 1.4245014245014246e-06, |
|
"loss": 0.0002, |
|
"step": 11410 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.0014463013503700495, |
|
"learning_rate": 1.4082214082214083e-06, |
|
"loss": 0.0002, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 0.0014738457975909114, |
|
"learning_rate": 1.391941391941392e-06, |
|
"loss": 0.0002, |
|
"step": 11430 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.0033326647244393826, |
|
"learning_rate": 1.3756613756613758e-06, |
|
"loss": 0.0002, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.0014288641978055239, |
|
"learning_rate": 1.3593813593813595e-06, |
|
"loss": 0.0002, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 0.0015226053074002266, |
|
"learning_rate": 1.3431013431013433e-06, |
|
"loss": 0.0002, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.0014885494019836187, |
|
"learning_rate": 1.326821326821327e-06, |
|
"loss": 0.0002, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 0.0014367675175890326, |
|
"learning_rate": 1.3105413105413107e-06, |
|
"loss": 0.0002, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.0014275162247940898, |
|
"learning_rate": 1.2942612942612944e-06, |
|
"loss": 0.0002, |
|
"step": 11490 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.0014797335024923086, |
|
"learning_rate": 1.2779812779812782e-06, |
|
"loss": 0.0002, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.0014295239234343171, |
|
"learning_rate": 1.2617012617012617e-06, |
|
"loss": 0.0002, |
|
"step": 11510 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.0014915807405486703, |
|
"learning_rate": 1.2454212454212456e-06, |
|
"loss": 0.0002, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 4.69, |
|
"grad_norm": 0.0016227615997195244, |
|
"learning_rate": 1.2291412291412294e-06, |
|
"loss": 0.0002, |
|
"step": 11530 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.0014580420684069395, |
|
"learning_rate": 1.212861212861213e-06, |
|
"loss": 0.0002, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.0017063523409888148, |
|
"learning_rate": 1.1965811965811968e-06, |
|
"loss": 0.0002, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 0.0014365671668201685, |
|
"learning_rate": 1.1803011803011806e-06, |
|
"loss": 0.0002, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 1.2260816097259521, |
|
"learning_rate": 1.164021164021164e-06, |
|
"loss": 0.006, |
|
"step": 11570 |
|
}, |
|
{ |
|
"epoch": 4.71, |
|
"grad_norm": 0.0018874687375500798, |
|
"learning_rate": 1.1477411477411478e-06, |
|
"loss": 0.0002, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.0013750126818194985, |
|
"learning_rate": 1.1314611314611315e-06, |
|
"loss": 0.0002, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.001419686945155263, |
|
"learning_rate": 1.1151811151811153e-06, |
|
"loss": 0.0002, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.0014118729159235954, |
|
"learning_rate": 1.098901098901099e-06, |
|
"loss": 0.0002, |
|
"step": 11610 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.0014139912091195583, |
|
"learning_rate": 1.0826210826210827e-06, |
|
"loss": 0.0002, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.0015478282002732158, |
|
"learning_rate": 1.0663410663410665e-06, |
|
"loss": 0.0002, |
|
"step": 11630 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.0015366330044344068, |
|
"learning_rate": 1.0500610500610502e-06, |
|
"loss": 0.0002, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 0.001490729977376759, |
|
"learning_rate": 1.033781033781034e-06, |
|
"loss": 0.0002, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.001423732377588749, |
|
"learning_rate": 1.0175010175010176e-06, |
|
"loss": 0.0002, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.0014315071748569608, |
|
"learning_rate": 1.0012210012210014e-06, |
|
"loss": 0.0002, |
|
"step": 11670 |
|
}, |
|
{ |
|
"epoch": 4.75, |
|
"grad_norm": 0.001486779423430562, |
|
"learning_rate": 9.84940984940985e-07, |
|
"loss": 0.0002, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.004522906616330147, |
|
"learning_rate": 9.686609686609686e-07, |
|
"loss": 0.0002, |
|
"step": 11690 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 0.0014580250717699528, |
|
"learning_rate": 9.523809523809525e-07, |
|
"loss": 0.0002, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.0015162237687036395, |
|
"learning_rate": 9.361009361009362e-07, |
|
"loss": 0.0092, |
|
"step": 11710 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.0013567224377766252, |
|
"learning_rate": 9.198209198209199e-07, |
|
"loss": 0.0002, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 4.77, |
|
"grad_norm": 0.0014696550788357854, |
|
"learning_rate": 9.035409035409036e-07, |
|
"loss": 0.0002, |
|
"step": 11730 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.0014795665629208088, |
|
"learning_rate": 8.872608872608874e-07, |
|
"loss": 0.0002, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 0.0015325862914323807, |
|
"learning_rate": 8.709808709808711e-07, |
|
"loss": 0.0002, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.0014404217945411801, |
|
"learning_rate": 8.547008547008548e-07, |
|
"loss": 0.0002, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.0019404751947149634, |
|
"learning_rate": 8.384208384208386e-07, |
|
"loss": 0.0002, |
|
"step": 11770 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 0.0016487601678818464, |
|
"learning_rate": 8.221408221408223e-07, |
|
"loss": 0.0002, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.0020900049712508917, |
|
"learning_rate": 8.05860805860806e-07, |
|
"loss": 0.0002, |
|
"step": 11790 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.044903818517923355, |
|
"learning_rate": 7.895807895807897e-07, |
|
"loss": 0.0002, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.006237754598259926, |
|
"learning_rate": 7.733007733007733e-07, |
|
"loss": 0.0002, |
|
"step": 11810 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.001496842596679926, |
|
"learning_rate": 7.57020757020757e-07, |
|
"loss": 0.0002, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 0.0014312907587736845, |
|
"learning_rate": 7.407407407407407e-07, |
|
"loss": 0.0002, |
|
"step": 11830 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.0020331472624093294, |
|
"learning_rate": 7.244607244607245e-07, |
|
"loss": 0.0002, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 0.0015430136118084192, |
|
"learning_rate": 7.081807081807082e-07, |
|
"loss": 0.0002, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.0014734879368916154, |
|
"learning_rate": 6.919006919006919e-07, |
|
"loss": 0.0002, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 0.006134878844022751, |
|
"learning_rate": 6.756206756206756e-07, |
|
"loss": 0.0002, |
|
"step": 11870 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.0013609755551442504, |
|
"learning_rate": 6.593406593406594e-07, |
|
"loss": 0.0002, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.002070717280730605, |
|
"learning_rate": 6.430606430606431e-07, |
|
"loss": 0.0002, |
|
"step": 11890 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.0014169508358463645, |
|
"learning_rate": 6.267806267806268e-07, |
|
"loss": 0.0002, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.0014770817942917347, |
|
"learning_rate": 6.105006105006106e-07, |
|
"loss": 0.0002, |
|
"step": 11910 |
|
}, |
|
{ |
|
"epoch": 4.85, |
|
"grad_norm": 0.0014419537037611008, |
|
"learning_rate": 5.942205942205943e-07, |
|
"loss": 0.0002, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.001446893555112183, |
|
"learning_rate": 5.77940577940578e-07, |
|
"loss": 0.0002, |
|
"step": 11930 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.001416919520124793, |
|
"learning_rate": 5.616605616605618e-07, |
|
"loss": 0.0002, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 0.0034949700348079205, |
|
"learning_rate": 5.453805453805455e-07, |
|
"loss": 0.0002, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.0014441277598962188, |
|
"learning_rate": 5.291005291005291e-07, |
|
"loss": 0.0002, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.0015632550930604339, |
|
"learning_rate": 5.128205128205128e-07, |
|
"loss": 0.0002, |
|
"step": 11970 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.001399176544509828, |
|
"learning_rate": 4.965404965404966e-07, |
|
"loss": 0.0002, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.0013975553447380662, |
|
"learning_rate": 4.802604802604803e-07, |
|
"loss": 0.0002, |
|
"step": 11990 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 0.0013712114887312055, |
|
"learning_rate": 4.63980463980464e-07, |
|
"loss": 0.0002, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.001828977488912642, |
|
"learning_rate": 4.4770044770044775e-07, |
|
"loss": 0.0002, |
|
"step": 12010 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.0014294543070718646, |
|
"learning_rate": 4.3142043142043143e-07, |
|
"loss": 0.0002, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.0013922780053690076, |
|
"learning_rate": 4.1514041514041516e-07, |
|
"loss": 0.0002, |
|
"step": 12030 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.0016130340518429875, |
|
"learning_rate": 3.988603988603989e-07, |
|
"loss": 0.0002, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 0.0013872876297682524, |
|
"learning_rate": 3.825803825803826e-07, |
|
"loss": 0.0002, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 0.0014586036559194326, |
|
"learning_rate": 3.6630036630036635e-07, |
|
"loss": 0.0002, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 0.0014334677252918482, |
|
"learning_rate": 3.500203500203501e-07, |
|
"loss": 0.0002, |
|
"step": 12070 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.0014047607546672225, |
|
"learning_rate": 3.3374033374033376e-07, |
|
"loss": 0.0002, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.0013850359246134758, |
|
"learning_rate": 3.174603174603175e-07, |
|
"loss": 0.0002, |
|
"step": 12090 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 0.0013912185095250607, |
|
"learning_rate": 3.011803011803012e-07, |
|
"loss": 0.0002, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.001442193053662777, |
|
"learning_rate": 2.8490028490028494e-07, |
|
"loss": 0.0002, |
|
"step": 12110 |
|
}, |
|
{ |
|
"epoch": 4.93, |
|
"grad_norm": 0.0014724673237651587, |
|
"learning_rate": 2.6862026862026867e-07, |
|
"loss": 0.0002, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.0017670753877609968, |
|
"learning_rate": 2.5234025234025235e-07, |
|
"loss": 0.0002, |
|
"step": 12130 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 0.001458752085454762, |
|
"learning_rate": 2.3606023606023608e-07, |
|
"loss": 0.0002, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.0015336443902924657, |
|
"learning_rate": 2.197802197802198e-07, |
|
"loss": 0.0002, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.001413301331922412, |
|
"learning_rate": 2.035002035002035e-07, |
|
"loss": 0.0002, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 0.001454474637284875, |
|
"learning_rate": 1.8722018722018724e-07, |
|
"loss": 0.0002, |
|
"step": 12170 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.0014644395560026169, |
|
"learning_rate": 1.7094017094017097e-07, |
|
"loss": 0.0002, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.0014874679036438465, |
|
"learning_rate": 1.5466015466015467e-07, |
|
"loss": 0.0002, |
|
"step": 12190 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.0014028714504092932, |
|
"learning_rate": 1.383801383801384e-07, |
|
"loss": 0.0002, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.0014859441434964538, |
|
"learning_rate": 1.221001221001221e-07, |
|
"loss": 0.0002, |
|
"step": 12210 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 0.0014206055784597993, |
|
"learning_rate": 1.0582010582010582e-07, |
|
"loss": 0.0002, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.0013865531655028462, |
|
"learning_rate": 8.954008954008955e-08, |
|
"loss": 0.0002, |
|
"step": 12230 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 0.0014404187677428126, |
|
"learning_rate": 7.326007326007327e-08, |
|
"loss": 0.0002, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.0015573910204693675, |
|
"learning_rate": 5.6980056980056986e-08, |
|
"loss": 0.0003, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.0015043216990306973, |
|
"learning_rate": 4.07000407000407e-08, |
|
"loss": 0.0002, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 0.0015565322246402502, |
|
"learning_rate": 2.4420024420024422e-08, |
|
"loss": 0.0002, |
|
"step": 12270 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.003684895345941186, |
|
"learning_rate": 8.14000814000814e-09, |
|
"loss": 0.0002, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 12285, |
|
"total_flos": 1.523143801869613e+19, |
|
"train_loss": 0.006059203078136595, |
|
"train_runtime": 4479.7513, |
|
"train_samples_per_second": 43.876, |
|
"train_steps_per_second": 2.742 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 12285, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.523143801869613e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|