DuongTrongChi's picture
Training in progress, step 364, checkpoint
fb4b6d2 verified
raw
history blame
63.2 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9986282578875172,
"eval_steps": 500,
"global_step": 364,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027434842249657062,
"grad_norm": 0.2946356534957886,
"learning_rate": 2.0000000000000003e-06,
"loss": 2.0381,
"step": 1
},
{
"epoch": 0.0054869684499314125,
"grad_norm": 0.29844385385513306,
"learning_rate": 4.000000000000001e-06,
"loss": 2.0639,
"step": 2
},
{
"epoch": 0.00823045267489712,
"grad_norm": 0.29449769854545593,
"learning_rate": 6e-06,
"loss": 2.0396,
"step": 3
},
{
"epoch": 0.010973936899862825,
"grad_norm": 0.2835416793823242,
"learning_rate": 8.000000000000001e-06,
"loss": 1.9542,
"step": 4
},
{
"epoch": 0.013717421124828532,
"grad_norm": 0.29806381464004517,
"learning_rate": 1e-05,
"loss": 2.0672,
"step": 5
},
{
"epoch": 0.01646090534979424,
"grad_norm": 0.3270595073699951,
"learning_rate": 1.2e-05,
"loss": 2.0958,
"step": 6
},
{
"epoch": 0.019204389574759947,
"grad_norm": 0.298843652009964,
"learning_rate": 1.4000000000000001e-05,
"loss": 2.0237,
"step": 7
},
{
"epoch": 0.02194787379972565,
"grad_norm": 0.2945656180381775,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.0231,
"step": 8
},
{
"epoch": 0.024691358024691357,
"grad_norm": 0.26927563548088074,
"learning_rate": 1.8e-05,
"loss": 1.9405,
"step": 9
},
{
"epoch": 0.027434842249657063,
"grad_norm": 0.28798505663871765,
"learning_rate": 2e-05,
"loss": 2.0259,
"step": 10
},
{
"epoch": 0.03017832647462277,
"grad_norm": 0.2883073091506958,
"learning_rate": 2.2000000000000003e-05,
"loss": 2.0067,
"step": 11
},
{
"epoch": 0.03292181069958848,
"grad_norm": 0.2727283239364624,
"learning_rate": 2.4e-05,
"loss": 1.9331,
"step": 12
},
{
"epoch": 0.03566529492455418,
"grad_norm": 0.276351660490036,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.9741,
"step": 13
},
{
"epoch": 0.038408779149519894,
"grad_norm": 0.2597358822822571,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.9715,
"step": 14
},
{
"epoch": 0.0411522633744856,
"grad_norm": 0.2467939853668213,
"learning_rate": 3e-05,
"loss": 1.8935,
"step": 15
},
{
"epoch": 0.0438957475994513,
"grad_norm": 0.23715272545814514,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.972,
"step": 16
},
{
"epoch": 0.04663923182441701,
"grad_norm": 0.24344952404499054,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.9286,
"step": 17
},
{
"epoch": 0.04938271604938271,
"grad_norm": 0.25223401188850403,
"learning_rate": 3.6e-05,
"loss": 1.9943,
"step": 18
},
{
"epoch": 0.05212620027434842,
"grad_norm": 0.22262343764305115,
"learning_rate": 3.8e-05,
"loss": 1.8965,
"step": 19
},
{
"epoch": 0.05486968449931413,
"grad_norm": 0.2297024428844452,
"learning_rate": 4e-05,
"loss": 1.8722,
"step": 20
},
{
"epoch": 0.05761316872427984,
"grad_norm": 0.25161924958229065,
"learning_rate": 4.2e-05,
"loss": 1.979,
"step": 21
},
{
"epoch": 0.06035665294924554,
"grad_norm": 0.23105685412883759,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.8368,
"step": 22
},
{
"epoch": 0.06310013717421124,
"grad_norm": 0.2125592976808548,
"learning_rate": 4.600000000000001e-05,
"loss": 1.8045,
"step": 23
},
{
"epoch": 0.06584362139917696,
"grad_norm": 0.23787732422351837,
"learning_rate": 4.8e-05,
"loss": 1.8132,
"step": 24
},
{
"epoch": 0.06858710562414266,
"grad_norm": 0.22976243495941162,
"learning_rate": 5e-05,
"loss": 1.8008,
"step": 25
},
{
"epoch": 0.07133058984910837,
"grad_norm": 0.22189414501190186,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.7567,
"step": 26
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.22582988440990448,
"learning_rate": 5.4000000000000005e-05,
"loss": 1.7547,
"step": 27
},
{
"epoch": 0.07681755829903979,
"grad_norm": 0.24456851184368134,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.7323,
"step": 28
},
{
"epoch": 0.07956104252400549,
"grad_norm": 0.24938376247882843,
"learning_rate": 5.8e-05,
"loss": 1.7625,
"step": 29
},
{
"epoch": 0.0823045267489712,
"grad_norm": 0.2996648848056793,
"learning_rate": 6e-05,
"loss": 1.7083,
"step": 30
},
{
"epoch": 0.0850480109739369,
"grad_norm": 0.2755124270915985,
"learning_rate": 6.2e-05,
"loss": 1.6479,
"step": 31
},
{
"epoch": 0.0877914951989026,
"grad_norm": 0.27583757042884827,
"learning_rate": 6.400000000000001e-05,
"loss": 1.5984,
"step": 32
},
{
"epoch": 0.09053497942386832,
"grad_norm": 0.3020310699939728,
"learning_rate": 6.6e-05,
"loss": 1.5614,
"step": 33
},
{
"epoch": 0.09327846364883402,
"grad_norm": 0.33891311287879944,
"learning_rate": 6.800000000000001e-05,
"loss": 1.5925,
"step": 34
},
{
"epoch": 0.09602194787379972,
"grad_norm": 0.34853625297546387,
"learning_rate": 7e-05,
"loss": 1.5499,
"step": 35
},
{
"epoch": 0.09876543209876543,
"grad_norm": 0.7007214426994324,
"learning_rate": 7.2e-05,
"loss": 1.5028,
"step": 36
},
{
"epoch": 0.10150891632373114,
"grad_norm": 0.3837442994117737,
"learning_rate": 7.4e-05,
"loss": 1.4699,
"step": 37
},
{
"epoch": 0.10425240054869685,
"grad_norm": 0.3312113881111145,
"learning_rate": 7.6e-05,
"loss": 1.3699,
"step": 38
},
{
"epoch": 0.10699588477366255,
"grad_norm": 0.31788209080696106,
"learning_rate": 7.800000000000001e-05,
"loss": 1.3965,
"step": 39
},
{
"epoch": 0.10973936899862825,
"grad_norm": 0.27148839831352234,
"learning_rate": 8e-05,
"loss": 1.4292,
"step": 40
},
{
"epoch": 0.11248285322359397,
"grad_norm": 0.334274023771286,
"learning_rate": 8.2e-05,
"loss": 1.3351,
"step": 41
},
{
"epoch": 0.11522633744855967,
"grad_norm": 0.295818954706192,
"learning_rate": 8.4e-05,
"loss": 1.389,
"step": 42
},
{
"epoch": 0.11796982167352538,
"grad_norm": 0.20193521678447723,
"learning_rate": 8.6e-05,
"loss": 1.3042,
"step": 43
},
{
"epoch": 0.12071330589849108,
"grad_norm": 0.18113870918750763,
"learning_rate": 8.800000000000001e-05,
"loss": 1.341,
"step": 44
},
{
"epoch": 0.12345679012345678,
"grad_norm": 0.1790560781955719,
"learning_rate": 9e-05,
"loss": 1.2262,
"step": 45
},
{
"epoch": 0.1262002743484225,
"grad_norm": 0.1680901199579239,
"learning_rate": 9.200000000000001e-05,
"loss": 1.3332,
"step": 46
},
{
"epoch": 0.1289437585733882,
"grad_norm": 0.1703910529613495,
"learning_rate": 9.4e-05,
"loss": 1.293,
"step": 47
},
{
"epoch": 0.13168724279835392,
"grad_norm": 0.21085375547409058,
"learning_rate": 9.6e-05,
"loss": 1.2366,
"step": 48
},
{
"epoch": 0.13443072702331962,
"grad_norm": 0.19799071550369263,
"learning_rate": 9.8e-05,
"loss": 1.2745,
"step": 49
},
{
"epoch": 0.13717421124828533,
"grad_norm": 0.18740390241146088,
"learning_rate": 0.0001,
"loss": 1.2922,
"step": 50
},
{
"epoch": 0.13991769547325103,
"grad_norm": 0.18821899592876434,
"learning_rate": 0.00010200000000000001,
"loss": 1.2539,
"step": 51
},
{
"epoch": 0.14266117969821673,
"grad_norm": 0.20197157561779022,
"learning_rate": 0.00010400000000000001,
"loss": 1.2264,
"step": 52
},
{
"epoch": 0.14540466392318244,
"grad_norm": 0.2285614311695099,
"learning_rate": 0.00010600000000000002,
"loss": 1.1831,
"step": 53
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.18926049768924713,
"learning_rate": 0.00010800000000000001,
"loss": 1.2345,
"step": 54
},
{
"epoch": 0.15089163237311384,
"grad_norm": 0.2111654132604599,
"learning_rate": 0.00011000000000000002,
"loss": 1.2619,
"step": 55
},
{
"epoch": 0.15363511659807957,
"grad_norm": 0.20455430448055267,
"learning_rate": 0.00011200000000000001,
"loss": 1.2055,
"step": 56
},
{
"epoch": 0.15637860082304528,
"grad_norm": 0.2112637162208557,
"learning_rate": 0.00011399999999999999,
"loss": 1.2008,
"step": 57
},
{
"epoch": 0.15912208504801098,
"grad_norm": 0.2301080822944641,
"learning_rate": 0.000116,
"loss": 1.1433,
"step": 58
},
{
"epoch": 0.16186556927297668,
"grad_norm": 0.15775448083877563,
"learning_rate": 0.000118,
"loss": 1.1936,
"step": 59
},
{
"epoch": 0.1646090534979424,
"grad_norm": 0.19497638940811157,
"learning_rate": 0.00012,
"loss": 1.2291,
"step": 60
},
{
"epoch": 0.1673525377229081,
"grad_norm": 0.2723619341850281,
"learning_rate": 0.000122,
"loss": 1.2482,
"step": 61
},
{
"epoch": 0.1700960219478738,
"grad_norm": 0.24689513444900513,
"learning_rate": 0.000124,
"loss": 1.1834,
"step": 62
},
{
"epoch": 0.1728395061728395,
"grad_norm": 0.13218045234680176,
"learning_rate": 0.000126,
"loss": 1.1565,
"step": 63
},
{
"epoch": 0.1755829903978052,
"grad_norm": 0.18825608491897583,
"learning_rate": 0.00012800000000000002,
"loss": 1.1397,
"step": 64
},
{
"epoch": 0.17832647462277093,
"grad_norm": 0.23535209894180298,
"learning_rate": 0.00013000000000000002,
"loss": 1.1726,
"step": 65
},
{
"epoch": 0.18106995884773663,
"grad_norm": 0.15770217776298523,
"learning_rate": 0.000132,
"loss": 1.1692,
"step": 66
},
{
"epoch": 0.18381344307270234,
"grad_norm": 0.10571371763944626,
"learning_rate": 0.000134,
"loss": 1.1657,
"step": 67
},
{
"epoch": 0.18655692729766804,
"grad_norm": 0.08727282285690308,
"learning_rate": 0.00013600000000000003,
"loss": 1.1894,
"step": 68
},
{
"epoch": 0.18930041152263374,
"grad_norm": 0.1120406910777092,
"learning_rate": 0.000138,
"loss": 1.1434,
"step": 69
},
{
"epoch": 0.19204389574759945,
"grad_norm": 0.13725778460502625,
"learning_rate": 0.00014,
"loss": 1.1033,
"step": 70
},
{
"epoch": 0.19478737997256515,
"grad_norm": 0.10864784568548203,
"learning_rate": 0.000142,
"loss": 1.116,
"step": 71
},
{
"epoch": 0.19753086419753085,
"grad_norm": 0.09078360348939896,
"learning_rate": 0.000144,
"loss": 1.1561,
"step": 72
},
{
"epoch": 0.20027434842249658,
"grad_norm": 0.07816857099533081,
"learning_rate": 0.000146,
"loss": 1.1537,
"step": 73
},
{
"epoch": 0.2030178326474623,
"grad_norm": 0.08013437688350677,
"learning_rate": 0.000148,
"loss": 1.1615,
"step": 74
},
{
"epoch": 0.205761316872428,
"grad_norm": 0.07821324467658997,
"learning_rate": 0.00015000000000000001,
"loss": 1.1757,
"step": 75
},
{
"epoch": 0.2085048010973937,
"grad_norm": 0.07252127677202225,
"learning_rate": 0.000152,
"loss": 1.1462,
"step": 76
},
{
"epoch": 0.2112482853223594,
"grad_norm": 0.06541033089160919,
"learning_rate": 0.000154,
"loss": 1.1639,
"step": 77
},
{
"epoch": 0.2139917695473251,
"grad_norm": 0.06751246005296707,
"learning_rate": 0.00015600000000000002,
"loss": 1.1323,
"step": 78
},
{
"epoch": 0.2167352537722908,
"grad_norm": 0.07330214232206345,
"learning_rate": 0.00015800000000000002,
"loss": 1.181,
"step": 79
},
{
"epoch": 0.2194787379972565,
"grad_norm": 0.07126874476671219,
"learning_rate": 0.00016,
"loss": 1.1548,
"step": 80
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.07262880355119705,
"learning_rate": 0.000162,
"loss": 1.1685,
"step": 81
},
{
"epoch": 0.22496570644718794,
"grad_norm": 0.0727178081870079,
"learning_rate": 0.000164,
"loss": 1.0893,
"step": 82
},
{
"epoch": 0.22770919067215364,
"grad_norm": 0.07534700632095337,
"learning_rate": 0.000166,
"loss": 1.2199,
"step": 83
},
{
"epoch": 0.23045267489711935,
"grad_norm": 0.07132842391729355,
"learning_rate": 0.000168,
"loss": 1.0465,
"step": 84
},
{
"epoch": 0.23319615912208505,
"grad_norm": 0.08760551363229752,
"learning_rate": 0.00017,
"loss": 1.0927,
"step": 85
},
{
"epoch": 0.23593964334705075,
"grad_norm": 0.0642840787768364,
"learning_rate": 0.000172,
"loss": 1.1566,
"step": 86
},
{
"epoch": 0.23868312757201646,
"grad_norm": 0.072926364839077,
"learning_rate": 0.000174,
"loss": 1.1691,
"step": 87
},
{
"epoch": 0.24142661179698216,
"grad_norm": 0.06403500586748123,
"learning_rate": 0.00017600000000000002,
"loss": 1.2099,
"step": 88
},
{
"epoch": 0.24417009602194786,
"grad_norm": 0.0723823606967926,
"learning_rate": 0.00017800000000000002,
"loss": 1.1168,
"step": 89
},
{
"epoch": 0.24691358024691357,
"grad_norm": 0.0792553573846817,
"learning_rate": 0.00018,
"loss": 1.1433,
"step": 90
},
{
"epoch": 0.2496570644718793,
"grad_norm": 0.07233411818742752,
"learning_rate": 0.000182,
"loss": 1.1189,
"step": 91
},
{
"epoch": 0.252400548696845,
"grad_norm": 0.0691598579287529,
"learning_rate": 0.00018400000000000003,
"loss": 1.2083,
"step": 92
},
{
"epoch": 0.2551440329218107,
"grad_norm": 0.06808219105005264,
"learning_rate": 0.00018600000000000002,
"loss": 1.1325,
"step": 93
},
{
"epoch": 0.2578875171467764,
"grad_norm": 0.0633513554930687,
"learning_rate": 0.000188,
"loss": 1.1104,
"step": 94
},
{
"epoch": 0.2606310013717421,
"grad_norm": 0.06468816101551056,
"learning_rate": 0.00019,
"loss": 1.1761,
"step": 95
},
{
"epoch": 0.26337448559670784,
"grad_norm": 0.0678904801607132,
"learning_rate": 0.000192,
"loss": 1.1516,
"step": 96
},
{
"epoch": 0.2661179698216735,
"grad_norm": 0.06778690963983536,
"learning_rate": 0.000194,
"loss": 1.1589,
"step": 97
},
{
"epoch": 0.26886145404663925,
"grad_norm": 0.06877358257770538,
"learning_rate": 0.000196,
"loss": 1.069,
"step": 98
},
{
"epoch": 0.2716049382716049,
"grad_norm": 0.06891150772571564,
"learning_rate": 0.00019800000000000002,
"loss": 1.1271,
"step": 99
},
{
"epoch": 0.27434842249657065,
"grad_norm": 0.06893782317638397,
"learning_rate": 0.0002,
"loss": 1.1032,
"step": 100
},
{
"epoch": 0.27709190672153633,
"grad_norm": 0.06988958269357681,
"learning_rate": 0.00019924242424242426,
"loss": 1.1083,
"step": 101
},
{
"epoch": 0.27983539094650206,
"grad_norm": 0.070386603474617,
"learning_rate": 0.0001984848484848485,
"loss": 1.1062,
"step": 102
},
{
"epoch": 0.2825788751714678,
"grad_norm": 0.06926562637090683,
"learning_rate": 0.00019772727272727273,
"loss": 1.0914,
"step": 103
},
{
"epoch": 0.28532235939643347,
"grad_norm": 0.06630910187959671,
"learning_rate": 0.00019696969696969698,
"loss": 1.1299,
"step": 104
},
{
"epoch": 0.2880658436213992,
"grad_norm": 0.07249345630407333,
"learning_rate": 0.00019621212121212123,
"loss": 1.0555,
"step": 105
},
{
"epoch": 0.2908093278463649,
"grad_norm": 0.06877996772527695,
"learning_rate": 0.00019545454545454548,
"loss": 1.0914,
"step": 106
},
{
"epoch": 0.2935528120713306,
"grad_norm": 0.0682688057422638,
"learning_rate": 0.0001946969696969697,
"loss": 1.0866,
"step": 107
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.06741555780172348,
"learning_rate": 0.00019393939393939395,
"loss": 1.0891,
"step": 108
},
{
"epoch": 0.299039780521262,
"grad_norm": 0.06763298809528351,
"learning_rate": 0.0001931818181818182,
"loss": 1.1692,
"step": 109
},
{
"epoch": 0.3017832647462277,
"grad_norm": 0.07024376839399338,
"learning_rate": 0.00019242424242424245,
"loss": 1.0626,
"step": 110
},
{
"epoch": 0.3045267489711934,
"grad_norm": 0.06682418286800385,
"learning_rate": 0.00019166666666666667,
"loss": 1.1368,
"step": 111
},
{
"epoch": 0.30727023319615915,
"grad_norm": 0.06636565923690796,
"learning_rate": 0.00019090909090909092,
"loss": 1.1415,
"step": 112
},
{
"epoch": 0.3100137174211248,
"grad_norm": 0.06566283106803894,
"learning_rate": 0.00019015151515151517,
"loss": 1.1443,
"step": 113
},
{
"epoch": 0.31275720164609055,
"grad_norm": 0.07026142627000809,
"learning_rate": 0.00018939393939393942,
"loss": 1.1342,
"step": 114
},
{
"epoch": 0.31550068587105623,
"grad_norm": 0.0713697299361229,
"learning_rate": 0.00018863636363636364,
"loss": 1.1552,
"step": 115
},
{
"epoch": 0.31824417009602196,
"grad_norm": 0.06598669290542603,
"learning_rate": 0.0001878787878787879,
"loss": 1.0659,
"step": 116
},
{
"epoch": 0.32098765432098764,
"grad_norm": 0.06925521045923233,
"learning_rate": 0.00018712121212121212,
"loss": 1.1745,
"step": 117
},
{
"epoch": 0.32373113854595337,
"grad_norm": 0.07137319445610046,
"learning_rate": 0.00018636363636363636,
"loss": 1.2006,
"step": 118
},
{
"epoch": 0.32647462277091904,
"grad_norm": 0.06820110231637955,
"learning_rate": 0.00018560606060606061,
"loss": 1.1067,
"step": 119
},
{
"epoch": 0.3292181069958848,
"grad_norm": 0.07061274349689484,
"learning_rate": 0.00018484848484848484,
"loss": 1.1273,
"step": 120
},
{
"epoch": 0.3319615912208505,
"grad_norm": 0.07401842623949051,
"learning_rate": 0.00018409090909090909,
"loss": 1.0367,
"step": 121
},
{
"epoch": 0.3347050754458162,
"grad_norm": 0.070513054728508,
"learning_rate": 0.00018333333333333334,
"loss": 1.1302,
"step": 122
},
{
"epoch": 0.3374485596707819,
"grad_norm": 0.07722620666027069,
"learning_rate": 0.00018257575757575758,
"loss": 1.0779,
"step": 123
},
{
"epoch": 0.3401920438957476,
"grad_norm": 0.07941732555627823,
"learning_rate": 0.00018181818181818183,
"loss": 1.0864,
"step": 124
},
{
"epoch": 0.3429355281207133,
"grad_norm": 0.07316974550485611,
"learning_rate": 0.00018106060606060606,
"loss": 1.1599,
"step": 125
},
{
"epoch": 0.345679012345679,
"grad_norm": 0.06967978924512863,
"learning_rate": 0.0001803030303030303,
"loss": 1.1056,
"step": 126
},
{
"epoch": 0.3484224965706447,
"grad_norm": 0.08569256216287613,
"learning_rate": 0.00017954545454545456,
"loss": 1.1652,
"step": 127
},
{
"epoch": 0.3511659807956104,
"grad_norm": 0.07549481093883514,
"learning_rate": 0.0001787878787878788,
"loss": 1.1811,
"step": 128
},
{
"epoch": 0.35390946502057613,
"grad_norm": 0.07452043145895004,
"learning_rate": 0.00017803030303030303,
"loss": 1.1641,
"step": 129
},
{
"epoch": 0.35665294924554186,
"grad_norm": 0.07065053284168243,
"learning_rate": 0.00017727272727272728,
"loss": 1.1272,
"step": 130
},
{
"epoch": 0.35939643347050754,
"grad_norm": 0.07598507404327393,
"learning_rate": 0.00017651515151515153,
"loss": 1.0538,
"step": 131
},
{
"epoch": 0.36213991769547327,
"grad_norm": 0.07031415402889252,
"learning_rate": 0.00017575757575757578,
"loss": 1.131,
"step": 132
},
{
"epoch": 0.36488340192043894,
"grad_norm": 0.07048885524272919,
"learning_rate": 0.000175,
"loss": 1.102,
"step": 133
},
{
"epoch": 0.3676268861454047,
"grad_norm": 0.07216944545507431,
"learning_rate": 0.00017424242424242425,
"loss": 1.0883,
"step": 134
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.07806240767240524,
"learning_rate": 0.0001734848484848485,
"loss": 1.0138,
"step": 135
},
{
"epoch": 0.3731138545953361,
"grad_norm": 0.07860879600048065,
"learning_rate": 0.00017272727272727275,
"loss": 1.0954,
"step": 136
},
{
"epoch": 0.37585733882030176,
"grad_norm": 0.07348994165658951,
"learning_rate": 0.00017196969696969697,
"loss": 1.1519,
"step": 137
},
{
"epoch": 0.3786008230452675,
"grad_norm": 0.07253053784370422,
"learning_rate": 0.00017121212121212122,
"loss": 1.1412,
"step": 138
},
{
"epoch": 0.3813443072702332,
"grad_norm": 0.07173314690589905,
"learning_rate": 0.00017045454545454547,
"loss": 1.1001,
"step": 139
},
{
"epoch": 0.3840877914951989,
"grad_norm": 0.07977738231420517,
"learning_rate": 0.00016969696969696972,
"loss": 1.1339,
"step": 140
},
{
"epoch": 0.3868312757201646,
"grad_norm": 0.08174338191747665,
"learning_rate": 0.00016893939393939394,
"loss": 1.0587,
"step": 141
},
{
"epoch": 0.3895747599451303,
"grad_norm": 0.07792508602142334,
"learning_rate": 0.0001681818181818182,
"loss": 1.1188,
"step": 142
},
{
"epoch": 0.39231824417009603,
"grad_norm": 0.07827766239643097,
"learning_rate": 0.00016742424242424244,
"loss": 1.0577,
"step": 143
},
{
"epoch": 0.3950617283950617,
"grad_norm": 0.08250988274812698,
"learning_rate": 0.0001666666666666667,
"loss": 1.1019,
"step": 144
},
{
"epoch": 0.39780521262002744,
"grad_norm": 0.07399312406778336,
"learning_rate": 0.00016590909090909094,
"loss": 1.0642,
"step": 145
},
{
"epoch": 0.40054869684499317,
"grad_norm": 0.07980221509933472,
"learning_rate": 0.00016515151515151516,
"loss": 1.108,
"step": 146
},
{
"epoch": 0.40329218106995884,
"grad_norm": 0.07118561118841171,
"learning_rate": 0.0001643939393939394,
"loss": 1.077,
"step": 147
},
{
"epoch": 0.4060356652949246,
"grad_norm": 0.07912468165159225,
"learning_rate": 0.00016363636363636366,
"loss": 1.0813,
"step": 148
},
{
"epoch": 0.40877914951989025,
"grad_norm": 0.07469073683023453,
"learning_rate": 0.0001628787878787879,
"loss": 1.1335,
"step": 149
},
{
"epoch": 0.411522633744856,
"grad_norm": 0.0729939341545105,
"learning_rate": 0.00016212121212121213,
"loss": 1.0728,
"step": 150
},
{
"epoch": 0.41426611796982166,
"grad_norm": 0.07003669440746307,
"learning_rate": 0.00016136363636363635,
"loss": 1.1808,
"step": 151
},
{
"epoch": 0.4170096021947874,
"grad_norm": 0.08286017924547195,
"learning_rate": 0.0001606060606060606,
"loss": 1.0878,
"step": 152
},
{
"epoch": 0.41975308641975306,
"grad_norm": 0.08155351877212524,
"learning_rate": 0.00015984848484848485,
"loss": 1.1349,
"step": 153
},
{
"epoch": 0.4224965706447188,
"grad_norm": 0.0759543851017952,
"learning_rate": 0.0001590909090909091,
"loss": 1.0588,
"step": 154
},
{
"epoch": 0.4252400548696845,
"grad_norm": 0.078402079641819,
"learning_rate": 0.00015833333333333332,
"loss": 1.0919,
"step": 155
},
{
"epoch": 0.4279835390946502,
"grad_norm": 0.07917957007884979,
"learning_rate": 0.00015757575757575757,
"loss": 1.0976,
"step": 156
},
{
"epoch": 0.43072702331961593,
"grad_norm": 0.0801803320646286,
"learning_rate": 0.00015681818181818182,
"loss": 1.0845,
"step": 157
},
{
"epoch": 0.4334705075445816,
"grad_norm": 0.07363519072532654,
"learning_rate": 0.00015606060606060607,
"loss": 1.1814,
"step": 158
},
{
"epoch": 0.43621399176954734,
"grad_norm": 0.07720773667097092,
"learning_rate": 0.0001553030303030303,
"loss": 1.1242,
"step": 159
},
{
"epoch": 0.438957475994513,
"grad_norm": 0.0772809386253357,
"learning_rate": 0.00015454545454545454,
"loss": 1.1629,
"step": 160
},
{
"epoch": 0.44170096021947874,
"grad_norm": 0.07335783541202545,
"learning_rate": 0.0001537878787878788,
"loss": 1.1399,
"step": 161
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.07078609615564346,
"learning_rate": 0.00015303030303030304,
"loss": 1.0633,
"step": 162
},
{
"epoch": 0.44718792866941015,
"grad_norm": 0.07721443474292755,
"learning_rate": 0.00015227272727272727,
"loss": 1.0968,
"step": 163
},
{
"epoch": 0.4499314128943759,
"grad_norm": 0.07415551692247391,
"learning_rate": 0.00015151515151515152,
"loss": 1.0701,
"step": 164
},
{
"epoch": 0.45267489711934156,
"grad_norm": 0.0827174186706543,
"learning_rate": 0.00015075757575757576,
"loss": 1.1398,
"step": 165
},
{
"epoch": 0.4554183813443073,
"grad_norm": 0.0798204243183136,
"learning_rate": 0.00015000000000000001,
"loss": 1.1372,
"step": 166
},
{
"epoch": 0.45816186556927296,
"grad_norm": 0.07882829010486603,
"learning_rate": 0.00014924242424242426,
"loss": 1.0937,
"step": 167
},
{
"epoch": 0.4609053497942387,
"grad_norm": 0.07198477536439896,
"learning_rate": 0.00014848484848484849,
"loss": 1.1051,
"step": 168
},
{
"epoch": 0.46364883401920437,
"grad_norm": 0.07786595076322556,
"learning_rate": 0.00014772727272727274,
"loss": 1.1006,
"step": 169
},
{
"epoch": 0.4663923182441701,
"grad_norm": 0.07669170945882797,
"learning_rate": 0.00014696969696969698,
"loss": 1.1723,
"step": 170
},
{
"epoch": 0.4691358024691358,
"grad_norm": 0.0792948380112648,
"learning_rate": 0.00014621212121212123,
"loss": 1.1051,
"step": 171
},
{
"epoch": 0.4718792866941015,
"grad_norm": 0.07537594437599182,
"learning_rate": 0.00014545454545454546,
"loss": 1.1015,
"step": 172
},
{
"epoch": 0.47462277091906724,
"grad_norm": 0.08040138334035873,
"learning_rate": 0.0001446969696969697,
"loss": 1.0279,
"step": 173
},
{
"epoch": 0.4773662551440329,
"grad_norm": 0.07825993746519089,
"learning_rate": 0.00014393939393939396,
"loss": 1.1322,
"step": 174
},
{
"epoch": 0.48010973936899864,
"grad_norm": 0.07616106420755386,
"learning_rate": 0.0001431818181818182,
"loss": 1.1015,
"step": 175
},
{
"epoch": 0.4828532235939643,
"grad_norm": 0.07644405961036682,
"learning_rate": 0.00014242424242424243,
"loss": 1.092,
"step": 176
},
{
"epoch": 0.48559670781893005,
"grad_norm": 0.07263129204511642,
"learning_rate": 0.00014166666666666668,
"loss": 1.1056,
"step": 177
},
{
"epoch": 0.4883401920438957,
"grad_norm": 0.08228097856044769,
"learning_rate": 0.00014090909090909093,
"loss": 1.1323,
"step": 178
},
{
"epoch": 0.49108367626886146,
"grad_norm": 0.07709623128175735,
"learning_rate": 0.00014015151515151518,
"loss": 1.1366,
"step": 179
},
{
"epoch": 0.49382716049382713,
"grad_norm": 0.07731063663959503,
"learning_rate": 0.0001393939393939394,
"loss": 1.0704,
"step": 180
},
{
"epoch": 0.49657064471879286,
"grad_norm": 0.0785774365067482,
"learning_rate": 0.00013863636363636365,
"loss": 1.0515,
"step": 181
},
{
"epoch": 0.4993141289437586,
"grad_norm": 0.07938612997531891,
"learning_rate": 0.0001378787878787879,
"loss": 1.0655,
"step": 182
},
{
"epoch": 0.5020576131687243,
"grad_norm": 0.0815865769982338,
"learning_rate": 0.00013712121212121212,
"loss": 1.1327,
"step": 183
},
{
"epoch": 0.50480109739369,
"grad_norm": 0.07875478267669678,
"learning_rate": 0.00013636363636363637,
"loss": 1.1439,
"step": 184
},
{
"epoch": 0.5075445816186557,
"grad_norm": 0.0801415741443634,
"learning_rate": 0.0001356060606060606,
"loss": 1.0911,
"step": 185
},
{
"epoch": 0.5102880658436214,
"grad_norm": 0.07313405722379684,
"learning_rate": 0.00013484848484848484,
"loss": 1.1351,
"step": 186
},
{
"epoch": 0.5130315500685871,
"grad_norm": 0.08034719526767731,
"learning_rate": 0.0001340909090909091,
"loss": 1.0797,
"step": 187
},
{
"epoch": 0.5157750342935528,
"grad_norm": 0.0769338384270668,
"learning_rate": 0.00013333333333333334,
"loss": 1.1155,
"step": 188
},
{
"epoch": 0.5185185185185185,
"grad_norm": 0.07676918804645538,
"learning_rate": 0.00013257575757575756,
"loss": 1.0673,
"step": 189
},
{
"epoch": 0.5212620027434842,
"grad_norm": 0.07683762162923813,
"learning_rate": 0.0001318181818181818,
"loss": 1.1268,
"step": 190
},
{
"epoch": 0.52400548696845,
"grad_norm": 0.07416949421167374,
"learning_rate": 0.00013106060606060606,
"loss": 1.1569,
"step": 191
},
{
"epoch": 0.5267489711934157,
"grad_norm": 0.08352147787809372,
"learning_rate": 0.0001303030303030303,
"loss": 1.0751,
"step": 192
},
{
"epoch": 0.5294924554183813,
"grad_norm": 0.07809505611658096,
"learning_rate": 0.00012954545454545456,
"loss": 1.0831,
"step": 193
},
{
"epoch": 0.532235939643347,
"grad_norm": 0.077943816781044,
"learning_rate": 0.00012878787878787878,
"loss": 1.0779,
"step": 194
},
{
"epoch": 0.5349794238683128,
"grad_norm": 0.07693532109260559,
"learning_rate": 0.00012803030303030303,
"loss": 1.067,
"step": 195
},
{
"epoch": 0.5377229080932785,
"grad_norm": 0.07566115260124207,
"learning_rate": 0.00012727272727272728,
"loss": 1.1247,
"step": 196
},
{
"epoch": 0.5404663923182441,
"grad_norm": 0.07868772745132446,
"learning_rate": 0.00012651515151515153,
"loss": 1.1299,
"step": 197
},
{
"epoch": 0.5432098765432098,
"grad_norm": 0.08335736393928528,
"learning_rate": 0.00012575757575757575,
"loss": 1.0941,
"step": 198
},
{
"epoch": 0.5459533607681756,
"grad_norm": 0.08072495460510254,
"learning_rate": 0.000125,
"loss": 1.0399,
"step": 199
},
{
"epoch": 0.5486968449931413,
"grad_norm": 0.07898343354463577,
"learning_rate": 0.00012424242424242425,
"loss": 1.0546,
"step": 200
},
{
"epoch": 0.551440329218107,
"grad_norm": 0.0824529305100441,
"learning_rate": 0.0001234848484848485,
"loss": 1.0855,
"step": 201
},
{
"epoch": 0.5541838134430727,
"grad_norm": 0.08226650953292847,
"learning_rate": 0.00012272727272727272,
"loss": 1.1273,
"step": 202
},
{
"epoch": 0.5569272976680384,
"grad_norm": 0.07998031377792358,
"learning_rate": 0.00012196969696969697,
"loss": 1.0859,
"step": 203
},
{
"epoch": 0.5596707818930041,
"grad_norm": 0.08315291255712509,
"learning_rate": 0.00012121212121212122,
"loss": 1.0044,
"step": 204
},
{
"epoch": 0.5624142661179699,
"grad_norm": 0.08188676834106445,
"learning_rate": 0.00012045454545454546,
"loss": 1.1204,
"step": 205
},
{
"epoch": 0.5651577503429356,
"grad_norm": 0.07907616347074509,
"learning_rate": 0.00011969696969696971,
"loss": 1.0595,
"step": 206
},
{
"epoch": 0.5679012345679012,
"grad_norm": 0.07982588559389114,
"learning_rate": 0.00011893939393939394,
"loss": 1.0917,
"step": 207
},
{
"epoch": 0.5706447187928669,
"grad_norm": 0.0851968377828598,
"learning_rate": 0.0001181818181818182,
"loss": 1.1344,
"step": 208
},
{
"epoch": 0.5733882030178327,
"grad_norm": 0.08252400159835815,
"learning_rate": 0.00011742424242424244,
"loss": 1.0821,
"step": 209
},
{
"epoch": 0.5761316872427984,
"grad_norm": 0.07744535803794861,
"learning_rate": 0.00011666666666666668,
"loss": 1.12,
"step": 210
},
{
"epoch": 0.578875171467764,
"grad_norm": 0.08743196725845337,
"learning_rate": 0.00011590909090909093,
"loss": 1.0335,
"step": 211
},
{
"epoch": 0.5816186556927297,
"grad_norm": 0.07998170703649521,
"learning_rate": 0.00011515151515151516,
"loss": 1.1517,
"step": 212
},
{
"epoch": 0.5843621399176955,
"grad_norm": 0.0830145925283432,
"learning_rate": 0.00011439393939393941,
"loss": 1.0338,
"step": 213
},
{
"epoch": 0.5871056241426612,
"grad_norm": 0.07934883236885071,
"learning_rate": 0.00011363636363636365,
"loss": 1.0978,
"step": 214
},
{
"epoch": 0.5898491083676269,
"grad_norm": 0.08373104780912399,
"learning_rate": 0.0001128787878787879,
"loss": 1.0454,
"step": 215
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.07729553431272507,
"learning_rate": 0.00011212121212121212,
"loss": 1.0724,
"step": 216
},
{
"epoch": 0.5953360768175583,
"grad_norm": 0.08006753772497177,
"learning_rate": 0.00011136363636363636,
"loss": 1.1247,
"step": 217
},
{
"epoch": 0.598079561042524,
"grad_norm": 0.07897701859474182,
"learning_rate": 0.00011060606060606061,
"loss": 1.1037,
"step": 218
},
{
"epoch": 0.6008230452674898,
"grad_norm": 0.0868496224284172,
"learning_rate": 0.00010984848484848484,
"loss": 1.0103,
"step": 219
},
{
"epoch": 0.6035665294924554,
"grad_norm": 0.07509775459766388,
"learning_rate": 0.00010909090909090909,
"loss": 1.1151,
"step": 220
},
{
"epoch": 0.6063100137174211,
"grad_norm": 0.08052953332662582,
"learning_rate": 0.00010833333333333333,
"loss": 1.0913,
"step": 221
},
{
"epoch": 0.6090534979423868,
"grad_norm": 0.08503436297178268,
"learning_rate": 0.00010757575757575758,
"loss": 1.0426,
"step": 222
},
{
"epoch": 0.6117969821673526,
"grad_norm": 0.08087459206581116,
"learning_rate": 0.00010681818181818181,
"loss": 1.0417,
"step": 223
},
{
"epoch": 0.6145404663923183,
"grad_norm": 0.08182617276906967,
"learning_rate": 0.00010606060606060606,
"loss": 1.0696,
"step": 224
},
{
"epoch": 0.6172839506172839,
"grad_norm": 0.08103771507740021,
"learning_rate": 0.0001053030303030303,
"loss": 1.1263,
"step": 225
},
{
"epoch": 0.6200274348422496,
"grad_norm": 0.0866732969880104,
"learning_rate": 0.00010454545454545455,
"loss": 1.0592,
"step": 226
},
{
"epoch": 0.6227709190672154,
"grad_norm": 0.07961411774158478,
"learning_rate": 0.00010378787878787878,
"loss": 1.115,
"step": 227
},
{
"epoch": 0.6255144032921811,
"grad_norm": 0.08118937164545059,
"learning_rate": 0.00010303030303030303,
"loss": 1.1054,
"step": 228
},
{
"epoch": 0.6282578875171467,
"grad_norm": 0.07873468846082687,
"learning_rate": 0.00010227272727272727,
"loss": 1.1006,
"step": 229
},
{
"epoch": 0.6310013717421125,
"grad_norm": 0.08241429179906845,
"learning_rate": 0.00010151515151515152,
"loss": 1.0152,
"step": 230
},
{
"epoch": 0.6337448559670782,
"grad_norm": 0.08102323859930038,
"learning_rate": 0.00010075757575757576,
"loss": 1.0657,
"step": 231
},
{
"epoch": 0.6364883401920439,
"grad_norm": 0.08024183660745621,
"learning_rate": 0.0001,
"loss": 1.1208,
"step": 232
},
{
"epoch": 0.6392318244170097,
"grad_norm": 0.08709632605314255,
"learning_rate": 9.924242424242425e-05,
"loss": 1.0167,
"step": 233
},
{
"epoch": 0.6419753086419753,
"grad_norm": 0.08346287161111832,
"learning_rate": 9.848484848484849e-05,
"loss": 1.0715,
"step": 234
},
{
"epoch": 0.644718792866941,
"grad_norm": 0.08752211928367615,
"learning_rate": 9.772727272727274e-05,
"loss": 1.0333,
"step": 235
},
{
"epoch": 0.6474622770919067,
"grad_norm": 0.08118908852338791,
"learning_rate": 9.696969696969698e-05,
"loss": 1.0912,
"step": 236
},
{
"epoch": 0.6502057613168725,
"grad_norm": 0.08248468488454819,
"learning_rate": 9.621212121212123e-05,
"loss": 1.0387,
"step": 237
},
{
"epoch": 0.6529492455418381,
"grad_norm": 0.08687452971935272,
"learning_rate": 9.545454545454546e-05,
"loss": 1.0663,
"step": 238
},
{
"epoch": 0.6556927297668038,
"grad_norm": 0.07929343730211258,
"learning_rate": 9.469696969696971e-05,
"loss": 1.0821,
"step": 239
},
{
"epoch": 0.6584362139917695,
"grad_norm": 0.08319993317127228,
"learning_rate": 9.393939393939395e-05,
"loss": 1.1171,
"step": 240
},
{
"epoch": 0.6611796982167353,
"grad_norm": 0.07612979412078857,
"learning_rate": 9.318181818181818e-05,
"loss": 1.0876,
"step": 241
},
{
"epoch": 0.663923182441701,
"grad_norm": 0.08205942809581757,
"learning_rate": 9.242424242424242e-05,
"loss": 1.1507,
"step": 242
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.08132138848304749,
"learning_rate": 9.166666666666667e-05,
"loss": 1.0434,
"step": 243
},
{
"epoch": 0.6694101508916324,
"grad_norm": 0.08600456267595291,
"learning_rate": 9.090909090909092e-05,
"loss": 1.0111,
"step": 244
},
{
"epoch": 0.6721536351165981,
"grad_norm": 0.088123619556427,
"learning_rate": 9.015151515151515e-05,
"loss": 1.0903,
"step": 245
},
{
"epoch": 0.6748971193415638,
"grad_norm": 0.0872405469417572,
"learning_rate": 8.93939393939394e-05,
"loss": 1.0554,
"step": 246
},
{
"epoch": 0.6776406035665294,
"grad_norm": 0.08145253360271454,
"learning_rate": 8.863636363636364e-05,
"loss": 1.0823,
"step": 247
},
{
"epoch": 0.6803840877914952,
"grad_norm": 0.08167996257543564,
"learning_rate": 8.787878787878789e-05,
"loss": 1.0948,
"step": 248
},
{
"epoch": 0.6831275720164609,
"grad_norm": 0.08128040283918381,
"learning_rate": 8.712121212121212e-05,
"loss": 1.0737,
"step": 249
},
{
"epoch": 0.6858710562414266,
"grad_norm": 0.09208956360816956,
"learning_rate": 8.636363636363637e-05,
"loss": 1.0589,
"step": 250
},
{
"epoch": 0.6886145404663924,
"grad_norm": 0.085269995033741,
"learning_rate": 8.560606060606061e-05,
"loss": 1.0887,
"step": 251
},
{
"epoch": 0.691358024691358,
"grad_norm": 0.08460511267185211,
"learning_rate": 8.484848484848486e-05,
"loss": 1.0189,
"step": 252
},
{
"epoch": 0.6941015089163237,
"grad_norm": 0.08053672313690186,
"learning_rate": 8.40909090909091e-05,
"loss": 1.0129,
"step": 253
},
{
"epoch": 0.6968449931412894,
"grad_norm": 0.08292900770902634,
"learning_rate": 8.333333333333334e-05,
"loss": 1.0723,
"step": 254
},
{
"epoch": 0.6995884773662552,
"grad_norm": 0.08680149167776108,
"learning_rate": 8.257575757575758e-05,
"loss": 1.0794,
"step": 255
},
{
"epoch": 0.7023319615912208,
"grad_norm": 0.08525680750608444,
"learning_rate": 8.181818181818183e-05,
"loss": 1.0633,
"step": 256
},
{
"epoch": 0.7050754458161865,
"grad_norm": 0.08447825908660889,
"learning_rate": 8.106060606060607e-05,
"loss": 1.0485,
"step": 257
},
{
"epoch": 0.7078189300411523,
"grad_norm": 0.08495179563760757,
"learning_rate": 8.03030303030303e-05,
"loss": 1.0915,
"step": 258
},
{
"epoch": 0.710562414266118,
"grad_norm": 0.07913429290056229,
"learning_rate": 7.954545454545455e-05,
"loss": 1.0692,
"step": 259
},
{
"epoch": 0.7133058984910837,
"grad_norm": 0.08367093652486801,
"learning_rate": 7.878787878787879e-05,
"loss": 1.0583,
"step": 260
},
{
"epoch": 0.7160493827160493,
"grad_norm": 0.08013223111629486,
"learning_rate": 7.803030303030304e-05,
"loss": 1.0461,
"step": 261
},
{
"epoch": 0.7187928669410151,
"grad_norm": 0.0824413076043129,
"learning_rate": 7.727272727272727e-05,
"loss": 1.1834,
"step": 262
},
{
"epoch": 0.7215363511659808,
"grad_norm": 0.08083179593086243,
"learning_rate": 7.651515151515152e-05,
"loss": 1.1348,
"step": 263
},
{
"epoch": 0.7242798353909465,
"grad_norm": 0.0788070410490036,
"learning_rate": 7.575757575757576e-05,
"loss": 1.1085,
"step": 264
},
{
"epoch": 0.7270233196159122,
"grad_norm": 0.0821022316813469,
"learning_rate": 7.500000000000001e-05,
"loss": 1.0878,
"step": 265
},
{
"epoch": 0.7297668038408779,
"grad_norm": 0.08073017746210098,
"learning_rate": 7.424242424242424e-05,
"loss": 1.0041,
"step": 266
},
{
"epoch": 0.7325102880658436,
"grad_norm": 0.08375431597232819,
"learning_rate": 7.348484848484849e-05,
"loss": 1.0785,
"step": 267
},
{
"epoch": 0.7352537722908093,
"grad_norm": 0.08851874619722366,
"learning_rate": 7.272727272727273e-05,
"loss": 1.1159,
"step": 268
},
{
"epoch": 0.7379972565157751,
"grad_norm": 0.08411041647195816,
"learning_rate": 7.196969696969698e-05,
"loss": 1.0892,
"step": 269
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.08156825602054596,
"learning_rate": 7.121212121212121e-05,
"loss": 1.0392,
"step": 270
},
{
"epoch": 0.7434842249657064,
"grad_norm": 0.0837472677230835,
"learning_rate": 7.045454545454546e-05,
"loss": 1.0213,
"step": 271
},
{
"epoch": 0.7462277091906722,
"grad_norm": 0.07907503843307495,
"learning_rate": 6.96969696969697e-05,
"loss": 1.0665,
"step": 272
},
{
"epoch": 0.7489711934156379,
"grad_norm": 0.0840056911110878,
"learning_rate": 6.893939393939395e-05,
"loss": 1.0486,
"step": 273
},
{
"epoch": 0.7517146776406035,
"grad_norm": 0.08614211529493332,
"learning_rate": 6.818181818181818e-05,
"loss": 1.0542,
"step": 274
},
{
"epoch": 0.7544581618655692,
"grad_norm": 0.07795161753892899,
"learning_rate": 6.742424242424242e-05,
"loss": 1.1377,
"step": 275
},
{
"epoch": 0.757201646090535,
"grad_norm": 0.08403259515762329,
"learning_rate": 6.666666666666667e-05,
"loss": 1.1508,
"step": 276
},
{
"epoch": 0.7599451303155007,
"grad_norm": 0.08293148130178452,
"learning_rate": 6.59090909090909e-05,
"loss": 1.0358,
"step": 277
},
{
"epoch": 0.7626886145404664,
"grad_norm": 0.08527221530675888,
"learning_rate": 6.515151515151516e-05,
"loss": 1.0521,
"step": 278
},
{
"epoch": 0.7654320987654321,
"grad_norm": 0.08584438264369965,
"learning_rate": 6.439393939393939e-05,
"loss": 1.0842,
"step": 279
},
{
"epoch": 0.7681755829903978,
"grad_norm": 0.08304653316736221,
"learning_rate": 6.363636363636364e-05,
"loss": 1.0504,
"step": 280
},
{
"epoch": 0.7709190672153635,
"grad_norm": 0.08562014997005463,
"learning_rate": 6.287878787878788e-05,
"loss": 1.1145,
"step": 281
},
{
"epoch": 0.7736625514403292,
"grad_norm": 0.08264237642288208,
"learning_rate": 6.212121212121213e-05,
"loss": 1.1025,
"step": 282
},
{
"epoch": 0.7764060356652949,
"grad_norm": 0.08219614624977112,
"learning_rate": 6.136363636363636e-05,
"loss": 1.0686,
"step": 283
},
{
"epoch": 0.7791495198902606,
"grad_norm": 0.08253966271877289,
"learning_rate": 6.060606060606061e-05,
"loss": 1.0494,
"step": 284
},
{
"epoch": 0.7818930041152263,
"grad_norm": 0.08081547915935516,
"learning_rate": 5.9848484848484854e-05,
"loss": 1.1232,
"step": 285
},
{
"epoch": 0.7846364883401921,
"grad_norm": 0.08054647594690323,
"learning_rate": 5.90909090909091e-05,
"loss": 1.1093,
"step": 286
},
{
"epoch": 0.7873799725651578,
"grad_norm": 0.08239459246397018,
"learning_rate": 5.833333333333334e-05,
"loss": 1.0773,
"step": 287
},
{
"epoch": 0.7901234567901234,
"grad_norm": 0.08540047705173492,
"learning_rate": 5.757575757575758e-05,
"loss": 1.0504,
"step": 288
},
{
"epoch": 0.7928669410150891,
"grad_norm": 0.08504082262516022,
"learning_rate": 5.6818181818181825e-05,
"loss": 1.0244,
"step": 289
},
{
"epoch": 0.7956104252400549,
"grad_norm": 0.0847964882850647,
"learning_rate": 5.606060606060606e-05,
"loss": 1.0745,
"step": 290
},
{
"epoch": 0.7983539094650206,
"grad_norm": 0.08256299793720245,
"learning_rate": 5.5303030303030304e-05,
"loss": 1.1221,
"step": 291
},
{
"epoch": 0.8010973936899863,
"grad_norm": 0.08534077554941177,
"learning_rate": 5.4545454545454546e-05,
"loss": 1.0621,
"step": 292
},
{
"epoch": 0.803840877914952,
"grad_norm": 0.08906951546669006,
"learning_rate": 5.378787878787879e-05,
"loss": 1.0925,
"step": 293
},
{
"epoch": 0.8065843621399177,
"grad_norm": 0.08183780312538147,
"learning_rate": 5.303030303030303e-05,
"loss": 1.1554,
"step": 294
},
{
"epoch": 0.8093278463648834,
"grad_norm": 0.08581645786762238,
"learning_rate": 5.2272727272727274e-05,
"loss": 1.061,
"step": 295
},
{
"epoch": 0.8120713305898491,
"grad_norm": 0.08507423847913742,
"learning_rate": 5.151515151515152e-05,
"loss": 0.9929,
"step": 296
},
{
"epoch": 0.8148148148148148,
"grad_norm": 0.08369109034538269,
"learning_rate": 5.075757575757576e-05,
"loss": 1.1093,
"step": 297
},
{
"epoch": 0.8175582990397805,
"grad_norm": 0.08173946291208267,
"learning_rate": 5e-05,
"loss": 1.0196,
"step": 298
},
{
"epoch": 0.8203017832647462,
"grad_norm": 0.08361264318227768,
"learning_rate": 4.9242424242424245e-05,
"loss": 1.0283,
"step": 299
},
{
"epoch": 0.823045267489712,
"grad_norm": 0.08748258650302887,
"learning_rate": 4.848484848484849e-05,
"loss": 0.9691,
"step": 300
},
{
"epoch": 0.8257887517146777,
"grad_norm": 0.08660106360912323,
"learning_rate": 4.772727272727273e-05,
"loss": 0.9973,
"step": 301
},
{
"epoch": 0.8285322359396433,
"grad_norm": 0.08636549115180969,
"learning_rate": 4.696969696969697e-05,
"loss": 1.1138,
"step": 302
},
{
"epoch": 0.831275720164609,
"grad_norm": 0.08026378601789474,
"learning_rate": 4.621212121212121e-05,
"loss": 1.1085,
"step": 303
},
{
"epoch": 0.8340192043895748,
"grad_norm": 0.08145099878311157,
"learning_rate": 4.545454545454546e-05,
"loss": 1.0545,
"step": 304
},
{
"epoch": 0.8367626886145405,
"grad_norm": 0.08613786101341248,
"learning_rate": 4.46969696969697e-05,
"loss": 1.0303,
"step": 305
},
{
"epoch": 0.8395061728395061,
"grad_norm": 0.078981414437294,
"learning_rate": 4.3939393939393944e-05,
"loss": 1.083,
"step": 306
},
{
"epoch": 0.8422496570644719,
"grad_norm": 0.07854614406824112,
"learning_rate": 4.318181818181819e-05,
"loss": 1.1413,
"step": 307
},
{
"epoch": 0.8449931412894376,
"grad_norm": 0.08534348011016846,
"learning_rate": 4.242424242424243e-05,
"loss": 1.044,
"step": 308
},
{
"epoch": 0.8477366255144033,
"grad_norm": 0.08599614351987839,
"learning_rate": 4.166666666666667e-05,
"loss": 1.1798,
"step": 309
},
{
"epoch": 0.850480109739369,
"grad_norm": 0.08740722388029099,
"learning_rate": 4.0909090909090915e-05,
"loss": 1.0741,
"step": 310
},
{
"epoch": 0.8532235939643347,
"grad_norm": 0.08579033613204956,
"learning_rate": 4.015151515151515e-05,
"loss": 1.046,
"step": 311
},
{
"epoch": 0.8559670781893004,
"grad_norm": 0.0809055045247078,
"learning_rate": 3.939393939393939e-05,
"loss": 1.1218,
"step": 312
},
{
"epoch": 0.8587105624142661,
"grad_norm": 0.08440711349248886,
"learning_rate": 3.8636363636363636e-05,
"loss": 1.1075,
"step": 313
},
{
"epoch": 0.8614540466392319,
"grad_norm": 0.08196821063756943,
"learning_rate": 3.787878787878788e-05,
"loss": 1.15,
"step": 314
},
{
"epoch": 0.8641975308641975,
"grad_norm": 0.08028547465801239,
"learning_rate": 3.712121212121212e-05,
"loss": 1.1572,
"step": 315
},
{
"epoch": 0.8669410150891632,
"grad_norm": 0.08412973582744598,
"learning_rate": 3.6363636363636364e-05,
"loss": 1.0748,
"step": 316
},
{
"epoch": 0.869684499314129,
"grad_norm": 0.08089521527290344,
"learning_rate": 3.560606060606061e-05,
"loss": 1.0628,
"step": 317
},
{
"epoch": 0.8724279835390947,
"grad_norm": 0.09000709652900696,
"learning_rate": 3.484848484848485e-05,
"loss": 1.038,
"step": 318
},
{
"epoch": 0.8751714677640604,
"grad_norm": 0.08310791105031967,
"learning_rate": 3.409090909090909e-05,
"loss": 1.1304,
"step": 319
},
{
"epoch": 0.877914951989026,
"grad_norm": 0.07905035465955734,
"learning_rate": 3.3333333333333335e-05,
"loss": 1.0579,
"step": 320
},
{
"epoch": 0.8806584362139918,
"grad_norm": 0.08173554390668869,
"learning_rate": 3.257575757575758e-05,
"loss": 1.1301,
"step": 321
},
{
"epoch": 0.8834019204389575,
"grad_norm": 0.08850661665201187,
"learning_rate": 3.181818181818182e-05,
"loss": 1.0727,
"step": 322
},
{
"epoch": 0.8861454046639232,
"grad_norm": 0.08684371411800385,
"learning_rate": 3.106060606060606e-05,
"loss": 1.0485,
"step": 323
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.0840585008263588,
"learning_rate": 3.0303030303030306e-05,
"loss": 1.0585,
"step": 324
},
{
"epoch": 0.8916323731138546,
"grad_norm": 0.08585873991250992,
"learning_rate": 2.954545454545455e-05,
"loss": 1.0302,
"step": 325
},
{
"epoch": 0.8943758573388203,
"grad_norm": 0.08615091443061829,
"learning_rate": 2.878787878787879e-05,
"loss": 1.0538,
"step": 326
},
{
"epoch": 0.897119341563786,
"grad_norm": 0.08681048452854156,
"learning_rate": 2.803030303030303e-05,
"loss": 0.974,
"step": 327
},
{
"epoch": 0.8998628257887518,
"grad_norm": 0.08452990651130676,
"learning_rate": 2.7272727272727273e-05,
"loss": 1.0475,
"step": 328
},
{
"epoch": 0.9026063100137174,
"grad_norm": 0.08300242573022842,
"learning_rate": 2.6515151515151516e-05,
"loss": 1.1505,
"step": 329
},
{
"epoch": 0.9053497942386831,
"grad_norm": 0.0792202427983284,
"learning_rate": 2.575757575757576e-05,
"loss": 1.0701,
"step": 330
},
{
"epoch": 0.9080932784636488,
"grad_norm": 0.08142198622226715,
"learning_rate": 2.5e-05,
"loss": 1.047,
"step": 331
},
{
"epoch": 0.9108367626886146,
"grad_norm": 0.08813714981079102,
"learning_rate": 2.4242424242424244e-05,
"loss": 1.0553,
"step": 332
},
{
"epoch": 0.9135802469135802,
"grad_norm": 0.09316329658031464,
"learning_rate": 2.3484848484848487e-05,
"loss": 1.0672,
"step": 333
},
{
"epoch": 0.9163237311385459,
"grad_norm": 0.09021364152431488,
"learning_rate": 2.272727272727273e-05,
"loss": 1.0518,
"step": 334
},
{
"epoch": 0.9190672153635117,
"grad_norm": 0.08736269921064377,
"learning_rate": 2.1969696969696972e-05,
"loss": 1.03,
"step": 335
},
{
"epoch": 0.9218106995884774,
"grad_norm": 0.08807602524757385,
"learning_rate": 2.1212121212121215e-05,
"loss": 1.0276,
"step": 336
},
{
"epoch": 0.9245541838134431,
"grad_norm": 0.08561466634273529,
"learning_rate": 2.0454545454545457e-05,
"loss": 1.0449,
"step": 337
},
{
"epoch": 0.9272976680384087,
"grad_norm": 0.0889008566737175,
"learning_rate": 1.9696969696969697e-05,
"loss": 1.0515,
"step": 338
},
{
"epoch": 0.9300411522633745,
"grad_norm": 0.08104939758777618,
"learning_rate": 1.893939393939394e-05,
"loss": 1.0403,
"step": 339
},
{
"epoch": 0.9327846364883402,
"grad_norm": 0.08113296329975128,
"learning_rate": 1.8181818181818182e-05,
"loss": 1.0841,
"step": 340
},
{
"epoch": 0.9355281207133059,
"grad_norm": 0.08735201507806778,
"learning_rate": 1.7424242424242425e-05,
"loss": 1.0661,
"step": 341
},
{
"epoch": 0.9382716049382716,
"grad_norm": 0.08495688438415527,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.0982,
"step": 342
},
{
"epoch": 0.9410150891632373,
"grad_norm": 0.08920719474554062,
"learning_rate": 1.590909090909091e-05,
"loss": 1.0408,
"step": 343
},
{
"epoch": 0.943758573388203,
"grad_norm": 0.08456548303365707,
"learning_rate": 1.5151515151515153e-05,
"loss": 1.049,
"step": 344
},
{
"epoch": 0.9465020576131687,
"grad_norm": 0.08137591183185577,
"learning_rate": 1.4393939393939396e-05,
"loss": 1.06,
"step": 345
},
{
"epoch": 0.9492455418381345,
"grad_norm": 0.08184567838907242,
"learning_rate": 1.3636363636363637e-05,
"loss": 1.0403,
"step": 346
},
{
"epoch": 0.9519890260631001,
"grad_norm": 0.07847440242767334,
"learning_rate": 1.287878787878788e-05,
"loss": 1.0806,
"step": 347
},
{
"epoch": 0.9547325102880658,
"grad_norm": 0.0863838866353035,
"learning_rate": 1.2121212121212122e-05,
"loss": 1.0958,
"step": 348
},
{
"epoch": 0.9574759945130316,
"grad_norm": 0.08629950135946274,
"learning_rate": 1.1363636363636365e-05,
"loss": 0.9939,
"step": 349
},
{
"epoch": 0.9602194787379973,
"grad_norm": 0.08571231365203857,
"learning_rate": 1.0606060606060607e-05,
"loss": 1.0633,
"step": 350
},
{
"epoch": 0.9629629629629629,
"grad_norm": 0.08512404561042786,
"learning_rate": 9.848484848484848e-06,
"loss": 1.0605,
"step": 351
},
{
"epoch": 0.9657064471879286,
"grad_norm": 0.08234819024801254,
"learning_rate": 9.090909090909091e-06,
"loss": 1.0379,
"step": 352
},
{
"epoch": 0.9684499314128944,
"grad_norm": 0.08618064224720001,
"learning_rate": 8.333333333333334e-06,
"loss": 1.0862,
"step": 353
},
{
"epoch": 0.9711934156378601,
"grad_norm": 0.08058779686689377,
"learning_rate": 7.5757575757575764e-06,
"loss": 1.0757,
"step": 354
},
{
"epoch": 0.9739368998628258,
"grad_norm": 0.08263111859560013,
"learning_rate": 6.818181818181818e-06,
"loss": 1.0886,
"step": 355
},
{
"epoch": 0.9766803840877915,
"grad_norm": 0.08850718289613724,
"learning_rate": 6.060606060606061e-06,
"loss": 1.0127,
"step": 356
},
{
"epoch": 0.9794238683127572,
"grad_norm": 0.08599220961332321,
"learning_rate": 5.303030303030304e-06,
"loss": 1.0171,
"step": 357
},
{
"epoch": 0.9821673525377229,
"grad_norm": 0.08793280273675919,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.0184,
"step": 358
},
{
"epoch": 0.9849108367626886,
"grad_norm": 0.0828399509191513,
"learning_rate": 3.7878787878787882e-06,
"loss": 1.0628,
"step": 359
},
{
"epoch": 0.9876543209876543,
"grad_norm": 0.08672760426998138,
"learning_rate": 3.0303030303030305e-06,
"loss": 1.1291,
"step": 360
},
{
"epoch": 0.99039780521262,
"grad_norm": 0.08883443474769592,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.9949,
"step": 361
},
{
"epoch": 0.9931412894375857,
"grad_norm": 0.08098744601011276,
"learning_rate": 1.5151515151515152e-06,
"loss": 1.0165,
"step": 362
},
{
"epoch": 0.9958847736625515,
"grad_norm": 0.0881686583161354,
"learning_rate": 7.575757575757576e-07,
"loss": 1.0239,
"step": 363
},
{
"epoch": 0.9986282578875172,
"grad_norm": 0.08588221669197083,
"learning_rate": 0.0,
"loss": 1.0815,
"step": 364
}
],
"logging_steps": 1,
"max_steps": 364,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.563690834619392e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}