pm / checkpoint-1664 /trainer_state.json
yyuan244
pm first train
0647465
raw
history blame
144 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1664,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001201923076923077,
"grad_norm": 354.53998230392546,
"learning_rate": 2.5000000000000004e-07,
"loss": 7.4318,
"step": 2
},
{
"epoch": 0.002403846153846154,
"grad_norm": 371.91393245440116,
"learning_rate": 5.000000000000001e-07,
"loss": 7.4355,
"step": 4
},
{
"epoch": 0.003605769230769231,
"grad_norm": 346.9801049377746,
"learning_rate": 7.5e-07,
"loss": 7.2172,
"step": 6
},
{
"epoch": 0.004807692307692308,
"grad_norm": 449.5218610086392,
"learning_rate": 1.0000000000000002e-06,
"loss": 5.8268,
"step": 8
},
{
"epoch": 0.006009615384615385,
"grad_norm": 139.92367559356336,
"learning_rate": 1.25e-06,
"loss": 2.5854,
"step": 10
},
{
"epoch": 0.007211538461538462,
"grad_norm": 52.715150799729045,
"learning_rate": 1.5e-06,
"loss": 0.5361,
"step": 12
},
{
"epoch": 0.008413461538461538,
"grad_norm": 19.976853421159532,
"learning_rate": 1.75e-06,
"loss": 0.2878,
"step": 14
},
{
"epoch": 0.009615384615384616,
"grad_norm": 20.500149107050714,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2495,
"step": 16
},
{
"epoch": 0.010817307692307692,
"grad_norm": 17.263902163149385,
"learning_rate": 2.25e-06,
"loss": 0.2177,
"step": 18
},
{
"epoch": 0.01201923076923077,
"grad_norm": 12.128764817788255,
"learning_rate": 2.5e-06,
"loss": 0.212,
"step": 20
},
{
"epoch": 0.013221153846153846,
"grad_norm": 8.417030179298662,
"learning_rate": 2.7500000000000004e-06,
"loss": 0.2035,
"step": 22
},
{
"epoch": 0.014423076923076924,
"grad_norm": 10.874846551654207,
"learning_rate": 3e-06,
"loss": 0.1999,
"step": 24
},
{
"epoch": 0.015625,
"grad_norm": 2.553082597942841,
"learning_rate": 3.2500000000000002e-06,
"loss": 0.1884,
"step": 26
},
{
"epoch": 0.016826923076923076,
"grad_norm": 8.091183712435873,
"learning_rate": 3.5e-06,
"loss": 0.1729,
"step": 28
},
{
"epoch": 0.018028846153846152,
"grad_norm": 6.473289695229128,
"learning_rate": 3.7500000000000005e-06,
"loss": 0.1858,
"step": 30
},
{
"epoch": 0.019230769230769232,
"grad_norm": 10.845224583341055,
"learning_rate": 4.000000000000001e-06,
"loss": 0.1779,
"step": 32
},
{
"epoch": 0.020432692307692308,
"grad_norm": 7.588560990570617,
"learning_rate": 4.25e-06,
"loss": 0.1807,
"step": 34
},
{
"epoch": 0.021634615384615384,
"grad_norm": 3.2833536176531437,
"learning_rate": 4.5e-06,
"loss": 0.1741,
"step": 36
},
{
"epoch": 0.02283653846153846,
"grad_norm": 9.48172518986478,
"learning_rate": 4.75e-06,
"loss": 0.17,
"step": 38
},
{
"epoch": 0.02403846153846154,
"grad_norm": 10.178982049068438,
"learning_rate": 5e-06,
"loss": 0.1603,
"step": 40
},
{
"epoch": 0.025240384615384616,
"grad_norm": 2.410777001535273,
"learning_rate": 4.999981288993133e-06,
"loss": 0.1772,
"step": 42
},
{
"epoch": 0.026442307692307692,
"grad_norm": 11.143463053654319,
"learning_rate": 4.999925156252611e-06,
"loss": 0.1799,
"step": 44
},
{
"epoch": 0.027644230769230768,
"grad_norm": 9.886918600788055,
"learning_rate": 4.9998316026186755e-06,
"loss": 0.1773,
"step": 46
},
{
"epoch": 0.028846153846153848,
"grad_norm": 7.527819471799396,
"learning_rate": 4.999700629491713e-06,
"loss": 0.176,
"step": 48
},
{
"epoch": 0.030048076923076924,
"grad_norm": 7.572713386840099,
"learning_rate": 4.999532238832233e-06,
"loss": 0.153,
"step": 50
},
{
"epoch": 0.03125,
"grad_norm": 3.477539644705173,
"learning_rate": 4.999326433160844e-06,
"loss": 0.1588,
"step": 52
},
{
"epoch": 0.03245192307692308,
"grad_norm": 2.5927408403215266,
"learning_rate": 4.999083215558211e-06,
"loss": 0.1657,
"step": 54
},
{
"epoch": 0.03365384615384615,
"grad_norm": 1.8986386268295627,
"learning_rate": 4.998802589665009e-06,
"loss": 0.1624,
"step": 56
},
{
"epoch": 0.03485576923076923,
"grad_norm": 4.511287232603737,
"learning_rate": 4.998484559681875e-06,
"loss": 0.1604,
"step": 58
},
{
"epoch": 0.036057692307692304,
"grad_norm": 3.1615021035675586,
"learning_rate": 4.998129130369338e-06,
"loss": 0.1541,
"step": 60
},
{
"epoch": 0.037259615384615384,
"grad_norm": 2.3641598718509163,
"learning_rate": 4.997736307047748e-06,
"loss": 0.1609,
"step": 62
},
{
"epoch": 0.038461538461538464,
"grad_norm": 2.1050208268263018,
"learning_rate": 4.997306095597203e-06,
"loss": 0.1628,
"step": 64
},
{
"epoch": 0.039663461538461536,
"grad_norm": 4.670481996347925,
"learning_rate": 4.996838502457453e-06,
"loss": 0.1605,
"step": 66
},
{
"epoch": 0.040865384615384616,
"grad_norm": 2.812878853953604,
"learning_rate": 4.99633353462781e-06,
"loss": 0.1394,
"step": 68
},
{
"epoch": 0.042067307692307696,
"grad_norm": 3.3453547259139658,
"learning_rate": 4.995791199667038e-06,
"loss": 0.1353,
"step": 70
},
{
"epoch": 0.04326923076923077,
"grad_norm": 9.508467033910211,
"learning_rate": 4.9952115056932445e-06,
"loss": 0.1464,
"step": 72
},
{
"epoch": 0.04447115384615385,
"grad_norm": 6.12221208629695,
"learning_rate": 4.994594461383756e-06,
"loss": 0.1534,
"step": 74
},
{
"epoch": 0.04567307692307692,
"grad_norm": 3.2936985183570644,
"learning_rate": 4.993940075974988e-06,
"loss": 0.1551,
"step": 76
},
{
"epoch": 0.046875,
"grad_norm": 4.147948524170783,
"learning_rate": 4.993248359262308e-06,
"loss": 0.1599,
"step": 78
},
{
"epoch": 0.04807692307692308,
"grad_norm": 6.328245841206747,
"learning_rate": 4.99251932159989e-06,
"loss": 0.1433,
"step": 80
},
{
"epoch": 0.04927884615384615,
"grad_norm": 3.454386437171457,
"learning_rate": 4.991752973900558e-06,
"loss": 0.1589,
"step": 82
},
{
"epoch": 0.05048076923076923,
"grad_norm": 4.628897164295182,
"learning_rate": 4.9909493276356184e-06,
"loss": 0.16,
"step": 84
},
{
"epoch": 0.051682692307692304,
"grad_norm": 4.959693562958358,
"learning_rate": 4.990108394834698e-06,
"loss": 0.1504,
"step": 86
},
{
"epoch": 0.052884615384615384,
"grad_norm": 2.037506021404142,
"learning_rate": 4.9892301880855565e-06,
"loss": 0.1469,
"step": 88
},
{
"epoch": 0.054086538461538464,
"grad_norm": 3.2873483486362898,
"learning_rate": 4.988314720533899e-06,
"loss": 0.152,
"step": 90
},
{
"epoch": 0.055288461538461536,
"grad_norm": 3.5444104019705165,
"learning_rate": 4.987362005883182e-06,
"loss": 0.1396,
"step": 92
},
{
"epoch": 0.056490384615384616,
"grad_norm": 2.43315278564924,
"learning_rate": 4.986372058394404e-06,
"loss": 0.1365,
"step": 94
},
{
"epoch": 0.057692307692307696,
"grad_norm": 3.1953964952618015,
"learning_rate": 4.985344892885899e-06,
"loss": 0.158,
"step": 96
},
{
"epoch": 0.05889423076923077,
"grad_norm": 2.0524573584491814,
"learning_rate": 4.984280524733107e-06,
"loss": 0.1571,
"step": 98
},
{
"epoch": 0.06009615384615385,
"grad_norm": 6.255981390751074,
"learning_rate": 4.983178969868346e-06,
"loss": 0.1464,
"step": 100
},
{
"epoch": 0.06129807692307692,
"grad_norm": 2.401646579094803,
"learning_rate": 4.98204024478058e-06,
"loss": 0.1417,
"step": 102
},
{
"epoch": 0.0625,
"grad_norm": 4.086961173737914,
"learning_rate": 4.980864366515159e-06,
"loss": 0.1541,
"step": 104
},
{
"epoch": 0.06370192307692307,
"grad_norm": 5.143484575050959,
"learning_rate": 4.97965135267358e-06,
"loss": 0.1499,
"step": 106
},
{
"epoch": 0.06490384615384616,
"grad_norm": 9.335258253064257,
"learning_rate": 4.978401221413209e-06,
"loss": 0.1684,
"step": 108
},
{
"epoch": 0.06610576923076923,
"grad_norm": 7.980472809228967,
"learning_rate": 4.977113991447017e-06,
"loss": 0.1663,
"step": 110
},
{
"epoch": 0.0673076923076923,
"grad_norm": 5.567091210665335,
"learning_rate": 4.9757896820433015e-06,
"loss": 0.1496,
"step": 112
},
{
"epoch": 0.06850961538461539,
"grad_norm": 5.007400500292786,
"learning_rate": 4.9744283130253905e-06,
"loss": 0.1415,
"step": 114
},
{
"epoch": 0.06971153846153846,
"grad_norm": 2.720822334329411,
"learning_rate": 4.973029904771353e-06,
"loss": 0.1541,
"step": 116
},
{
"epoch": 0.07091346153846154,
"grad_norm": 6.0293181825471605,
"learning_rate": 4.97159447821369e-06,
"loss": 0.1334,
"step": 118
},
{
"epoch": 0.07211538461538461,
"grad_norm": 2.190960264135707,
"learning_rate": 4.9701220548390215e-06,
"loss": 0.1353,
"step": 120
},
{
"epoch": 0.0733173076923077,
"grad_norm": 1.8394820554560345,
"learning_rate": 4.968612656687768e-06,
"loss": 0.1424,
"step": 122
},
{
"epoch": 0.07451923076923077,
"grad_norm": 1.890530118257683,
"learning_rate": 4.967066306353816e-06,
"loss": 0.161,
"step": 124
},
{
"epoch": 0.07572115384615384,
"grad_norm": 1.74782914530488,
"learning_rate": 4.965483026984182e-06,
"loss": 0.1391,
"step": 126
},
{
"epoch": 0.07692307692307693,
"grad_norm": 2.529223967308172,
"learning_rate": 4.963862842278669e-06,
"loss": 0.1509,
"step": 128
},
{
"epoch": 0.078125,
"grad_norm": 3.2375323783057546,
"learning_rate": 4.962205776489506e-06,
"loss": 0.1452,
"step": 130
},
{
"epoch": 0.07932692307692307,
"grad_norm": 3.289988413600293,
"learning_rate": 4.9605118544209874e-06,
"loss": 0.1369,
"step": 132
},
{
"epoch": 0.08052884615384616,
"grad_norm": 2.533449561050558,
"learning_rate": 4.958781101429104e-06,
"loss": 0.157,
"step": 134
},
{
"epoch": 0.08173076923076923,
"grad_norm": 2.3698097270600846,
"learning_rate": 4.9570135434211615e-06,
"loss": 0.1586,
"step": 136
},
{
"epoch": 0.0829326923076923,
"grad_norm": 2.9294128800046835,
"learning_rate": 4.95520920685539e-06,
"loss": 0.1438,
"step": 138
},
{
"epoch": 0.08413461538461539,
"grad_norm": 3.199532338265642,
"learning_rate": 4.953368118740555e-06,
"loss": 0.1404,
"step": 140
},
{
"epoch": 0.08533653846153846,
"grad_norm": 5.007008541932283,
"learning_rate": 4.951490306635543e-06,
"loss": 0.1595,
"step": 142
},
{
"epoch": 0.08653846153846154,
"grad_norm": 6.472389725628937,
"learning_rate": 4.949575798648962e-06,
"loss": 0.1589,
"step": 144
},
{
"epoch": 0.08774038461538461,
"grad_norm": 2.0940111362516998,
"learning_rate": 4.947624623438707e-06,
"loss": 0.1352,
"step": 146
},
{
"epoch": 0.0889423076923077,
"grad_norm": 2.6543426483078214,
"learning_rate": 4.9456368102115414e-06,
"loss": 0.1396,
"step": 148
},
{
"epoch": 0.09014423076923077,
"grad_norm": 4.492736133720869,
"learning_rate": 4.943612388722654e-06,
"loss": 0.1362,
"step": 150
},
{
"epoch": 0.09134615384615384,
"grad_norm": 3.1413400916428698,
"learning_rate": 4.941551389275217e-06,
"loss": 0.1398,
"step": 152
},
{
"epoch": 0.09254807692307693,
"grad_norm": 7.121010742342612,
"learning_rate": 4.9394538427199305e-06,
"loss": 0.1612,
"step": 154
},
{
"epoch": 0.09375,
"grad_norm": 2.0494879939162773,
"learning_rate": 4.937319780454559e-06,
"loss": 0.1372,
"step": 156
},
{
"epoch": 0.09495192307692307,
"grad_norm": 3.41511576689734,
"learning_rate": 4.935149234423468e-06,
"loss": 0.1463,
"step": 158
},
{
"epoch": 0.09615384615384616,
"grad_norm": 4.070726562354956,
"learning_rate": 4.9329422371171375e-06,
"loss": 0.1534,
"step": 160
},
{
"epoch": 0.09735576923076923,
"grad_norm": 4.8008312467816125,
"learning_rate": 4.930698821571681e-06,
"loss": 0.1603,
"step": 162
},
{
"epoch": 0.0985576923076923,
"grad_norm": 6.5840675047315225,
"learning_rate": 4.928419021368349e-06,
"loss": 0.1472,
"step": 164
},
{
"epoch": 0.09975961538461539,
"grad_norm": 4.906437852842057,
"learning_rate": 4.926102870633029e-06,
"loss": 0.1518,
"step": 166
},
{
"epoch": 0.10096153846153846,
"grad_norm": 2.2753956803841424,
"learning_rate": 4.923750404035729e-06,
"loss": 0.132,
"step": 168
},
{
"epoch": 0.10216346153846154,
"grad_norm": 10.360890258671878,
"learning_rate": 4.921361656790065e-06,
"loss": 0.1615,
"step": 170
},
{
"epoch": 0.10336538461538461,
"grad_norm": 3.781202441981427,
"learning_rate": 4.918936664652729e-06,
"loss": 0.1317,
"step": 172
},
{
"epoch": 0.1045673076923077,
"grad_norm": 4.612706315229004,
"learning_rate": 4.9164754639229575e-06,
"loss": 0.1556,
"step": 174
},
{
"epoch": 0.10576923076923077,
"grad_norm": 10.089023799727872,
"learning_rate": 4.913978091441985e-06,
"loss": 0.1366,
"step": 176
},
{
"epoch": 0.10697115384615384,
"grad_norm": 2.3844132422742215,
"learning_rate": 4.911444584592495e-06,
"loss": 0.1364,
"step": 178
},
{
"epoch": 0.10817307692307693,
"grad_norm": 8.01833294402442,
"learning_rate": 4.908874981298058e-06,
"loss": 0.1367,
"step": 180
},
{
"epoch": 0.109375,
"grad_norm": 2.6816022197266083,
"learning_rate": 4.906269320022566e-06,
"loss": 0.1357,
"step": 182
},
{
"epoch": 0.11057692307692307,
"grad_norm": 3.540312508006275,
"learning_rate": 4.903627639769656e-06,
"loss": 0.1485,
"step": 184
},
{
"epoch": 0.11177884615384616,
"grad_norm": 2.127391987641345,
"learning_rate": 4.900949980082127e-06,
"loss": 0.1491,
"step": 186
},
{
"epoch": 0.11298076923076923,
"grad_norm": 1.8381751149591552,
"learning_rate": 4.898236381041343e-06,
"loss": 0.1378,
"step": 188
},
{
"epoch": 0.1141826923076923,
"grad_norm": 2.4143842380581355,
"learning_rate": 4.895486883266644e-06,
"loss": 0.134,
"step": 190
},
{
"epoch": 0.11538461538461539,
"grad_norm": 2.33879580368458,
"learning_rate": 4.892701527914725e-06,
"loss": 0.1274,
"step": 192
},
{
"epoch": 0.11658653846153846,
"grad_norm": 2.677161136207072,
"learning_rate": 4.88988035667903e-06,
"loss": 0.1247,
"step": 194
},
{
"epoch": 0.11778846153846154,
"grad_norm": 2.224205454303052,
"learning_rate": 4.88702341178912e-06,
"loss": 0.1171,
"step": 196
},
{
"epoch": 0.11899038461538461,
"grad_norm": 2.331487573448718,
"learning_rate": 4.88413073601005e-06,
"loss": 0.1304,
"step": 198
},
{
"epoch": 0.1201923076923077,
"grad_norm": 3.904053698054214,
"learning_rate": 4.8812023726417194e-06,
"loss": 0.1441,
"step": 200
},
{
"epoch": 0.12139423076923077,
"grad_norm": 1.9333636759461283,
"learning_rate": 4.878238365518231e-06,
"loss": 0.1473,
"step": 202
},
{
"epoch": 0.12259615384615384,
"grad_norm": 4.5752867405646205,
"learning_rate": 4.87523875900723e-06,
"loss": 0.1337,
"step": 204
},
{
"epoch": 0.12379807692307693,
"grad_norm": 1.857974634859215,
"learning_rate": 4.872203598009244e-06,
"loss": 0.127,
"step": 206
},
{
"epoch": 0.125,
"grad_norm": 3.153527922810332,
"learning_rate": 4.869132927957007e-06,
"loss": 0.1484,
"step": 208
},
{
"epoch": 0.12620192307692307,
"grad_norm": 2.2228179237011534,
"learning_rate": 4.866026794814781e-06,
"loss": 0.1306,
"step": 210
},
{
"epoch": 0.12740384615384615,
"grad_norm": 1.7350718661408604,
"learning_rate": 4.862885245077669e-06,
"loss": 0.1352,
"step": 212
},
{
"epoch": 0.12860576923076922,
"grad_norm": 2.1132426954959924,
"learning_rate": 4.859708325770919e-06,
"loss": 0.1416,
"step": 214
},
{
"epoch": 0.12980769230769232,
"grad_norm": 1.8563726212012472,
"learning_rate": 4.856496084449218e-06,
"loss": 0.1461,
"step": 216
},
{
"epoch": 0.1310096153846154,
"grad_norm": 1.8179558309835169,
"learning_rate": 4.85324856919598e-06,
"loss": 0.1322,
"step": 218
},
{
"epoch": 0.13221153846153846,
"grad_norm": 4.497720485766678,
"learning_rate": 4.849965828622632e-06,
"loss": 0.1275,
"step": 220
},
{
"epoch": 0.13341346153846154,
"grad_norm": 3.0404012207264843,
"learning_rate": 4.846647911867877e-06,
"loss": 0.1436,
"step": 222
},
{
"epoch": 0.1346153846153846,
"grad_norm": 3.224075088217143,
"learning_rate": 4.8432948685969646e-06,
"loss": 0.1656,
"step": 224
},
{
"epoch": 0.13581730769230768,
"grad_norm": 3.2443798600258686,
"learning_rate": 4.83990674900095e-06,
"loss": 0.1393,
"step": 226
},
{
"epoch": 0.13701923076923078,
"grad_norm": 1.786558241709454,
"learning_rate": 4.836483603795935e-06,
"loss": 0.1263,
"step": 228
},
{
"epoch": 0.13822115384615385,
"grad_norm": 2.1509725801613513,
"learning_rate": 4.8330254842223155e-06,
"loss": 0.1409,
"step": 230
},
{
"epoch": 0.13942307692307693,
"grad_norm": 2.835049924036413,
"learning_rate": 4.829532442044008e-06,
"loss": 0.1319,
"step": 232
},
{
"epoch": 0.140625,
"grad_norm": 4.679921143965946,
"learning_rate": 4.8260045295476846e-06,
"loss": 0.1506,
"step": 234
},
{
"epoch": 0.14182692307692307,
"grad_norm": 1.9142698244457717,
"learning_rate": 4.822441799541979e-06,
"loss": 0.15,
"step": 236
},
{
"epoch": 0.14302884615384615,
"grad_norm": 8.216926584060278,
"learning_rate": 4.818844305356705e-06,
"loss": 0.1508,
"step": 238
},
{
"epoch": 0.14423076923076922,
"grad_norm": 1.5774872715864894,
"learning_rate": 4.815212100842053e-06,
"loss": 0.1365,
"step": 240
},
{
"epoch": 0.14543269230769232,
"grad_norm": 4.90886123617284,
"learning_rate": 4.811545240367785e-06,
"loss": 0.1488,
"step": 242
},
{
"epoch": 0.1466346153846154,
"grad_norm": 2.867193865140862,
"learning_rate": 4.807843778822424e-06,
"loss": 0.1403,
"step": 244
},
{
"epoch": 0.14783653846153846,
"grad_norm": 2.8742525824591123,
"learning_rate": 4.804107771612427e-06,
"loss": 0.1543,
"step": 246
},
{
"epoch": 0.14903846153846154,
"grad_norm": 2.3762533430208563,
"learning_rate": 4.800337274661358e-06,
"loss": 0.1375,
"step": 248
},
{
"epoch": 0.1502403846153846,
"grad_norm": 2.0839909923885447,
"learning_rate": 4.796532344409055e-06,
"loss": 0.1501,
"step": 250
},
{
"epoch": 0.15144230769230768,
"grad_norm": 4.198893033771618,
"learning_rate": 4.7926930378107765e-06,
"loss": 0.1323,
"step": 252
},
{
"epoch": 0.15264423076923078,
"grad_norm": 6.846739098146311,
"learning_rate": 4.788819412336358e-06,
"loss": 0.1399,
"step": 254
},
{
"epoch": 0.15384615384615385,
"grad_norm": 5.537919034600803,
"learning_rate": 4.784911525969344e-06,
"loss": 0.1233,
"step": 256
},
{
"epoch": 0.15504807692307693,
"grad_norm": 3.033292609600203,
"learning_rate": 4.780969437206128e-06,
"loss": 0.1478,
"step": 258
},
{
"epoch": 0.15625,
"grad_norm": 4.62635643989095,
"learning_rate": 4.776993205055067e-06,
"loss": 0.1465,
"step": 260
},
{
"epoch": 0.15745192307692307,
"grad_norm": 1.6930995723930686,
"learning_rate": 4.772982889035609e-06,
"loss": 0.134,
"step": 262
},
{
"epoch": 0.15865384615384615,
"grad_norm": 5.190694670180204,
"learning_rate": 4.7689385491773934e-06,
"loss": 0.1397,
"step": 264
},
{
"epoch": 0.15985576923076922,
"grad_norm": 3.0169437457346104,
"learning_rate": 4.764860246019356e-06,
"loss": 0.1462,
"step": 266
},
{
"epoch": 0.16105769230769232,
"grad_norm": 6.609894055693969,
"learning_rate": 4.760748040608826e-06,
"loss": 0.1349,
"step": 268
},
{
"epoch": 0.1622596153846154,
"grad_norm": 4.93984379875883,
"learning_rate": 4.756601994500604e-06,
"loss": 0.1336,
"step": 270
},
{
"epoch": 0.16346153846153846,
"grad_norm": 4.75518188154229,
"learning_rate": 4.752422169756048e-06,
"loss": 0.146,
"step": 272
},
{
"epoch": 0.16466346153846154,
"grad_norm": 2.086098294528403,
"learning_rate": 4.748208628942143e-06,
"loss": 0.1419,
"step": 274
},
{
"epoch": 0.1658653846153846,
"grad_norm": 5.396963944335693,
"learning_rate": 4.7439614351305614e-06,
"loss": 0.1432,
"step": 276
},
{
"epoch": 0.16706730769230768,
"grad_norm": 7.532501685521908,
"learning_rate": 4.739680651896721e-06,
"loss": 0.145,
"step": 278
},
{
"epoch": 0.16826923076923078,
"grad_norm": 2.336463554377762,
"learning_rate": 4.7353663433188325e-06,
"loss": 0.1475,
"step": 280
},
{
"epoch": 0.16947115384615385,
"grad_norm": 5.48654115007209,
"learning_rate": 4.731018573976943e-06,
"loss": 0.1544,
"step": 282
},
{
"epoch": 0.17067307692307693,
"grad_norm": 2.120931360446975,
"learning_rate": 4.726637408951966e-06,
"loss": 0.1286,
"step": 284
},
{
"epoch": 0.171875,
"grad_norm": 3.3875599595704498,
"learning_rate": 4.7222229138247076e-06,
"loss": 0.1383,
"step": 286
},
{
"epoch": 0.17307692307692307,
"grad_norm": 2.0989272460873796,
"learning_rate": 4.717775154674888e-06,
"loss": 0.1168,
"step": 288
},
{
"epoch": 0.17427884615384615,
"grad_norm": 3.817152405138102,
"learning_rate": 4.713294198080149e-06,
"loss": 0.1257,
"step": 290
},
{
"epoch": 0.17548076923076922,
"grad_norm": 2.293976238111847,
"learning_rate": 4.708780111115058e-06,
"loss": 0.1358,
"step": 292
},
{
"epoch": 0.17668269230769232,
"grad_norm": 2.0161046467731407,
"learning_rate": 4.7042329613501035e-06,
"loss": 0.1214,
"step": 294
},
{
"epoch": 0.1778846153846154,
"grad_norm": 2.3356279678505674,
"learning_rate": 4.699652816850686e-06,
"loss": 0.1296,
"step": 296
},
{
"epoch": 0.17908653846153846,
"grad_norm": 2.034038118147649,
"learning_rate": 4.6950397461761e-06,
"loss": 0.1163,
"step": 298
},
{
"epoch": 0.18028846153846154,
"grad_norm": 2.63792392669932,
"learning_rate": 4.690393818378501e-06,
"loss": 0.1269,
"step": 300
},
{
"epoch": 0.1814903846153846,
"grad_norm": 2.75722633258936,
"learning_rate": 4.685715103001879e-06,
"loss": 0.1243,
"step": 302
},
{
"epoch": 0.18269230769230768,
"grad_norm": 2.0819705788021183,
"learning_rate": 4.681003670081015e-06,
"loss": 0.1304,
"step": 304
},
{
"epoch": 0.18389423076923078,
"grad_norm": 3.4298950454490176,
"learning_rate": 4.676259590140431e-06,
"loss": 0.1377,
"step": 306
},
{
"epoch": 0.18509615384615385,
"grad_norm": 2.471860576622299,
"learning_rate": 4.671482934193337e-06,
"loss": 0.1356,
"step": 308
},
{
"epoch": 0.18629807692307693,
"grad_norm": 4.8199475175470825,
"learning_rate": 4.666673773740568e-06,
"loss": 0.125,
"step": 310
},
{
"epoch": 0.1875,
"grad_norm": 3.178092400708656,
"learning_rate": 4.66183218076951e-06,
"loss": 0.1365,
"step": 312
},
{
"epoch": 0.18870192307692307,
"grad_norm": 5.888487682413386,
"learning_rate": 4.656958227753028e-06,
"loss": 0.1415,
"step": 314
},
{
"epoch": 0.18990384615384615,
"grad_norm": 1.7792863825981573,
"learning_rate": 4.652051987648375e-06,
"loss": 0.1416,
"step": 316
},
{
"epoch": 0.19110576923076922,
"grad_norm": 2.9722465375990836,
"learning_rate": 4.647113533896106e-06,
"loss": 0.1396,
"step": 318
},
{
"epoch": 0.19230769230769232,
"grad_norm": 2.4672094667500475,
"learning_rate": 4.642142940418973e-06,
"loss": 0.1248,
"step": 320
},
{
"epoch": 0.1935096153846154,
"grad_norm": 2.7077423897634914,
"learning_rate": 4.637140281620825e-06,
"loss": 0.1383,
"step": 322
},
{
"epoch": 0.19471153846153846,
"grad_norm": 4.807221287056447,
"learning_rate": 4.632105632385488e-06,
"loss": 0.1361,
"step": 324
},
{
"epoch": 0.19591346153846154,
"grad_norm": 5.728075789744543,
"learning_rate": 4.627039068075647e-06,
"loss": 0.1444,
"step": 326
},
{
"epoch": 0.1971153846153846,
"grad_norm": 4.398279898396926,
"learning_rate": 4.621940664531718e-06,
"loss": 0.1486,
"step": 328
},
{
"epoch": 0.19831730769230768,
"grad_norm": 2.610023546419106,
"learning_rate": 4.6168104980707105e-06,
"loss": 0.1263,
"step": 330
},
{
"epoch": 0.19951923076923078,
"grad_norm": 3.5181999115242943,
"learning_rate": 4.61164864548509e-06,
"loss": 0.1308,
"step": 332
},
{
"epoch": 0.20072115384615385,
"grad_norm": 2.1510452933939903,
"learning_rate": 4.606455184041623e-06,
"loss": 0.14,
"step": 334
},
{
"epoch": 0.20192307692307693,
"grad_norm": 3.244280076674407,
"learning_rate": 4.6012301914802245e-06,
"loss": 0.1211,
"step": 336
},
{
"epoch": 0.203125,
"grad_norm": 2.4060103217800726,
"learning_rate": 4.595973746012791e-06,
"loss": 0.1331,
"step": 338
},
{
"epoch": 0.20432692307692307,
"grad_norm": 5.706691276517432,
"learning_rate": 4.590685926322032e-06,
"loss": 0.1275,
"step": 340
},
{
"epoch": 0.20552884615384615,
"grad_norm": 1.982976110922252,
"learning_rate": 4.585366811560293e-06,
"loss": 0.1236,
"step": 342
},
{
"epoch": 0.20673076923076922,
"grad_norm": 4.23602021986134,
"learning_rate": 4.580016481348367e-06,
"loss": 0.1361,
"step": 344
},
{
"epoch": 0.20793269230769232,
"grad_norm": 2.211392940952842,
"learning_rate": 4.574635015774308e-06,
"loss": 0.1255,
"step": 346
},
{
"epoch": 0.2091346153846154,
"grad_norm": 6.442272520375928,
"learning_rate": 4.569222495392227e-06,
"loss": 0.1344,
"step": 348
},
{
"epoch": 0.21033653846153846,
"grad_norm": 3.8749351925382594,
"learning_rate": 4.563779001221087e-06,
"loss": 0.1501,
"step": 350
},
{
"epoch": 0.21153846153846154,
"grad_norm": 1.7448331765525071,
"learning_rate": 4.558304614743496e-06,
"loss": 0.1381,
"step": 352
},
{
"epoch": 0.2127403846153846,
"grad_norm": 5.668086585104286,
"learning_rate": 4.5527994179044785e-06,
"loss": 0.1306,
"step": 354
},
{
"epoch": 0.21394230769230768,
"grad_norm": 2.5525220134836677,
"learning_rate": 4.547263493110257e-06,
"loss": 0.1386,
"step": 356
},
{
"epoch": 0.21514423076923078,
"grad_norm": 4.733640695947825,
"learning_rate": 4.54169692322701e-06,
"loss": 0.131,
"step": 358
},
{
"epoch": 0.21634615384615385,
"grad_norm": 2.4560081882135965,
"learning_rate": 4.536099791579643e-06,
"loss": 0.1332,
"step": 360
},
{
"epoch": 0.21754807692307693,
"grad_norm": 3.30310384335084,
"learning_rate": 4.530472181950528e-06,
"loss": 0.1452,
"step": 362
},
{
"epoch": 0.21875,
"grad_norm": 3.96117046469673,
"learning_rate": 4.524814178578261e-06,
"loss": 0.1258,
"step": 364
},
{
"epoch": 0.21995192307692307,
"grad_norm": 2.1571934099507324,
"learning_rate": 4.519125866156392e-06,
"loss": 0.1268,
"step": 366
},
{
"epoch": 0.22115384615384615,
"grad_norm": 2.731599995456764,
"learning_rate": 4.5134073298321655e-06,
"loss": 0.1275,
"step": 368
},
{
"epoch": 0.22235576923076922,
"grad_norm": 6.257464871792732,
"learning_rate": 4.5076586552052375e-06,
"loss": 0.136,
"step": 370
},
{
"epoch": 0.22355769230769232,
"grad_norm": 2.1253597649510496,
"learning_rate": 4.501879928326402e-06,
"loss": 0.1097,
"step": 372
},
{
"epoch": 0.2247596153846154,
"grad_norm": 4.866268104213111,
"learning_rate": 4.496071235696296e-06,
"loss": 0.1172,
"step": 374
},
{
"epoch": 0.22596153846153846,
"grad_norm": 2.65071594422531,
"learning_rate": 4.49023266426411e-06,
"loss": 0.1167,
"step": 376
},
{
"epoch": 0.22716346153846154,
"grad_norm": 3.5486758869705266,
"learning_rate": 4.484364301426285e-06,
"loss": 0.1276,
"step": 378
},
{
"epoch": 0.2283653846153846,
"grad_norm": 6.057336639310383,
"learning_rate": 4.478466235025203e-06,
"loss": 0.1393,
"step": 380
},
{
"epoch": 0.22956730769230768,
"grad_norm": 3.204768478540457,
"learning_rate": 4.472538553347871e-06,
"loss": 0.1208,
"step": 382
},
{
"epoch": 0.23076923076923078,
"grad_norm": 3.509914333448296,
"learning_rate": 4.466581345124605e-06,
"loss": 0.138,
"step": 384
},
{
"epoch": 0.23197115384615385,
"grad_norm": 1.9711436203178656,
"learning_rate": 4.460594699527695e-06,
"loss": 0.1263,
"step": 386
},
{
"epoch": 0.23317307692307693,
"grad_norm": 3.331060762805983,
"learning_rate": 4.454578706170075e-06,
"loss": 0.1424,
"step": 388
},
{
"epoch": 0.234375,
"grad_norm": 2.467895906628356,
"learning_rate": 4.448533455103979e-06,
"loss": 0.1324,
"step": 390
},
{
"epoch": 0.23557692307692307,
"grad_norm": 1.5497019030552028,
"learning_rate": 4.442459036819595e-06,
"loss": 0.1319,
"step": 392
},
{
"epoch": 0.23677884615384615,
"grad_norm": 2.822888845766881,
"learning_rate": 4.4363555422437095e-06,
"loss": 0.1272,
"step": 394
},
{
"epoch": 0.23798076923076922,
"grad_norm": 2.5029600610225695,
"learning_rate": 4.430223062738344e-06,
"loss": 0.128,
"step": 396
},
{
"epoch": 0.23918269230769232,
"grad_norm": 1.9911030442643596,
"learning_rate": 4.424061690099392e-06,
"loss": 0.1365,
"step": 398
},
{
"epoch": 0.2403846153846154,
"grad_norm": 2.062089943745463,
"learning_rate": 4.417871516555241e-06,
"loss": 0.1287,
"step": 400
},
{
"epoch": 0.24158653846153846,
"grad_norm": 3.3929814258858784,
"learning_rate": 4.411652634765398e-06,
"loss": 0.1354,
"step": 402
},
{
"epoch": 0.24278846153846154,
"grad_norm": 2.47220919536628,
"learning_rate": 4.4054051378190915e-06,
"loss": 0.1243,
"step": 404
},
{
"epoch": 0.2439903846153846,
"grad_norm": 4.8206335139544265,
"learning_rate": 4.39912911923389e-06,
"loss": 0.1225,
"step": 406
},
{
"epoch": 0.24519230769230768,
"grad_norm": 6.242330191426054,
"learning_rate": 4.392824672954295e-06,
"loss": 0.1495,
"step": 408
},
{
"epoch": 0.24639423076923078,
"grad_norm": 4.388543194744337,
"learning_rate": 4.386491893350334e-06,
"loss": 0.1225,
"step": 410
},
{
"epoch": 0.24759615384615385,
"grad_norm": 3.190150517704449,
"learning_rate": 4.380130875216156e-06,
"loss": 0.1255,
"step": 412
},
{
"epoch": 0.24879807692307693,
"grad_norm": 2.213731831929862,
"learning_rate": 4.373741713768605e-06,
"loss": 0.1356,
"step": 414
},
{
"epoch": 0.25,
"grad_norm": 2.7046643388864546,
"learning_rate": 4.367324504645793e-06,
"loss": 0.1374,
"step": 416
},
{
"epoch": 0.2512019230769231,
"grad_norm": 2.1009699551445977,
"learning_rate": 4.360879343905677e-06,
"loss": 0.1332,
"step": 418
},
{
"epoch": 0.25240384615384615,
"grad_norm": 2.650338768654261,
"learning_rate": 4.354406328024613e-06,
"loss": 0.1314,
"step": 420
},
{
"epoch": 0.2536057692307692,
"grad_norm": 2.810331149813075,
"learning_rate": 4.347905553895918e-06,
"loss": 0.1295,
"step": 422
},
{
"epoch": 0.2548076923076923,
"grad_norm": 3.814782755239228,
"learning_rate": 4.341377118828415e-06,
"loss": 0.1193,
"step": 424
},
{
"epoch": 0.25600961538461536,
"grad_norm": 2.9998629650762405,
"learning_rate": 4.33482112054498e-06,
"loss": 0.131,
"step": 426
},
{
"epoch": 0.25721153846153844,
"grad_norm": 2.7743819788707365,
"learning_rate": 4.3282376571810745e-06,
"loss": 0.1262,
"step": 428
},
{
"epoch": 0.25841346153846156,
"grad_norm": 2.9124275491739255,
"learning_rate": 4.32162682728328e-06,
"loss": 0.1256,
"step": 430
},
{
"epoch": 0.25961538461538464,
"grad_norm": 2.1687486550981805,
"learning_rate": 4.3149887298078275e-06,
"loss": 0.1355,
"step": 432
},
{
"epoch": 0.2608173076923077,
"grad_norm": 3.46835598153599,
"learning_rate": 4.308323464119103e-06,
"loss": 0.1294,
"step": 434
},
{
"epoch": 0.2620192307692308,
"grad_norm": 3.1610918409226603,
"learning_rate": 4.301631129988174e-06,
"loss": 0.1179,
"step": 436
},
{
"epoch": 0.26322115384615385,
"grad_norm": 3.5692561412500914,
"learning_rate": 4.294911827591288e-06,
"loss": 0.1316,
"step": 438
},
{
"epoch": 0.2644230769230769,
"grad_norm": 3.1685646314642955,
"learning_rate": 4.288165657508377e-06,
"loss": 0.1287,
"step": 440
},
{
"epoch": 0.265625,
"grad_norm": 2.6180618695713695,
"learning_rate": 4.281392720721546e-06,
"loss": 0.1225,
"step": 442
},
{
"epoch": 0.2668269230769231,
"grad_norm": 1.8843206020979073,
"learning_rate": 4.274593118613569e-06,
"loss": 0.1116,
"step": 444
},
{
"epoch": 0.26802884615384615,
"grad_norm": 2.541701491013687,
"learning_rate": 4.267766952966369e-06,
"loss": 0.131,
"step": 446
},
{
"epoch": 0.2692307692307692,
"grad_norm": 2.7752887478918185,
"learning_rate": 4.260914325959491e-06,
"loss": 0.134,
"step": 448
},
{
"epoch": 0.2704326923076923,
"grad_norm": 2.2244522364780748,
"learning_rate": 4.254035340168577e-06,
"loss": 0.1331,
"step": 450
},
{
"epoch": 0.27163461538461536,
"grad_norm": 2.1577691411009186,
"learning_rate": 4.247130098563825e-06,
"loss": 0.1356,
"step": 452
},
{
"epoch": 0.27283653846153844,
"grad_norm": 2.495807078564746,
"learning_rate": 4.2401987045084544e-06,
"loss": 0.1285,
"step": 454
},
{
"epoch": 0.27403846153846156,
"grad_norm": 2.315378410961849,
"learning_rate": 4.233241261757155e-06,
"loss": 0.1314,
"step": 456
},
{
"epoch": 0.27524038461538464,
"grad_norm": 2.3360381121240485,
"learning_rate": 4.226257874454535e-06,
"loss": 0.1335,
"step": 458
},
{
"epoch": 0.2764423076923077,
"grad_norm": 4.342066939412811,
"learning_rate": 4.219248647133559e-06,
"loss": 0.1407,
"step": 460
},
{
"epoch": 0.2776442307692308,
"grad_norm": 2.3663888374606032,
"learning_rate": 4.212213684713987e-06,
"loss": 0.1224,
"step": 462
},
{
"epoch": 0.27884615384615385,
"grad_norm": 3.0614706455153553,
"learning_rate": 4.205153092500805e-06,
"loss": 0.1229,
"step": 464
},
{
"epoch": 0.2800480769230769,
"grad_norm": 2.380259494398439,
"learning_rate": 4.198066976182644e-06,
"loss": 0.1292,
"step": 466
},
{
"epoch": 0.28125,
"grad_norm": 4.013842010005791,
"learning_rate": 4.1909554418302e-06,
"loss": 0.134,
"step": 468
},
{
"epoch": 0.2824519230769231,
"grad_norm": 1.778945958084193,
"learning_rate": 4.183818595894648e-06,
"loss": 0.1428,
"step": 470
},
{
"epoch": 0.28365384615384615,
"grad_norm": 3.750377365680276,
"learning_rate": 4.176656545206046e-06,
"loss": 0.1291,
"step": 472
},
{
"epoch": 0.2848557692307692,
"grad_norm": 1.9066583171893872,
"learning_rate": 4.169469396971739e-06,
"loss": 0.1176,
"step": 474
},
{
"epoch": 0.2860576923076923,
"grad_norm": 3.7582224188634736,
"learning_rate": 4.16225725877475e-06,
"loss": 0.1249,
"step": 476
},
{
"epoch": 0.28725961538461536,
"grad_norm": 2.7825989989563564,
"learning_rate": 4.155020238572174e-06,
"loss": 0.1109,
"step": 478
},
{
"epoch": 0.28846153846153844,
"grad_norm": 4.879245102252371,
"learning_rate": 4.147758444693557e-06,
"loss": 0.1364,
"step": 480
},
{
"epoch": 0.28966346153846156,
"grad_norm": 3.2182991915950394,
"learning_rate": 4.140471985839281e-06,
"loss": 0.1271,
"step": 482
},
{
"epoch": 0.29086538461538464,
"grad_norm": 2.166479148262207,
"learning_rate": 4.13316097107893e-06,
"loss": 0.1213,
"step": 484
},
{
"epoch": 0.2920673076923077,
"grad_norm": 2.47776248902879,
"learning_rate": 4.125825509849662e-06,
"loss": 0.1193,
"step": 486
},
{
"epoch": 0.2932692307692308,
"grad_norm": 2.540451340281278,
"learning_rate": 4.11846571195457e-06,
"loss": 0.119,
"step": 488
},
{
"epoch": 0.29447115384615385,
"grad_norm": 3.2230059766589814,
"learning_rate": 4.111081687561036e-06,
"loss": 0.1276,
"step": 490
},
{
"epoch": 0.2956730769230769,
"grad_norm": 2.835333516397744,
"learning_rate": 4.103673547199087e-06,
"loss": 0.1241,
"step": 492
},
{
"epoch": 0.296875,
"grad_norm": 2.752629007829119,
"learning_rate": 4.096241401759732e-06,
"loss": 0.1239,
"step": 494
},
{
"epoch": 0.2980769230769231,
"grad_norm": 1.8919133248892268,
"learning_rate": 4.0887853624933134e-06,
"loss": 0.1239,
"step": 496
},
{
"epoch": 0.29927884615384615,
"grad_norm": 2.8561871397763774,
"learning_rate": 4.081305541007832e-06,
"loss": 0.1289,
"step": 498
},
{
"epoch": 0.3004807692307692,
"grad_norm": 1.6600940797126917,
"learning_rate": 4.07380204926728e-06,
"loss": 0.1384,
"step": 500
},
{
"epoch": 0.3016826923076923,
"grad_norm": 2.404290817625276,
"learning_rate": 4.066274999589967e-06,
"loss": 0.1299,
"step": 502
},
{
"epoch": 0.30288461538461536,
"grad_norm": 1.9475394667243153,
"learning_rate": 4.058724504646834e-06,
"loss": 0.1259,
"step": 504
},
{
"epoch": 0.30408653846153844,
"grad_norm": 3.0051337393851143,
"learning_rate": 4.051150677459772e-06,
"loss": 0.1237,
"step": 506
},
{
"epoch": 0.30528846153846156,
"grad_norm": 2.1578955093603063,
"learning_rate": 4.043553631399928e-06,
"loss": 0.1202,
"step": 508
},
{
"epoch": 0.30649038461538464,
"grad_norm": 6.142783994800525,
"learning_rate": 4.035933480186005e-06,
"loss": 0.1347,
"step": 510
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.954830082548502,
"learning_rate": 4.028290337882565e-06,
"loss": 0.1437,
"step": 512
},
{
"epoch": 0.3088942307692308,
"grad_norm": 2.2019067150054386,
"learning_rate": 4.020624318898319e-06,
"loss": 0.1307,
"step": 514
},
{
"epoch": 0.31009615384615385,
"grad_norm": 4.54110621977567,
"learning_rate": 4.012935537984414e-06,
"loss": 0.1335,
"step": 516
},
{
"epoch": 0.3112980769230769,
"grad_norm": 2.85625320530939,
"learning_rate": 4.005224110232715e-06,
"loss": 0.1317,
"step": 518
},
{
"epoch": 0.3125,
"grad_norm": 2.3576527639858895,
"learning_rate": 3.997490151074085e-06,
"loss": 0.1284,
"step": 520
},
{
"epoch": 0.3137019230769231,
"grad_norm": 2.4537651255404214,
"learning_rate": 3.989733776276654e-06,
"loss": 0.1211,
"step": 522
},
{
"epoch": 0.31490384615384615,
"grad_norm": 3.352379617409583,
"learning_rate": 3.981955101944088e-06,
"loss": 0.1223,
"step": 524
},
{
"epoch": 0.3161057692307692,
"grad_norm": 2.233759105251149,
"learning_rate": 3.9741542445138505e-06,
"loss": 0.1279,
"step": 526
},
{
"epoch": 0.3173076923076923,
"grad_norm": 3.26893899586464,
"learning_rate": 3.966331320755457e-06,
"loss": 0.1308,
"step": 528
},
{
"epoch": 0.31850961538461536,
"grad_norm": 1.9938930635011727,
"learning_rate": 3.958486447768736e-06,
"loss": 0.1191,
"step": 530
},
{
"epoch": 0.31971153846153844,
"grad_norm": 1.6739004551575976,
"learning_rate": 3.95061974298206e-06,
"loss": 0.1114,
"step": 532
},
{
"epoch": 0.32091346153846156,
"grad_norm": 2.1002098020462574,
"learning_rate": 3.942731324150606e-06,
"loss": 0.1229,
"step": 534
},
{
"epoch": 0.32211538461538464,
"grad_norm": 3.227784545692672,
"learning_rate": 3.934821309354581e-06,
"loss": 0.1282,
"step": 536
},
{
"epoch": 0.3233173076923077,
"grad_norm": 2.6927966633468134,
"learning_rate": 3.926889816997457e-06,
"loss": 0.1274,
"step": 538
},
{
"epoch": 0.3245192307692308,
"grad_norm": 3.841849853659577,
"learning_rate": 3.9189369658042e-06,
"loss": 0.1316,
"step": 540
},
{
"epoch": 0.32572115384615385,
"grad_norm": 2.6872062042849727,
"learning_rate": 3.910962874819495e-06,
"loss": 0.1275,
"step": 542
},
{
"epoch": 0.3269230769230769,
"grad_norm": 3.6657337480434946,
"learning_rate": 3.9029676634059565e-06,
"loss": 0.1254,
"step": 544
},
{
"epoch": 0.328125,
"grad_norm": 2.8137841340293352,
"learning_rate": 3.894951451242351e-06,
"loss": 0.1316,
"step": 546
},
{
"epoch": 0.3293269230769231,
"grad_norm": 1.5503149824535458,
"learning_rate": 3.886914358321796e-06,
"loss": 0.1199,
"step": 548
},
{
"epoch": 0.33052884615384615,
"grad_norm": 1.9124225846435765,
"learning_rate": 3.8788565049499746e-06,
"loss": 0.1144,
"step": 550
},
{
"epoch": 0.3317307692307692,
"grad_norm": 2.2194257928538974,
"learning_rate": 3.8707780117433276e-06,
"loss": 0.1203,
"step": 552
},
{
"epoch": 0.3329326923076923,
"grad_norm": 2.2430374522475556,
"learning_rate": 3.8626789996272466e-06,
"loss": 0.1254,
"step": 554
},
{
"epoch": 0.33413461538461536,
"grad_norm": 1.656547967694163,
"learning_rate": 3.854559589834269e-06,
"loss": 0.1155,
"step": 556
},
{
"epoch": 0.33533653846153844,
"grad_norm": 2.71535491729536,
"learning_rate": 3.846419903902261e-06,
"loss": 0.1248,
"step": 558
},
{
"epoch": 0.33653846153846156,
"grad_norm": 4.963796667259708,
"learning_rate": 3.838260063672599e-06,
"loss": 0.1201,
"step": 560
},
{
"epoch": 0.33774038461538464,
"grad_norm": 2.02605376529183,
"learning_rate": 3.830080191288342e-06,
"loss": 0.1264,
"step": 562
},
{
"epoch": 0.3389423076923077,
"grad_norm": 2.2760213880197124,
"learning_rate": 3.82188040919241e-06,
"loss": 0.1121,
"step": 564
},
{
"epoch": 0.3401442307692308,
"grad_norm": 2.458521927082506,
"learning_rate": 3.813660840125747e-06,
"loss": 0.1322,
"step": 566
},
{
"epoch": 0.34134615384615385,
"grad_norm": 4.210654399847963,
"learning_rate": 3.805421607125482e-06,
"loss": 0.128,
"step": 568
},
{
"epoch": 0.3425480769230769,
"grad_norm": 2.161926215111614,
"learning_rate": 3.7971628335230932e-06,
"loss": 0.13,
"step": 570
},
{
"epoch": 0.34375,
"grad_norm": 3.904255891368641,
"learning_rate": 3.788884642942555e-06,
"loss": 0.1317,
"step": 572
},
{
"epoch": 0.3449519230769231,
"grad_norm": 2.71934531169795,
"learning_rate": 3.780587159298492e-06,
"loss": 0.1359,
"step": 574
},
{
"epoch": 0.34615384615384615,
"grad_norm": 2.8064237134830274,
"learning_rate": 3.7722705067943227e-06,
"loss": 0.133,
"step": 576
},
{
"epoch": 0.3473557692307692,
"grad_norm": 2.5669808093942272,
"learning_rate": 3.763934809920401e-06,
"loss": 0.1312,
"step": 578
},
{
"epoch": 0.3485576923076923,
"grad_norm": 2.6878698838883883,
"learning_rate": 3.755580193452153e-06,
"loss": 0.126,
"step": 580
},
{
"epoch": 0.34975961538461536,
"grad_norm": 1.9940547887564615,
"learning_rate": 3.747206782448207e-06,
"loss": 0.1215,
"step": 582
},
{
"epoch": 0.35096153846153844,
"grad_norm": 2.4246119443294147,
"learning_rate": 3.738814702248524e-06,
"loss": 0.1259,
"step": 584
},
{
"epoch": 0.35216346153846156,
"grad_norm": 2.448624947878468,
"learning_rate": 3.7304040784725183e-06,
"loss": 0.1265,
"step": 586
},
{
"epoch": 0.35336538461538464,
"grad_norm": 2.6611405194352544,
"learning_rate": 3.7219750370171843e-06,
"loss": 0.1258,
"step": 588
},
{
"epoch": 0.3545673076923077,
"grad_norm": 3.9151580028753092,
"learning_rate": 3.7135277040552014e-06,
"loss": 0.1269,
"step": 590
},
{
"epoch": 0.3557692307692308,
"grad_norm": 1.902396245377977,
"learning_rate": 3.7050622060330553e-06,
"loss": 0.1269,
"step": 592
},
{
"epoch": 0.35697115384615385,
"grad_norm": 2.200109114807576,
"learning_rate": 3.6965786696691386e-06,
"loss": 0.1297,
"step": 594
},
{
"epoch": 0.3581730769230769,
"grad_norm": 2.640515221983352,
"learning_rate": 3.688077221951857e-06,
"loss": 0.1217,
"step": 596
},
{
"epoch": 0.359375,
"grad_norm": 2.9478456557798194,
"learning_rate": 3.6795579901377277e-06,
"loss": 0.1206,
"step": 598
},
{
"epoch": 0.3605769230769231,
"grad_norm": 4.499371410793944,
"learning_rate": 3.671021101749476e-06,
"loss": 0.1159,
"step": 600
},
{
"epoch": 0.36177884615384615,
"grad_norm": 3.2861013876529266,
"learning_rate": 3.662466684574122e-06,
"loss": 0.1147,
"step": 602
},
{
"epoch": 0.3629807692307692,
"grad_norm": 2.936797344536718,
"learning_rate": 3.653894866661073e-06,
"loss": 0.1218,
"step": 604
},
{
"epoch": 0.3641826923076923,
"grad_norm": 2.5284722183745347,
"learning_rate": 3.645305776320205e-06,
"loss": 0.1277,
"step": 606
},
{
"epoch": 0.36538461538461536,
"grad_norm": 2.0656418154561416,
"learning_rate": 3.636699542119939e-06,
"loss": 0.1226,
"step": 608
},
{
"epoch": 0.36658653846153844,
"grad_norm": 2.761257208121012,
"learning_rate": 3.628076292885322e-06,
"loss": 0.1176,
"step": 610
},
{
"epoch": 0.36778846153846156,
"grad_norm": 4.409264331933305,
"learning_rate": 3.6194361576960944e-06,
"loss": 0.1303,
"step": 612
},
{
"epoch": 0.36899038461538464,
"grad_norm": 2.2897088881849483,
"learning_rate": 3.6107792658847597e-06,
"loss": 0.1166,
"step": 614
},
{
"epoch": 0.3701923076923077,
"grad_norm": 2.556001831241419,
"learning_rate": 3.602105747034646e-06,
"loss": 0.1238,
"step": 616
},
{
"epoch": 0.3713942307692308,
"grad_norm": 2.3832438875718442,
"learning_rate": 3.5934157309779714e-06,
"loss": 0.1189,
"step": 618
},
{
"epoch": 0.37259615384615385,
"grad_norm": 2.256691965422808,
"learning_rate": 3.5847093477938955e-06,
"loss": 0.1324,
"step": 620
},
{
"epoch": 0.3737980769230769,
"grad_norm": 4.4764970926214325,
"learning_rate": 3.5759867278065752e-06,
"loss": 0.1266,
"step": 622
},
{
"epoch": 0.375,
"grad_norm": 2.8438597379920045,
"learning_rate": 3.5672480015832117e-06,
"loss": 0.1258,
"step": 624
},
{
"epoch": 0.3762019230769231,
"grad_norm": 2.5547304438348193,
"learning_rate": 3.5584932999320986e-06,
"loss": 0.1189,
"step": 626
},
{
"epoch": 0.37740384615384615,
"grad_norm": 3.861193208078938,
"learning_rate": 3.549722753900662e-06,
"loss": 0.12,
"step": 628
},
{
"epoch": 0.3786057692307692,
"grad_norm": 2.0271164351237076,
"learning_rate": 3.5409364947734994e-06,
"loss": 0.1034,
"step": 630
},
{
"epoch": 0.3798076923076923,
"grad_norm": 2.661574124686293,
"learning_rate": 3.532134654070415e-06,
"loss": 0.1179,
"step": 632
},
{
"epoch": 0.38100961538461536,
"grad_norm": 4.444020843792755,
"learning_rate": 3.523317363544449e-06,
"loss": 0.1383,
"step": 634
},
{
"epoch": 0.38221153846153844,
"grad_norm": 2.0898293018736145,
"learning_rate": 3.5144847551799105e-06,
"loss": 0.128,
"step": 636
},
{
"epoch": 0.38341346153846156,
"grad_norm": 6.381896171861657,
"learning_rate": 3.5056369611903945e-06,
"loss": 0.135,
"step": 638
},
{
"epoch": 0.38461538461538464,
"grad_norm": 3.3029527185373913,
"learning_rate": 3.496774114016809e-06,
"loss": 0.1367,
"step": 640
},
{
"epoch": 0.3858173076923077,
"grad_norm": 2.3200365246792094,
"learning_rate": 3.487896346325389e-06,
"loss": 0.1244,
"step": 642
},
{
"epoch": 0.3870192307692308,
"grad_norm": 3.598439324678028,
"learning_rate": 3.4790037910057128e-06,
"loss": 0.131,
"step": 644
},
{
"epoch": 0.38822115384615385,
"grad_norm": 1.4871335164149173,
"learning_rate": 3.4700965811687106e-06,
"loss": 0.1194,
"step": 646
},
{
"epoch": 0.3894230769230769,
"grad_norm": 2.4184023479090024,
"learning_rate": 3.461174850144674e-06,
"loss": 0.1213,
"step": 648
},
{
"epoch": 0.390625,
"grad_norm": 3.436257185320764,
"learning_rate": 3.4522387314812606e-06,
"loss": 0.1324,
"step": 650
},
{
"epoch": 0.3918269230769231,
"grad_norm": 1.8151625861479124,
"learning_rate": 3.443288358941491e-06,
"loss": 0.1108,
"step": 652
},
{
"epoch": 0.39302884615384615,
"grad_norm": 1.5261810547328365,
"learning_rate": 3.4343238665017512e-06,
"loss": 0.1105,
"step": 654
},
{
"epoch": 0.3942307692307692,
"grad_norm": 2.8091934186049063,
"learning_rate": 3.425345388349787e-06,
"loss": 0.1348,
"step": 656
},
{
"epoch": 0.3954326923076923,
"grad_norm": 2.002504867469609,
"learning_rate": 3.4163530588826877e-06,
"loss": 0.1075,
"step": 658
},
{
"epoch": 0.39663461538461536,
"grad_norm": 1.925848303593358,
"learning_rate": 3.4073470127048867e-06,
"loss": 0.121,
"step": 660
},
{
"epoch": 0.39783653846153844,
"grad_norm": 3.4486630510150134,
"learning_rate": 3.3983273846261373e-06,
"loss": 0.13,
"step": 662
},
{
"epoch": 0.39903846153846156,
"grad_norm": 2.29190337434423,
"learning_rate": 3.3892943096594968e-06,
"loss": 0.1175,
"step": 664
},
{
"epoch": 0.40024038461538464,
"grad_norm": 2.7382806950058574,
"learning_rate": 3.3802479230193074e-06,
"loss": 0.1355,
"step": 666
},
{
"epoch": 0.4014423076923077,
"grad_norm": 3.8969395559370286,
"learning_rate": 3.371188360119173e-06,
"loss": 0.1265,
"step": 668
},
{
"epoch": 0.4026442307692308,
"grad_norm": 2.0972867493422567,
"learning_rate": 3.3621157565699265e-06,
"loss": 0.1182,
"step": 670
},
{
"epoch": 0.40384615384615385,
"grad_norm": 3.7477223788217673,
"learning_rate": 3.3530302481776062e-06,
"loss": 0.1147,
"step": 672
},
{
"epoch": 0.4050480769230769,
"grad_norm": 2.585644020351654,
"learning_rate": 3.343931970941421e-06,
"loss": 0.1184,
"step": 674
},
{
"epoch": 0.40625,
"grad_norm": 2.6033563821440664,
"learning_rate": 3.3348210610517117e-06,
"loss": 0.1221,
"step": 676
},
{
"epoch": 0.4074519230769231,
"grad_norm": 3.1763777004125067,
"learning_rate": 3.3256976548879183e-06,
"loss": 0.1149,
"step": 678
},
{
"epoch": 0.40865384615384615,
"grad_norm": 2.7352894929472535,
"learning_rate": 3.3165618890165306e-06,
"loss": 0.1205,
"step": 680
},
{
"epoch": 0.4098557692307692,
"grad_norm": 3.574807534485726,
"learning_rate": 3.307413900189054e-06,
"loss": 0.1073,
"step": 682
},
{
"epoch": 0.4110576923076923,
"grad_norm": 3.311593916021147,
"learning_rate": 3.29825382533995e-06,
"loss": 0.1152,
"step": 684
},
{
"epoch": 0.41225961538461536,
"grad_norm": 2.6214370492688692,
"learning_rate": 3.289081801584601e-06,
"loss": 0.1178,
"step": 686
},
{
"epoch": 0.41346153846153844,
"grad_norm": 2.28098423314985,
"learning_rate": 3.2798979662172446e-06,
"loss": 0.1175,
"step": 688
},
{
"epoch": 0.41466346153846156,
"grad_norm": 4.235250427718613,
"learning_rate": 3.2707024567089267e-06,
"loss": 0.1504,
"step": 690
},
{
"epoch": 0.41586538461538464,
"grad_norm": 1.9122767567805194,
"learning_rate": 3.2614954107054405e-06,
"loss": 0.1294,
"step": 692
},
{
"epoch": 0.4170673076923077,
"grad_norm": 3.054582085992648,
"learning_rate": 3.2522769660252673e-06,
"loss": 0.1223,
"step": 694
},
{
"epoch": 0.4182692307692308,
"grad_norm": 1.6351923608702348,
"learning_rate": 3.243047260657511e-06,
"loss": 0.1197,
"step": 696
},
{
"epoch": 0.41947115384615385,
"grad_norm": 2.7477487145437576,
"learning_rate": 3.233806432759837e-06,
"loss": 0.1293,
"step": 698
},
{
"epoch": 0.4206730769230769,
"grad_norm": 2.4016286502537505,
"learning_rate": 3.2245546206564015e-06,
"loss": 0.1154,
"step": 700
},
{
"epoch": 0.421875,
"grad_norm": 1.9800234381047233,
"learning_rate": 3.215291962835779e-06,
"loss": 0.123,
"step": 702
},
{
"epoch": 0.4230769230769231,
"grad_norm": 3.217074666511334,
"learning_rate": 3.206018597948893e-06,
"loss": 0.1208,
"step": 704
},
{
"epoch": 0.42427884615384615,
"grad_norm": 3.25172973443265,
"learning_rate": 3.1967346648069397e-06,
"loss": 0.1244,
"step": 706
},
{
"epoch": 0.4254807692307692,
"grad_norm": 2.2450714988867353,
"learning_rate": 3.1874403023793078e-06,
"loss": 0.1179,
"step": 708
},
{
"epoch": 0.4266826923076923,
"grad_norm": 3.2488238286410875,
"learning_rate": 3.1781356497914995e-06,
"loss": 0.1245,
"step": 710
},
{
"epoch": 0.42788461538461536,
"grad_norm": 2.218601857724757,
"learning_rate": 3.168820846323053e-06,
"loss": 0.1251,
"step": 712
},
{
"epoch": 0.42908653846153844,
"grad_norm": 2.088964444672931,
"learning_rate": 3.1594960314054455e-06,
"loss": 0.1193,
"step": 714
},
{
"epoch": 0.43028846153846156,
"grad_norm": 4.741704269019802,
"learning_rate": 3.150161344620021e-06,
"loss": 0.1322,
"step": 716
},
{
"epoch": 0.43149038461538464,
"grad_norm": 3.493342583852878,
"learning_rate": 3.1408169256958888e-06,
"loss": 0.1278,
"step": 718
},
{
"epoch": 0.4326923076923077,
"grad_norm": 2.351714268349835,
"learning_rate": 3.1314629145078377e-06,
"loss": 0.116,
"step": 720
},
{
"epoch": 0.4338942307692308,
"grad_norm": 3.8649842638324015,
"learning_rate": 3.1220994510742432e-06,
"loss": 0.1297,
"step": 722
},
{
"epoch": 0.43509615384615385,
"grad_norm": 2.841739719188719,
"learning_rate": 3.1127266755549673e-06,
"loss": 0.1238,
"step": 724
},
{
"epoch": 0.4362980769230769,
"grad_norm": 2.0373254493345843,
"learning_rate": 3.1033447282492645e-06,
"loss": 0.1339,
"step": 726
},
{
"epoch": 0.4375,
"grad_norm": 1.8332876940880098,
"learning_rate": 3.0939537495936784e-06,
"loss": 0.1255,
"step": 728
},
{
"epoch": 0.4387019230769231,
"grad_norm": 1.9574438212255216,
"learning_rate": 3.0845538801599423e-06,
"loss": 0.1197,
"step": 730
},
{
"epoch": 0.43990384615384615,
"grad_norm": 1.7871551779346857,
"learning_rate": 3.075145260652873e-06,
"loss": 0.1344,
"step": 732
},
{
"epoch": 0.4411057692307692,
"grad_norm": 3.6706640863007416,
"learning_rate": 3.0657280319082657e-06,
"loss": 0.116,
"step": 734
},
{
"epoch": 0.4423076923076923,
"grad_norm": 1.6394420662743008,
"learning_rate": 3.056302334890786e-06,
"loss": 0.123,
"step": 736
},
{
"epoch": 0.44350961538461536,
"grad_norm": 1.8174034087550737,
"learning_rate": 3.0468683106918608e-06,
"loss": 0.1203,
"step": 738
},
{
"epoch": 0.44471153846153844,
"grad_norm": 2.028279546605494,
"learning_rate": 3.0374261005275606e-06,
"loss": 0.1153,
"step": 740
},
{
"epoch": 0.44591346153846156,
"grad_norm": 3.1742172663448893,
"learning_rate": 3.0279758457364943e-06,
"loss": 0.1119,
"step": 742
},
{
"epoch": 0.44711538461538464,
"grad_norm": 2.1542693819149994,
"learning_rate": 3.018517687777688e-06,
"loss": 0.1152,
"step": 744
},
{
"epoch": 0.4483173076923077,
"grad_norm": 4.6204720149874605,
"learning_rate": 3.009051768228468e-06,
"loss": 0.1297,
"step": 746
},
{
"epoch": 0.4495192307692308,
"grad_norm": 2.0445376227310095,
"learning_rate": 2.9995782287823428e-06,
"loss": 0.115,
"step": 748
},
{
"epoch": 0.45072115384615385,
"grad_norm": 2.320840534894566,
"learning_rate": 2.9900972112468823e-06,
"loss": 0.1257,
"step": 750
},
{
"epoch": 0.4519230769230769,
"grad_norm": 4.0732649101420915,
"learning_rate": 2.9806088575415926e-06,
"loss": 0.1182,
"step": 752
},
{
"epoch": 0.453125,
"grad_norm": 3.8261178694802327,
"learning_rate": 2.971113309695796e-06,
"loss": 0.1202,
"step": 754
},
{
"epoch": 0.4543269230769231,
"grad_norm": 2.393271946060094,
"learning_rate": 2.961610709846501e-06,
"loss": 0.1171,
"step": 756
},
{
"epoch": 0.45552884615384615,
"grad_norm": 1.8371462666695046,
"learning_rate": 2.9521012002362766e-06,
"loss": 0.1142,
"step": 758
},
{
"epoch": 0.4567307692307692,
"grad_norm": 2.08485758756134,
"learning_rate": 2.942584923211121e-06,
"loss": 0.1154,
"step": 760
},
{
"epoch": 0.4579326923076923,
"grad_norm": 2.6562279999651257,
"learning_rate": 2.933062021218337e-06,
"loss": 0.1063,
"step": 762
},
{
"epoch": 0.45913461538461536,
"grad_norm": 2.533470915365061,
"learning_rate": 2.9235326368043885e-06,
"loss": 0.1135,
"step": 764
},
{
"epoch": 0.46033653846153844,
"grad_norm": 2.4011631762333905,
"learning_rate": 2.9139969126127803e-06,
"loss": 0.1134,
"step": 766
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.252103330371488,
"learning_rate": 2.9044549913819125e-06,
"loss": 0.1329,
"step": 768
},
{
"epoch": 0.46274038461538464,
"grad_norm": 2.111392303163354,
"learning_rate": 2.8949070159429473e-06,
"loss": 0.1167,
"step": 770
},
{
"epoch": 0.4639423076923077,
"grad_norm": 2.10465453166218,
"learning_rate": 2.885353129217671e-06,
"loss": 0.1294,
"step": 772
},
{
"epoch": 0.4651442307692308,
"grad_norm": 1.7606762750864913,
"learning_rate": 2.875793474216358e-06,
"loss": 0.1195,
"step": 774
},
{
"epoch": 0.46634615384615385,
"grad_norm": 3.4911755377127665,
"learning_rate": 2.8662281940356234e-06,
"loss": 0.1197,
"step": 776
},
{
"epoch": 0.4675480769230769,
"grad_norm": 2.485129458685194,
"learning_rate": 2.8566574318562855e-06,
"loss": 0.1257,
"step": 778
},
{
"epoch": 0.46875,
"grad_norm": 3.0980789105745536,
"learning_rate": 2.8470813309412222e-06,
"loss": 0.1159,
"step": 780
},
{
"epoch": 0.4699519230769231,
"grad_norm": 2.06101810490773,
"learning_rate": 2.8375000346332256e-06,
"loss": 0.1114,
"step": 782
},
{
"epoch": 0.47115384615384615,
"grad_norm": 2.5211271193230567,
"learning_rate": 2.827913686352856e-06,
"loss": 0.1278,
"step": 784
},
{
"epoch": 0.4723557692307692,
"grad_norm": 2.1529408157219825,
"learning_rate": 2.818322429596297e-06,
"loss": 0.1206,
"step": 786
},
{
"epoch": 0.4735576923076923,
"grad_norm": 2.366887732661358,
"learning_rate": 2.808726407933205e-06,
"loss": 0.1149,
"step": 788
},
{
"epoch": 0.47475961538461536,
"grad_norm": 2.16343980990941,
"learning_rate": 2.7991257650045606e-06,
"loss": 0.1208,
"step": 790
},
{
"epoch": 0.47596153846153844,
"grad_norm": 2.8342000182216345,
"learning_rate": 2.7895206445205226e-06,
"loss": 0.1217,
"step": 792
},
{
"epoch": 0.47716346153846156,
"grad_norm": 1.852391269800072,
"learning_rate": 2.7799111902582697e-06,
"loss": 0.1155,
"step": 794
},
{
"epoch": 0.47836538461538464,
"grad_norm": 2.5799284357343484,
"learning_rate": 2.7702975460598545e-06,
"loss": 0.1283,
"step": 796
},
{
"epoch": 0.4795673076923077,
"grad_norm": 1.881492308096937,
"learning_rate": 2.760679855830047e-06,
"loss": 0.1081,
"step": 798
},
{
"epoch": 0.4807692307692308,
"grad_norm": 2.5186830859263436,
"learning_rate": 2.7510582635341815e-06,
"loss": 0.1187,
"step": 800
},
{
"epoch": 0.48197115384615385,
"grad_norm": 2.6559499054158615,
"learning_rate": 2.7414329131960004e-06,
"loss": 0.1233,
"step": 802
},
{
"epoch": 0.4831730769230769,
"grad_norm": 4.62630178829242,
"learning_rate": 2.731803948895503e-06,
"loss": 0.124,
"step": 804
},
{
"epoch": 0.484375,
"grad_norm": 1.914060815314394,
"learning_rate": 2.722171514766781e-06,
"loss": 0.1123,
"step": 806
},
{
"epoch": 0.4855769230769231,
"grad_norm": 2.4270202069774145,
"learning_rate": 2.7125357549958687e-06,
"loss": 0.1287,
"step": 808
},
{
"epoch": 0.48677884615384615,
"grad_norm": 2.1602891567758746,
"learning_rate": 2.7028968138185783e-06,
"loss": 0.1143,
"step": 810
},
{
"epoch": 0.4879807692307692,
"grad_norm": 2.68436864433482,
"learning_rate": 2.6932548355183476e-06,
"loss": 0.1166,
"step": 812
},
{
"epoch": 0.4891826923076923,
"grad_norm": 2.4944384936946196,
"learning_rate": 2.6836099644240727e-06,
"loss": 0.1133,
"step": 814
},
{
"epoch": 0.49038461538461536,
"grad_norm": 1.8613202081753457,
"learning_rate": 2.673962344907953e-06,
"loss": 0.109,
"step": 816
},
{
"epoch": 0.49158653846153844,
"grad_norm": 2.219693506080579,
"learning_rate": 2.6643121213833306e-06,
"loss": 0.1145,
"step": 818
},
{
"epoch": 0.49278846153846156,
"grad_norm": 2.8619481470099117,
"learning_rate": 2.6546594383025214e-06,
"loss": 0.1115,
"step": 820
},
{
"epoch": 0.49399038461538464,
"grad_norm": 2.666948981163753,
"learning_rate": 2.6450044401546632e-06,
"loss": 0.1305,
"step": 822
},
{
"epoch": 0.4951923076923077,
"grad_norm": 2.09947237601635,
"learning_rate": 2.6353472714635443e-06,
"loss": 0.1099,
"step": 824
},
{
"epoch": 0.4963942307692308,
"grad_norm": 1.9473099754220278,
"learning_rate": 2.625688076785445e-06,
"loss": 0.1208,
"step": 826
},
{
"epoch": 0.49759615384615385,
"grad_norm": 2.0864251934157774,
"learning_rate": 2.6160270007069703e-06,
"loss": 0.1257,
"step": 828
},
{
"epoch": 0.4987980769230769,
"grad_norm": 1.9893158881100514,
"learning_rate": 2.606364187842891e-06,
"loss": 0.1264,
"step": 830
},
{
"epoch": 0.5,
"grad_norm": 2.3582369528291083,
"learning_rate": 2.5966997828339724e-06,
"loss": 0.1147,
"step": 832
},
{
"epoch": 0.5012019230769231,
"grad_norm": 2.146899490039593,
"learning_rate": 2.5870339303448127e-06,
"loss": 0.1152,
"step": 834
},
{
"epoch": 0.5024038461538461,
"grad_norm": 1.990060058415754,
"learning_rate": 2.5773667750616783e-06,
"loss": 0.1041,
"step": 836
},
{
"epoch": 0.5036057692307693,
"grad_norm": 2.178599936980344,
"learning_rate": 2.5676984616903367e-06,
"loss": 0.1286,
"step": 838
},
{
"epoch": 0.5048076923076923,
"grad_norm": 2.562250086026024,
"learning_rate": 2.5580291349538895e-06,
"loss": 0.1146,
"step": 840
},
{
"epoch": 0.5060096153846154,
"grad_norm": 2.4548795707580418,
"learning_rate": 2.5483589395906084e-06,
"loss": 0.1232,
"step": 842
},
{
"epoch": 0.5072115384615384,
"grad_norm": 2.0576956102764536,
"learning_rate": 2.5386880203517665e-06,
"loss": 0.1091,
"step": 844
},
{
"epoch": 0.5084134615384616,
"grad_norm": 1.7798937747570411,
"learning_rate": 2.5290165219994734e-06,
"loss": 0.122,
"step": 846
},
{
"epoch": 0.5096153846153846,
"grad_norm": 3.2665103557785473,
"learning_rate": 2.5193445893045054e-06,
"loss": 0.119,
"step": 848
},
{
"epoch": 0.5108173076923077,
"grad_norm": 2.3751458473034175,
"learning_rate": 2.5096723670441437e-06,
"loss": 0.1161,
"step": 850
},
{
"epoch": 0.5120192307692307,
"grad_norm": 1.7591316722682409,
"learning_rate": 2.5e-06,
"loss": 0.1151,
"step": 852
},
{
"epoch": 0.5132211538461539,
"grad_norm": 2.2115382855282464,
"learning_rate": 2.4903276329558567e-06,
"loss": 0.1313,
"step": 854
},
{
"epoch": 0.5144230769230769,
"grad_norm": 3.714925572378303,
"learning_rate": 2.480655410695495e-06,
"loss": 0.118,
"step": 856
},
{
"epoch": 0.515625,
"grad_norm": 2.292092125779591,
"learning_rate": 2.4709834780005283e-06,
"loss": 0.1105,
"step": 858
},
{
"epoch": 0.5168269230769231,
"grad_norm": 2.8062030763080066,
"learning_rate": 2.4613119796482343e-06,
"loss": 0.1279,
"step": 860
},
{
"epoch": 0.5180288461538461,
"grad_norm": 3.0016696690528684,
"learning_rate": 2.4516410604093924e-06,
"loss": 0.124,
"step": 862
},
{
"epoch": 0.5192307692307693,
"grad_norm": 2.6910032305249776,
"learning_rate": 2.441970865046111e-06,
"loss": 0.1164,
"step": 864
},
{
"epoch": 0.5204326923076923,
"grad_norm": 2.790603434708355,
"learning_rate": 2.4323015383096645e-06,
"loss": 0.1284,
"step": 866
},
{
"epoch": 0.5216346153846154,
"grad_norm": 1.88418937937736,
"learning_rate": 2.422633224938323e-06,
"loss": 0.1197,
"step": 868
},
{
"epoch": 0.5228365384615384,
"grad_norm": 2.268135867297592,
"learning_rate": 2.412966069655188e-06,
"loss": 0.1087,
"step": 870
},
{
"epoch": 0.5240384615384616,
"grad_norm": 1.7727390247554256,
"learning_rate": 2.403300217166028e-06,
"loss": 0.1047,
"step": 872
},
{
"epoch": 0.5252403846153846,
"grad_norm": 2.6452934155833,
"learning_rate": 2.39363581215711e-06,
"loss": 0.1209,
"step": 874
},
{
"epoch": 0.5264423076923077,
"grad_norm": 2.231648348284633,
"learning_rate": 2.38397299929303e-06,
"loss": 0.1225,
"step": 876
},
{
"epoch": 0.5276442307692307,
"grad_norm": 3.1589321862401323,
"learning_rate": 2.374311923214556e-06,
"loss": 0.1278,
"step": 878
},
{
"epoch": 0.5288461538461539,
"grad_norm": 2.538600702095854,
"learning_rate": 2.3646527285364565e-06,
"loss": 0.1133,
"step": 880
},
{
"epoch": 0.5300480769230769,
"grad_norm": 1.8010218639998627,
"learning_rate": 2.3549955598453384e-06,
"loss": 0.1102,
"step": 882
},
{
"epoch": 0.53125,
"grad_norm": 1.9859781619064247,
"learning_rate": 2.3453405616974794e-06,
"loss": 0.1223,
"step": 884
},
{
"epoch": 0.5324519230769231,
"grad_norm": 2.48433192428649,
"learning_rate": 2.3356878786166703e-06,
"loss": 0.1276,
"step": 886
},
{
"epoch": 0.5336538461538461,
"grad_norm": 1.9977524593562115,
"learning_rate": 2.3260376550920472e-06,
"loss": 0.1219,
"step": 888
},
{
"epoch": 0.5348557692307693,
"grad_norm": 1.9708632642889377,
"learning_rate": 2.3163900355759277e-06,
"loss": 0.117,
"step": 890
},
{
"epoch": 0.5360576923076923,
"grad_norm": 2.3393380650509146,
"learning_rate": 2.3067451644816537e-06,
"loss": 0.1328,
"step": 892
},
{
"epoch": 0.5372596153846154,
"grad_norm": 2.016237150348988,
"learning_rate": 2.2971031861814225e-06,
"loss": 0.115,
"step": 894
},
{
"epoch": 0.5384615384615384,
"grad_norm": 2.254008950804627,
"learning_rate": 2.287464245004132e-06,
"loss": 0.1184,
"step": 896
},
{
"epoch": 0.5396634615384616,
"grad_norm": 3.2043282624716403,
"learning_rate": 2.27782848523322e-06,
"loss": 0.1193,
"step": 898
},
{
"epoch": 0.5408653846153846,
"grad_norm": 1.4454861904743852,
"learning_rate": 2.268196051104499e-06,
"loss": 0.1104,
"step": 900
},
{
"epoch": 0.5420673076923077,
"grad_norm": 2.397925567908216,
"learning_rate": 2.2585670868040004e-06,
"loss": 0.1173,
"step": 902
},
{
"epoch": 0.5432692307692307,
"grad_norm": 2.6349415212538503,
"learning_rate": 2.2489417364658194e-06,
"loss": 0.1175,
"step": 904
},
{
"epoch": 0.5444711538461539,
"grad_norm": 1.912841995108057,
"learning_rate": 2.2393201441699535e-06,
"loss": 0.1124,
"step": 906
},
{
"epoch": 0.5456730769230769,
"grad_norm": 2.2214483754728396,
"learning_rate": 2.2297024539401463e-06,
"loss": 0.1169,
"step": 908
},
{
"epoch": 0.546875,
"grad_norm": 2.6645784394778995,
"learning_rate": 2.2200888097417308e-06,
"loss": 0.1124,
"step": 910
},
{
"epoch": 0.5480769230769231,
"grad_norm": 2.3991327890112757,
"learning_rate": 2.2104793554794783e-06,
"loss": 0.1082,
"step": 912
},
{
"epoch": 0.5492788461538461,
"grad_norm": 2.4642009420576487,
"learning_rate": 2.2008742349954394e-06,
"loss": 0.119,
"step": 914
},
{
"epoch": 0.5504807692307693,
"grad_norm": 2.5918285453531116,
"learning_rate": 2.1912735920667966e-06,
"loss": 0.1055,
"step": 916
},
{
"epoch": 0.5516826923076923,
"grad_norm": 2.0680446180956373,
"learning_rate": 2.181677570403704e-06,
"loss": 0.1109,
"step": 918
},
{
"epoch": 0.5528846153846154,
"grad_norm": 2.193301046368466,
"learning_rate": 2.1720863136471447e-06,
"loss": 0.1277,
"step": 920
},
{
"epoch": 0.5540865384615384,
"grad_norm": 2.5163737723965736,
"learning_rate": 2.162499965366775e-06,
"loss": 0.1219,
"step": 922
},
{
"epoch": 0.5552884615384616,
"grad_norm": 3.521848753217605,
"learning_rate": 2.1529186690587786e-06,
"loss": 0.114,
"step": 924
},
{
"epoch": 0.5564903846153846,
"grad_norm": 3.069616221629034,
"learning_rate": 2.1433425681437154e-06,
"loss": 0.1071,
"step": 926
},
{
"epoch": 0.5576923076923077,
"grad_norm": 3.53398612074779,
"learning_rate": 2.1337718059643774e-06,
"loss": 0.1236,
"step": 928
},
{
"epoch": 0.5588942307692307,
"grad_norm": 5.058223699592573,
"learning_rate": 2.124206525783643e-06,
"loss": 0.1109,
"step": 930
},
{
"epoch": 0.5600961538461539,
"grad_norm": 2.5478159897083352,
"learning_rate": 2.114646870782329e-06,
"loss": 0.1167,
"step": 932
},
{
"epoch": 0.5612980769230769,
"grad_norm": 2.354071051813213,
"learning_rate": 2.1050929840570544e-06,
"loss": 0.1011,
"step": 934
},
{
"epoch": 0.5625,
"grad_norm": 4.853864942677267,
"learning_rate": 2.0955450086180883e-06,
"loss": 0.116,
"step": 936
},
{
"epoch": 0.5637019230769231,
"grad_norm": 2.722700474105122,
"learning_rate": 2.08600308738722e-06,
"loss": 0.1108,
"step": 938
},
{
"epoch": 0.5649038461538461,
"grad_norm": 4.630914205750646,
"learning_rate": 2.0764673631956115e-06,
"loss": 0.1172,
"step": 940
},
{
"epoch": 0.5661057692307693,
"grad_norm": 2.0844097872671616,
"learning_rate": 2.0669379787816644e-06,
"loss": 0.1086,
"step": 942
},
{
"epoch": 0.5673076923076923,
"grad_norm": 2.2079993034525147,
"learning_rate": 2.0574150767888795e-06,
"loss": 0.1199,
"step": 944
},
{
"epoch": 0.5685096153846154,
"grad_norm": 3.624361624117408,
"learning_rate": 2.0478987997637246e-06,
"loss": 0.1028,
"step": 946
},
{
"epoch": 0.5697115384615384,
"grad_norm": 2.304855132990531,
"learning_rate": 2.0383892901534995e-06,
"loss": 0.1143,
"step": 948
},
{
"epoch": 0.5709134615384616,
"grad_norm": 2.303913178369359,
"learning_rate": 2.0288866903042055e-06,
"loss": 0.1149,
"step": 950
},
{
"epoch": 0.5721153846153846,
"grad_norm": 2.5242944062982944,
"learning_rate": 2.0193911424584082e-06,
"loss": 0.1271,
"step": 952
},
{
"epoch": 0.5733173076923077,
"grad_norm": 2.2840209986395643,
"learning_rate": 2.0099027887531186e-06,
"loss": 0.1025,
"step": 954
},
{
"epoch": 0.5745192307692307,
"grad_norm": 2.1475236126016757,
"learning_rate": 2.0004217712176576e-06,
"loss": 0.1052,
"step": 956
},
{
"epoch": 0.5757211538461539,
"grad_norm": 2.2062922770065625,
"learning_rate": 1.9909482317715335e-06,
"loss": 0.1261,
"step": 958
},
{
"epoch": 0.5769230769230769,
"grad_norm": 2.573858967297316,
"learning_rate": 1.9814823122223125e-06,
"loss": 0.1206,
"step": 960
},
{
"epoch": 0.578125,
"grad_norm": 3.5542312782650267,
"learning_rate": 1.972024154263506e-06,
"loss": 0.118,
"step": 962
},
{
"epoch": 0.5793269230769231,
"grad_norm": 3.043328928606157,
"learning_rate": 1.96257389947244e-06,
"loss": 0.1148,
"step": 964
},
{
"epoch": 0.5805288461538461,
"grad_norm": 2.186779926589517,
"learning_rate": 1.9531316893081396e-06,
"loss": 0.1028,
"step": 966
},
{
"epoch": 0.5817307692307693,
"grad_norm": 1.9507127168704683,
"learning_rate": 1.9436976651092143e-06,
"loss": 0.1069,
"step": 968
},
{
"epoch": 0.5829326923076923,
"grad_norm": 2.2374696361826403,
"learning_rate": 1.934271968091735e-06,
"loss": 0.1172,
"step": 970
},
{
"epoch": 0.5841346153846154,
"grad_norm": 2.55684351637379,
"learning_rate": 1.924854739347128e-06,
"loss": 0.1084,
"step": 972
},
{
"epoch": 0.5853365384615384,
"grad_norm": 2.325113870079778,
"learning_rate": 1.9154461198400585e-06,
"loss": 0.1235,
"step": 974
},
{
"epoch": 0.5865384615384616,
"grad_norm": 2.6657491779145976,
"learning_rate": 1.9060462504063229e-06,
"loss": 0.1071,
"step": 976
},
{
"epoch": 0.5877403846153846,
"grad_norm": 1.956462181600069,
"learning_rate": 1.8966552717507364e-06,
"loss": 0.119,
"step": 978
},
{
"epoch": 0.5889423076923077,
"grad_norm": 2.705164990543757,
"learning_rate": 1.8872733244450331e-06,
"loss": 0.1023,
"step": 980
},
{
"epoch": 0.5901442307692307,
"grad_norm": 1.9312443310397687,
"learning_rate": 1.8779005489257572e-06,
"loss": 0.1053,
"step": 982
},
{
"epoch": 0.5913461538461539,
"grad_norm": 2.855051765002529,
"learning_rate": 1.8685370854921631e-06,
"loss": 0.1072,
"step": 984
},
{
"epoch": 0.5925480769230769,
"grad_norm": 2.3926479059613373,
"learning_rate": 1.8591830743041123e-06,
"loss": 0.1226,
"step": 986
},
{
"epoch": 0.59375,
"grad_norm": 3.5632611766021465,
"learning_rate": 1.8498386553799802e-06,
"loss": 0.1003,
"step": 988
},
{
"epoch": 0.5949519230769231,
"grad_norm": 3.0673131415514803,
"learning_rate": 1.8405039685945547e-06,
"loss": 0.1103,
"step": 990
},
{
"epoch": 0.5961538461538461,
"grad_norm": 3.875636229689881,
"learning_rate": 1.8311791536769485e-06,
"loss": 0.1301,
"step": 992
},
{
"epoch": 0.5973557692307693,
"grad_norm": 3.2290497985605775,
"learning_rate": 1.821864350208501e-06,
"loss": 0.1149,
"step": 994
},
{
"epoch": 0.5985576923076923,
"grad_norm": 1.9347455813936323,
"learning_rate": 1.8125596976206933e-06,
"loss": 0.1087,
"step": 996
},
{
"epoch": 0.5997596153846154,
"grad_norm": 2.04110053593177,
"learning_rate": 1.8032653351930607e-06,
"loss": 0.112,
"step": 998
},
{
"epoch": 0.6009615384615384,
"grad_norm": 2.4285768876217637,
"learning_rate": 1.793981402051107e-06,
"loss": 0.1184,
"step": 1000
},
{
"epoch": 0.6021634615384616,
"grad_norm": 2.056516574333895,
"learning_rate": 1.7847080371642222e-06,
"loss": 0.1089,
"step": 1002
},
{
"epoch": 0.6033653846153846,
"grad_norm": 2.872014799730928,
"learning_rate": 1.7754453793435995e-06,
"loss": 0.1203,
"step": 1004
},
{
"epoch": 0.6045673076923077,
"grad_norm": 3.1357344499654225,
"learning_rate": 1.7661935672401635e-06,
"loss": 0.1057,
"step": 1006
},
{
"epoch": 0.6057692307692307,
"grad_norm": 2.2990506646197684,
"learning_rate": 1.7569527393424894e-06,
"loss": 0.1102,
"step": 1008
},
{
"epoch": 0.6069711538461539,
"grad_norm": 1.9080185886472223,
"learning_rate": 1.7477230339747342e-06,
"loss": 0.1128,
"step": 1010
},
{
"epoch": 0.6081730769230769,
"grad_norm": 2.4866351605585217,
"learning_rate": 1.7385045892945603e-06,
"loss": 0.1082,
"step": 1012
},
{
"epoch": 0.609375,
"grad_norm": 2.408386002877475,
"learning_rate": 1.7292975432910738e-06,
"loss": 0.1065,
"step": 1014
},
{
"epoch": 0.6105769230769231,
"grad_norm": 2.3425864575127724,
"learning_rate": 1.7201020337827556e-06,
"loss": 0.0992,
"step": 1016
},
{
"epoch": 0.6117788461538461,
"grad_norm": 3.3764842505199897,
"learning_rate": 1.7109181984154e-06,
"loss": 0.0994,
"step": 1018
},
{
"epoch": 0.6129807692307693,
"grad_norm": 3.0511908446927105,
"learning_rate": 1.7017461746600506e-06,
"loss": 0.1116,
"step": 1020
},
{
"epoch": 0.6141826923076923,
"grad_norm": 2.0835754940115048,
"learning_rate": 1.6925860998109472e-06,
"loss": 0.1027,
"step": 1022
},
{
"epoch": 0.6153846153846154,
"grad_norm": 2.1369310697938873,
"learning_rate": 1.6834381109834696e-06,
"loss": 0.1082,
"step": 1024
},
{
"epoch": 0.6165865384615384,
"grad_norm": 3.633547678025306,
"learning_rate": 1.6743023451120831e-06,
"loss": 0.1179,
"step": 1026
},
{
"epoch": 0.6177884615384616,
"grad_norm": 3.222183839199512,
"learning_rate": 1.6651789389482885e-06,
"loss": 0.1047,
"step": 1028
},
{
"epoch": 0.6189903846153846,
"grad_norm": 3.373496600017691,
"learning_rate": 1.6560680290585798e-06,
"loss": 0.1174,
"step": 1030
},
{
"epoch": 0.6201923076923077,
"grad_norm": 3.355771088882065,
"learning_rate": 1.646969751822394e-06,
"loss": 0.1225,
"step": 1032
},
{
"epoch": 0.6213942307692307,
"grad_norm": 2.132501441209862,
"learning_rate": 1.6378842434300746e-06,
"loss": 0.1085,
"step": 1034
},
{
"epoch": 0.6225961538461539,
"grad_norm": 1.8201904843022139,
"learning_rate": 1.6288116398808278e-06,
"loss": 0.1072,
"step": 1036
},
{
"epoch": 0.6237980769230769,
"grad_norm": 1.9943546756148034,
"learning_rate": 1.619752076980693e-06,
"loss": 0.1175,
"step": 1038
},
{
"epoch": 0.625,
"grad_norm": 1.9417071481978827,
"learning_rate": 1.6107056903405038e-06,
"loss": 0.1031,
"step": 1040
},
{
"epoch": 0.6262019230769231,
"grad_norm": 2.0812507755491776,
"learning_rate": 1.6016726153738638e-06,
"loss": 0.1181,
"step": 1042
},
{
"epoch": 0.6274038461538461,
"grad_norm": 1.9437266222472136,
"learning_rate": 1.5926529872951144e-06,
"loss": 0.1104,
"step": 1044
},
{
"epoch": 0.6286057692307693,
"grad_norm": 2.0078937220346265,
"learning_rate": 1.583646941117313e-06,
"loss": 0.1044,
"step": 1046
},
{
"epoch": 0.6298076923076923,
"grad_norm": 2.2331084033833366,
"learning_rate": 1.574654611650214e-06,
"loss": 0.1147,
"step": 1048
},
{
"epoch": 0.6310096153846154,
"grad_norm": 2.133371687932722,
"learning_rate": 1.5656761334982487e-06,
"loss": 0.1159,
"step": 1050
},
{
"epoch": 0.6322115384615384,
"grad_norm": 2.068123773517536,
"learning_rate": 1.5567116410585101e-06,
"loss": 0.1038,
"step": 1052
},
{
"epoch": 0.6334134615384616,
"grad_norm": 2.5576918982500683,
"learning_rate": 1.5477612685187405e-06,
"loss": 0.1169,
"step": 1054
},
{
"epoch": 0.6346153846153846,
"grad_norm": 2.694751080220668,
"learning_rate": 1.5388251498553263e-06,
"loss": 0.1081,
"step": 1056
},
{
"epoch": 0.6358173076923077,
"grad_norm": 2.135244446442495,
"learning_rate": 1.52990341883129e-06,
"loss": 0.1075,
"step": 1058
},
{
"epoch": 0.6370192307692307,
"grad_norm": 2.1823074476166764,
"learning_rate": 1.5209962089942885e-06,
"loss": 0.1085,
"step": 1060
},
{
"epoch": 0.6382211538461539,
"grad_norm": 1.9277746702424785,
"learning_rate": 1.5121036536746119e-06,
"loss": 0.1049,
"step": 1062
},
{
"epoch": 0.6394230769230769,
"grad_norm": 2.365543759553611,
"learning_rate": 1.5032258859831916e-06,
"loss": 0.1093,
"step": 1064
},
{
"epoch": 0.640625,
"grad_norm": 2.4257341316404406,
"learning_rate": 1.4943630388096055e-06,
"loss": 0.1175,
"step": 1066
},
{
"epoch": 0.6418269230769231,
"grad_norm": 2.653293916979889,
"learning_rate": 1.4855152448200901e-06,
"loss": 0.1153,
"step": 1068
},
{
"epoch": 0.6430288461538461,
"grad_norm": 2.419944610975381,
"learning_rate": 1.4766826364555514e-06,
"loss": 0.1159,
"step": 1070
},
{
"epoch": 0.6442307692307693,
"grad_norm": 2.0103810925549626,
"learning_rate": 1.467865345929586e-06,
"loss": 0.1143,
"step": 1072
},
{
"epoch": 0.6454326923076923,
"grad_norm": 2.01089727654853,
"learning_rate": 1.4590635052265008e-06,
"loss": 0.1106,
"step": 1074
},
{
"epoch": 0.6466346153846154,
"grad_norm": 1.748446439918439,
"learning_rate": 1.4502772460993387e-06,
"loss": 0.1018,
"step": 1076
},
{
"epoch": 0.6478365384615384,
"grad_norm": 2.484572897708403,
"learning_rate": 1.4415067000679029e-06,
"loss": 0.1104,
"step": 1078
},
{
"epoch": 0.6490384615384616,
"grad_norm": 2.4037649077365657,
"learning_rate": 1.4327519984167887e-06,
"loss": 0.1189,
"step": 1080
},
{
"epoch": 0.6502403846153846,
"grad_norm": 1.8720994441559204,
"learning_rate": 1.4240132721934256e-06,
"loss": 0.118,
"step": 1082
},
{
"epoch": 0.6514423076923077,
"grad_norm": 1.9961620517391614,
"learning_rate": 1.415290652206105e-06,
"loss": 0.1062,
"step": 1084
},
{
"epoch": 0.6526442307692307,
"grad_norm": 3.3559687716616,
"learning_rate": 1.4065842690220294e-06,
"loss": 0.1192,
"step": 1086
},
{
"epoch": 0.6538461538461539,
"grad_norm": 2.1228084765105373,
"learning_rate": 1.3978942529653549e-06,
"loss": 0.0997,
"step": 1088
},
{
"epoch": 0.6550480769230769,
"grad_norm": 2.609409554692004,
"learning_rate": 1.3892207341152416e-06,
"loss": 0.1146,
"step": 1090
},
{
"epoch": 0.65625,
"grad_norm": 3.084566569938987,
"learning_rate": 1.3805638423039056e-06,
"loss": 0.1238,
"step": 1092
},
{
"epoch": 0.6574519230769231,
"grad_norm": 2.755372215903661,
"learning_rate": 1.371923707114679e-06,
"loss": 0.1091,
"step": 1094
},
{
"epoch": 0.6586538461538461,
"grad_norm": 2.119982444557482,
"learning_rate": 1.3633004578800613e-06,
"loss": 0.099,
"step": 1096
},
{
"epoch": 0.6598557692307693,
"grad_norm": 2.701943705630255,
"learning_rate": 1.354694223679796e-06,
"loss": 0.1235,
"step": 1098
},
{
"epoch": 0.6610576923076923,
"grad_norm": 2.383471150976908,
"learning_rate": 1.3461051333389275e-06,
"loss": 0.1031,
"step": 1100
},
{
"epoch": 0.6622596153846154,
"grad_norm": 3.00768304842994,
"learning_rate": 1.3375333154258788e-06,
"loss": 0.1087,
"step": 1102
},
{
"epoch": 0.6634615384615384,
"grad_norm": 2.088505043527597,
"learning_rate": 1.328978898250525e-06,
"loss": 0.1166,
"step": 1104
},
{
"epoch": 0.6646634615384616,
"grad_norm": 2.434276624558114,
"learning_rate": 1.3204420098622727e-06,
"loss": 0.11,
"step": 1106
},
{
"epoch": 0.6658653846153846,
"grad_norm": 1.8412804984656046,
"learning_rate": 1.3119227780481442e-06,
"loss": 0.113,
"step": 1108
},
{
"epoch": 0.6670673076923077,
"grad_norm": 2.0956844206733405,
"learning_rate": 1.3034213303308627e-06,
"loss": 0.1144,
"step": 1110
},
{
"epoch": 0.6682692307692307,
"grad_norm": 2.1476124760530566,
"learning_rate": 1.294937793966946e-06,
"loss": 0.1095,
"step": 1112
},
{
"epoch": 0.6694711538461539,
"grad_norm": 2.292664553276864,
"learning_rate": 1.286472295944799e-06,
"loss": 0.1146,
"step": 1114
},
{
"epoch": 0.6706730769230769,
"grad_norm": 2.1662467131117404,
"learning_rate": 1.2780249629828161e-06,
"loss": 0.1097,
"step": 1116
},
{
"epoch": 0.671875,
"grad_norm": 2.906015346971846,
"learning_rate": 1.2695959215274817e-06,
"loss": 0.1148,
"step": 1118
},
{
"epoch": 0.6730769230769231,
"grad_norm": 2.1982439434562737,
"learning_rate": 1.261185297751477e-06,
"loss": 0.1053,
"step": 1120
},
{
"epoch": 0.6742788461538461,
"grad_norm": 2.018201703916458,
"learning_rate": 1.2527932175517934e-06,
"loss": 0.115,
"step": 1122
},
{
"epoch": 0.6754807692307693,
"grad_norm": 2.6111890149300976,
"learning_rate": 1.2444198065478475e-06,
"loss": 0.1224,
"step": 1124
},
{
"epoch": 0.6766826923076923,
"grad_norm": 2.5284325319117267,
"learning_rate": 1.2360651900795995e-06,
"loss": 0.1207,
"step": 1126
},
{
"epoch": 0.6778846153846154,
"grad_norm": 2.2545340347392955,
"learning_rate": 1.2277294932056783e-06,
"loss": 0.112,
"step": 1128
},
{
"epoch": 0.6790865384615384,
"grad_norm": 3.362156324890133,
"learning_rate": 1.2194128407015094e-06,
"loss": 0.1164,
"step": 1130
},
{
"epoch": 0.6802884615384616,
"grad_norm": 1.632189263569225,
"learning_rate": 1.2111153570574454e-06,
"loss": 0.1012,
"step": 1132
},
{
"epoch": 0.6814903846153846,
"grad_norm": 2.092206166748186,
"learning_rate": 1.202837166476907e-06,
"loss": 0.1085,
"step": 1134
},
{
"epoch": 0.6826923076923077,
"grad_norm": 2.7031603463833704,
"learning_rate": 1.1945783928745187e-06,
"loss": 0.1109,
"step": 1136
},
{
"epoch": 0.6838942307692307,
"grad_norm": 3.4147286461299355,
"learning_rate": 1.1863391598742535e-06,
"loss": 0.1133,
"step": 1138
},
{
"epoch": 0.6850961538461539,
"grad_norm": 2.1129785563716994,
"learning_rate": 1.1781195908075903e-06,
"loss": 0.1097,
"step": 1140
},
{
"epoch": 0.6862980769230769,
"grad_norm": 3.2026130054118593,
"learning_rate": 1.169919808711659e-06,
"loss": 0.1184,
"step": 1142
},
{
"epoch": 0.6875,
"grad_norm": 2.249630204645609,
"learning_rate": 1.1617399363274024e-06,
"loss": 0.1106,
"step": 1144
},
{
"epoch": 0.6887019230769231,
"grad_norm": 3.2963891692649514,
"learning_rate": 1.1535800960977398e-06,
"loss": 0.1196,
"step": 1146
},
{
"epoch": 0.6899038461538461,
"grad_norm": 3.186358499780556,
"learning_rate": 1.1454404101657319e-06,
"loss": 0.1121,
"step": 1148
},
{
"epoch": 0.6911057692307693,
"grad_norm": 2.47209843153002,
"learning_rate": 1.1373210003727536e-06,
"loss": 0.1167,
"step": 1150
},
{
"epoch": 0.6923076923076923,
"grad_norm": 2.4518332512722876,
"learning_rate": 1.1292219882566726e-06,
"loss": 0.1148,
"step": 1152
},
{
"epoch": 0.6935096153846154,
"grad_norm": 3.262824991958051,
"learning_rate": 1.121143495050026e-06,
"loss": 0.106,
"step": 1154
},
{
"epoch": 0.6947115384615384,
"grad_norm": 2.338537712422274,
"learning_rate": 1.1130856416782046e-06,
"loss": 0.106,
"step": 1156
},
{
"epoch": 0.6959134615384616,
"grad_norm": 2.2204770447922297,
"learning_rate": 1.1050485487576506e-06,
"loss": 0.1101,
"step": 1158
},
{
"epoch": 0.6971153846153846,
"grad_norm": 2.0597064409649892,
"learning_rate": 1.0970323365940443e-06,
"loss": 0.0959,
"step": 1160
},
{
"epoch": 0.6983173076923077,
"grad_norm": 2.222014493052337,
"learning_rate": 1.089037125180506e-06,
"loss": 0.1034,
"step": 1162
},
{
"epoch": 0.6995192307692307,
"grad_norm": 3.1568264036888265,
"learning_rate": 1.0810630341958004e-06,
"loss": 0.1224,
"step": 1164
},
{
"epoch": 0.7007211538461539,
"grad_norm": 2.1388673626652817,
"learning_rate": 1.0731101830025442e-06,
"loss": 0.1024,
"step": 1166
},
{
"epoch": 0.7019230769230769,
"grad_norm": 2.844127901014447,
"learning_rate": 1.0651786906454192e-06,
"loss": 0.1236,
"step": 1168
},
{
"epoch": 0.703125,
"grad_norm": 1.9936288576811623,
"learning_rate": 1.057268675849395e-06,
"loss": 0.1006,
"step": 1170
},
{
"epoch": 0.7043269230769231,
"grad_norm": 1.8480926642214928,
"learning_rate": 1.0493802570179411e-06,
"loss": 0.1001,
"step": 1172
},
{
"epoch": 0.7055288461538461,
"grad_norm": 2.261808036207062,
"learning_rate": 1.041513552231265e-06,
"loss": 0.1038,
"step": 1174
},
{
"epoch": 0.7067307692307693,
"grad_norm": 2.2099301841197545,
"learning_rate": 1.0336686792445424e-06,
"loss": 0.1101,
"step": 1176
},
{
"epoch": 0.7079326923076923,
"grad_norm": 2.180203910907892,
"learning_rate": 1.0258457554861502e-06,
"loss": 0.1057,
"step": 1178
},
{
"epoch": 0.7091346153846154,
"grad_norm": 2.797064097348832,
"learning_rate": 1.0180448980559125e-06,
"loss": 0.0926,
"step": 1180
},
{
"epoch": 0.7103365384615384,
"grad_norm": 3.1147260554752147,
"learning_rate": 1.0102662237233465e-06,
"loss": 0.1191,
"step": 1182
},
{
"epoch": 0.7115384615384616,
"grad_norm": 2.488698925587082,
"learning_rate": 1.0025098489259161e-06,
"loss": 0.1014,
"step": 1184
},
{
"epoch": 0.7127403846153846,
"grad_norm": 2.2420640526045927,
"learning_rate": 9.947758897672855e-07,
"loss": 0.1125,
"step": 1186
},
{
"epoch": 0.7139423076923077,
"grad_norm": 2.634277342582424,
"learning_rate": 9.870644620155878e-07,
"loss": 0.1104,
"step": 1188
},
{
"epoch": 0.7151442307692307,
"grad_norm": 2.278093380222903,
"learning_rate": 9.793756811016824e-07,
"loss": 0.1045,
"step": 1190
},
{
"epoch": 0.7163461538461539,
"grad_norm": 2.1408407961096088,
"learning_rate": 9.717096621174355e-07,
"loss": 0.1154,
"step": 1192
},
{
"epoch": 0.7175480769230769,
"grad_norm": 2.2023983168340413,
"learning_rate": 9.640665198139957e-07,
"loss": 0.1147,
"step": 1194
},
{
"epoch": 0.71875,
"grad_norm": 1.9253814839904362,
"learning_rate": 9.564463686000728e-07,
"loss": 0.1157,
"step": 1196
},
{
"epoch": 0.7199519230769231,
"grad_norm": 2.036041516452903,
"learning_rate": 9.488493225402282e-07,
"loss": 0.0948,
"step": 1198
},
{
"epoch": 0.7211538461538461,
"grad_norm": 3.306618789071036,
"learning_rate": 9.412754953531664e-07,
"loss": 0.101,
"step": 1200
},
{
"epoch": 0.7223557692307693,
"grad_norm": 2.818632480308661,
"learning_rate": 9.337250004100337e-07,
"loss": 0.1232,
"step": 1202
},
{
"epoch": 0.7235576923076923,
"grad_norm": 2.3587067360069334,
"learning_rate": 9.261979507327204e-07,
"loss": 0.1062,
"step": 1204
},
{
"epoch": 0.7247596153846154,
"grad_norm": 2.3843210548687908,
"learning_rate": 9.186944589921687e-07,
"loss": 0.1161,
"step": 1206
},
{
"epoch": 0.7259615384615384,
"grad_norm": 2.069552811499533,
"learning_rate": 9.112146375066872e-07,
"loss": 0.1037,
"step": 1208
},
{
"epoch": 0.7271634615384616,
"grad_norm": 2.5490211308951487,
"learning_rate": 9.037585982402678e-07,
"loss": 0.1182,
"step": 1210
},
{
"epoch": 0.7283653846153846,
"grad_norm": 2.2537446863547177,
"learning_rate": 8.96326452800915e-07,
"loss": 0.1024,
"step": 1212
},
{
"epoch": 0.7295673076923077,
"grad_norm": 2.1542852130856085,
"learning_rate": 8.889183124389645e-07,
"loss": 0.1102,
"step": 1214
},
{
"epoch": 0.7307692307692307,
"grad_norm": 1.8957554942236439,
"learning_rate": 8.815342880454312e-07,
"loss": 0.107,
"step": 1216
},
{
"epoch": 0.7319711538461539,
"grad_norm": 2.4674322862732314,
"learning_rate": 8.741744901503387e-07,
"loss": 0.114,
"step": 1218
},
{
"epoch": 0.7331730769230769,
"grad_norm": 2.039475353958351,
"learning_rate": 8.66839028921071e-07,
"loss": 0.1106,
"step": 1220
},
{
"epoch": 0.734375,
"grad_norm": 2.3489571512912364,
"learning_rate": 8.595280141607198e-07,
"loss": 0.1073,
"step": 1222
},
{
"epoch": 0.7355769230769231,
"grad_norm": 3.073761818193723,
"learning_rate": 8.522415553064433e-07,
"loss": 0.1069,
"step": 1224
},
{
"epoch": 0.7367788461538461,
"grad_norm": 2.4347506007521433,
"learning_rate": 8.44979761427826e-07,
"loss": 0.1064,
"step": 1226
},
{
"epoch": 0.7379807692307693,
"grad_norm": 2.3883683060647134,
"learning_rate": 8.377427412252495e-07,
"loss": 0.1063,
"step": 1228
},
{
"epoch": 0.7391826923076923,
"grad_norm": 2.7706472616211077,
"learning_rate": 8.305306030282618e-07,
"loss": 0.1126,
"step": 1230
},
{
"epoch": 0.7403846153846154,
"grad_norm": 2.5634994337657413,
"learning_rate": 8.233434547939539e-07,
"loss": 0.112,
"step": 1232
},
{
"epoch": 0.7415865384615384,
"grad_norm": 2.397107165704345,
"learning_rate": 8.161814041053526e-07,
"loss": 0.1106,
"step": 1234
},
{
"epoch": 0.7427884615384616,
"grad_norm": 2.2450002047020807,
"learning_rate": 8.090445581698006e-07,
"loss": 0.108,
"step": 1236
},
{
"epoch": 0.7439903846153846,
"grad_norm": 2.014500102466641,
"learning_rate": 8.019330238173568e-07,
"loss": 0.1077,
"step": 1238
},
{
"epoch": 0.7451923076923077,
"grad_norm": 2.168024712104591,
"learning_rate": 7.948469074991955e-07,
"loss": 0.1045,
"step": 1240
},
{
"epoch": 0.7463942307692307,
"grad_norm": 3.0079126945368904,
"learning_rate": 7.877863152860133e-07,
"loss": 0.1092,
"step": 1242
},
{
"epoch": 0.7475961538461539,
"grad_norm": 2.4795980921136294,
"learning_rate": 7.807513528664415e-07,
"loss": 0.1107,
"step": 1244
},
{
"epoch": 0.7487980769230769,
"grad_norm": 2.247412162226902,
"learning_rate": 7.737421255454661e-07,
"loss": 0.1198,
"step": 1246
},
{
"epoch": 0.75,
"grad_norm": 2.3971310721116983,
"learning_rate": 7.667587382428455e-07,
"loss": 0.1161,
"step": 1248
},
{
"epoch": 0.7512019230769231,
"grad_norm": 1.8099165767446914,
"learning_rate": 7.598012954915457e-07,
"loss": 0.0973,
"step": 1250
},
{
"epoch": 0.7524038461538461,
"grad_norm": 2.3170574343599286,
"learning_rate": 7.528699014361757e-07,
"loss": 0.1093,
"step": 1252
},
{
"epoch": 0.7536057692307693,
"grad_norm": 1.7417689564815537,
"learning_rate": 7.459646598314246e-07,
"loss": 0.1021,
"step": 1254
},
{
"epoch": 0.7548076923076923,
"grad_norm": 2.012989717897973,
"learning_rate": 7.390856740405092e-07,
"loss": 0.1022,
"step": 1256
},
{
"epoch": 0.7560096153846154,
"grad_norm": 2.7755030823894082,
"learning_rate": 7.322330470336314e-07,
"loss": 0.108,
"step": 1258
},
{
"epoch": 0.7572115384615384,
"grad_norm": 2.7553825309268305,
"learning_rate": 7.254068813864315e-07,
"loss": 0.1164,
"step": 1260
},
{
"epoch": 0.7584134615384616,
"grad_norm": 3.2871528914249164,
"learning_rate": 7.186072792784549e-07,
"loss": 0.1018,
"step": 1262
},
{
"epoch": 0.7596153846153846,
"grad_norm": 2.3825605880656826,
"learning_rate": 7.118343424916249e-07,
"loss": 0.1006,
"step": 1264
},
{
"epoch": 0.7608173076923077,
"grad_norm": 2.8679237655683627,
"learning_rate": 7.050881724087125e-07,
"loss": 0.1043,
"step": 1266
},
{
"epoch": 0.7620192307692307,
"grad_norm": 2.6274099068260557,
"learning_rate": 6.983688700118257e-07,
"loss": 0.1084,
"step": 1268
},
{
"epoch": 0.7632211538461539,
"grad_norm": 2.432380483126836,
"learning_rate": 6.916765358808969e-07,
"loss": 0.1098,
"step": 1270
},
{
"epoch": 0.7644230769230769,
"grad_norm": 2.2356194365705218,
"learning_rate": 6.850112701921735e-07,
"loss": 0.0974,
"step": 1272
},
{
"epoch": 0.765625,
"grad_norm": 2.322442564380917,
"learning_rate": 6.783731727167195e-07,
"loss": 0.1149,
"step": 1274
},
{
"epoch": 0.7668269230769231,
"grad_norm": 2.7155413629798777,
"learning_rate": 6.717623428189262e-07,
"loss": 0.1107,
"step": 1276
},
{
"epoch": 0.7680288461538461,
"grad_norm": 2.5926151388895184,
"learning_rate": 6.65178879455021e-07,
"loss": 0.0961,
"step": 1278
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.053872678475484,
"learning_rate": 6.586228811715853e-07,
"loss": 0.104,
"step": 1280
},
{
"epoch": 0.7704326923076923,
"grad_norm": 2.414054056484151,
"learning_rate": 6.520944461040829e-07,
"loss": 0.0987,
"step": 1282
},
{
"epoch": 0.7716346153846154,
"grad_norm": 2.2800029934734014,
"learning_rate": 6.455936719753883e-07,
"loss": 0.1109,
"step": 1284
},
{
"epoch": 0.7728365384615384,
"grad_norm": 2.254565376531854,
"learning_rate": 6.391206560943241e-07,
"loss": 0.0972,
"step": 1286
},
{
"epoch": 0.7740384615384616,
"grad_norm": 2.4180234435201866,
"learning_rate": 6.326754953542086e-07,
"loss": 0.1055,
"step": 1288
},
{
"epoch": 0.7752403846153846,
"grad_norm": 2.864294623339486,
"learning_rate": 6.262582862313968e-07,
"loss": 0.1073,
"step": 1290
},
{
"epoch": 0.7764423076923077,
"grad_norm": 2.5508498340470465,
"learning_rate": 6.198691247838437e-07,
"loss": 0.1072,
"step": 1292
},
{
"epoch": 0.7776442307692307,
"grad_norm": 2.7551427113131233,
"learning_rate": 6.135081066496662e-07,
"loss": 0.0988,
"step": 1294
},
{
"epoch": 0.7788461538461539,
"grad_norm": 2.891789692416902,
"learning_rate": 6.071753270457065e-07,
"loss": 0.1214,
"step": 1296
},
{
"epoch": 0.7800480769230769,
"grad_norm": 2.1917615480612165,
"learning_rate": 6.00870880766111e-07,
"loss": 0.1027,
"step": 1298
},
{
"epoch": 0.78125,
"grad_norm": 2.323581668550887,
"learning_rate": 5.945948621809092e-07,
"loss": 0.0992,
"step": 1300
},
{
"epoch": 0.7824519230769231,
"grad_norm": 2.166198462254229,
"learning_rate": 5.883473652346031e-07,
"loss": 0.1107,
"step": 1302
},
{
"epoch": 0.7836538461538461,
"grad_norm": 2.1325952846722043,
"learning_rate": 5.821284834447586e-07,
"loss": 0.1137,
"step": 1304
},
{
"epoch": 0.7848557692307693,
"grad_norm": 2.3389500061042856,
"learning_rate": 5.759383099006094e-07,
"loss": 0.114,
"step": 1306
},
{
"epoch": 0.7860576923076923,
"grad_norm": 2.392023151903911,
"learning_rate": 5.697769372616565e-07,
"loss": 0.1154,
"step": 1308
},
{
"epoch": 0.7872596153846154,
"grad_norm": 1.9514556840096247,
"learning_rate": 5.636444577562911e-07,
"loss": 0.1071,
"step": 1310
},
{
"epoch": 0.7884615384615384,
"grad_norm": 1.7463857786932386,
"learning_rate": 5.575409631804049e-07,
"loss": 0.0932,
"step": 1312
},
{
"epoch": 0.7896634615384616,
"grad_norm": 2.5321085368327023,
"learning_rate": 5.51466544896021e-07,
"loss": 0.1249,
"step": 1314
},
{
"epoch": 0.7908653846153846,
"grad_norm": 2.7367754184042794,
"learning_rate": 5.454212938299256e-07,
"loss": 0.1083,
"step": 1316
},
{
"epoch": 0.7920673076923077,
"grad_norm": 2.1480042151176795,
"learning_rate": 5.39405300472306e-07,
"loss": 0.1135,
"step": 1318
},
{
"epoch": 0.7932692307692307,
"grad_norm": 2.1126995444295895,
"learning_rate": 5.334186548753961e-07,
"loss": 0.0993,
"step": 1320
},
{
"epoch": 0.7944711538461539,
"grad_norm": 1.9387325114766338,
"learning_rate": 5.2746144665213e-07,
"loss": 0.0975,
"step": 1322
},
{
"epoch": 0.7956730769230769,
"grad_norm": 2.5557991339707193,
"learning_rate": 5.215337649747986e-07,
"loss": 0.1062,
"step": 1324
},
{
"epoch": 0.796875,
"grad_norm": 1.9233646398585384,
"learning_rate": 5.156356985737154e-07,
"loss": 0.0983,
"step": 1326
},
{
"epoch": 0.7980769230769231,
"grad_norm": 2.2467131024558182,
"learning_rate": 5.097673357358906e-07,
"loss": 0.0968,
"step": 1328
},
{
"epoch": 0.7992788461538461,
"grad_norm": 2.4109454813538442,
"learning_rate": 5.039287643037058e-07,
"loss": 0.0979,
"step": 1330
},
{
"epoch": 0.8004807692307693,
"grad_norm": 3.124394231436496,
"learning_rate": 4.981200716735993e-07,
"loss": 0.1265,
"step": 1332
},
{
"epoch": 0.8016826923076923,
"grad_norm": 2.6675264999412,
"learning_rate": 4.92341344794763e-07,
"loss": 0.1049,
"step": 1334
},
{
"epoch": 0.8028846153846154,
"grad_norm": 2.848770862565795,
"learning_rate": 4.865926701678353e-07,
"loss": 0.1025,
"step": 1336
},
{
"epoch": 0.8040865384615384,
"grad_norm": 2.6854316431958867,
"learning_rate": 4.808741338436082e-07,
"loss": 0.1073,
"step": 1338
},
{
"epoch": 0.8052884615384616,
"grad_norm": 3.1092668803437515,
"learning_rate": 4.7518582142174e-07,
"loss": 0.0928,
"step": 1340
},
{
"epoch": 0.8064903846153846,
"grad_norm": 2.1214191642164266,
"learning_rate": 4.695278180494725e-07,
"loss": 0.1012,
"step": 1342
},
{
"epoch": 0.8076923076923077,
"grad_norm": 2.5730076842528553,
"learning_rate": 4.6390020842035755e-07,
"loss": 0.11,
"step": 1344
},
{
"epoch": 0.8088942307692307,
"grad_norm": 2.68220531087873,
"learning_rate": 4.5830307677298984e-07,
"loss": 0.1188,
"step": 1346
},
{
"epoch": 0.8100961538461539,
"grad_norm": 2.363649492332498,
"learning_rate": 4.5273650688974437e-07,
"loss": 0.1021,
"step": 1348
},
{
"epoch": 0.8112980769230769,
"grad_norm": 2.541964709244174,
"learning_rate": 4.4720058209552163e-07,
"loss": 0.0925,
"step": 1350
},
{
"epoch": 0.8125,
"grad_norm": 3.265981110682609,
"learning_rate": 4.4169538525650453e-07,
"loss": 0.1037,
"step": 1352
},
{
"epoch": 0.8137019230769231,
"grad_norm": 2.4485095937525854,
"learning_rate": 4.362209987789129e-07,
"loss": 0.1086,
"step": 1354
},
{
"epoch": 0.8149038461538461,
"grad_norm": 2.2907426923363805,
"learning_rate": 4.307775046077739e-07,
"loss": 0.0986,
"step": 1356
},
{
"epoch": 0.8161057692307693,
"grad_norm": 2.0945358815806387,
"learning_rate": 4.2536498422569237e-07,
"loss": 0.0955,
"step": 1358
},
{
"epoch": 0.8173076923076923,
"grad_norm": 2.211078181765995,
"learning_rate": 4.1998351865163323e-07,
"loss": 0.1005,
"step": 1360
},
{
"epoch": 0.8185096153846154,
"grad_norm": 2.3888674275205473,
"learning_rate": 4.1463318843970727e-07,
"loss": 0.0946,
"step": 1362
},
{
"epoch": 0.8197115384615384,
"grad_norm": 2.8100928189396783,
"learning_rate": 4.093140736779691e-07,
"loss": 0.1072,
"step": 1364
},
{
"epoch": 0.8209134615384616,
"grad_norm": 2.5814035620911775,
"learning_rate": 4.0402625398721056e-07,
"loss": 0.1085,
"step": 1366
},
{
"epoch": 0.8221153846153846,
"grad_norm": 2.2204309134850604,
"learning_rate": 3.987698085197761e-07,
"loss": 0.1057,
"step": 1368
},
{
"epoch": 0.8233173076923077,
"grad_norm": 2.284890393659316,
"learning_rate": 3.935448159583774e-07,
"loss": 0.1095,
"step": 1370
},
{
"epoch": 0.8245192307692307,
"grad_norm": 2.9277446873455233,
"learning_rate": 3.8835135451491037e-07,
"loss": 0.0972,
"step": 1372
},
{
"epoch": 0.8257211538461539,
"grad_norm": 2.624827973263955,
"learning_rate": 3.831895019292897e-07,
"loss": 0.1103,
"step": 1374
},
{
"epoch": 0.8269230769230769,
"grad_norm": 2.680261506966643,
"learning_rate": 3.7805933546828265e-07,
"loss": 0.1172,
"step": 1376
},
{
"epoch": 0.828125,
"grad_norm": 2.194552961136517,
"learning_rate": 3.7296093192435325e-07,
"loss": 0.1003,
"step": 1378
},
{
"epoch": 0.8293269230769231,
"grad_norm": 2.559847310791807,
"learning_rate": 3.6789436761451135e-07,
"loss": 0.1039,
"step": 1380
},
{
"epoch": 0.8305288461538461,
"grad_norm": 2.1332823235020575,
"learning_rate": 3.6285971837917514e-07,
"loss": 0.1004,
"step": 1382
},
{
"epoch": 0.8317307692307693,
"grad_norm": 2.260620258768886,
"learning_rate": 3.578570595810274e-07,
"loss": 0.1043,
"step": 1384
},
{
"epoch": 0.8329326923076923,
"grad_norm": 2.1460191050714768,
"learning_rate": 3.5288646610389497e-07,
"loss": 0.0973,
"step": 1386
},
{
"epoch": 0.8341346153846154,
"grad_norm": 2.4453293937330804,
"learning_rate": 3.4794801235162575e-07,
"loss": 0.0982,
"step": 1388
},
{
"epoch": 0.8353365384615384,
"grad_norm": 2.5470784403076823,
"learning_rate": 3.4304177224697284e-07,
"loss": 0.1071,
"step": 1390
},
{
"epoch": 0.8365384615384616,
"grad_norm": 2.1819545974434194,
"learning_rate": 3.3816781923049047e-07,
"loss": 0.0977,
"step": 1392
},
{
"epoch": 0.8377403846153846,
"grad_norm": 2.66559740829053,
"learning_rate": 3.333262262594328e-07,
"loss": 0.1013,
"step": 1394
},
{
"epoch": 0.8389423076923077,
"grad_norm": 2.4531040234693493,
"learning_rate": 3.285170658066636e-07,
"loss": 0.1136,
"step": 1396
},
{
"epoch": 0.8401442307692307,
"grad_norm": 2.222643442815888,
"learning_rate": 3.2374040985957005e-07,
"loss": 0.1069,
"step": 1398
},
{
"epoch": 0.8413461538461539,
"grad_norm": 2.513474365488169,
"learning_rate": 3.1899632991898634e-07,
"loss": 0.1115,
"step": 1400
},
{
"epoch": 0.8425480769230769,
"grad_norm": 2.4676487353640537,
"learning_rate": 3.1428489699812187e-07,
"loss": 0.1134,
"step": 1402
},
{
"epoch": 0.84375,
"grad_norm": 2.5478148761024344,
"learning_rate": 3.096061816214993e-07,
"loss": 0.1125,
"step": 1404
},
{
"epoch": 0.8449519230769231,
"grad_norm": 2.693250913616855,
"learning_rate": 3.0496025382390023e-07,
"loss": 0.1101,
"step": 1406
},
{
"epoch": 0.8461538461538461,
"grad_norm": 2.1582519269517846,
"learning_rate": 3.0034718314931376e-07,
"loss": 0.0987,
"step": 1408
},
{
"epoch": 0.8473557692307693,
"grad_norm": 2.356390319809914,
"learning_rate": 2.9576703864989705e-07,
"loss": 0.1103,
"step": 1410
},
{
"epoch": 0.8485576923076923,
"grad_norm": 2.4939149281676944,
"learning_rate": 2.9121988888494297e-07,
"loss": 0.1075,
"step": 1412
},
{
"epoch": 0.8497596153846154,
"grad_norm": 2.560971562380158,
"learning_rate": 2.8670580191985096e-07,
"loss": 0.1047,
"step": 1414
},
{
"epoch": 0.8509615384615384,
"grad_norm": 2.328897343184531,
"learning_rate": 2.822248453251117e-07,
"loss": 0.0952,
"step": 1416
},
{
"epoch": 0.8521634615384616,
"grad_norm": 2.741941178369846,
"learning_rate": 2.7777708617529263e-07,
"loss": 0.114,
"step": 1418
},
{
"epoch": 0.8533653846153846,
"grad_norm": 2.8152555502780747,
"learning_rate": 2.73362591048035e-07,
"loss": 0.1118,
"step": 1420
},
{
"epoch": 0.8545673076923077,
"grad_norm": 2.308625479951822,
"learning_rate": 2.689814260230575e-07,
"loss": 0.0916,
"step": 1422
},
{
"epoch": 0.8557692307692307,
"grad_norm": 2.7282177204631655,
"learning_rate": 2.646336566811686e-07,
"loss": 0.0998,
"step": 1424
},
{
"epoch": 0.8569711538461539,
"grad_norm": 2.3678387797587415,
"learning_rate": 2.6031934810328006e-07,
"loss": 0.097,
"step": 1426
},
{
"epoch": 0.8581730769230769,
"grad_norm": 2.2659866299053864,
"learning_rate": 2.560385648694394e-07,
"loss": 0.1035,
"step": 1428
},
{
"epoch": 0.859375,
"grad_norm": 2.1167601912463665,
"learning_rate": 2.5179137105785733e-07,
"loss": 0.1133,
"step": 1430
},
{
"epoch": 0.8605769230769231,
"grad_norm": 3.156290005065614,
"learning_rate": 2.4757783024395244e-07,
"loss": 0.1083,
"step": 1432
},
{
"epoch": 0.8617788461538461,
"grad_norm": 2.3830390104253514,
"learning_rate": 2.43398005499397e-07,
"loss": 0.1142,
"step": 1434
},
{
"epoch": 0.8629807692307693,
"grad_norm": 2.1651688894763805,
"learning_rate": 2.3925195939117516e-07,
"loss": 0.1008,
"step": 1436
},
{
"epoch": 0.8641826923076923,
"grad_norm": 2.906596215817537,
"learning_rate": 2.3513975398064382e-07,
"loss": 0.109,
"step": 1438
},
{
"epoch": 0.8653846153846154,
"grad_norm": 2.822390125625684,
"learning_rate": 2.3106145082260777e-07,
"loss": 0.11,
"step": 1440
},
{
"epoch": 0.8665865384615384,
"grad_norm": 4.20785338557248,
"learning_rate": 2.2701711096439177e-07,
"loss": 0.0926,
"step": 1442
},
{
"epoch": 0.8677884615384616,
"grad_norm": 2.401735024395661,
"learning_rate": 2.23006794944933e-07,
"loss": 0.1096,
"step": 1444
},
{
"epoch": 0.8689903846153846,
"grad_norm": 2.324535843969192,
"learning_rate": 2.1903056279387242e-07,
"loss": 0.0979,
"step": 1446
},
{
"epoch": 0.8701923076923077,
"grad_norm": 2.3309020366100395,
"learning_rate": 2.1508847403065582e-07,
"loss": 0.1003,
"step": 1448
},
{
"epoch": 0.8713942307692307,
"grad_norm": 1.8021632811568191,
"learning_rate": 2.1118058766364245e-07,
"loss": 0.0973,
"step": 1450
},
{
"epoch": 0.8725961538461539,
"grad_norm": 3.2593847440771753,
"learning_rate": 2.0730696218922376e-07,
"loss": 0.1181,
"step": 1452
},
{
"epoch": 0.8737980769230769,
"grad_norm": 2.822198349147213,
"learning_rate": 2.0346765559094566e-07,
"loss": 0.1011,
"step": 1454
},
{
"epoch": 0.875,
"grad_norm": 2.281439077352929,
"learning_rate": 1.9966272533864183e-07,
"loss": 0.1078,
"step": 1456
},
{
"epoch": 0.8762019230769231,
"grad_norm": 2.113587059455818,
"learning_rate": 1.9589222838757416e-07,
"loss": 0.101,
"step": 1458
},
{
"epoch": 0.8774038461538461,
"grad_norm": 2.8223349033116034,
"learning_rate": 1.9215622117757683e-07,
"loss": 0.1061,
"step": 1460
},
{
"epoch": 0.8786057692307693,
"grad_norm": 3.472491327729482,
"learning_rate": 1.8845475963221504e-07,
"loss": 0.1025,
"step": 1462
},
{
"epoch": 0.8798076923076923,
"grad_norm": 1.8581268435798293,
"learning_rate": 1.847878991579477e-07,
"loss": 0.095,
"step": 1464
},
{
"epoch": 0.8810096153846154,
"grad_norm": 2.730059314985803,
"learning_rate": 1.8115569464329602e-07,
"loss": 0.1186,
"step": 1466
},
{
"epoch": 0.8822115384615384,
"grad_norm": 2.1077055410907906,
"learning_rate": 1.7755820045802146e-07,
"loss": 0.1038,
"step": 1468
},
{
"epoch": 0.8834134615384616,
"grad_norm": 2.425923169061633,
"learning_rate": 1.7399547045231612e-07,
"loss": 0.1052,
"step": 1470
},
{
"epoch": 0.8846153846153846,
"grad_norm": 3.112007013321009,
"learning_rate": 1.7046755795599224e-07,
"loss": 0.1081,
"step": 1472
},
{
"epoch": 0.8858173076923077,
"grad_norm": 2.569797668666943,
"learning_rate": 1.6697451577768558e-07,
"loss": 0.1066,
"step": 1474
},
{
"epoch": 0.8870192307692307,
"grad_norm": 2.2793145814741815,
"learning_rate": 1.6351639620406506e-07,
"loss": 0.093,
"step": 1476
},
{
"epoch": 0.8882211538461539,
"grad_norm": 2.5222639348107148,
"learning_rate": 1.600932509990502e-07,
"loss": 0.1044,
"step": 1478
},
{
"epoch": 0.8894230769230769,
"grad_norm": 2.588170889871738,
"learning_rate": 1.567051314030349e-07,
"loss": 0.1095,
"step": 1480
},
{
"epoch": 0.890625,
"grad_norm": 2.291713127389421,
"learning_rate": 1.5335208813212376e-07,
"loss": 0.108,
"step": 1482
},
{
"epoch": 0.8918269230769231,
"grad_norm": 2.466650814807856,
"learning_rate": 1.500341713773687e-07,
"loss": 0.0961,
"step": 1484
},
{
"epoch": 0.8930288461538461,
"grad_norm": 2.7084007905892733,
"learning_rate": 1.4675143080401965e-07,
"loss": 0.1085,
"step": 1486
},
{
"epoch": 0.8942307692307693,
"grad_norm": 2.3729169086566286,
"learning_rate": 1.4350391555078253e-07,
"loss": 0.0961,
"step": 1488
},
{
"epoch": 0.8954326923076923,
"grad_norm": 2.6818163643038995,
"learning_rate": 1.4029167422908107e-07,
"loss": 0.1043,
"step": 1490
},
{
"epoch": 0.8966346153846154,
"grad_norm": 2.8498937973846066,
"learning_rate": 1.3711475492233116e-07,
"loss": 0.1005,
"step": 1492
},
{
"epoch": 0.8978365384615384,
"grad_norm": 2.3727561897542184,
"learning_rate": 1.3397320518521993e-07,
"loss": 0.1083,
"step": 1494
},
{
"epoch": 0.8990384615384616,
"grad_norm": 2.864804942811416,
"learning_rate": 1.3086707204299415e-07,
"loss": 0.1042,
"step": 1496
},
{
"epoch": 0.9002403846153846,
"grad_norm": 2.5133132508801537,
"learning_rate": 1.2779640199075627e-07,
"loss": 0.1155,
"step": 1498
},
{
"epoch": 0.9014423076923077,
"grad_norm": 2.534644984379523,
"learning_rate": 1.2476124099277038e-07,
"loss": 0.1136,
"step": 1500
},
{
"epoch": 0.9026442307692307,
"grad_norm": 2.3008784304419714,
"learning_rate": 1.217616344817693e-07,
"loss": 0.0916,
"step": 1502
},
{
"epoch": 0.9038461538461539,
"grad_norm": 3.1775796763443047,
"learning_rate": 1.1879762735828081e-07,
"loss": 0.1042,
"step": 1504
},
{
"epoch": 0.9050480769230769,
"grad_norm": 2.497072810226958,
"learning_rate": 1.1586926398995057e-07,
"loss": 0.1107,
"step": 1506
},
{
"epoch": 0.90625,
"grad_norm": 2.5866908472155923,
"learning_rate": 1.129765882108802e-07,
"loss": 0.1043,
"step": 1508
},
{
"epoch": 0.9074519230769231,
"grad_norm": 2.2123227768120626,
"learning_rate": 1.1011964332097114e-07,
"loss": 0.1056,
"step": 1510
},
{
"epoch": 0.9086538461538461,
"grad_norm": 2.2624610691482134,
"learning_rate": 1.0729847208527516e-07,
"loss": 0.1097,
"step": 1512
},
{
"epoch": 0.9098557692307693,
"grad_norm": 2.4925186199498115,
"learning_rate": 1.045131167333563e-07,
"loss": 0.1055,
"step": 1514
},
{
"epoch": 0.9110576923076923,
"grad_norm": 2.0402886616020846,
"learning_rate": 1.0176361895865683e-07,
"loss": 0.1012,
"step": 1516
},
{
"epoch": 0.9122596153846154,
"grad_norm": 3.0557112542579583,
"learning_rate": 9.9050019917874e-08,
"loss": 0.0904,
"step": 1518
},
{
"epoch": 0.9134615384615384,
"grad_norm": 2.8003064126365653,
"learning_rate": 9.637236023034403e-08,
"loss": 0.096,
"step": 1520
},
{
"epoch": 0.9146634615384616,
"grad_norm": 2.3950834236132548,
"learning_rate": 9.373067997743429e-08,
"loss": 0.1103,
"step": 1522
},
{
"epoch": 0.9158653846153846,
"grad_norm": 2.3223933444523275,
"learning_rate": 9.112501870194273e-08,
"loss": 0.1051,
"step": 1524
},
{
"epoch": 0.9170673076923077,
"grad_norm": 2.6778326848037084,
"learning_rate": 8.855541540750579e-08,
"loss": 0.1079,
"step": 1526
},
{
"epoch": 0.9182692307692307,
"grad_norm": 2.527199338042573,
"learning_rate": 8.602190855801523e-08,
"loss": 0.1109,
"step": 1528
},
{
"epoch": 0.9194711538461539,
"grad_norm": 2.2105422763119598,
"learning_rate": 8.352453607704286e-08,
"loss": 0.0994,
"step": 1530
},
{
"epoch": 0.9206730769230769,
"grad_norm": 2.4639244734521357,
"learning_rate": 8.106333534727145e-08,
"loss": 0.1108,
"step": 1532
},
{
"epoch": 0.921875,
"grad_norm": 2.2497655156731162,
"learning_rate": 7.86383432099358e-08,
"loss": 0.0991,
"step": 1534
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.3748694066193115,
"learning_rate": 7.624959596427145e-08,
"loss": 0.0998,
"step": 1536
},
{
"epoch": 0.9242788461538461,
"grad_norm": 3.0743080890043983,
"learning_rate": 7.38971293669713e-08,
"loss": 0.1068,
"step": 1538
},
{
"epoch": 0.9254807692307693,
"grad_norm": 2.1876563987125675,
"learning_rate": 7.15809786316507e-08,
"loss": 0.1021,
"step": 1540
},
{
"epoch": 0.9266826923076923,
"grad_norm": 2.0975147997242143,
"learning_rate": 6.930117842831958e-08,
"loss": 0.1046,
"step": 1542
},
{
"epoch": 0.9278846153846154,
"grad_norm": 2.075830478745358,
"learning_rate": 6.705776288286281e-08,
"loss": 0.0954,
"step": 1544
},
{
"epoch": 0.9290865384615384,
"grad_norm": 2.6184435003671362,
"learning_rate": 6.485076557653236e-08,
"loss": 0.1175,
"step": 1546
},
{
"epoch": 0.9302884615384616,
"grad_norm": 1.9245903381939464,
"learning_rate": 6.268021954544095e-08,
"loss": 0.1013,
"step": 1548
},
{
"epoch": 0.9314903846153846,
"grad_norm": 2.2746719047460853,
"learning_rate": 6.05461572800703e-08,
"loss": 0.1126,
"step": 1550
},
{
"epoch": 0.9326923076923077,
"grad_norm": 2.340635789861226,
"learning_rate": 5.844861072478336e-08,
"loss": 0.1123,
"step": 1552
},
{
"epoch": 0.9338942307692307,
"grad_norm": 2.1875192748623418,
"learning_rate": 5.6387611277346486e-08,
"loss": 0.1207,
"step": 1554
},
{
"epoch": 0.9350961538461539,
"grad_norm": 2.7002532298324997,
"learning_rate": 5.436318978845917e-08,
"loss": 0.1021,
"step": 1556
},
{
"epoch": 0.9362980769230769,
"grad_norm": 2.50568764777332,
"learning_rate": 5.237537656129332e-08,
"loss": 0.0963,
"step": 1558
},
{
"epoch": 0.9375,
"grad_norm": 2.4255108084712806,
"learning_rate": 5.042420135103865e-08,
"loss": 0.1056,
"step": 1560
},
{
"epoch": 0.9387019230769231,
"grad_norm": 2.3846332604623215,
"learning_rate": 4.850969336445688e-08,
"loss": 0.1018,
"step": 1562
},
{
"epoch": 0.9399038461538461,
"grad_norm": 1.9410217717252691,
"learning_rate": 4.663188125944601e-08,
"loss": 0.1034,
"step": 1564
},
{
"epoch": 0.9411057692307693,
"grad_norm": 1.9719340906948433,
"learning_rate": 4.47907931446101e-08,
"loss": 0.1002,
"step": 1566
},
{
"epoch": 0.9423076923076923,
"grad_norm": 2.7209374640824073,
"learning_rate": 4.298645657883904e-08,
"loss": 0.1025,
"step": 1568
},
{
"epoch": 0.9435096153846154,
"grad_norm": 2.5221543954885,
"learning_rate": 4.121889857089584e-08,
"loss": 0.1129,
"step": 1570
},
{
"epoch": 0.9447115384615384,
"grad_norm": 2.362069229118289,
"learning_rate": 3.948814557901276e-08,
"loss": 0.1076,
"step": 1572
},
{
"epoch": 0.9459134615384616,
"grad_norm": 2.383853603153857,
"learning_rate": 3.779422351049417e-08,
"loss": 0.116,
"step": 1574
},
{
"epoch": 0.9471153846153846,
"grad_norm": 1.9719712080104705,
"learning_rate": 3.613715772133097e-08,
"loss": 0.0939,
"step": 1576
},
{
"epoch": 0.9483173076923077,
"grad_norm": 2.302141720175791,
"learning_rate": 3.451697301581791e-08,
"loss": 0.1108,
"step": 1578
},
{
"epoch": 0.9495192307692307,
"grad_norm": 2.259505291636599,
"learning_rate": 3.293369364618465e-08,
"loss": 0.0928,
"step": 1580
},
{
"epoch": 0.9507211538461539,
"grad_norm": 3.306570471168316,
"learning_rate": 3.138734331223248e-08,
"loss": 0.1092,
"step": 1582
},
{
"epoch": 0.9519230769230769,
"grad_norm": 2.132657723925501,
"learning_rate": 2.987794516097875e-08,
"loss": 0.1076,
"step": 1584
},
{
"epoch": 0.953125,
"grad_norm": 2.4599044268457995,
"learning_rate": 2.8405521786310508e-08,
"loss": 0.1032,
"step": 1586
},
{
"epoch": 0.9543269230769231,
"grad_norm": 2.743173912553656,
"learning_rate": 2.6970095228647243e-08,
"loss": 0.1006,
"step": 1588
},
{
"epoch": 0.9555288461538461,
"grad_norm": 2.7902694032785678,
"learning_rate": 2.5571686974609766e-08,
"loss": 0.1082,
"step": 1590
},
{
"epoch": 0.9567307692307693,
"grad_norm": 2.1596954453730617,
"learning_rate": 2.4210317956698814e-08,
"loss": 0.0968,
"step": 1592
},
{
"epoch": 0.9579326923076923,
"grad_norm": 2.1748717942295452,
"learning_rate": 2.2886008552983064e-08,
"loss": 0.1159,
"step": 1594
},
{
"epoch": 0.9591346153846154,
"grad_norm": 2.772837831508022,
"learning_rate": 2.1598778586792158e-08,
"loss": 0.1188,
"step": 1596
},
{
"epoch": 0.9603365384615384,
"grad_norm": 2.3289207936080105,
"learning_rate": 2.0348647326420835e-08,
"loss": 0.1177,
"step": 1598
},
{
"epoch": 0.9615384615384616,
"grad_norm": 2.2952508596930685,
"learning_rate": 1.91356334848411e-08,
"loss": 0.1076,
"step": 1600
},
{
"epoch": 0.9627403846153846,
"grad_norm": 2.273702791234848,
"learning_rate": 1.795975521942106e-08,
"loss": 0.1046,
"step": 1602
},
{
"epoch": 0.9639423076923077,
"grad_norm": 2.592999996375681,
"learning_rate": 1.682103013165376e-08,
"loss": 0.114,
"step": 1604
},
{
"epoch": 0.9651442307692307,
"grad_norm": 2.227735041940276,
"learning_rate": 1.571947526689349e-08,
"loss": 0.1054,
"step": 1606
},
{
"epoch": 0.9663461538461539,
"grad_norm": 2.26729275395297,
"learning_rate": 1.4655107114101008e-08,
"loss": 0.0916,
"step": 1608
},
{
"epoch": 0.9675480769230769,
"grad_norm": 2.546233331956148,
"learning_rate": 1.362794160559594e-08,
"loss": 0.1151,
"step": 1610
},
{
"epoch": 0.96875,
"grad_norm": 2.496560671829686,
"learning_rate": 1.263799411681893e-08,
"loss": 0.1161,
"step": 1612
},
{
"epoch": 0.9699519230769231,
"grad_norm": 2.2682911816628715,
"learning_rate": 1.1685279466101817e-08,
"loss": 0.1008,
"step": 1614
},
{
"epoch": 0.9711538461538461,
"grad_norm": 2.7304953540259405,
"learning_rate": 1.0769811914444206e-08,
"loss": 0.1041,
"step": 1616
},
{
"epoch": 0.9723557692307693,
"grad_norm": 2.6466979182161885,
"learning_rate": 9.89160516530252e-09,
"loss": 0.1044,
"step": 1618
},
{
"epoch": 0.9735576923076923,
"grad_norm": 2.394702205679758,
"learning_rate": 9.050672364382118e-09,
"loss": 0.0955,
"step": 1620
},
{
"epoch": 0.9747596153846154,
"grad_norm": 3.3455759598567836,
"learning_rate": 8.247026099443279e-09,
"loss": 0.1109,
"step": 1622
},
{
"epoch": 0.9759615384615384,
"grad_norm": 2.275839668759994,
"learning_rate": 7.480678400109965e-09,
"loss": 0.1061,
"step": 1624
},
{
"epoch": 0.9771634615384616,
"grad_norm": 2.133232464295508,
"learning_rate": 6.751640737691911e-09,
"loss": 0.1042,
"step": 1626
},
{
"epoch": 0.9783653846153846,
"grad_norm": 2.304963137531874,
"learning_rate": 6.059924025012542e-09,
"loss": 0.1038,
"step": 1628
},
{
"epoch": 0.9795673076923077,
"grad_norm": 1.9133668945458344,
"learning_rate": 5.405538616244377e-09,
"loss": 0.0946,
"step": 1630
},
{
"epoch": 0.9807692307692307,
"grad_norm": 2.7193573070283596,
"learning_rate": 4.788494306755542e-09,
"loss": 0.1021,
"step": 1632
},
{
"epoch": 0.9819711538461539,
"grad_norm": 2.8786636757179878,
"learning_rate": 4.208800332961838e-09,
"loss": 0.1102,
"step": 1634
},
{
"epoch": 0.9831730769230769,
"grad_norm": 2.4551588884935,
"learning_rate": 3.666465372190453e-09,
"loss": 0.0962,
"step": 1636
},
{
"epoch": 0.984375,
"grad_norm": 2.1960677965536957,
"learning_rate": 3.1614975425470207e-09,
"loss": 0.1151,
"step": 1638
},
{
"epoch": 0.9855769230769231,
"grad_norm": 3.4510437899419992,
"learning_rate": 2.693904402797376e-09,
"loss": 0.0972,
"step": 1640
},
{
"epoch": 0.9867788461538461,
"grad_norm": 2.4216851513924205,
"learning_rate": 2.2636929522520945e-09,
"loss": 0.1199,
"step": 1642
},
{
"epoch": 0.9879807692307693,
"grad_norm": 2.5149641594525214,
"learning_rate": 1.8708696306624087e-09,
"loss": 0.0947,
"step": 1644
},
{
"epoch": 0.9891826923076923,
"grad_norm": 2.8133693313833303,
"learning_rate": 1.5154403181247279e-09,
"loss": 0.102,
"step": 1646
},
{
"epoch": 0.9903846153846154,
"grad_norm": 1.9008875611487197,
"learning_rate": 1.1974103349909894e-09,
"loss": 0.0975,
"step": 1648
},
{
"epoch": 0.9915865384615384,
"grad_norm": 2.3226078974966184,
"learning_rate": 9.167844417901084e-10,
"loss": 0.115,
"step": 1650
},
{
"epoch": 0.9927884615384616,
"grad_norm": 3.1578748196963695,
"learning_rate": 6.735668391566475e-10,
"loss": 0.1127,
"step": 1652
},
{
"epoch": 0.9939903846153846,
"grad_norm": 2.273472912036705,
"learning_rate": 4.677611677675331e-10,
"loss": 0.096,
"step": 1654
},
{
"epoch": 0.9951923076923077,
"grad_norm": 2.0942516236835993,
"learning_rate": 2.993705082879328e-10,
"loss": 0.1131,
"step": 1656
},
{
"epoch": 0.9963942307692307,
"grad_norm": 2.5707284998861346,
"learning_rate": 1.683973813249029e-10,
"loss": 0.0986,
"step": 1658
},
{
"epoch": 0.9975961538461539,
"grad_norm": 2.5538888383981013,
"learning_rate": 7.484374738936373e-11,
"loss": 0.1101,
"step": 1660
},
{
"epoch": 0.9987980769230769,
"grad_norm": 2.0051743259281305,
"learning_rate": 1.8711006867788707e-11,
"loss": 0.1041,
"step": 1662
},
{
"epoch": 1.0,
"grad_norm": 2.79621961431732,
"learning_rate": 0.0,
"loss": 0.1053,
"step": 1664
}
],
"logging_steps": 2,
"max_steps": 1664,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 522572361891840.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}