Meditron3-Gemma2-2B / trainer_state.json
Alexandre Sallinen
Upload folder using huggingface_hub
a2b5678 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7499861462585662,
"eval_steps": 500,
"global_step": 20301,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007388662098010602,
"grad_norm": 12.062633265062304,
"learning_rate": 4.999999984845559e-07,
"loss": 1.9234,
"step": 20
},
{
"epoch": 0.0014777324196021205,
"grad_norm": 7.366383713647709,
"learning_rate": 4.999993938226169e-07,
"loss": 1.8338,
"step": 40
},
{
"epoch": 0.002216598629403181,
"grad_norm": 1.7977538453913595,
"learning_rate": 4.999975752937336e-07,
"loss": 1.7526,
"step": 60
},
{
"epoch": 0.002955464839204241,
"grad_norm": 1.6391792506527756,
"learning_rate": 4.999945444231491e-07,
"loss": 1.7305,
"step": 80
},
{
"epoch": 0.0036943310490053015,
"grad_norm": 2.046829887141379,
"learning_rate": 4.999903012271942e-07,
"loss": 1.741,
"step": 100
},
{
"epoch": 0.004433197258806362,
"grad_norm": 1.8506603870837903,
"learning_rate": 4.999848457287324e-07,
"loss": 1.7129,
"step": 120
},
{
"epoch": 0.005172063468607422,
"grad_norm": 1.532512758048568,
"learning_rate": 4.999781779571592e-07,
"loss": 1.6774,
"step": 140
},
{
"epoch": 0.005910929678408482,
"grad_norm": 1.5744798799071549,
"learning_rate": 4.999702979484023e-07,
"loss": 1.7007,
"step": 160
},
{
"epoch": 0.006649795888209542,
"grad_norm": 3.3220978847027203,
"learning_rate": 4.999612057449209e-07,
"loss": 1.713,
"step": 180
},
{
"epoch": 0.007388662098010603,
"grad_norm": 1.6192232490789547,
"learning_rate": 4.999509013957061e-07,
"loss": 1.7085,
"step": 200
},
{
"epoch": 0.008127528307811663,
"grad_norm": 1.6102605979486695,
"learning_rate": 4.999393849562803e-07,
"loss": 1.6909,
"step": 220
},
{
"epoch": 0.008866394517612723,
"grad_norm": 1.7856392499507323,
"learning_rate": 4.999266564886968e-07,
"loss": 1.7105,
"step": 240
},
{
"epoch": 0.009605260727413783,
"grad_norm": 1.462292806282488,
"learning_rate": 4.999127160615396e-07,
"loss": 1.7254,
"step": 260
},
{
"epoch": 0.010344126937214844,
"grad_norm": 1.6210572507919945,
"learning_rate": 4.998975637499234e-07,
"loss": 1.7228,
"step": 280
},
{
"epoch": 0.011082993147015904,
"grad_norm": 1.5952635257667038,
"learning_rate": 4.998811996354924e-07,
"loss": 1.747,
"step": 300
},
{
"epoch": 0.011821859356816964,
"grad_norm": 1.7849552493640788,
"learning_rate": 4.998636238064202e-07,
"loss": 1.6851,
"step": 320
},
{
"epoch": 0.012560725566618025,
"grad_norm": 1.7748576256781579,
"learning_rate": 4.9984483635741e-07,
"loss": 1.7215,
"step": 340
},
{
"epoch": 0.013299591776419085,
"grad_norm": 1.5217631531152356,
"learning_rate": 4.998248373896929e-07,
"loss": 1.7062,
"step": 360
},
{
"epoch": 0.014038457986220144,
"grad_norm": 1.8070743437219547,
"learning_rate": 4.998036270110284e-07,
"loss": 1.7108,
"step": 380
},
{
"epoch": 0.014777324196021206,
"grad_norm": 1.931037650128842,
"learning_rate": 4.997812053357031e-07,
"loss": 1.6739,
"step": 400
},
{
"epoch": 0.015516190405822266,
"grad_norm": 2.3425787137976073,
"learning_rate": 4.997575724845303e-07,
"loss": 1.6501,
"step": 420
},
{
"epoch": 0.016255056615623327,
"grad_norm": 1.5819249160473718,
"learning_rate": 4.997327285848497e-07,
"loss": 1.7295,
"step": 440
},
{
"epoch": 0.016993922825424387,
"grad_norm": 1.780767344095751,
"learning_rate": 4.997066737705263e-07,
"loss": 1.7035,
"step": 460
},
{
"epoch": 0.017732789035225446,
"grad_norm": 1.502517772930168,
"learning_rate": 4.996794081819497e-07,
"loss": 1.72,
"step": 480
},
{
"epoch": 0.018471655245026506,
"grad_norm": 1.6230104328728192,
"learning_rate": 4.996509319660336e-07,
"loss": 1.7052,
"step": 500
},
{
"epoch": 0.019210521454827566,
"grad_norm": 2.5063386321134287,
"learning_rate": 4.996212452762147e-07,
"loss": 1.7111,
"step": 520
},
{
"epoch": 0.01994938766462863,
"grad_norm": 1.508194569170525,
"learning_rate": 4.995903482724523e-07,
"loss": 1.7116,
"step": 540
},
{
"epoch": 0.02068825387442969,
"grad_norm": 1.5662589803980058,
"learning_rate": 4.995582411212267e-07,
"loss": 1.6586,
"step": 560
},
{
"epoch": 0.021427120084230748,
"grad_norm": 1.5479171922961865,
"learning_rate": 4.995249239955392e-07,
"loss": 1.6605,
"step": 580
},
{
"epoch": 0.022165986294031808,
"grad_norm": 1.4441787150001577,
"learning_rate": 4.994903970749107e-07,
"loss": 1.6952,
"step": 600
},
{
"epoch": 0.022904852503832868,
"grad_norm": 1.8034952536565763,
"learning_rate": 4.994546605453804e-07,
"loss": 1.6928,
"step": 620
},
{
"epoch": 0.023643718713633927,
"grad_norm": 2.0505792045813123,
"learning_rate": 4.994177145995056e-07,
"loss": 1.6979,
"step": 640
},
{
"epoch": 0.02438258492343499,
"grad_norm": 1.7345451000474756,
"learning_rate": 4.993795594363599e-07,
"loss": 1.6827,
"step": 660
},
{
"epoch": 0.02512145113323605,
"grad_norm": 1.7265664949693813,
"learning_rate": 4.993401952615327e-07,
"loss": 1.6949,
"step": 680
},
{
"epoch": 0.02586031734303711,
"grad_norm": 1.5684431888117931,
"learning_rate": 4.992996222871278e-07,
"loss": 1.6725,
"step": 700
},
{
"epoch": 0.02659918355283817,
"grad_norm": 1.8458741005435486,
"learning_rate": 4.992578407317622e-07,
"loss": 1.6876,
"step": 720
},
{
"epoch": 0.02733804976263923,
"grad_norm": 1.603183220486937,
"learning_rate": 4.992148508205652e-07,
"loss": 1.7001,
"step": 740
},
{
"epoch": 0.02807691597244029,
"grad_norm": 1.4656870216667528,
"learning_rate": 4.991706527851766e-07,
"loss": 1.6743,
"step": 760
},
{
"epoch": 0.028815782182241352,
"grad_norm": 1.79341933069724,
"learning_rate": 4.991252468637465e-07,
"loss": 1.6894,
"step": 780
},
{
"epoch": 0.029554648392042412,
"grad_norm": 1.4496770314789245,
"learning_rate": 4.990786333009329e-07,
"loss": 1.7038,
"step": 800
},
{
"epoch": 0.03029351460184347,
"grad_norm": 1.757004570982493,
"learning_rate": 4.990308123479012e-07,
"loss": 1.7134,
"step": 820
},
{
"epoch": 0.03103238081164453,
"grad_norm": 1.5200364379437228,
"learning_rate": 4.98981784262322e-07,
"loss": 1.6698,
"step": 840
},
{
"epoch": 0.03177124702144559,
"grad_norm": 1.486100216095798,
"learning_rate": 4.989315493083708e-07,
"loss": 1.6896,
"step": 860
},
{
"epoch": 0.032510113231246654,
"grad_norm": 1.6006604995588511,
"learning_rate": 4.988801077567258e-07,
"loss": 1.6842,
"step": 880
},
{
"epoch": 0.03324897944104771,
"grad_norm": 1.6369118826080298,
"learning_rate": 4.988274598845665e-07,
"loss": 1.7129,
"step": 900
},
{
"epoch": 0.03398784565084877,
"grad_norm": 1.594714153238538,
"learning_rate": 4.987736059755724e-07,
"loss": 1.6812,
"step": 920
},
{
"epoch": 0.03472671186064983,
"grad_norm": 1.691349253313934,
"learning_rate": 4.987185463199215e-07,
"loss": 1.7131,
"step": 940
},
{
"epoch": 0.03546557807045089,
"grad_norm": 2.1466962990805385,
"learning_rate": 4.986622812142888e-07,
"loss": 1.7217,
"step": 960
},
{
"epoch": 0.036204444280251956,
"grad_norm": 1.6551865204895997,
"learning_rate": 4.986048109618442e-07,
"loss": 1.7179,
"step": 980
},
{
"epoch": 0.03694331049005301,
"grad_norm": 1.5681769699914139,
"learning_rate": 4.985461358722514e-07,
"loss": 1.6897,
"step": 1000
},
{
"epoch": 0.037682176699854075,
"grad_norm": 1.5623589901869384,
"learning_rate": 4.984862562616661e-07,
"loss": 1.7307,
"step": 1020
},
{
"epoch": 0.03842104290965513,
"grad_norm": 1.710638101923504,
"learning_rate": 4.984251724527342e-07,
"loss": 1.6815,
"step": 1040
},
{
"epoch": 0.039159909119456195,
"grad_norm": 2.613860202511964,
"learning_rate": 4.983628847745904e-07,
"loss": 1.6798,
"step": 1060
},
{
"epoch": 0.03989877532925726,
"grad_norm": 1.9956576024499864,
"learning_rate": 4.982993935628554e-07,
"loss": 1.6715,
"step": 1080
},
{
"epoch": 0.040637641539058314,
"grad_norm": 1.833965747583207,
"learning_rate": 4.982346991596356e-07,
"loss": 1.7044,
"step": 1100
},
{
"epoch": 0.04137650774885938,
"grad_norm": 1.709065067682895,
"learning_rate": 4.981688019135202e-07,
"loss": 1.6612,
"step": 1120
},
{
"epoch": 0.04211537395866043,
"grad_norm": 2.2904499102757767,
"learning_rate": 4.981017021795794e-07,
"loss": 1.6984,
"step": 1140
},
{
"epoch": 0.042854240168461497,
"grad_norm": 1.5809892282131641,
"learning_rate": 4.980334003193632e-07,
"loss": 1.672,
"step": 1160
},
{
"epoch": 0.04359310637826255,
"grad_norm": 1.4895408854624943,
"learning_rate": 4.979638967008983e-07,
"loss": 1.6637,
"step": 1180
},
{
"epoch": 0.044331972588063616,
"grad_norm": 1.6294048072820626,
"learning_rate": 4.978931916986874e-07,
"loss": 1.6604,
"step": 1200
},
{
"epoch": 0.04507083879786468,
"grad_norm": 2.156396711607377,
"learning_rate": 4.978212856937062e-07,
"loss": 1.678,
"step": 1220
},
{
"epoch": 0.045809705007665735,
"grad_norm": 1.6010232675443634,
"learning_rate": 4.977481790734016e-07,
"loss": 1.6922,
"step": 1240
},
{
"epoch": 0.0465485712174668,
"grad_norm": 1.4024504403885678,
"learning_rate": 4.9767387223169e-07,
"loss": 1.6538,
"step": 1260
},
{
"epoch": 0.047287437427267855,
"grad_norm": 1.711902948101267,
"learning_rate": 4.975983655689547e-07,
"loss": 1.6844,
"step": 1280
},
{
"epoch": 0.04802630363706892,
"grad_norm": 1.572788133497536,
"learning_rate": 4.975216594920441e-07,
"loss": 1.6773,
"step": 1300
},
{
"epoch": 0.04876516984686998,
"grad_norm": 1.5865129712420638,
"learning_rate": 4.974437544142691e-07,
"loss": 1.6794,
"step": 1320
},
{
"epoch": 0.04950403605667104,
"grad_norm": 1.5690014017001472,
"learning_rate": 4.973646507554012e-07,
"loss": 1.7073,
"step": 1340
},
{
"epoch": 0.0502429022664721,
"grad_norm": 1.5937164094810738,
"learning_rate": 4.972843489416702e-07,
"loss": 1.6958,
"step": 1360
},
{
"epoch": 0.05098176847627316,
"grad_norm": 1.5264901796499448,
"learning_rate": 4.972028494057619e-07,
"loss": 1.6452,
"step": 1380
},
{
"epoch": 0.05172063468607422,
"grad_norm": 1.7268277514753942,
"learning_rate": 4.971201525868155e-07,
"loss": 1.6944,
"step": 1400
},
{
"epoch": 0.052459500895875276,
"grad_norm": 1.6746090286211905,
"learning_rate": 4.970362589304216e-07,
"loss": 1.6621,
"step": 1420
},
{
"epoch": 0.05319836710567634,
"grad_norm": 1.5009694467436718,
"learning_rate": 4.969511688886198e-07,
"loss": 1.6797,
"step": 1440
},
{
"epoch": 0.0539372333154774,
"grad_norm": 1.4662106712988012,
"learning_rate": 4.968648829198958e-07,
"loss": 1.6664,
"step": 1460
},
{
"epoch": 0.05467609952527846,
"grad_norm": 1.5749221565087543,
"learning_rate": 4.967774014891796e-07,
"loss": 1.7086,
"step": 1480
},
{
"epoch": 0.05541496573507952,
"grad_norm": 1.482093097261866,
"learning_rate": 4.966887250678421e-07,
"loss": 1.7089,
"step": 1500
},
{
"epoch": 0.05615383194488058,
"grad_norm": 1.520435320135513,
"learning_rate": 4.965988541336936e-07,
"loss": 1.6734,
"step": 1520
},
{
"epoch": 0.05689269815468164,
"grad_norm": 1.5553430296104012,
"learning_rate": 4.965077891709807e-07,
"loss": 1.697,
"step": 1540
},
{
"epoch": 0.057631564364482704,
"grad_norm": 1.3543792401342896,
"learning_rate": 4.964155306703835e-07,
"loss": 1.6997,
"step": 1560
},
{
"epoch": 0.05837043057428376,
"grad_norm": 1.575031153656866,
"learning_rate": 4.963220791290132e-07,
"loss": 1.6916,
"step": 1580
},
{
"epoch": 0.059109296784084824,
"grad_norm": 1.669401673230416,
"learning_rate": 4.962274350504096e-07,
"loss": 1.7042,
"step": 1600
},
{
"epoch": 0.05984816299388588,
"grad_norm": 1.5212881661869584,
"learning_rate": 4.961315989445378e-07,
"loss": 1.667,
"step": 1620
},
{
"epoch": 0.06058702920368694,
"grad_norm": 1.5762244565376538,
"learning_rate": 4.960345713277863e-07,
"loss": 1.6342,
"step": 1640
},
{
"epoch": 0.061325895413488006,
"grad_norm": 1.5691787785228513,
"learning_rate": 4.959363527229634e-07,
"loss": 1.6525,
"step": 1660
},
{
"epoch": 0.06206476162328906,
"grad_norm": 1.6437209138688083,
"learning_rate": 4.958369436592948e-07,
"loss": 1.6769,
"step": 1680
},
{
"epoch": 0.06280362783309013,
"grad_norm": 1.553888728962609,
"learning_rate": 4.957363446724208e-07,
"loss": 1.6924,
"step": 1700
},
{
"epoch": 0.06354249404289118,
"grad_norm": 1.5365170274491486,
"learning_rate": 4.956345563043933e-07,
"loss": 1.6894,
"step": 1720
},
{
"epoch": 0.06428136025269224,
"grad_norm": 1.5197453947387185,
"learning_rate": 4.955315791036727e-07,
"loss": 1.6758,
"step": 1740
},
{
"epoch": 0.06502022646249331,
"grad_norm": 2.464391338240643,
"learning_rate": 4.954274136251251e-07,
"loss": 1.6332,
"step": 1760
},
{
"epoch": 0.06575909267229436,
"grad_norm": 1.6394501426047832,
"learning_rate": 4.953220604300198e-07,
"loss": 1.6879,
"step": 1780
},
{
"epoch": 0.06649795888209542,
"grad_norm": 1.6810218422818062,
"learning_rate": 4.952155200860251e-07,
"loss": 1.6724,
"step": 1800
},
{
"epoch": 0.06723682509189649,
"grad_norm": 1.6917107156739108,
"learning_rate": 4.951077931672067e-07,
"loss": 1.6826,
"step": 1820
},
{
"epoch": 0.06797569130169755,
"grad_norm": 4.580444011220939,
"learning_rate": 4.949988802540229e-07,
"loss": 1.6581,
"step": 1840
},
{
"epoch": 0.0687145575114986,
"grad_norm": 1.529682024928737,
"learning_rate": 4.948887819333236e-07,
"loss": 1.6649,
"step": 1860
},
{
"epoch": 0.06945342372129966,
"grad_norm": 1.4414816212505979,
"learning_rate": 4.947774987983449e-07,
"loss": 1.6691,
"step": 1880
},
{
"epoch": 0.07019228993110073,
"grad_norm": 1.9799877388187868,
"learning_rate": 4.946650314487077e-07,
"loss": 1.6937,
"step": 1900
},
{
"epoch": 0.07093115614090179,
"grad_norm": 1.4952225950122013,
"learning_rate": 4.945513804904131e-07,
"loss": 1.6798,
"step": 1920
},
{
"epoch": 0.07167002235070284,
"grad_norm": 1.5642244850204086,
"learning_rate": 4.944365465358407e-07,
"loss": 1.6783,
"step": 1940
},
{
"epoch": 0.07240888856050391,
"grad_norm": 1.5048141774546024,
"learning_rate": 4.943205302037432e-07,
"loss": 1.6486,
"step": 1960
},
{
"epoch": 0.07314775477030497,
"grad_norm": 1.4222420311354336,
"learning_rate": 4.942033321192452e-07,
"loss": 1.6868,
"step": 1980
},
{
"epoch": 0.07388662098010602,
"grad_norm": 1.6893784971157513,
"learning_rate": 4.940849529138383e-07,
"loss": 1.6934,
"step": 2000
},
{
"epoch": 0.0746254871899071,
"grad_norm": 1.9605139373755291,
"learning_rate": 4.939653932253786e-07,
"loss": 1.6537,
"step": 2020
},
{
"epoch": 0.07536435339970815,
"grad_norm": 1.6497175696745814,
"learning_rate": 4.938446536980829e-07,
"loss": 1.7022,
"step": 2040
},
{
"epoch": 0.0761032196095092,
"grad_norm": 1.6258237906038047,
"learning_rate": 4.93722734982525e-07,
"loss": 1.6925,
"step": 2060
},
{
"epoch": 0.07684208581931026,
"grad_norm": 1.5236446480879742,
"learning_rate": 4.935996377356326e-07,
"loss": 1.6418,
"step": 2080
},
{
"epoch": 0.07758095202911133,
"grad_norm": 1.5958168322212294,
"learning_rate": 4.934753626206837e-07,
"loss": 1.7259,
"step": 2100
},
{
"epoch": 0.07831981823891239,
"grad_norm": 2.45551144657574,
"learning_rate": 4.933499103073029e-07,
"loss": 1.7141,
"step": 2120
},
{
"epoch": 0.07905868444871345,
"grad_norm": 1.519298595383626,
"learning_rate": 4.932232814714576e-07,
"loss": 1.6712,
"step": 2140
},
{
"epoch": 0.07979755065851452,
"grad_norm": 1.6278461520508305,
"learning_rate": 4.930954767954551e-07,
"loss": 1.6784,
"step": 2160
},
{
"epoch": 0.08053641686831557,
"grad_norm": 1.5199514109499472,
"learning_rate": 4.92966496967938e-07,
"loss": 1.6842,
"step": 2180
},
{
"epoch": 0.08127528307811663,
"grad_norm": 1.8268617588637115,
"learning_rate": 4.928363426838808e-07,
"loss": 1.714,
"step": 2200
},
{
"epoch": 0.08201414928791768,
"grad_norm": 1.671569089879459,
"learning_rate": 4.927050146445867e-07,
"loss": 1.6693,
"step": 2220
},
{
"epoch": 0.08275301549771875,
"grad_norm": 1.4546842764348067,
"learning_rate": 4.92572513557683e-07,
"loss": 1.6724,
"step": 2240
},
{
"epoch": 0.08349188170751981,
"grad_norm": 1.5602541654624753,
"learning_rate": 4.924388401371179e-07,
"loss": 1.6715,
"step": 2260
},
{
"epoch": 0.08423074791732087,
"grad_norm": 1.6408350929881408,
"learning_rate": 4.923039951031562e-07,
"loss": 1.6538,
"step": 2280
},
{
"epoch": 0.08496961412712194,
"grad_norm": 1.3547990923859226,
"learning_rate": 4.921679791823761e-07,
"loss": 1.6639,
"step": 2300
},
{
"epoch": 0.08570848033692299,
"grad_norm": 1.536988279196407,
"learning_rate": 4.92030793107664e-07,
"loss": 1.6709,
"step": 2320
},
{
"epoch": 0.08644734654672405,
"grad_norm": 1.4484585256339257,
"learning_rate": 4.918924376182121e-07,
"loss": 1.6517,
"step": 2340
},
{
"epoch": 0.0871862127565251,
"grad_norm": 1.4965077209050879,
"learning_rate": 4.917529134595135e-07,
"loss": 1.6956,
"step": 2360
},
{
"epoch": 0.08792507896632618,
"grad_norm": 1.858590784934109,
"learning_rate": 4.916122213833584e-07,
"loss": 1.6667,
"step": 2380
},
{
"epoch": 0.08866394517612723,
"grad_norm": 1.6845854426029852,
"learning_rate": 4.914703621478297e-07,
"loss": 1.6392,
"step": 2400
},
{
"epoch": 0.08940281138592829,
"grad_norm": 1.479499322660105,
"learning_rate": 4.913273365172998e-07,
"loss": 1.6323,
"step": 2420
},
{
"epoch": 0.09014167759572936,
"grad_norm": 1.4475363138688357,
"learning_rate": 4.911831452624253e-07,
"loss": 1.655,
"step": 2440
},
{
"epoch": 0.09088054380553041,
"grad_norm": 1.4410524177419237,
"learning_rate": 4.910377891601439e-07,
"loss": 1.6977,
"step": 2460
},
{
"epoch": 0.09161941001533147,
"grad_norm": 1.512362377905178,
"learning_rate": 4.908912689936697e-07,
"loss": 1.6716,
"step": 2480
},
{
"epoch": 0.09235827622513254,
"grad_norm": 1.8697344841744916,
"learning_rate": 4.90743585552489e-07,
"loss": 1.6694,
"step": 2500
},
{
"epoch": 0.0930971424349336,
"grad_norm": 1.5406884179833267,
"learning_rate": 4.905947396323561e-07,
"loss": 1.7013,
"step": 2520
},
{
"epoch": 0.09383600864473465,
"grad_norm": 1.883868312926782,
"learning_rate": 4.904447320352891e-07,
"loss": 1.6438,
"step": 2540
},
{
"epoch": 0.09457487485453571,
"grad_norm": 1.5063919396389938,
"learning_rate": 4.902935635695655e-07,
"loss": 1.6341,
"step": 2560
},
{
"epoch": 0.09531374106433678,
"grad_norm": 2.5666944465723223,
"learning_rate": 4.901412350497177e-07,
"loss": 1.673,
"step": 2580
},
{
"epoch": 0.09605260727413784,
"grad_norm": 1.6864017944187357,
"learning_rate": 4.899877472965289e-07,
"loss": 1.6532,
"step": 2600
},
{
"epoch": 0.09679147348393889,
"grad_norm": 1.489838376943142,
"learning_rate": 4.898331011370282e-07,
"loss": 1.7006,
"step": 2620
},
{
"epoch": 0.09753033969373996,
"grad_norm": 1.516224235445671,
"learning_rate": 4.896772974044871e-07,
"loss": 1.664,
"step": 2640
},
{
"epoch": 0.09826920590354102,
"grad_norm": 1.4154240383552321,
"learning_rate": 4.895203369384138e-07,
"loss": 1.6749,
"step": 2660
},
{
"epoch": 0.09900807211334207,
"grad_norm": 1.8653781111338754,
"learning_rate": 4.893622205845498e-07,
"loss": 1.6255,
"step": 2680
},
{
"epoch": 0.09974693832314313,
"grad_norm": 1.6154619117281779,
"learning_rate": 4.892029491948642e-07,
"loss": 1.7121,
"step": 2700
},
{
"epoch": 0.1004858045329442,
"grad_norm": 1.6240732568528131,
"learning_rate": 4.890425236275502e-07,
"loss": 1.687,
"step": 2720
},
{
"epoch": 0.10122467074274526,
"grad_norm": 1.459326292962488,
"learning_rate": 4.888809447470195e-07,
"loss": 1.5967,
"step": 2740
},
{
"epoch": 0.10196353695254631,
"grad_norm": 1.7582112558661527,
"learning_rate": 4.887182134238989e-07,
"loss": 1.7297,
"step": 2760
},
{
"epoch": 0.10270240316234738,
"grad_norm": 1.7154567295687058,
"learning_rate": 4.885543305350241e-07,
"loss": 1.6881,
"step": 2780
},
{
"epoch": 0.10344126937214844,
"grad_norm": 1.68486225816754,
"learning_rate": 4.88389296963436e-07,
"loss": 1.6351,
"step": 2800
},
{
"epoch": 0.1041801355819495,
"grad_norm": 1.4658940210533413,
"learning_rate": 4.882231135983757e-07,
"loss": 1.6584,
"step": 2820
},
{
"epoch": 0.10491900179175055,
"grad_norm": 1.3967168353938462,
"learning_rate": 4.880557813352796e-07,
"loss": 1.6811,
"step": 2840
},
{
"epoch": 0.10565786800155162,
"grad_norm": 1.6648778148188543,
"learning_rate": 4.878873010757747e-07,
"loss": 1.6447,
"step": 2860
},
{
"epoch": 0.10639673421135268,
"grad_norm": 1.6827360384506134,
"learning_rate": 4.877176737276736e-07,
"loss": 1.6671,
"step": 2880
},
{
"epoch": 0.10713560042115373,
"grad_norm": 1.6125148782802161,
"learning_rate": 4.875469002049697e-07,
"loss": 1.6611,
"step": 2900
},
{
"epoch": 0.1078744666309548,
"grad_norm": 3.1640996826552925,
"learning_rate": 4.873749814278325e-07,
"loss": 1.6914,
"step": 2920
},
{
"epoch": 0.10861333284075586,
"grad_norm": 1.5756821875718683,
"learning_rate": 4.87201918322602e-07,
"loss": 1.6891,
"step": 2940
},
{
"epoch": 0.10935219905055692,
"grad_norm": 1.508384464413988,
"learning_rate": 4.870277118217844e-07,
"loss": 1.6765,
"step": 2960
},
{
"epoch": 0.11009106526035799,
"grad_norm": 1.8943879400046142,
"learning_rate": 4.868523628640468e-07,
"loss": 1.6718,
"step": 2980
},
{
"epoch": 0.11082993147015904,
"grad_norm": 1.5476264075937183,
"learning_rate": 4.86675872394212e-07,
"loss": 1.6384,
"step": 3000
},
{
"epoch": 0.1115687976799601,
"grad_norm": 1.7120101891654744,
"learning_rate": 4.864982413632537e-07,
"loss": 1.66,
"step": 3020
},
{
"epoch": 0.11230766388976116,
"grad_norm": 1.8834789513548644,
"learning_rate": 4.863284363107887e-07,
"loss": 1.6453,
"step": 3040
},
{
"epoch": 0.11304653009956223,
"grad_norm": 1.6393861847878763,
"learning_rate": 4.861485839441465e-07,
"loss": 1.6914,
"step": 3060
},
{
"epoch": 0.11378539630936328,
"grad_norm": 1.548505894649462,
"learning_rate": 4.859675938575391e-07,
"loss": 1.6513,
"step": 3080
},
{
"epoch": 0.11452426251916434,
"grad_norm": 1.7314160899998987,
"learning_rate": 4.857854670261854e-07,
"loss": 1.6652,
"step": 3100
},
{
"epoch": 0.11526312872896541,
"grad_norm": 1.6255645061866926,
"learning_rate": 4.856022044314289e-07,
"loss": 1.6825,
"step": 3120
},
{
"epoch": 0.11600199493876646,
"grad_norm": 1.7047082936180922,
"learning_rate": 4.854178070607332e-07,
"loss": 1.6571,
"step": 3140
},
{
"epoch": 0.11674086114856752,
"grad_norm": 1.5937691951508997,
"learning_rate": 4.852322759076762e-07,
"loss": 1.6796,
"step": 3160
},
{
"epoch": 0.11747972735836858,
"grad_norm": 1.5581038553350461,
"learning_rate": 4.850456119719448e-07,
"loss": 1.6237,
"step": 3180
},
{
"epoch": 0.11821859356816965,
"grad_norm": 1.5319442885899253,
"learning_rate": 4.848578162593298e-07,
"loss": 1.6507,
"step": 3200
},
{
"epoch": 0.1189574597779707,
"grad_norm": 1.7452294652307094,
"learning_rate": 4.846783629455789e-07,
"loss": 1.6334,
"step": 3220
},
{
"epoch": 0.11969632598777176,
"grad_norm": 1.6423452527210813,
"learning_rate": 4.844883631840362e-07,
"loss": 1.6591,
"step": 3240
},
{
"epoch": 0.12043519219757283,
"grad_norm": 2.3138017105742277,
"learning_rate": 4.842972346482019e-07,
"loss": 1.6693,
"step": 3260
},
{
"epoch": 0.12117405840737389,
"grad_norm": 1.5077648756938484,
"learning_rate": 4.841049783679233e-07,
"loss": 1.6486,
"step": 3280
},
{
"epoch": 0.12191292461717494,
"grad_norm": 1.4711190983794034,
"learning_rate": 4.839115953791238e-07,
"loss": 1.6881,
"step": 3300
},
{
"epoch": 0.12265179082697601,
"grad_norm": 4.058044242916531,
"learning_rate": 4.837170867237982e-07,
"loss": 1.6469,
"step": 3320
},
{
"epoch": 0.12339065703677707,
"grad_norm": 1.8109757223352017,
"learning_rate": 4.835214534500064e-07,
"loss": 1.6912,
"step": 3340
},
{
"epoch": 0.12412952324657812,
"grad_norm": 1.5112894099167034,
"learning_rate": 4.83324696611868e-07,
"loss": 1.6452,
"step": 3360
},
{
"epoch": 0.12486838945637918,
"grad_norm": 1.7532693818843224,
"learning_rate": 4.83126817269557e-07,
"loss": 1.6158,
"step": 3380
},
{
"epoch": 0.12560725566618025,
"grad_norm": 1.7433921276878421,
"learning_rate": 4.829278164892951e-07,
"loss": 1.6684,
"step": 3400
},
{
"epoch": 0.1263461218759813,
"grad_norm": 1.499971805431214,
"learning_rate": 4.827276953433474e-07,
"loss": 1.6596,
"step": 3420
},
{
"epoch": 0.12708498808578236,
"grad_norm": 1.5392331224579805,
"learning_rate": 4.825264549100149e-07,
"loss": 1.6411,
"step": 3440
},
{
"epoch": 0.12782385429558343,
"grad_norm": 1.5289257318137572,
"learning_rate": 4.823240962736303e-07,
"loss": 1.6759,
"step": 3460
},
{
"epoch": 0.12856272050538448,
"grad_norm": 1.5034439532563377,
"learning_rate": 4.82120620524551e-07,
"loss": 1.6405,
"step": 3480
},
{
"epoch": 0.12930158671518555,
"grad_norm": 1.4978715454221503,
"learning_rate": 4.81916028759154e-07,
"loss": 1.6732,
"step": 3500
},
{
"epoch": 0.13004045292498662,
"grad_norm": 1.45790640802375,
"learning_rate": 4.817103220798296e-07,
"loss": 1.6649,
"step": 3520
},
{
"epoch": 0.13077931913478766,
"grad_norm": 1.5322708095688835,
"learning_rate": 4.815035015949754e-07,
"loss": 1.6588,
"step": 3540
},
{
"epoch": 0.13151818534458873,
"grad_norm": 1.540513558070265,
"learning_rate": 4.812955684189904e-07,
"loss": 1.6718,
"step": 3560
},
{
"epoch": 0.1322570515543898,
"grad_norm": 1.4880225438470713,
"learning_rate": 4.810865236722692e-07,
"loss": 1.6313,
"step": 3580
},
{
"epoch": 0.13299591776419084,
"grad_norm": 1.4919528959671158,
"learning_rate": 4.808763684811959e-07,
"loss": 1.62,
"step": 3600
},
{
"epoch": 0.1337347839739919,
"grad_norm": 1.6101194590431924,
"learning_rate": 4.806651039781377e-07,
"loss": 1.6933,
"step": 3620
},
{
"epoch": 0.13447365018379298,
"grad_norm": 1.5722737602103793,
"learning_rate": 4.804527313014392e-07,
"loss": 1.6555,
"step": 3640
},
{
"epoch": 0.13521251639359402,
"grad_norm": 1.647937670204523,
"learning_rate": 4.802392515954161e-07,
"loss": 1.6561,
"step": 3660
},
{
"epoch": 0.1359513826033951,
"grad_norm": 1.6527027343392149,
"learning_rate": 4.80024666010349e-07,
"loss": 1.6747,
"step": 3680
},
{
"epoch": 0.13669024881319616,
"grad_norm": 1.596151179002379,
"learning_rate": 4.798089757024773e-07,
"loss": 1.6602,
"step": 3700
},
{
"epoch": 0.1374291150229972,
"grad_norm": 1.6359785367644735,
"learning_rate": 4.795921818339928e-07,
"loss": 1.7041,
"step": 3720
},
{
"epoch": 0.13816798123279828,
"grad_norm": 1.5303851327334592,
"learning_rate": 4.793742855730337e-07,
"loss": 1.6921,
"step": 3740
},
{
"epoch": 0.13890684744259932,
"grad_norm": 1.552833624004378,
"learning_rate": 4.79155288093678e-07,
"loss": 1.6646,
"step": 3760
},
{
"epoch": 0.1396457136524004,
"grad_norm": 1.5328749650552398,
"learning_rate": 4.789351905759377e-07,
"loss": 1.671,
"step": 3780
},
{
"epoch": 0.14038457986220146,
"grad_norm": 1.4637618775535644,
"learning_rate": 4.787139942057513e-07,
"loss": 1.6826,
"step": 3800
},
{
"epoch": 0.1411234460720025,
"grad_norm": 1.456698106912096,
"learning_rate": 4.784917001749791e-07,
"loss": 1.7079,
"step": 3820
},
{
"epoch": 0.14186231228180357,
"grad_norm": 1.4778158837226694,
"learning_rate": 4.782683096813954e-07,
"loss": 1.6673,
"step": 3840
},
{
"epoch": 0.14260117849160464,
"grad_norm": 1.426517743754919,
"learning_rate": 4.780438239286824e-07,
"loss": 1.6327,
"step": 3860
},
{
"epoch": 0.14334004470140568,
"grad_norm": 1.7717097070454197,
"learning_rate": 4.77818244126424e-07,
"loss": 1.6577,
"step": 3880
},
{
"epoch": 0.14407891091120675,
"grad_norm": 1.6008901431845195,
"learning_rate": 4.775915714900992e-07,
"loss": 1.6493,
"step": 3900
},
{
"epoch": 0.14481777712100782,
"grad_norm": 1.5377457534191892,
"learning_rate": 4.773638072410752e-07,
"loss": 1.6668,
"step": 3920
},
{
"epoch": 0.14555664333080887,
"grad_norm": 1.9114280227385299,
"learning_rate": 4.771349526066014e-07,
"loss": 1.6925,
"step": 3940
},
{
"epoch": 0.14629550954060994,
"grad_norm": 1.803899924444919,
"learning_rate": 4.769050088198021e-07,
"loss": 1.6775,
"step": 3960
},
{
"epoch": 0.147034375750411,
"grad_norm": 1.5100721777601815,
"learning_rate": 4.7667397711967037e-07,
"loss": 1.6181,
"step": 3980
},
{
"epoch": 0.14777324196021205,
"grad_norm": 1.4720945445766893,
"learning_rate": 4.764418587510615e-07,
"loss": 1.6607,
"step": 4000
},
{
"epoch": 0.14851210817001312,
"grad_norm": 1.569266687535282,
"learning_rate": 4.7620865496468544e-07,
"loss": 1.6829,
"step": 4020
},
{
"epoch": 0.1492509743798142,
"grad_norm": 1.5799540185979453,
"learning_rate": 4.7597436701710107e-07,
"loss": 1.6483,
"step": 4040
},
{
"epoch": 0.14998984058961523,
"grad_norm": 1.5804308168544465,
"learning_rate": 4.75738996170709e-07,
"loss": 1.6924,
"step": 4060
},
{
"epoch": 0.1507287067994163,
"grad_norm": 1.523398154876467,
"learning_rate": 4.7550254369374455e-07,
"loss": 1.6519,
"step": 4080
},
{
"epoch": 0.15146757300921734,
"grad_norm": 1.4233865381689017,
"learning_rate": 4.752650108602712e-07,
"loss": 1.664,
"step": 4100
},
{
"epoch": 0.1522064392190184,
"grad_norm": 1.512734811893487,
"learning_rate": 4.7502639895017366e-07,
"loss": 1.7103,
"step": 4120
},
{
"epoch": 0.15294530542881948,
"grad_norm": 1.5630800949377466,
"learning_rate": 4.747867092491511e-07,
"loss": 1.6531,
"step": 4140
},
{
"epoch": 0.15368417163862053,
"grad_norm": 1.470144612554125,
"learning_rate": 4.7454594304870977e-07,
"loss": 1.6725,
"step": 4160
},
{
"epoch": 0.1544230378484216,
"grad_norm": 1.6569477682445206,
"learning_rate": 4.743041016461567e-07,
"loss": 1.6998,
"step": 4180
},
{
"epoch": 0.15516190405822267,
"grad_norm": 1.7296103801240361,
"learning_rate": 4.7406118634459223e-07,
"loss": 1.6613,
"step": 4200
},
{
"epoch": 0.1559007702680237,
"grad_norm": 1.6463696442561442,
"learning_rate": 4.738171984529031e-07,
"loss": 1.6575,
"step": 4220
},
{
"epoch": 0.15663963647782478,
"grad_norm": 1.545869558479261,
"learning_rate": 4.7357213928575546e-07,
"loss": 1.6741,
"step": 4240
},
{
"epoch": 0.15737850268762585,
"grad_norm": 1.7796493147352748,
"learning_rate": 4.7332601016358773e-07,
"loss": 1.7046,
"step": 4260
},
{
"epoch": 0.1581173688974269,
"grad_norm": 1.5172414763731175,
"learning_rate": 4.7307881241260365e-07,
"loss": 1.6365,
"step": 4280
},
{
"epoch": 0.15885623510722796,
"grad_norm": 1.5470321540163943,
"learning_rate": 4.7283054736476474e-07,
"loss": 1.6844,
"step": 4300
},
{
"epoch": 0.15959510131702903,
"grad_norm": 1.5074962263335083,
"learning_rate": 4.725812163577835e-07,
"loss": 1.6683,
"step": 4320
},
{
"epoch": 0.16033396752683007,
"grad_norm": 1.5931587963454854,
"learning_rate": 4.723308207351162e-07,
"loss": 1.6972,
"step": 4340
},
{
"epoch": 0.16107283373663114,
"grad_norm": 1.4335946997211053,
"learning_rate": 4.720793618459553e-07,
"loss": 1.6182,
"step": 4360
},
{
"epoch": 0.1618116999464322,
"grad_norm": 1.9207877719443267,
"learning_rate": 4.718268410452226e-07,
"loss": 1.6777,
"step": 4380
},
{
"epoch": 0.16255056615623326,
"grad_norm": 1.4490578223410473,
"learning_rate": 4.7157325969356143e-07,
"loss": 1.6911,
"step": 4400
},
{
"epoch": 0.16328943236603433,
"grad_norm": 1.593847776562296,
"learning_rate": 4.713186191573301e-07,
"loss": 1.6927,
"step": 4420
},
{
"epoch": 0.16402829857583537,
"grad_norm": 1.4739123126868083,
"learning_rate": 4.7106292080859363e-07,
"loss": 1.6492,
"step": 4440
},
{
"epoch": 0.16476716478563644,
"grad_norm": 1.424511297941709,
"learning_rate": 4.7080616602511705e-07,
"loss": 1.6847,
"step": 4460
},
{
"epoch": 0.1655060309954375,
"grad_norm": 1.6007681786366288,
"learning_rate": 4.705483561903576e-07,
"loss": 1.662,
"step": 4480
},
{
"epoch": 0.16624489720523855,
"grad_norm": 1.55690540989863,
"learning_rate": 4.702894926934573e-07,
"loss": 1.6851,
"step": 4500
},
{
"epoch": 0.16698376341503962,
"grad_norm": 2.0423474735881926,
"learning_rate": 4.700295769292359e-07,
"loss": 1.6604,
"step": 4520
},
{
"epoch": 0.1677226296248407,
"grad_norm": 1.453355289637868,
"learning_rate": 4.6976861029818264e-07,
"loss": 1.6842,
"step": 4540
},
{
"epoch": 0.16846149583464173,
"grad_norm": 1.5505160972568328,
"learning_rate": 4.695065942064494e-07,
"loss": 1.6804,
"step": 4560
},
{
"epoch": 0.1692003620444428,
"grad_norm": 1.7608287873846744,
"learning_rate": 4.6924353006584244e-07,
"loss": 1.6595,
"step": 4580
},
{
"epoch": 0.16993922825424387,
"grad_norm": 1.4685283699391545,
"learning_rate": 4.689794192938156e-07,
"loss": 1.6264,
"step": 4600
},
{
"epoch": 0.17067809446404492,
"grad_norm": 1.7781661683868824,
"learning_rate": 4.687142633134619e-07,
"loss": 1.6875,
"step": 4620
},
{
"epoch": 0.17141696067384599,
"grad_norm": 1.6196809334292608,
"learning_rate": 4.6844806355350623e-07,
"loss": 1.6753,
"step": 4640
},
{
"epoch": 0.17215582688364706,
"grad_norm": 1.6293152376567321,
"learning_rate": 4.6818082144829787e-07,
"loss": 1.6665,
"step": 4660
},
{
"epoch": 0.1728946930934481,
"grad_norm": 1.510069163173277,
"learning_rate": 4.6791253843780217e-07,
"loss": 1.6697,
"step": 4680
},
{
"epoch": 0.17363355930324917,
"grad_norm": 1.4471973015401869,
"learning_rate": 4.676432159675933e-07,
"loss": 1.6806,
"step": 4700
},
{
"epoch": 0.1743724255130502,
"grad_norm": 1.7753201121195747,
"learning_rate": 4.6737285548884655e-07,
"loss": 1.6935,
"step": 4720
},
{
"epoch": 0.17511129172285128,
"grad_norm": 1.5222859899502188,
"learning_rate": 4.671014584583296e-07,
"loss": 1.6664,
"step": 4740
},
{
"epoch": 0.17585015793265235,
"grad_norm": 1.4892529478692567,
"learning_rate": 4.668290263383959e-07,
"loss": 1.6669,
"step": 4760
},
{
"epoch": 0.1765890241424534,
"grad_norm": 1.5841443455470228,
"learning_rate": 4.66555560596976e-07,
"loss": 1.6419,
"step": 4780
},
{
"epoch": 0.17732789035225446,
"grad_norm": 1.5264328160932443,
"learning_rate": 4.6628106270757e-07,
"loss": 1.6642,
"step": 4800
},
{
"epoch": 0.17806675656205553,
"grad_norm": 1.6887371299004348,
"learning_rate": 4.6600553414923913e-07,
"loss": 1.6387,
"step": 4820
},
{
"epoch": 0.17880562277185658,
"grad_norm": 1.4594422560615166,
"learning_rate": 4.657289764065985e-07,
"loss": 1.6493,
"step": 4840
},
{
"epoch": 0.17954448898165765,
"grad_norm": 1.6615232385858325,
"learning_rate": 4.6545139096980846e-07,
"loss": 1.6312,
"step": 4860
},
{
"epoch": 0.18028335519145872,
"grad_norm": 1.4161658999634517,
"learning_rate": 4.651727793345669e-07,
"loss": 1.687,
"step": 4880
},
{
"epoch": 0.18102222140125976,
"grad_norm": 1.4750799503852594,
"learning_rate": 4.6489314300210117e-07,
"loss": 1.6579,
"step": 4900
},
{
"epoch": 0.18176108761106083,
"grad_norm": 1.5823630581751142,
"learning_rate": 4.646124834791598e-07,
"loss": 1.6974,
"step": 4920
},
{
"epoch": 0.1824999538208619,
"grad_norm": 1.5953496527857909,
"learning_rate": 4.6433080227800476e-07,
"loss": 1.6349,
"step": 4940
},
{
"epoch": 0.18323882003066294,
"grad_norm": 1.8088958779925088,
"learning_rate": 4.640481009164028e-07,
"loss": 1.7021,
"step": 4960
},
{
"epoch": 0.183977686240464,
"grad_norm": 1.6985722383661672,
"learning_rate": 4.6376438091761776e-07,
"loss": 1.6835,
"step": 4980
},
{
"epoch": 0.18471655245026508,
"grad_norm": 1.5740586459999972,
"learning_rate": 4.63479643810402e-07,
"loss": 1.6778,
"step": 5000
},
{
"epoch": 0.18545541866006612,
"grad_norm": 1.5576615822168314,
"learning_rate": 4.631938911289884e-07,
"loss": 1.6432,
"step": 5020
},
{
"epoch": 0.1861942848698672,
"grad_norm": 1.4882435243374539,
"learning_rate": 4.629071244130818e-07,
"loss": 1.697,
"step": 5040
},
{
"epoch": 0.18693315107966824,
"grad_norm": 1.7414218611909407,
"learning_rate": 4.6261934520785135e-07,
"loss": 1.6472,
"step": 5060
},
{
"epoch": 0.1876720172894693,
"grad_norm": 1.5111215790202166,
"learning_rate": 4.623305550639212e-07,
"loss": 1.6814,
"step": 5080
},
{
"epoch": 0.18841088349927038,
"grad_norm": 1.4998930010938694,
"learning_rate": 4.6204075553736317e-07,
"loss": 1.6965,
"step": 5100
},
{
"epoch": 0.18914974970907142,
"grad_norm": 1.6166379161449234,
"learning_rate": 4.617499481896874e-07,
"loss": 1.6367,
"step": 5120
},
{
"epoch": 0.1898886159188725,
"grad_norm": 1.564060473042759,
"learning_rate": 4.6145813458783484e-07,
"loss": 1.6404,
"step": 5140
},
{
"epoch": 0.19062748212867356,
"grad_norm": 1.5498475055243737,
"learning_rate": 4.611653163041681e-07,
"loss": 1.64,
"step": 5160
},
{
"epoch": 0.1913663483384746,
"grad_norm": 1.6108037998097682,
"learning_rate": 4.6087149491646343e-07,
"loss": 1.699,
"step": 5180
},
{
"epoch": 0.19210521454827567,
"grad_norm": 1.6995541712978521,
"learning_rate": 4.6057667200790203e-07,
"loss": 1.6546,
"step": 5200
},
{
"epoch": 0.19284408075807674,
"grad_norm": 1.499037507366822,
"learning_rate": 4.6028084916706147e-07,
"loss": 1.6083,
"step": 5220
},
{
"epoch": 0.19358294696787778,
"grad_norm": 1.5172594570626625,
"learning_rate": 4.5998402798790704e-07,
"loss": 1.6699,
"step": 5240
},
{
"epoch": 0.19432181317767885,
"grad_norm": 1.4963740648019974,
"learning_rate": 4.5968621006978373e-07,
"loss": 1.6898,
"step": 5260
},
{
"epoch": 0.19506067938747992,
"grad_norm": 2.566805183937073,
"learning_rate": 4.5938739701740686e-07,
"loss": 1.6694,
"step": 5280
},
{
"epoch": 0.19579954559728097,
"grad_norm": 1.4540566793967926,
"learning_rate": 4.590875904408539e-07,
"loss": 1.6692,
"step": 5300
},
{
"epoch": 0.19653841180708204,
"grad_norm": 3.9730656922103447,
"learning_rate": 4.587867919555557e-07,
"loss": 1.6625,
"step": 5320
},
{
"epoch": 0.1972772780168831,
"grad_norm": 1.5142078546698041,
"learning_rate": 4.5848500318228774e-07,
"loss": 1.6654,
"step": 5340
},
{
"epoch": 0.19801614422668415,
"grad_norm": 1.7032492720795371,
"learning_rate": 4.5818222574716127e-07,
"loss": 1.7022,
"step": 5360
},
{
"epoch": 0.19875501043648522,
"grad_norm": 1.554191757548726,
"learning_rate": 4.578784612816149e-07,
"loss": 1.6811,
"step": 5380
},
{
"epoch": 0.19949387664628626,
"grad_norm": 1.4929225978552914,
"learning_rate": 4.5758897229313755e-07,
"loss": 1.6509,
"step": 5400
},
{
"epoch": 0.20023274285608733,
"grad_norm": 1.4628893559215694,
"learning_rate": 4.5728328783083036e-07,
"loss": 1.7302,
"step": 5420
},
{
"epoch": 0.2009716090658884,
"grad_norm": 1.493249123165425,
"learning_rate": 4.5699197781569844e-07,
"loss": 1.6383,
"step": 5440
},
{
"epoch": 0.20171047527568944,
"grad_norm": 1.600690331893774,
"learning_rate": 4.5668437961972905e-07,
"loss": 1.6189,
"step": 5460
},
{
"epoch": 0.2024493414854905,
"grad_norm": 1.464802503893095,
"learning_rate": 4.5637580246409934e-07,
"loss": 1.65,
"step": 5480
},
{
"epoch": 0.20318820769529158,
"grad_norm": 1.5375722464094912,
"learning_rate": 4.5606624801149797e-07,
"loss": 1.6546,
"step": 5500
},
{
"epoch": 0.20392707390509263,
"grad_norm": 1.5967568446324583,
"learning_rate": 4.5575571792987984e-07,
"loss": 1.6286,
"step": 5520
},
{
"epoch": 0.2046659401148937,
"grad_norm": 1.5568969231756908,
"learning_rate": 4.5544421389245646e-07,
"loss": 1.6278,
"step": 5540
},
{
"epoch": 0.20540480632469477,
"grad_norm": 1.5499607650206735,
"learning_rate": 4.5513173757768746e-07,
"loss": 1.6755,
"step": 5560
},
{
"epoch": 0.2061436725344958,
"grad_norm": 1.4823222337131237,
"learning_rate": 4.548182906692714e-07,
"loss": 1.6661,
"step": 5580
},
{
"epoch": 0.20688253874429688,
"grad_norm": 1.507552555113675,
"learning_rate": 4.5450387485613635e-07,
"loss": 1.6659,
"step": 5600
},
{
"epoch": 0.20762140495409795,
"grad_norm": 1.4811185047336115,
"learning_rate": 4.541884918324313e-07,
"loss": 1.656,
"step": 5620
},
{
"epoch": 0.208360271163899,
"grad_norm": 1.576191450168426,
"learning_rate": 4.538721432975168e-07,
"loss": 1.6875,
"step": 5640
},
{
"epoch": 0.20909913737370006,
"grad_norm": 1.7938635395127402,
"learning_rate": 4.535707194370682e-07,
"loss": 1.6646,
"step": 5660
},
{
"epoch": 0.2098380035835011,
"grad_norm": 1.6552255449585238,
"learning_rate": 4.532524930627744e-07,
"loss": 1.6524,
"step": 5680
},
{
"epoch": 0.21057686979330217,
"grad_norm": 1.7516118506092397,
"learning_rate": 4.5293330622066034e-07,
"loss": 1.6157,
"step": 5700
},
{
"epoch": 0.21131573600310324,
"grad_norm": 1.4545866638005132,
"learning_rate": 4.526131606305823e-07,
"loss": 1.6476,
"step": 5720
},
{
"epoch": 0.2120546022129043,
"grad_norm": 1.6248585310317667,
"learning_rate": 4.5229205801756273e-07,
"loss": 1.6573,
"step": 5740
},
{
"epoch": 0.21279346842270536,
"grad_norm": 1.41925791489552,
"learning_rate": 4.519700001117807e-07,
"loss": 1.6685,
"step": 5760
},
{
"epoch": 0.21353233463250643,
"grad_norm": 1.7509635950883726,
"learning_rate": 4.5164698864856257e-07,
"loss": 1.6812,
"step": 5780
},
{
"epoch": 0.21427120084230747,
"grad_norm": 1.4694228842841779,
"learning_rate": 4.5132302536837273e-07,
"loss": 1.6556,
"step": 5800
},
{
"epoch": 0.21501006705210854,
"grad_norm": 1.553864895417105,
"learning_rate": 4.5099811201680416e-07,
"loss": 1.6883,
"step": 5820
},
{
"epoch": 0.2157489332619096,
"grad_norm": 1.491366651426128,
"learning_rate": 4.506722503445691e-07,
"loss": 1.6613,
"step": 5840
},
{
"epoch": 0.21648779947171065,
"grad_norm": 1.6466798284982602,
"learning_rate": 4.5034544210748953e-07,
"loss": 1.6497,
"step": 5860
},
{
"epoch": 0.21722666568151172,
"grad_norm": 1.4331846976152014,
"learning_rate": 4.5001768906648783e-07,
"loss": 1.6583,
"step": 5880
},
{
"epoch": 0.2179655318913128,
"grad_norm": 2.4779046528418793,
"learning_rate": 4.496889929875771e-07,
"loss": 1.6456,
"step": 5900
},
{
"epoch": 0.21870439810111383,
"grad_norm": 1.6613792185698004,
"learning_rate": 4.493593556418519e-07,
"loss": 1.6876,
"step": 5920
},
{
"epoch": 0.2194432643109149,
"grad_norm": 1.5936970250540041,
"learning_rate": 4.490287788054785e-07,
"loss": 1.6856,
"step": 5940
},
{
"epoch": 0.22018213052071597,
"grad_norm": 1.7774522510719284,
"learning_rate": 4.486972642596852e-07,
"loss": 1.6574,
"step": 5960
},
{
"epoch": 0.22092099673051702,
"grad_norm": 1.5404871158832736,
"learning_rate": 4.483648137907532e-07,
"loss": 1.6637,
"step": 5980
},
{
"epoch": 0.2216598629403181,
"grad_norm": 1.5238762502370415,
"learning_rate": 4.4803142919000645e-07,
"loss": 1.6526,
"step": 6000
},
{
"epoch": 0.22239872915011913,
"grad_norm": 1.4681103098352588,
"learning_rate": 4.4769711225380254e-07,
"loss": 1.6538,
"step": 6020
},
{
"epoch": 0.2231375953599202,
"grad_norm": 1.406496721553823,
"learning_rate": 4.4736186478352225e-07,
"loss": 1.6593,
"step": 6040
},
{
"epoch": 0.22387646156972127,
"grad_norm": 1.6502790317877305,
"learning_rate": 4.4702568858556063e-07,
"loss": 1.6946,
"step": 6060
},
{
"epoch": 0.2246153277795223,
"grad_norm": 1.5544958034860874,
"learning_rate": 4.466885854713169e-07,
"loss": 1.6922,
"step": 6080
},
{
"epoch": 0.22535419398932338,
"grad_norm": 1.35257283259656,
"learning_rate": 4.463505572571847e-07,
"loss": 1.6646,
"step": 6100
},
{
"epoch": 0.22609306019912445,
"grad_norm": 1.624788597950665,
"learning_rate": 4.460116057645422e-07,
"loss": 1.6464,
"step": 6120
},
{
"epoch": 0.2268319264089255,
"grad_norm": 1.5573729356283417,
"learning_rate": 4.4567173281974274e-07,
"loss": 1.6311,
"step": 6140
},
{
"epoch": 0.22757079261872656,
"grad_norm": 1.9342192243430807,
"learning_rate": 4.453309402541044e-07,
"loss": 1.6517,
"step": 6160
},
{
"epoch": 0.22830965882852763,
"grad_norm": 1.6525422759457808,
"learning_rate": 4.4498922990390044e-07,
"loss": 1.6584,
"step": 6180
},
{
"epoch": 0.22904852503832868,
"grad_norm": 1.3709737663427297,
"learning_rate": 4.446466036103493e-07,
"loss": 1.6552,
"step": 6200
},
{
"epoch": 0.22978739124812975,
"grad_norm": 1.7619047090616546,
"learning_rate": 4.44303063219605e-07,
"loss": 1.6515,
"step": 6220
},
{
"epoch": 0.23052625745793082,
"grad_norm": 1.425527104774275,
"learning_rate": 4.439586105827468e-07,
"loss": 1.7082,
"step": 6240
},
{
"epoch": 0.23126512366773186,
"grad_norm": 2.183066565667764,
"learning_rate": 4.436132475557693e-07,
"loss": 1.6457,
"step": 6260
},
{
"epoch": 0.23200398987753293,
"grad_norm": 2.5631189419788103,
"learning_rate": 4.432669759995725e-07,
"loss": 1.6441,
"step": 6280
},
{
"epoch": 0.232742856087334,
"grad_norm": 1.531958854525398,
"learning_rate": 4.4291979777995186e-07,
"loss": 1.6597,
"step": 6300
},
{
"epoch": 0.23348172229713504,
"grad_norm": 1.7334807358971334,
"learning_rate": 4.4257171476758813e-07,
"loss": 1.6189,
"step": 6320
},
{
"epoch": 0.2342205885069361,
"grad_norm": 1.606688663391079,
"learning_rate": 4.422227288380374e-07,
"loss": 1.6635,
"step": 6340
},
{
"epoch": 0.23495945471673715,
"grad_norm": 1.5504111994528522,
"learning_rate": 4.418728418717207e-07,
"loss": 1.6619,
"step": 6360
},
{
"epoch": 0.23569832092653822,
"grad_norm": 1.7059923161913078,
"learning_rate": 4.415220557539142e-07,
"loss": 1.6518,
"step": 6380
},
{
"epoch": 0.2364371871363393,
"grad_norm": 1.5282124634083587,
"learning_rate": 4.411703723747389e-07,
"loss": 1.6281,
"step": 6400
},
{
"epoch": 0.23717605334614034,
"grad_norm": 1.817029524914551,
"learning_rate": 4.4081779362915033e-07,
"loss": 1.6196,
"step": 6420
},
{
"epoch": 0.2379149195559414,
"grad_norm": 1.4287258918617316,
"learning_rate": 4.404643214169288e-07,
"loss": 1.6552,
"step": 6440
},
{
"epoch": 0.23865378576574248,
"grad_norm": 1.4874633967888828,
"learning_rate": 4.4010995764266845e-07,
"loss": 1.6398,
"step": 6460
},
{
"epoch": 0.23939265197554352,
"grad_norm": 1.721122957795877,
"learning_rate": 4.3975470421576764e-07,
"loss": 1.6512,
"step": 6480
},
{
"epoch": 0.2401315181853446,
"grad_norm": 1.523301573082442,
"learning_rate": 4.393985630504183e-07,
"loss": 1.6782,
"step": 6500
},
{
"epoch": 0.24087038439514566,
"grad_norm": 1.4599906341858953,
"learning_rate": 4.390415360655957e-07,
"loss": 1.6396,
"step": 6520
},
{
"epoch": 0.2416092506049467,
"grad_norm": 1.5009190844531946,
"learning_rate": 4.386836251850481e-07,
"loss": 1.648,
"step": 6540
},
{
"epoch": 0.24234811681474777,
"grad_norm": 1.3512220497620588,
"learning_rate": 4.3832483233728654e-07,
"loss": 1.6712,
"step": 6560
},
{
"epoch": 0.24308698302454884,
"grad_norm": 1.6590943419842232,
"learning_rate": 4.379651594555741e-07,
"loss": 1.6174,
"step": 6580
},
{
"epoch": 0.24382584923434988,
"grad_norm": 1.3956181675020793,
"learning_rate": 4.376046084779159e-07,
"loss": 1.6173,
"step": 6600
},
{
"epoch": 0.24456471544415095,
"grad_norm": 1.5798276517321244,
"learning_rate": 4.3724318134704826e-07,
"loss": 1.6419,
"step": 6620
},
{
"epoch": 0.24530358165395202,
"grad_norm": 1.4769865046542814,
"learning_rate": 4.3688088001042866e-07,
"loss": 1.6631,
"step": 6640
},
{
"epoch": 0.24604244786375307,
"grad_norm": 1.7571296905735259,
"learning_rate": 4.3651770642022483e-07,
"loss": 1.6615,
"step": 6660
},
{
"epoch": 0.24678131407355414,
"grad_norm": 10.261084539724488,
"learning_rate": 4.361536625333045e-07,
"loss": 1.6515,
"step": 6680
},
{
"epoch": 0.24752018028335518,
"grad_norm": 2.7070070654149956,
"learning_rate": 4.3578875031122466e-07,
"loss": 1.6584,
"step": 6700
},
{
"epoch": 0.24825904649315625,
"grad_norm": 1.54607876926978,
"learning_rate": 4.3542297172022126e-07,
"loss": 1.6517,
"step": 6720
},
{
"epoch": 0.24899791270295732,
"grad_norm": 1.3861037085930092,
"learning_rate": 4.3505632873119844e-07,
"loss": 1.6686,
"step": 6740
},
{
"epoch": 0.24973677891275836,
"grad_norm": 1.4161848471548175,
"learning_rate": 4.346888233197178e-07,
"loss": 1.6449,
"step": 6760
},
{
"epoch": 0.25047564512255943,
"grad_norm": 1.9634719417599906,
"learning_rate": 4.343204574659878e-07,
"loss": 1.6586,
"step": 6780
},
{
"epoch": 0.2512145113323605,
"grad_norm": 2.2362709149394835,
"learning_rate": 4.339512331548535e-07,
"loss": 1.6481,
"step": 6800
},
{
"epoch": 0.25195337754216157,
"grad_norm": 2.435262162446439,
"learning_rate": 4.335811523757855e-07,
"loss": 1.6751,
"step": 6820
},
{
"epoch": 0.2526922437519626,
"grad_norm": 1.4440630152259213,
"learning_rate": 4.3321021712286874e-07,
"loss": 1.6865,
"step": 6840
},
{
"epoch": 0.25343110996176366,
"grad_norm": 1.6572017188801809,
"learning_rate": 4.3283842939479297e-07,
"loss": 1.6874,
"step": 6860
},
{
"epoch": 0.2541699761715647,
"grad_norm": 1.6358091879473202,
"learning_rate": 4.3246579119484086e-07,
"loss": 1.6442,
"step": 6880
},
{
"epoch": 0.2549088423813658,
"grad_norm": 1.861949731594006,
"learning_rate": 4.3209230453087763e-07,
"loss": 1.6596,
"step": 6900
},
{
"epoch": 0.25564770859116687,
"grad_norm": 1.576364259347636,
"learning_rate": 4.317179714153405e-07,
"loss": 1.6409,
"step": 6920
},
{
"epoch": 0.25638657480096794,
"grad_norm": 1.6344350623705748,
"learning_rate": 4.3134279386522734e-07,
"loss": 1.6634,
"step": 6940
},
{
"epoch": 0.25712544101076895,
"grad_norm": 2.4484913186668056,
"learning_rate": 4.3096677390208606e-07,
"loss": 1.6635,
"step": 6960
},
{
"epoch": 0.25786430722057,
"grad_norm": 1.459583448230627,
"learning_rate": 4.3058991355200385e-07,
"loss": 1.6437,
"step": 6980
},
{
"epoch": 0.2586031734303711,
"grad_norm": 2.0774440428993426,
"learning_rate": 4.302122148455959e-07,
"loss": 1.6807,
"step": 7000
},
{
"epoch": 0.25934203964017216,
"grad_norm": 1.4906050741171306,
"learning_rate": 4.2983367981799484e-07,
"loss": 1.6477,
"step": 7020
},
{
"epoch": 0.26008090584997323,
"grad_norm": 1.6727105507446454,
"learning_rate": 4.294543105088395e-07,
"loss": 1.617,
"step": 7040
},
{
"epoch": 0.2608197720597743,
"grad_norm": 1.4754199269220696,
"learning_rate": 4.2907410896226415e-07,
"loss": 1.6391,
"step": 7060
},
{
"epoch": 0.2615586382695753,
"grad_norm": 1.5380802874413815,
"learning_rate": 4.2869307722688715e-07,
"loss": 1.687,
"step": 7080
},
{
"epoch": 0.2622975044793764,
"grad_norm": 1.6040883755814137,
"learning_rate": 4.283112173558003e-07,
"loss": 1.7171,
"step": 7100
},
{
"epoch": 0.26303637068917746,
"grad_norm": 2.822094109735399,
"learning_rate": 4.279285314065575e-07,
"loss": 1.6671,
"step": 7120
},
{
"epoch": 0.2637752368989785,
"grad_norm": 1.4328096068889253,
"learning_rate": 4.275450214411638e-07,
"loss": 1.6475,
"step": 7140
},
{
"epoch": 0.2645141031087796,
"grad_norm": 1.624272809516238,
"learning_rate": 4.2716068952606424e-07,
"loss": 1.693,
"step": 7160
},
{
"epoch": 0.2652529693185806,
"grad_norm": 1.502383886350249,
"learning_rate": 4.267755377321327e-07,
"loss": 1.6592,
"step": 7180
},
{
"epoch": 0.2659918355283817,
"grad_norm": 1.4780327874669796,
"learning_rate": 4.2638956813466094e-07,
"loss": 1.6273,
"step": 7200
},
{
"epoch": 0.26673070173818275,
"grad_norm": 1.647788340317037,
"learning_rate": 4.2600278281334683e-07,
"loss": 1.7177,
"step": 7220
},
{
"epoch": 0.2674695679479838,
"grad_norm": 1.4249175729696602,
"learning_rate": 4.256151838522842e-07,
"loss": 1.6134,
"step": 7240
},
{
"epoch": 0.2682084341577849,
"grad_norm": 1.525640467280493,
"learning_rate": 4.252267733399502e-07,
"loss": 1.6279,
"step": 7260
},
{
"epoch": 0.26894730036758596,
"grad_norm": 1.5643231773087998,
"learning_rate": 4.2483755336919546e-07,
"loss": 1.6319,
"step": 7280
},
{
"epoch": 0.269686166577387,
"grad_norm": 1.5088025290660787,
"learning_rate": 4.2444752603723185e-07,
"loss": 1.6465,
"step": 7300
},
{
"epoch": 0.27042503278718805,
"grad_norm": 1.690559249481047,
"learning_rate": 4.2405669344562157e-07,
"loss": 1.6597,
"step": 7320
},
{
"epoch": 0.2711638989969891,
"grad_norm": 1.4158777914075165,
"learning_rate": 4.236650577002658e-07,
"loss": 1.6498,
"step": 7340
},
{
"epoch": 0.2719027652067902,
"grad_norm": 1.4954788634515361,
"learning_rate": 4.232726209113931e-07,
"loss": 1.7073,
"step": 7360
},
{
"epoch": 0.27264163141659126,
"grad_norm": 1.96245857269846,
"learning_rate": 4.228793851935486e-07,
"loss": 1.6559,
"step": 7380
},
{
"epoch": 0.2733804976263923,
"grad_norm": 1.5534874631194424,
"learning_rate": 4.22485352665582e-07,
"loss": 1.6795,
"step": 7400
},
{
"epoch": 0.27411936383619334,
"grad_norm": 1.513478614204036,
"learning_rate": 4.2209052545063645e-07,
"loss": 1.6598,
"step": 7420
},
{
"epoch": 0.2748582300459944,
"grad_norm": 1.4981685008613979,
"learning_rate": 4.216949056761371e-07,
"loss": 1.6796,
"step": 7440
},
{
"epoch": 0.2755970962557955,
"grad_norm": 1.453166525310124,
"learning_rate": 4.212984954737796e-07,
"loss": 1.6547,
"step": 7460
},
{
"epoch": 0.27633596246559655,
"grad_norm": 1.4590359213340498,
"learning_rate": 4.2090129697951865e-07,
"loss": 1.668,
"step": 7480
},
{
"epoch": 0.2770748286753976,
"grad_norm": 1.5012030999873756,
"learning_rate": 4.205033123335563e-07,
"loss": 1.6253,
"step": 7500
},
{
"epoch": 0.27781369488519864,
"grad_norm": 1.605863135582104,
"learning_rate": 4.2010454368033075e-07,
"loss": 1.6684,
"step": 7520
},
{
"epoch": 0.2785525610949997,
"grad_norm": 1.9991749625802369,
"learning_rate": 4.197049931685046e-07,
"loss": 1.6403,
"step": 7540
},
{
"epoch": 0.2792914273048008,
"grad_norm": 1.5084206750440898,
"learning_rate": 4.193046629509533e-07,
"loss": 1.6673,
"step": 7560
},
{
"epoch": 0.28003029351460185,
"grad_norm": 1.6013334792913052,
"learning_rate": 4.1890355518475335e-07,
"loss": 1.6483,
"step": 7580
},
{
"epoch": 0.2807691597244029,
"grad_norm": 1.798812837038986,
"learning_rate": 4.185016720311712e-07,
"loss": 1.6795,
"step": 7600
},
{
"epoch": 0.281508025934204,
"grad_norm": 1.4900500600235345,
"learning_rate": 4.18099015655651e-07,
"loss": 1.6807,
"step": 7620
},
{
"epoch": 0.282246892144005,
"grad_norm": 1.6028189719479609,
"learning_rate": 4.176955882278033e-07,
"loss": 1.6596,
"step": 7640
},
{
"epoch": 0.28298575835380607,
"grad_norm": 1.9939881516366833,
"learning_rate": 4.1729139192139335e-07,
"loss": 1.6695,
"step": 7660
},
{
"epoch": 0.28372462456360714,
"grad_norm": 1.5127346940191255,
"learning_rate": 4.168864289143291e-07,
"loss": 1.7078,
"step": 7680
},
{
"epoch": 0.2844634907734082,
"grad_norm": 1.5284950240291668,
"learning_rate": 4.1648070138864993e-07,
"loss": 1.7175,
"step": 7700
},
{
"epoch": 0.2852023569832093,
"grad_norm": 1.5249438102092971,
"learning_rate": 4.1607421153051454e-07,
"loss": 1.6753,
"step": 7720
},
{
"epoch": 0.28594122319301035,
"grad_norm": 1.6281345917446086,
"learning_rate": 4.156669615301891e-07,
"loss": 1.6455,
"step": 7740
},
{
"epoch": 0.28668008940281137,
"grad_norm": 1.7327391694790744,
"learning_rate": 4.152589535820358e-07,
"loss": 1.6115,
"step": 7760
},
{
"epoch": 0.28741895561261244,
"grad_norm": 1.8046545180697087,
"learning_rate": 4.148501898845008e-07,
"loss": 1.6752,
"step": 7780
},
{
"epoch": 0.2881578218224135,
"grad_norm": 1.4479684507284691,
"learning_rate": 4.144406726401024e-07,
"loss": 1.7095,
"step": 7800
},
{
"epoch": 0.2888966880322146,
"grad_norm": 1.5133767331728856,
"learning_rate": 4.140304040554192e-07,
"loss": 1.6637,
"step": 7820
},
{
"epoch": 0.28963555424201565,
"grad_norm": 1.69526484807945,
"learning_rate": 4.1361938634107795e-07,
"loss": 1.6604,
"step": 7840
},
{
"epoch": 0.29037442045181666,
"grad_norm": 1.5901137640996412,
"learning_rate": 4.132076217117425e-07,
"loss": 1.7023,
"step": 7860
},
{
"epoch": 0.29111328666161773,
"grad_norm": 1.423118541107655,
"learning_rate": 4.1279511238610075e-07,
"loss": 1.6251,
"step": 7880
},
{
"epoch": 0.2918521528714188,
"grad_norm": 1.3770610046698395,
"learning_rate": 4.123818605868533e-07,
"loss": 1.6859,
"step": 7900
},
{
"epoch": 0.29259101908121987,
"grad_norm": 1.5512042035926865,
"learning_rate": 4.1196786854070147e-07,
"loss": 1.6682,
"step": 7920
},
{
"epoch": 0.29332988529102094,
"grad_norm": 1.5657764052019774,
"learning_rate": 4.115531384783352e-07,
"loss": 1.6373,
"step": 7940
},
{
"epoch": 0.294068751500822,
"grad_norm": 1.3977001410170469,
"learning_rate": 4.11137672634421e-07,
"loss": 1.623,
"step": 7960
},
{
"epoch": 0.294807617710623,
"grad_norm": 1.5471885506840533,
"learning_rate": 4.1072147324759007e-07,
"loss": 1.6359,
"step": 7980
},
{
"epoch": 0.2955464839204241,
"grad_norm": 1.9646501043093372,
"learning_rate": 4.103045425604257e-07,
"loss": 1.6575,
"step": 8000
},
{
"epoch": 0.29628535013022517,
"grad_norm": 2.4554925260754192,
"learning_rate": 4.098868828194523e-07,
"loss": 1.6505,
"step": 8020
},
{
"epoch": 0.29702421634002624,
"grad_norm": 1.5764440647794176,
"learning_rate": 4.0946849627512194e-07,
"loss": 1.6537,
"step": 8040
},
{
"epoch": 0.2977630825498273,
"grad_norm": 1.5679031999275903,
"learning_rate": 4.090493851818032e-07,
"loss": 1.6678,
"step": 8060
},
{
"epoch": 0.2985019487596284,
"grad_norm": 1.5427978270277976,
"learning_rate": 4.086295517977688e-07,
"loss": 1.646,
"step": 8080
},
{
"epoch": 0.2992408149694294,
"grad_norm": 1.6159758168642673,
"learning_rate": 4.082089983851831e-07,
"loss": 1.6543,
"step": 8100
},
{
"epoch": 0.29997968117923046,
"grad_norm": 1.4061897285537437,
"learning_rate": 4.0778772721009036e-07,
"loss": 1.6285,
"step": 8120
},
{
"epoch": 0.30071854738903153,
"grad_norm": 1.3965741494953192,
"learning_rate": 4.073657405424019e-07,
"loss": 1.6656,
"step": 8140
},
{
"epoch": 0.3014574135988326,
"grad_norm": 1.5484468689064121,
"learning_rate": 4.06943040655885e-07,
"loss": 1.661,
"step": 8160
},
{
"epoch": 0.30219627980863367,
"grad_norm": 1.5843927161871971,
"learning_rate": 4.065196298281493e-07,
"loss": 1.6622,
"step": 8180
},
{
"epoch": 0.3029351460184347,
"grad_norm": 1.6553065392619284,
"learning_rate": 4.0609551034063555e-07,
"loss": 1.6989,
"step": 8200
},
{
"epoch": 0.30367401222823576,
"grad_norm": 1.6004229625484228,
"learning_rate": 4.056706844786025e-07,
"loss": 1.6673,
"step": 8220
},
{
"epoch": 0.3044128784380368,
"grad_norm": 1.7218496726083523,
"learning_rate": 4.052451545311157e-07,
"loss": 1.7071,
"step": 8240
},
{
"epoch": 0.3051517446478379,
"grad_norm": 1.4453612541643919,
"learning_rate": 4.0481892279103375e-07,
"loss": 1.6418,
"step": 8260
},
{
"epoch": 0.30589061085763897,
"grad_norm": 2.0343056912272415,
"learning_rate": 4.043919915549972e-07,
"loss": 1.6406,
"step": 8280
},
{
"epoch": 0.30662947706744004,
"grad_norm": 1.4141851056827188,
"learning_rate": 4.0396436312341537e-07,
"loss": 1.6697,
"step": 8300
},
{
"epoch": 0.30736834327724105,
"grad_norm": 1.7030187367387806,
"learning_rate": 4.0353603980045434e-07,
"loss": 1.648,
"step": 8320
},
{
"epoch": 0.3081072094870421,
"grad_norm": 1.4580931131013146,
"learning_rate": 4.0310702389402455e-07,
"loss": 1.6738,
"step": 8340
},
{
"epoch": 0.3088460756968432,
"grad_norm": 1.6315260212867364,
"learning_rate": 4.0267731771576795e-07,
"loss": 1.6568,
"step": 8360
},
{
"epoch": 0.30958494190664426,
"grad_norm": 1.760277165218215,
"learning_rate": 4.022469235810462e-07,
"loss": 1.7044,
"step": 8380
},
{
"epoch": 0.31032380811644533,
"grad_norm": 1.5247483148379708,
"learning_rate": 4.0181584380892747e-07,
"loss": 1.625,
"step": 8400
},
{
"epoch": 0.3110626743262464,
"grad_norm": 1.6055425468824278,
"learning_rate": 4.0138408072217467e-07,
"loss": 1.6332,
"step": 8420
},
{
"epoch": 0.3118015405360474,
"grad_norm": 2.522263277058951,
"learning_rate": 4.009516366472323e-07,
"loss": 1.6795,
"step": 8440
},
{
"epoch": 0.3125404067458485,
"grad_norm": 1.4776229994815417,
"learning_rate": 4.005185139142143e-07,
"loss": 1.6675,
"step": 8460
},
{
"epoch": 0.31327927295564956,
"grad_norm": 1.458660936186841,
"learning_rate": 4.000847148568915e-07,
"loss": 1.661,
"step": 8480
},
{
"epoch": 0.3140181391654506,
"grad_norm": 1.5895551714359692,
"learning_rate": 3.9965024181267865e-07,
"loss": 1.6474,
"step": 8500
},
{
"epoch": 0.3147570053752517,
"grad_norm": 1.6027764846949324,
"learning_rate": 3.9921509712262237e-07,
"loss": 1.7055,
"step": 8520
},
{
"epoch": 0.3154958715850527,
"grad_norm": 1.4709407841933115,
"learning_rate": 3.9877928313138807e-07,
"loss": 1.6721,
"step": 8540
},
{
"epoch": 0.3162347377948538,
"grad_norm": 1.4461242455876133,
"learning_rate": 3.983428021872477e-07,
"loss": 1.6496,
"step": 8560
},
{
"epoch": 0.31697360400465485,
"grad_norm": 1.4524171700785795,
"learning_rate": 3.979056566420668e-07,
"loss": 1.6553,
"step": 8580
},
{
"epoch": 0.3177124702144559,
"grad_norm": 1.5057325136067627,
"learning_rate": 3.974678488512921e-07,
"loss": 1.6723,
"step": 8600
},
{
"epoch": 0.318451336424257,
"grad_norm": 1.4293777770249827,
"learning_rate": 3.9702938117393825e-07,
"loss": 1.6586,
"step": 8620
},
{
"epoch": 0.31919020263405806,
"grad_norm": 1.4212368243075615,
"learning_rate": 3.965902559725761e-07,
"loss": 1.6458,
"step": 8640
},
{
"epoch": 0.3199290688438591,
"grad_norm": 1.4727420961415922,
"learning_rate": 3.961504756133189e-07,
"loss": 1.6481,
"step": 8660
},
{
"epoch": 0.32066793505366015,
"grad_norm": 2.5900548552419895,
"learning_rate": 3.9573207959028544e-07,
"loss": 1.621,
"step": 8680
},
{
"epoch": 0.3214068012634612,
"grad_norm": 1.5430259080799726,
"learning_rate": 3.952910284920244e-07,
"loss": 1.6812,
"step": 8700
},
{
"epoch": 0.3221456674732623,
"grad_norm": 1.4794345694793534,
"learning_rate": 3.948493292364224e-07,
"loss": 1.6585,
"step": 8720
},
{
"epoch": 0.32288453368306336,
"grad_norm": 1.4614630552620829,
"learning_rate": 3.9440698420346246e-07,
"loss": 1.6466,
"step": 8740
},
{
"epoch": 0.3236233998928644,
"grad_norm": 1.4393288175430394,
"learning_rate": 3.939639957766073e-07,
"loss": 1.6215,
"step": 8760
},
{
"epoch": 0.32436226610266544,
"grad_norm": 2.1230018342791532,
"learning_rate": 3.9352036634278634e-07,
"loss": 1.6803,
"step": 8780
},
{
"epoch": 0.3251011323124665,
"grad_norm": 1.6164570568462948,
"learning_rate": 3.9307609829238297e-07,
"loss": 1.6766,
"step": 8800
},
{
"epoch": 0.3258399985222676,
"grad_norm": 1.4370335980422504,
"learning_rate": 3.9263119401922175e-07,
"loss": 1.6822,
"step": 8820
},
{
"epoch": 0.32657886473206865,
"grad_norm": 1.644081010299245,
"learning_rate": 3.9218565592055486e-07,
"loss": 1.6633,
"step": 8840
},
{
"epoch": 0.3273177309418697,
"grad_norm": 2.1011988058241173,
"learning_rate": 3.9173948639705027e-07,
"loss": 1.6765,
"step": 8860
},
{
"epoch": 0.32805659715167074,
"grad_norm": 2.151384135030328,
"learning_rate": 3.9129268785277796e-07,
"loss": 1.6465,
"step": 8880
},
{
"epoch": 0.3287954633614718,
"grad_norm": 1.4309025880636768,
"learning_rate": 3.908452626951972e-07,
"loss": 1.6543,
"step": 8900
},
{
"epoch": 0.3295343295712729,
"grad_norm": 1.8849999578121595,
"learning_rate": 3.903972133351436e-07,
"loss": 1.6514,
"step": 8920
},
{
"epoch": 0.33027319578107395,
"grad_norm": 1.7164685196230511,
"learning_rate": 3.8994854218681627e-07,
"loss": 1.7006,
"step": 8940
},
{
"epoch": 0.331012061990875,
"grad_norm": 1.4964402365248954,
"learning_rate": 3.8949925166776454e-07,
"loss": 1.6995,
"step": 8960
},
{
"epoch": 0.3317509282006761,
"grad_norm": 1.9725561956682367,
"learning_rate": 3.8904934419887493e-07,
"loss": 1.634,
"step": 8980
},
{
"epoch": 0.3324897944104771,
"grad_norm": 1.604770043849599,
"learning_rate": 3.885988222043586e-07,
"loss": 1.6307,
"step": 9000
},
{
"epoch": 0.33322866062027817,
"grad_norm": 1.4014528232679808,
"learning_rate": 3.881476881117376e-07,
"loss": 1.6384,
"step": 9020
},
{
"epoch": 0.33396752683007924,
"grad_norm": 1.5592294550988919,
"learning_rate": 3.876959443518323e-07,
"loss": 1.6893,
"step": 9040
},
{
"epoch": 0.3347063930398803,
"grad_norm": 1.512028885113723,
"learning_rate": 3.872662252925764e-07,
"loss": 1.6126,
"step": 9060
},
{
"epoch": 0.3354452592496814,
"grad_norm": 1.5167336039874841,
"learning_rate": 3.868132996855423e-07,
"loss": 1.6438,
"step": 9080
},
{
"epoch": 0.3361841254594824,
"grad_norm": 1.5732905269770532,
"learning_rate": 3.8635977160123356e-07,
"loss": 1.6129,
"step": 9100
},
{
"epoch": 0.33692299166928347,
"grad_norm": 1.6825164459147328,
"learning_rate": 3.859056434833698e-07,
"loss": 1.611,
"step": 9120
},
{
"epoch": 0.33766185787908454,
"grad_norm": 2.3767246380889095,
"learning_rate": 3.854509177789039e-07,
"loss": 1.6473,
"step": 9140
},
{
"epoch": 0.3384007240888856,
"grad_norm": 1.51475900965411,
"learning_rate": 3.8499559693800866e-07,
"loss": 1.6696,
"step": 9160
},
{
"epoch": 0.3391395902986867,
"grad_norm": 2.1798994146623496,
"learning_rate": 3.845396834140635e-07,
"loss": 1.6272,
"step": 9180
},
{
"epoch": 0.33987845650848775,
"grad_norm": 5.503662773520221,
"learning_rate": 3.8408317966364155e-07,
"loss": 1.6598,
"step": 9200
},
{
"epoch": 0.34061732271828876,
"grad_norm": 1.4387011677124582,
"learning_rate": 3.836260881464961e-07,
"loss": 1.6327,
"step": 9220
},
{
"epoch": 0.34135618892808983,
"grad_norm": 1.8647315334479582,
"learning_rate": 3.831684113255475e-07,
"loss": 1.6511,
"step": 9240
},
{
"epoch": 0.3420950551378909,
"grad_norm": 1.4777808537198769,
"learning_rate": 3.8271015166686987e-07,
"loss": 1.6361,
"step": 9260
},
{
"epoch": 0.34283392134769197,
"grad_norm": 2.045197276055339,
"learning_rate": 3.822513116396778e-07,
"loss": 1.6659,
"step": 9280
},
{
"epoch": 0.34357278755749304,
"grad_norm": 1.7790240681877276,
"learning_rate": 3.8179189371631307e-07,
"loss": 1.617,
"step": 9300
},
{
"epoch": 0.3443116537672941,
"grad_norm": 1.6594283041904447,
"learning_rate": 3.813319003722312e-07,
"loss": 1.6798,
"step": 9320
},
{
"epoch": 0.3450505199770951,
"grad_norm": 1.5722518111489987,
"learning_rate": 3.8087133408598837e-07,
"loss": 1.6448,
"step": 9340
},
{
"epoch": 0.3457893861868962,
"grad_norm": 1.3834190123625751,
"learning_rate": 3.804101973392278e-07,
"loss": 1.6937,
"step": 9360
},
{
"epoch": 0.34652825239669727,
"grad_norm": 2.860970712860898,
"learning_rate": 3.799484926166665e-07,
"loss": 1.6803,
"step": 9380
},
{
"epoch": 0.34726711860649834,
"grad_norm": 1.7303789413551895,
"learning_rate": 3.794862224060819e-07,
"loss": 1.6652,
"step": 9400
},
{
"epoch": 0.3480059848162994,
"grad_norm": 1.5722357665247504,
"learning_rate": 3.7902338919829854e-07,
"loss": 1.6824,
"step": 9420
},
{
"epoch": 0.3487448510261004,
"grad_norm": 1.4942909416069685,
"learning_rate": 3.785599954871741e-07,
"loss": 1.6334,
"step": 9440
},
{
"epoch": 0.3494837172359015,
"grad_norm": 1.5407701751336818,
"learning_rate": 3.7809604376958705e-07,
"loss": 1.6147,
"step": 9460
},
{
"epoch": 0.35022258344570256,
"grad_norm": 1.5151800327591411,
"learning_rate": 3.7763153654542187e-07,
"loss": 1.6591,
"step": 9480
},
{
"epoch": 0.35096144965550363,
"grad_norm": 1.5820720313790753,
"learning_rate": 3.7716647631755684e-07,
"loss": 1.6267,
"step": 9500
},
{
"epoch": 0.3517003158653047,
"grad_norm": 1.7136185539713005,
"learning_rate": 3.7670086559184944e-07,
"loss": 1.6443,
"step": 9520
},
{
"epoch": 0.3524391820751058,
"grad_norm": 1.6610072999142345,
"learning_rate": 3.7623470687712363e-07,
"loss": 1.6391,
"step": 9540
},
{
"epoch": 0.3531780482849068,
"grad_norm": 1.7561532016780041,
"learning_rate": 3.7576800268515615e-07,
"loss": 1.6403,
"step": 9560
},
{
"epoch": 0.35391691449470786,
"grad_norm": 1.6534365111706855,
"learning_rate": 3.7530075553066256e-07,
"loss": 1.6604,
"step": 9580
},
{
"epoch": 0.3546557807045089,
"grad_norm": 1.5197922636545014,
"learning_rate": 3.748329679312845e-07,
"loss": 1.6005,
"step": 9600
},
{
"epoch": 0.35539464691431,
"grad_norm": 2.1221364447575635,
"learning_rate": 3.743646424075753e-07,
"loss": 1.6302,
"step": 9620
},
{
"epoch": 0.35613351312411107,
"grad_norm": 1.520654127135304,
"learning_rate": 3.738957814829868e-07,
"loss": 1.7174,
"step": 9640
},
{
"epoch": 0.35687237933391214,
"grad_norm": 1.5099869797232601,
"learning_rate": 3.7342638768385597e-07,
"loss": 1.6592,
"step": 9660
},
{
"epoch": 0.35761124554371315,
"grad_norm": 1.8304484700278734,
"learning_rate": 3.729564635393907e-07,
"loss": 1.6745,
"step": 9680
},
{
"epoch": 0.3583501117535142,
"grad_norm": 1.778696114508267,
"learning_rate": 3.7248601158165674e-07,
"loss": 1.6592,
"step": 9700
},
{
"epoch": 0.3590889779633153,
"grad_norm": 1.4183327236752137,
"learning_rate": 3.720150343455638e-07,
"loss": 1.6637,
"step": 9720
},
{
"epoch": 0.35982784417311636,
"grad_norm": 1.559240346976758,
"learning_rate": 3.715435343688517e-07,
"loss": 1.6862,
"step": 9740
},
{
"epoch": 0.36056671038291743,
"grad_norm": 1.5461740842164586,
"learning_rate": 3.710715141920772e-07,
"loss": 1.6276,
"step": 9760
},
{
"epoch": 0.36130557659271845,
"grad_norm": 1.541024781373399,
"learning_rate": 3.705989763585998e-07,
"loss": 1.6519,
"step": 9780
},
{
"epoch": 0.3620444428025195,
"grad_norm": 1.568073509021964,
"learning_rate": 3.7012592341456855e-07,
"loss": 1.644,
"step": 9800
},
{
"epoch": 0.3627833090123206,
"grad_norm": 7.164278419276029,
"learning_rate": 3.6965235790890776e-07,
"loss": 1.6649,
"step": 9820
},
{
"epoch": 0.36352217522212166,
"grad_norm": 1.6290047071156604,
"learning_rate": 3.6917828239330364e-07,
"loss": 1.6321,
"step": 9840
},
{
"epoch": 0.3642610414319227,
"grad_norm": 2.2138525137520078,
"learning_rate": 3.6870369942219043e-07,
"loss": 1.6623,
"step": 9860
},
{
"epoch": 0.3649999076417238,
"grad_norm": 1.4780745550505248,
"learning_rate": 3.6822861155273664e-07,
"loss": 1.6303,
"step": 9880
},
{
"epoch": 0.3657387738515248,
"grad_norm": 1.6513433655082623,
"learning_rate": 3.677530213448315e-07,
"loss": 1.6678,
"step": 9900
},
{
"epoch": 0.3664776400613259,
"grad_norm": 1.4330452468765504,
"learning_rate": 3.6727693136107074e-07,
"loss": 1.6411,
"step": 9920
},
{
"epoch": 0.36721650627112695,
"grad_norm": 2.1041910204234773,
"learning_rate": 3.668241852955783e-07,
"loss": 1.6638,
"step": 9940
},
{
"epoch": 0.367955372480928,
"grad_norm": 1.579705325259841,
"learning_rate": 3.66347128129751e-07,
"loss": 1.6245,
"step": 9960
},
{
"epoch": 0.3686942386907291,
"grad_norm": 2.2840341365356185,
"learning_rate": 3.65869578763363e-07,
"loss": 1.6621,
"step": 9980
},
{
"epoch": 0.36943310490053016,
"grad_norm": 1.4886178225841975,
"learning_rate": 3.6539153976956643e-07,
"loss": 1.6815,
"step": 10000
},
{
"epoch": 0.3701719711103312,
"grad_norm": 2.0581153395070952,
"learning_rate": 3.6491301372415173e-07,
"loss": 1.6911,
"step": 10020
},
{
"epoch": 0.37091083732013225,
"grad_norm": 1.5433010278052928,
"learning_rate": 3.6443400320553387e-07,
"loss": 1.6726,
"step": 10040
},
{
"epoch": 0.3716497035299333,
"grad_norm": 1.3650733078052242,
"learning_rate": 3.6395451079473785e-07,
"loss": 1.6808,
"step": 10060
},
{
"epoch": 0.3723885697397344,
"grad_norm": 1.4829849508478619,
"learning_rate": 3.634745390753857e-07,
"loss": 1.638,
"step": 10080
},
{
"epoch": 0.37312743594953546,
"grad_norm": 1.4843368467181628,
"learning_rate": 3.6299409063368177e-07,
"loss": 1.6608,
"step": 10100
},
{
"epoch": 0.37386630215933647,
"grad_norm": 1.7135290138411319,
"learning_rate": 3.6251316805839925e-07,
"loss": 1.6201,
"step": 10120
},
{
"epoch": 0.37460516836913754,
"grad_norm": 1.4665338261705847,
"learning_rate": 3.6203177394086603e-07,
"loss": 1.6576,
"step": 10140
},
{
"epoch": 0.3753440345789386,
"grad_norm": 1.523807524784342,
"learning_rate": 3.615499108749508e-07,
"loss": 1.6531,
"step": 10160
},
{
"epoch": 0.3760829007887397,
"grad_norm": 1.4605532197043567,
"learning_rate": 3.6106758145704903e-07,
"loss": 1.6351,
"step": 10180
},
{
"epoch": 0.37682176699854075,
"grad_norm": 1.4767414919395185,
"learning_rate": 3.6058478828606904e-07,
"loss": 1.6816,
"step": 10200
},
{
"epoch": 0.3775606332083418,
"grad_norm": 3.319352148345807,
"learning_rate": 3.601015339634179e-07,
"loss": 1.646,
"step": 10220
},
{
"epoch": 0.37829949941814284,
"grad_norm": 1.6462705304843952,
"learning_rate": 3.5961782109298767e-07,
"loss": 1.6572,
"step": 10240
},
{
"epoch": 0.3790383656279439,
"grad_norm": 1.987828688877245,
"learning_rate": 3.5913365228114085e-07,
"loss": 1.6272,
"step": 10260
},
{
"epoch": 0.379777231837745,
"grad_norm": 1.5685525483250444,
"learning_rate": 3.5864903013669696e-07,
"loss": 1.629,
"step": 10280
},
{
"epoch": 0.38051609804754605,
"grad_norm": 1.454531386924792,
"learning_rate": 3.58163957270918e-07,
"loss": 1.6391,
"step": 10300
},
{
"epoch": 0.3812549642573471,
"grad_norm": 1.5741474691311197,
"learning_rate": 3.5767843629749465e-07,
"loss": 1.6497,
"step": 10320
},
{
"epoch": 0.3819938304671482,
"grad_norm": 1.494255550534897,
"learning_rate": 3.5719246983253227e-07,
"loss": 1.6584,
"step": 10340
},
{
"epoch": 0.3827326966769492,
"grad_norm": 1.5743114630725665,
"learning_rate": 3.5670606049453624e-07,
"loss": 1.6333,
"step": 10360
},
{
"epoch": 0.3834715628867503,
"grad_norm": 1.5229234435536247,
"learning_rate": 3.5621921090439856e-07,
"loss": 1.651,
"step": 10380
},
{
"epoch": 0.38421042909655134,
"grad_norm": 1.5784429804907898,
"learning_rate": 3.557319236853833e-07,
"loss": 1.6922,
"step": 10400
},
{
"epoch": 0.3849492953063524,
"grad_norm": 1.581472732564025,
"learning_rate": 3.552442014631125e-07,
"loss": 1.6725,
"step": 10420
},
{
"epoch": 0.3856881615161535,
"grad_norm": 1.5126802451542531,
"learning_rate": 3.5475604686555246e-07,
"loss": 1.6944,
"step": 10440
},
{
"epoch": 0.3864270277259545,
"grad_norm": 1.5957042160618131,
"learning_rate": 3.5426746252299876e-07,
"loss": 1.6474,
"step": 10460
},
{
"epoch": 0.38716589393575557,
"grad_norm": 1.5167798574452542,
"learning_rate": 3.537784510680629e-07,
"loss": 1.6269,
"step": 10480
},
{
"epoch": 0.38790476014555664,
"grad_norm": 1.4073803006779033,
"learning_rate": 3.5328901513565755e-07,
"loss": 1.667,
"step": 10500
},
{
"epoch": 0.3886436263553577,
"grad_norm": 1.5025049762633182,
"learning_rate": 3.527991573629826e-07,
"loss": 1.6685,
"step": 10520
},
{
"epoch": 0.3893824925651588,
"grad_norm": 1.498817940042482,
"learning_rate": 3.523088803895111e-07,
"loss": 1.6693,
"step": 10540
},
{
"epoch": 0.39012135877495985,
"grad_norm": 1.5375475807699233,
"learning_rate": 3.5181818685697454e-07,
"loss": 1.6257,
"step": 10560
},
{
"epoch": 0.39086022498476086,
"grad_norm": 1.4788669954107543,
"learning_rate": 3.513270794093493e-07,
"loss": 1.6396,
"step": 10580
},
{
"epoch": 0.39159909119456193,
"grad_norm": 1.8280175785471986,
"learning_rate": 3.508355606928417e-07,
"loss": 1.6708,
"step": 10600
},
{
"epoch": 0.392337957404363,
"grad_norm": 1.657327382022486,
"learning_rate": 3.503436333558744e-07,
"loss": 1.6344,
"step": 10620
},
{
"epoch": 0.3930768236141641,
"grad_norm": 3.2933368891799772,
"learning_rate": 3.498513000490713e-07,
"loss": 1.6233,
"step": 10640
},
{
"epoch": 0.39381568982396514,
"grad_norm": 1.5787521448516106,
"learning_rate": 3.4935856342524445e-07,
"loss": 1.6504,
"step": 10660
},
{
"epoch": 0.3945545560337662,
"grad_norm": 1.7273082957996757,
"learning_rate": 3.488654261393786e-07,
"loss": 1.6501,
"step": 10680
},
{
"epoch": 0.3952934222435672,
"grad_norm": 1.5427159019633168,
"learning_rate": 3.483718908486173e-07,
"loss": 1.6213,
"step": 10700
},
{
"epoch": 0.3960322884533683,
"grad_norm": 2.4791279004019944,
"learning_rate": 3.478779602122491e-07,
"loss": 1.6341,
"step": 10720
},
{
"epoch": 0.39677115466316937,
"grad_norm": 1.5057908958686839,
"learning_rate": 3.4738363689169227e-07,
"loss": 1.6344,
"step": 10740
},
{
"epoch": 0.39751002087297044,
"grad_norm": 1.6211537727930727,
"learning_rate": 3.4688892355048133e-07,
"loss": 1.6684,
"step": 10760
},
{
"epoch": 0.3982488870827715,
"grad_norm": 1.7112433425010558,
"learning_rate": 3.4639382285425217e-07,
"loss": 1.6742,
"step": 10780
},
{
"epoch": 0.3989877532925725,
"grad_norm": 1.7626819549867558,
"learning_rate": 3.4589833747072765e-07,
"loss": 1.6497,
"step": 10800
},
{
"epoch": 0.3997266195023736,
"grad_norm": 1.536514259186305,
"learning_rate": 3.4540247006970395e-07,
"loss": 1.6533,
"step": 10820
},
{
"epoch": 0.40046548571217466,
"grad_norm": 1.4352156142464503,
"learning_rate": 3.449062233230351e-07,
"loss": 1.6423,
"step": 10840
},
{
"epoch": 0.40120435192197573,
"grad_norm": 1.517870844401341,
"learning_rate": 3.4440959990461936e-07,
"loss": 1.6888,
"step": 10860
},
{
"epoch": 0.4019432181317768,
"grad_norm": 1.6903764999597104,
"learning_rate": 3.4391260249038467e-07,
"loss": 1.6242,
"step": 10880
},
{
"epoch": 0.4026820843415779,
"grad_norm": 1.9353070894961153,
"learning_rate": 3.4341523375827407e-07,
"loss": 1.6219,
"step": 10900
},
{
"epoch": 0.4034209505513789,
"grad_norm": 1.70733565978221,
"learning_rate": 3.4291749638823144e-07,
"loss": 1.6524,
"step": 10920
},
{
"epoch": 0.40415981676117996,
"grad_norm": 1.3794756923120337,
"learning_rate": 3.4241939306218655e-07,
"loss": 1.647,
"step": 10940
},
{
"epoch": 0.404898682970981,
"grad_norm": 1.4536895089620647,
"learning_rate": 3.4192092646404166e-07,
"loss": 1.6697,
"step": 10960
},
{
"epoch": 0.4056375491807821,
"grad_norm": 1.4185925084451405,
"learning_rate": 3.41422099279656e-07,
"loss": 1.6916,
"step": 10980
},
{
"epoch": 0.40637641539058317,
"grad_norm": 1.5516883391882288,
"learning_rate": 3.40922914196832e-07,
"loss": 1.6702,
"step": 11000
},
{
"epoch": 0.40711528160038424,
"grad_norm": 1.500896700694977,
"learning_rate": 3.4042337390530027e-07,
"loss": 1.6379,
"step": 11020
},
{
"epoch": 0.40785414781018525,
"grad_norm": 1.4488842610705819,
"learning_rate": 3.399234810967055e-07,
"loss": 1.6322,
"step": 11040
},
{
"epoch": 0.4085930140199863,
"grad_norm": 1.5363179452812292,
"learning_rate": 3.394232384645918e-07,
"loss": 1.7085,
"step": 11060
},
{
"epoch": 0.4093318802297874,
"grad_norm": 1.6587795154693055,
"learning_rate": 3.389226487043883e-07,
"loss": 1.6212,
"step": 11080
},
{
"epoch": 0.41007074643958846,
"grad_norm": 2.185811847037595,
"learning_rate": 3.3842171451339446e-07,
"loss": 1.653,
"step": 11100
},
{
"epoch": 0.41080961264938953,
"grad_norm": 1.4930598472252423,
"learning_rate": 3.3792043859076556e-07,
"loss": 1.6401,
"step": 11120
},
{
"epoch": 0.41154847885919055,
"grad_norm": 1.585267885050689,
"learning_rate": 3.3741882363749836e-07,
"loss": 1.6081,
"step": 11140
},
{
"epoch": 0.4122873450689916,
"grad_norm": 1.5745770350836434,
"learning_rate": 3.3691687235641633e-07,
"loss": 1.6657,
"step": 11160
},
{
"epoch": 0.4130262112787927,
"grad_norm": 1.638169374979827,
"learning_rate": 3.364145874521552e-07,
"loss": 1.6439,
"step": 11180
},
{
"epoch": 0.41376507748859376,
"grad_norm": 1.5771694576157802,
"learning_rate": 3.3591197163114807e-07,
"loss": 1.6344,
"step": 11200
},
{
"epoch": 0.41450394369839483,
"grad_norm": 1.507624879108444,
"learning_rate": 3.3540902760161153e-07,
"loss": 1.6414,
"step": 11220
},
{
"epoch": 0.4152428099081959,
"grad_norm": 1.5517359392564993,
"learning_rate": 3.349057580735304e-07,
"loss": 1.6103,
"step": 11240
},
{
"epoch": 0.4159816761179969,
"grad_norm": 1.6910189529581492,
"learning_rate": 3.3440216575864336e-07,
"loss": 1.6097,
"step": 11260
},
{
"epoch": 0.416720542327798,
"grad_norm": 1.4817048826322234,
"learning_rate": 3.338982533704284e-07,
"loss": 1.6322,
"step": 11280
},
{
"epoch": 0.41745940853759905,
"grad_norm": 2.4572073331823843,
"learning_rate": 3.3339402362408803e-07,
"loss": 1.6818,
"step": 11300
},
{
"epoch": 0.4181982747474001,
"grad_norm": 1.4690103698141457,
"learning_rate": 3.32889479236535e-07,
"loss": 1.6734,
"step": 11320
},
{
"epoch": 0.4189371409572012,
"grad_norm": 1.4525562290767953,
"learning_rate": 3.323846229263772e-07,
"loss": 1.6777,
"step": 11340
},
{
"epoch": 0.4196760071670022,
"grad_norm": 1.6088576080590102,
"learning_rate": 3.318794574139033e-07,
"loss": 1.6815,
"step": 11360
},
{
"epoch": 0.4204148733768033,
"grad_norm": 1.658735344378412,
"learning_rate": 3.3137398542106816e-07,
"loss": 1.7156,
"step": 11380
},
{
"epoch": 0.42115373958660435,
"grad_norm": 1.856711421074202,
"learning_rate": 3.308682096714777e-07,
"loss": 1.6056,
"step": 11400
},
{
"epoch": 0.4218926057964054,
"grad_norm": 1.524820866790581,
"learning_rate": 3.3036213289037494e-07,
"loss": 1.653,
"step": 11420
},
{
"epoch": 0.4226314720062065,
"grad_norm": 2.091225075765613,
"learning_rate": 3.298557578046248e-07,
"loss": 1.6344,
"step": 11440
},
{
"epoch": 0.42337033821600756,
"grad_norm": 1.5873899994137428,
"learning_rate": 3.2934908714269926e-07,
"loss": 1.7056,
"step": 11460
},
{
"epoch": 0.4241092044258086,
"grad_norm": 1.530785170405434,
"learning_rate": 3.2884212363466336e-07,
"loss": 1.6592,
"step": 11480
},
{
"epoch": 0.42484807063560964,
"grad_norm": 1.4187769683759475,
"learning_rate": 3.283348700121599e-07,
"loss": 1.6155,
"step": 11500
},
{
"epoch": 0.4255869368454107,
"grad_norm": 1.7098484503844666,
"learning_rate": 3.278273290083948e-07,
"loss": 1.6145,
"step": 11520
},
{
"epoch": 0.4263258030552118,
"grad_norm": 1.6337855300981592,
"learning_rate": 3.2731950335812245e-07,
"loss": 1.6718,
"step": 11540
},
{
"epoch": 0.42706466926501285,
"grad_norm": 1.562376692174843,
"learning_rate": 3.2681139579763116e-07,
"loss": 1.6299,
"step": 11560
},
{
"epoch": 0.4278035354748139,
"grad_norm": 1.7830680336877842,
"learning_rate": 3.263030090647282e-07,
"loss": 1.6427,
"step": 11580
},
{
"epoch": 0.42854240168461494,
"grad_norm": 1.67004917671626,
"learning_rate": 3.2579434589872487e-07,
"loss": 1.6645,
"step": 11600
},
{
"epoch": 0.429281267894416,
"grad_norm": 1.6704228734275928,
"learning_rate": 3.2528540904042226e-07,
"loss": 1.6427,
"step": 11620
},
{
"epoch": 0.4300201341042171,
"grad_norm": 1.4195450351330696,
"learning_rate": 3.24776201232096e-07,
"loss": 1.62,
"step": 11640
},
{
"epoch": 0.43075900031401815,
"grad_norm": 1.5285023969215334,
"learning_rate": 3.242667252174816e-07,
"loss": 1.6654,
"step": 11660
},
{
"epoch": 0.4314978665238192,
"grad_norm": 1.4602369388272751,
"learning_rate": 3.2375698374176e-07,
"loss": 1.6073,
"step": 11680
},
{
"epoch": 0.43223673273362023,
"grad_norm": 1.4791375841387864,
"learning_rate": 3.232469795515423e-07,
"loss": 1.6277,
"step": 11700
},
{
"epoch": 0.4329755989434213,
"grad_norm": 1.4365509577307647,
"learning_rate": 3.227367153948551e-07,
"loss": 1.6678,
"step": 11720
},
{
"epoch": 0.4337144651532224,
"grad_norm": 1.4925933032216425,
"learning_rate": 3.22226194021126e-07,
"loss": 1.6138,
"step": 11740
},
{
"epoch": 0.43445333136302344,
"grad_norm": 1.5965165214882902,
"learning_rate": 3.2171541818116844e-07,
"loss": 1.682,
"step": 11760
},
{
"epoch": 0.4351921975728245,
"grad_norm": 1.622561586319955,
"learning_rate": 3.2120439062716673e-07,
"loss": 1.6685,
"step": 11780
},
{
"epoch": 0.4359310637826256,
"grad_norm": 1.5068996818021825,
"learning_rate": 3.206931141126622e-07,
"loss": 1.6353,
"step": 11800
},
{
"epoch": 0.4366699299924266,
"grad_norm": 1.5980487695346257,
"learning_rate": 3.2018159139253667e-07,
"loss": 1.6442,
"step": 11820
},
{
"epoch": 0.43740879620222767,
"grad_norm": 1.9446682447819341,
"learning_rate": 3.1966982522299927e-07,
"loss": 1.6215,
"step": 11840
},
{
"epoch": 0.43814766241202874,
"grad_norm": 1.3911283325778476,
"learning_rate": 3.1915781836157076e-07,
"loss": 1.6237,
"step": 11860
},
{
"epoch": 0.4388865286218298,
"grad_norm": 1.7379788181506113,
"learning_rate": 3.1864557356706854e-07,
"loss": 1.6311,
"step": 11880
},
{
"epoch": 0.4396253948316309,
"grad_norm": 1.5960691894661032,
"learning_rate": 3.181330935995925e-07,
"loss": 1.6967,
"step": 11900
},
{
"epoch": 0.44036426104143195,
"grad_norm": 1.334622875404918,
"learning_rate": 3.176203812205092e-07,
"loss": 1.7151,
"step": 11920
},
{
"epoch": 0.44110312725123296,
"grad_norm": 2.3408851593313287,
"learning_rate": 3.171074391924379e-07,
"loss": 1.6204,
"step": 11940
},
{
"epoch": 0.44184199346103403,
"grad_norm": 1.517416691835459,
"learning_rate": 3.16594270279235e-07,
"loss": 1.647,
"step": 11960
},
{
"epoch": 0.4425808596708351,
"grad_norm": 1.732092967222855,
"learning_rate": 3.160808772459796e-07,
"loss": 1.6246,
"step": 11980
},
{
"epoch": 0.4433197258806362,
"grad_norm": 1.4748895033828555,
"learning_rate": 3.155672628589582e-07,
"loss": 1.6559,
"step": 12000
},
{
"epoch": 0.44405859209043724,
"grad_norm": 1.466688995230755,
"learning_rate": 3.1505342988565024e-07,
"loss": 1.6631,
"step": 12020
},
{
"epoch": 0.44479745830023826,
"grad_norm": 1.5762348950247518,
"learning_rate": 3.145393810947129e-07,
"loss": 1.6507,
"step": 12040
},
{
"epoch": 0.4455363245100393,
"grad_norm": 1.5705066014221254,
"learning_rate": 3.1402511925596604e-07,
"loss": 1.6218,
"step": 12060
},
{
"epoch": 0.4462751907198404,
"grad_norm": 1.5033544192166477,
"learning_rate": 3.135106471403778e-07,
"loss": 1.6645,
"step": 12080
},
{
"epoch": 0.44701405692964147,
"grad_norm": 1.8660368037827004,
"learning_rate": 3.1299596752004884e-07,
"loss": 1.6617,
"step": 12100
},
{
"epoch": 0.44775292313944254,
"grad_norm": 1.6278625709035912,
"learning_rate": 3.124810831681987e-07,
"loss": 1.6383,
"step": 12120
},
{
"epoch": 0.4484917893492436,
"grad_norm": 1.6698134882051106,
"learning_rate": 3.1196599685914916e-07,
"loss": 1.6691,
"step": 12140
},
{
"epoch": 0.4492306555590446,
"grad_norm": 1.5877476217951574,
"learning_rate": 3.114507113683109e-07,
"loss": 1.6091,
"step": 12160
},
{
"epoch": 0.4499695217688457,
"grad_norm": 1.533714449161249,
"learning_rate": 3.109352294721674e-07,
"loss": 1.6721,
"step": 12180
},
{
"epoch": 0.45070838797864676,
"grad_norm": 1.415779061176635,
"learning_rate": 3.104195539482607e-07,
"loss": 1.606,
"step": 12200
},
{
"epoch": 0.45144725418844783,
"grad_norm": 1.4338589085273825,
"learning_rate": 3.0990368757517605e-07,
"loss": 1.6661,
"step": 12220
},
{
"epoch": 0.4521861203982489,
"grad_norm": 1.8998339669584823,
"learning_rate": 3.093876331325269e-07,
"loss": 1.609,
"step": 12240
},
{
"epoch": 0.45292498660805,
"grad_norm": 1.384458068102408,
"learning_rate": 3.0889720974519455e-07,
"loss": 1.6454,
"step": 12260
},
{
"epoch": 0.453663852817851,
"grad_norm": 1.4452081009096462,
"learning_rate": 3.083807965655827e-07,
"loss": 1.6452,
"step": 12280
},
{
"epoch": 0.45440271902765206,
"grad_norm": 1.5698647385968285,
"learning_rate": 3.0786420352211376e-07,
"loss": 1.6741,
"step": 12300
},
{
"epoch": 0.45514158523745313,
"grad_norm": 1.9552580205602894,
"learning_rate": 3.0734743339831694e-07,
"loss": 1.6845,
"step": 12320
},
{
"epoch": 0.4558804514472542,
"grad_norm": 1.3583889408096808,
"learning_rate": 3.068304889786754e-07,
"loss": 1.6744,
"step": 12340
},
{
"epoch": 0.45661931765705527,
"grad_norm": 1.6780668319449847,
"learning_rate": 3.063133730486116e-07,
"loss": 1.6258,
"step": 12360
},
{
"epoch": 0.4573581838668563,
"grad_norm": 1.627173946323959,
"learning_rate": 3.057960883944719e-07,
"loss": 1.6198,
"step": 12380
},
{
"epoch": 0.45809705007665735,
"grad_norm": 1.3800453841054778,
"learning_rate": 3.0527863780351194e-07,
"loss": 1.6268,
"step": 12400
},
{
"epoch": 0.4588359162864584,
"grad_norm": 1.5516028071383072,
"learning_rate": 3.047610240638816e-07,
"loss": 1.679,
"step": 12420
},
{
"epoch": 0.4595747824962595,
"grad_norm": 1.546230302013408,
"learning_rate": 3.0424324996460955e-07,
"loss": 1.6234,
"step": 12440
},
{
"epoch": 0.46031364870606056,
"grad_norm": 1.5739393391599368,
"learning_rate": 3.037253182955887e-07,
"loss": 1.703,
"step": 12460
},
{
"epoch": 0.46105251491586163,
"grad_norm": 1.5792552039289542,
"learning_rate": 3.0320723184756095e-07,
"loss": 1.6453,
"step": 12480
},
{
"epoch": 0.46179138112566265,
"grad_norm": 1.5239329095833032,
"learning_rate": 3.026889934121023e-07,
"loss": 1.6553,
"step": 12500
},
{
"epoch": 0.4625302473354637,
"grad_norm": 1.4558048272931619,
"learning_rate": 3.021706057816074e-07,
"loss": 1.6563,
"step": 12520
},
{
"epoch": 0.4632691135452648,
"grad_norm": 1.5801820167249694,
"learning_rate": 3.0165207174927513e-07,
"loss": 1.6645,
"step": 12540
},
{
"epoch": 0.46400797975506586,
"grad_norm": 1.5560547577828236,
"learning_rate": 3.01133394109093e-07,
"loss": 1.6596,
"step": 12560
},
{
"epoch": 0.46474684596486693,
"grad_norm": 1.6818881647492323,
"learning_rate": 3.006145756558223e-07,
"loss": 1.6335,
"step": 12580
},
{
"epoch": 0.465485712174668,
"grad_norm": 1.6120666995517767,
"learning_rate": 3.0009561918498335e-07,
"loss": 1.6685,
"step": 12600
},
{
"epoch": 0.466224578384469,
"grad_norm": 1.4949729602626867,
"learning_rate": 2.995765274928398e-07,
"loss": 1.6753,
"step": 12620
},
{
"epoch": 0.4669634445942701,
"grad_norm": 1.5289962949889762,
"learning_rate": 2.9905730337638395e-07,
"loss": 1.6548,
"step": 12640
},
{
"epoch": 0.46770231080407115,
"grad_norm": 1.8299373423521412,
"learning_rate": 2.98537949633322e-07,
"loss": 1.5999,
"step": 12660
},
{
"epoch": 0.4684411770138722,
"grad_norm": 1.5948007806430553,
"learning_rate": 2.9801846906205794e-07,
"loss": 1.6638,
"step": 12680
},
{
"epoch": 0.4691800432236733,
"grad_norm": 1.418583561219425,
"learning_rate": 2.974988644616799e-07,
"loss": 1.6782,
"step": 12700
},
{
"epoch": 0.4699189094334743,
"grad_norm": 1.461318006445296,
"learning_rate": 2.9700512775939907e-07,
"loss": 1.6528,
"step": 12720
},
{
"epoch": 0.4706577756432754,
"grad_norm": 1.5468327583259127,
"learning_rate": 2.964852893556419e-07,
"loss": 1.6685,
"step": 12740
},
{
"epoch": 0.47139664185307645,
"grad_norm": 1.6470459204833447,
"learning_rate": 2.9596533518391615e-07,
"loss": 1.6733,
"step": 12760
},
{
"epoch": 0.4721355080628775,
"grad_norm": 1.624503313092944,
"learning_rate": 2.954452680458612e-07,
"loss": 1.6737,
"step": 12780
},
{
"epoch": 0.4728743742726786,
"grad_norm": 1.5728828027087576,
"learning_rate": 2.949250907437256e-07,
"loss": 1.6671,
"step": 12800
},
{
"epoch": 0.47361324048247966,
"grad_norm": 1.679151732155206,
"learning_rate": 2.944048060803512e-07,
"loss": 1.656,
"step": 12820
},
{
"epoch": 0.4743521066922807,
"grad_norm": 1.4259988112675113,
"learning_rate": 2.938844168591584e-07,
"loss": 1.6088,
"step": 12840
},
{
"epoch": 0.47509097290208174,
"grad_norm": 2.10422922646524,
"learning_rate": 2.933639258841309e-07,
"loss": 1.6411,
"step": 12860
},
{
"epoch": 0.4758298391118828,
"grad_norm": 1.809412517307293,
"learning_rate": 2.92843335959801e-07,
"loss": 1.654,
"step": 12880
},
{
"epoch": 0.4765687053216839,
"grad_norm": 1.6010915209622532,
"learning_rate": 2.923226498912336e-07,
"loss": 1.6653,
"step": 12900
},
{
"epoch": 0.47730757153148495,
"grad_norm": 1.7399335136485357,
"learning_rate": 2.918018704840123e-07,
"loss": 1.6839,
"step": 12920
},
{
"epoch": 0.478046437741286,
"grad_norm": 1.9845153410774579,
"learning_rate": 2.912810005442231e-07,
"loss": 1.6308,
"step": 12940
},
{
"epoch": 0.47878530395108704,
"grad_norm": 1.4672730941447367,
"learning_rate": 2.9076004287844007e-07,
"loss": 1.7158,
"step": 12960
},
{
"epoch": 0.4795241701608881,
"grad_norm": 1.3537458462825016,
"learning_rate": 2.9023900029371e-07,
"loss": 1.5888,
"step": 12980
},
{
"epoch": 0.4802630363706892,
"grad_norm": 1.585460577335508,
"learning_rate": 2.8971787559753695e-07,
"loss": 1.6476,
"step": 13000
},
{
"epoch": 0.48100190258049025,
"grad_norm": 1.561928549919643,
"learning_rate": 2.891966715978679e-07,
"loss": 1.6339,
"step": 13020
},
{
"epoch": 0.4817407687902913,
"grad_norm": 1.439464952580829,
"learning_rate": 2.886753911030767e-07,
"loss": 1.6619,
"step": 13040
},
{
"epoch": 0.48247963500009233,
"grad_norm": 1.5693967956885457,
"learning_rate": 2.8815403692194954e-07,
"loss": 1.6443,
"step": 13060
},
{
"epoch": 0.4832185012098934,
"grad_norm": 1.8445144793183739,
"learning_rate": 2.8763261186366977e-07,
"loss": 1.6395,
"step": 13080
},
{
"epoch": 0.4839573674196945,
"grad_norm": 1.4215590880054088,
"learning_rate": 2.8711111873780224e-07,
"loss": 1.6583,
"step": 13100
},
{
"epoch": 0.48469623362949554,
"grad_norm": 1.6129407222161285,
"learning_rate": 2.8658956035427917e-07,
"loss": 1.6579,
"step": 13120
},
{
"epoch": 0.4854350998392966,
"grad_norm": 1.7787904262576621,
"learning_rate": 2.8606793952338394e-07,
"loss": 1.6387,
"step": 13140
},
{
"epoch": 0.4861739660490977,
"grad_norm": 2.9317837538381384,
"learning_rate": 2.8554625905573646e-07,
"loss": 1.6258,
"step": 13160
},
{
"epoch": 0.4869128322588987,
"grad_norm": 1.6449106895888608,
"learning_rate": 2.850245217622784e-07,
"loss": 1.6492,
"step": 13180
},
{
"epoch": 0.48765169846869977,
"grad_norm": 1.5321621721627146,
"learning_rate": 2.8450273045425677e-07,
"loss": 1.6456,
"step": 13200
},
{
"epoch": 0.48839056467850084,
"grad_norm": 1.5327848701302575,
"learning_rate": 2.8398088794321054e-07,
"loss": 1.6299,
"step": 13220
},
{
"epoch": 0.4891294308883019,
"grad_norm": 1.5262317315528862,
"learning_rate": 2.8345899704095424e-07,
"loss": 1.6815,
"step": 13240
},
{
"epoch": 0.489868297098103,
"grad_norm": 8.056093277940944,
"learning_rate": 2.8293706055956266e-07,
"loss": 1.6196,
"step": 13260
},
{
"epoch": 0.49060716330790405,
"grad_norm": 1.7903474479157373,
"learning_rate": 2.8241508131135704e-07,
"loss": 1.6748,
"step": 13280
},
{
"epoch": 0.49134602951770506,
"grad_norm": 2.3280755640085857,
"learning_rate": 2.818930621088883e-07,
"loss": 1.674,
"step": 13300
},
{
"epoch": 0.49208489572750613,
"grad_norm": 1.7132266058410768,
"learning_rate": 2.8137100576492324e-07,
"loss": 1.6407,
"step": 13320
},
{
"epoch": 0.4928237619373072,
"grad_norm": 1.652779406776925,
"learning_rate": 2.808489150924283e-07,
"loss": 1.6672,
"step": 13340
},
{
"epoch": 0.4935626281471083,
"grad_norm": 1.597072673714322,
"learning_rate": 2.8032679290455525e-07,
"loss": 1.6326,
"step": 13360
},
{
"epoch": 0.49430149435690934,
"grad_norm": 1.483890002284729,
"learning_rate": 2.798046420146254e-07,
"loss": 1.6953,
"step": 13380
},
{
"epoch": 0.49504036056671036,
"grad_norm": 1.5673926854706393,
"learning_rate": 2.792824652361149e-07,
"loss": 1.6348,
"step": 13400
},
{
"epoch": 0.49577922677651143,
"grad_norm": 1.3752789014048936,
"learning_rate": 2.7876026538263935e-07,
"loss": 1.6333,
"step": 13420
},
{
"epoch": 0.4965180929863125,
"grad_norm": 1.439519752453901,
"learning_rate": 2.7823804526793863e-07,
"loss": 1.6322,
"step": 13440
},
{
"epoch": 0.49725695919611357,
"grad_norm": 1.6858659909371638,
"learning_rate": 2.777158077058619e-07,
"loss": 1.6087,
"step": 13460
},
{
"epoch": 0.49799582540591464,
"grad_norm": 1.475020677300443,
"learning_rate": 2.771935555103521e-07,
"loss": 1.6085,
"step": 13480
},
{
"epoch": 0.4987346916157157,
"grad_norm": 1.5498271971579036,
"learning_rate": 2.766712914954314e-07,
"loss": 1.6546,
"step": 13500
},
{
"epoch": 0.4994735578255167,
"grad_norm": 2.096090843883931,
"learning_rate": 2.7614901847518525e-07,
"loss": 1.6812,
"step": 13520
},
{
"epoch": 0.5002124240353178,
"grad_norm": 1.4457832913454574,
"learning_rate": 2.756267392637479e-07,
"loss": 1.6581,
"step": 13540
},
{
"epoch": 0.5009512902451189,
"grad_norm": 2.01817520318154,
"learning_rate": 2.751044566752869e-07,
"loss": 1.6615,
"step": 13560
},
{
"epoch": 0.5016901564549199,
"grad_norm": 1.4227402127659055,
"learning_rate": 2.745821735239878e-07,
"loss": 1.6324,
"step": 13580
},
{
"epoch": 0.502429022664721,
"grad_norm": 1.8405513240063371,
"learning_rate": 2.7405989262403955e-07,
"loss": 1.6698,
"step": 13600
},
{
"epoch": 0.503167888874522,
"grad_norm": 1.4788179775173926,
"learning_rate": 2.7353761678961865e-07,
"loss": 1.6359,
"step": 13620
},
{
"epoch": 0.5039067550843231,
"grad_norm": 1.7223731354636942,
"learning_rate": 2.730153488348744e-07,
"loss": 1.6306,
"step": 13640
},
{
"epoch": 0.5046456212941242,
"grad_norm": 2.5321925077821406,
"learning_rate": 2.724930915739137e-07,
"loss": 1.6752,
"step": 13660
},
{
"epoch": 0.5053844875039252,
"grad_norm": 1.5208216957527443,
"learning_rate": 2.7197084782078585e-07,
"loss": 1.6439,
"step": 13680
},
{
"epoch": 0.5061233537137263,
"grad_norm": 1.4323741561095633,
"learning_rate": 2.7144862038946716e-07,
"loss": 1.644,
"step": 13700
},
{
"epoch": 0.5068622199235273,
"grad_norm": 1.426194444263622,
"learning_rate": 2.709264120938464e-07,
"loss": 1.6383,
"step": 13720
},
{
"epoch": 0.5076010861333284,
"grad_norm": 1.9190094996790648,
"learning_rate": 2.7040422574770866e-07,
"loss": 1.6015,
"step": 13740
},
{
"epoch": 0.5083399523431295,
"grad_norm": 1.5070566631142777,
"learning_rate": 2.698820641647212e-07,
"loss": 1.6841,
"step": 13760
},
{
"epoch": 0.5090788185529306,
"grad_norm": 1.9970969408548236,
"learning_rate": 2.693599301584179e-07,
"loss": 1.6346,
"step": 13780
},
{
"epoch": 0.5098176847627316,
"grad_norm": 1.683784538174349,
"learning_rate": 2.688378265421837e-07,
"loss": 1.6829,
"step": 13800
},
{
"epoch": 0.5105565509725326,
"grad_norm": 1.7421711729558282,
"learning_rate": 2.683157561292399e-07,
"loss": 1.626,
"step": 13820
},
{
"epoch": 0.5112954171823337,
"grad_norm": 1.6638975974760875,
"learning_rate": 2.6779372173262917e-07,
"loss": 1.6847,
"step": 13840
},
{
"epoch": 0.5120342833921347,
"grad_norm": 1.7300243765637946,
"learning_rate": 2.672717261651998e-07,
"loss": 1.6635,
"step": 13860
},
{
"epoch": 0.5127731496019359,
"grad_norm": 1.7350443481000342,
"learning_rate": 2.667497722395909e-07,
"loss": 1.6648,
"step": 13880
},
{
"epoch": 0.5135120158117369,
"grad_norm": 1.8257677624748465,
"learning_rate": 2.662278627682172e-07,
"loss": 1.642,
"step": 13900
},
{
"epoch": 0.5142508820215379,
"grad_norm": 1.7828372493231617,
"learning_rate": 2.657060005632543e-07,
"loss": 1.6354,
"step": 13920
},
{
"epoch": 0.514989748231339,
"grad_norm": 1.4463498826235905,
"learning_rate": 2.6518418843662256e-07,
"loss": 1.6342,
"step": 13940
},
{
"epoch": 0.51572861444114,
"grad_norm": 1.5876083742799603,
"learning_rate": 2.6466242919997263e-07,
"loss": 1.6541,
"step": 13960
},
{
"epoch": 0.5164674806509412,
"grad_norm": 1.4658443332943762,
"learning_rate": 2.641407256646705e-07,
"loss": 1.6865,
"step": 13980
},
{
"epoch": 0.5172063468607422,
"grad_norm": 1.3991873689568013,
"learning_rate": 2.636190806417817e-07,
"loss": 1.6322,
"step": 14000
},
{
"epoch": 0.5179452130705432,
"grad_norm": 2.1443694620412823,
"learning_rate": 2.6309749694205643e-07,
"loss": 1.6337,
"step": 14020
},
{
"epoch": 0.5186840792803443,
"grad_norm": 1.8812922050974208,
"learning_rate": 2.6257597737591484e-07,
"loss": 1.6003,
"step": 14040
},
{
"epoch": 0.5194229454901453,
"grad_norm": 1.4849904179267404,
"learning_rate": 2.6205452475343135e-07,
"loss": 1.6554,
"step": 14060
},
{
"epoch": 0.5201618116999465,
"grad_norm": 1.5710794059095268,
"learning_rate": 2.6153314188431934e-07,
"loss": 1.6585,
"step": 14080
},
{
"epoch": 0.5209006779097475,
"grad_norm": 1.4300979250373247,
"learning_rate": 2.6101183157791687e-07,
"loss": 1.6266,
"step": 14100
},
{
"epoch": 0.5216395441195486,
"grad_norm": 1.4201641845366786,
"learning_rate": 2.604905966431707e-07,
"loss": 1.6278,
"step": 14120
},
{
"epoch": 0.5223784103293496,
"grad_norm": 1.4634294685934828,
"learning_rate": 2.5996943988862136e-07,
"loss": 1.6575,
"step": 14140
},
{
"epoch": 0.5231172765391506,
"grad_norm": 1.5428372121996694,
"learning_rate": 2.594483641223885e-07,
"loss": 1.6751,
"step": 14160
},
{
"epoch": 0.5238561427489518,
"grad_norm": 1.738164845435304,
"learning_rate": 2.5892737215215507e-07,
"loss": 1.6492,
"step": 14180
},
{
"epoch": 0.5245950089587528,
"grad_norm": 1.5256411770058975,
"learning_rate": 2.584064667851527e-07,
"loss": 1.6491,
"step": 14200
},
{
"epoch": 0.5253338751685539,
"grad_norm": 2.0408240630415513,
"learning_rate": 2.578856508281461e-07,
"loss": 1.6424,
"step": 14220
},
{
"epoch": 0.5260727413783549,
"grad_norm": 1.5107852579348091,
"learning_rate": 2.573649270874187e-07,
"loss": 1.6575,
"step": 14240
},
{
"epoch": 0.5268116075881559,
"grad_norm": 1.606923866961281,
"learning_rate": 2.568442983687567e-07,
"loss": 1.6678,
"step": 14260
},
{
"epoch": 0.527550473797957,
"grad_norm": 1.86036331527246,
"learning_rate": 2.5632376747743416e-07,
"loss": 1.6611,
"step": 14280
},
{
"epoch": 0.5282893400077581,
"grad_norm": 1.6282520348397496,
"learning_rate": 2.5580333721819837e-07,
"loss": 1.6887,
"step": 14300
},
{
"epoch": 0.5290282062175592,
"grad_norm": 1.4902965967534727,
"learning_rate": 2.5528301039525427e-07,
"loss": 1.673,
"step": 14320
},
{
"epoch": 0.5297670724273602,
"grad_norm": 2.9289521410401607,
"learning_rate": 2.547627898122493e-07,
"loss": 1.618,
"step": 14340
},
{
"epoch": 0.5305059386371612,
"grad_norm": 1.5801255890460382,
"learning_rate": 2.5424267827225884e-07,
"loss": 1.6478,
"step": 14360
},
{
"epoch": 0.5312448048469623,
"grad_norm": 1.904222753922445,
"learning_rate": 2.5372267857777017e-07,
"loss": 1.6543,
"step": 14380
},
{
"epoch": 0.5319836710567634,
"grad_norm": 1.5136725876022765,
"learning_rate": 2.532027935306684e-07,
"loss": 1.658,
"step": 14400
},
{
"epoch": 0.5327225372665645,
"grad_norm": 1.8648484080963088,
"learning_rate": 2.5268302593222056e-07,
"loss": 1.6279,
"step": 14420
},
{
"epoch": 0.5334614034763655,
"grad_norm": 1.4732933175166334,
"learning_rate": 2.521633785830612e-07,
"loss": 1.6535,
"step": 14440
},
{
"epoch": 0.5342002696861666,
"grad_norm": 1.7964137810644547,
"learning_rate": 2.5164385428317656e-07,
"loss": 1.6291,
"step": 14460
},
{
"epoch": 0.5349391358959676,
"grad_norm": 1.7384258178088878,
"learning_rate": 2.5112445583189e-07,
"loss": 1.6484,
"step": 14480
},
{
"epoch": 0.5356780021057687,
"grad_norm": 1.6118844731600752,
"learning_rate": 2.506051860278469e-07,
"loss": 1.6461,
"step": 14500
},
{
"epoch": 0.5364168683155698,
"grad_norm": 1.612441861147252,
"learning_rate": 2.500860476689993e-07,
"loss": 1.6368,
"step": 14520
},
{
"epoch": 0.5371557345253708,
"grad_norm": 1.4719276982885592,
"learning_rate": 2.4956704355259106e-07,
"loss": 1.616,
"step": 14540
},
{
"epoch": 0.5378946007351719,
"grad_norm": 1.4849285106056183,
"learning_rate": 2.4904817647514273e-07,
"loss": 1.6467,
"step": 14560
},
{
"epoch": 0.5386334669449729,
"grad_norm": 2.0929018106610533,
"learning_rate": 2.485294492324364e-07,
"loss": 1.6517,
"step": 14580
},
{
"epoch": 0.539372333154774,
"grad_norm": 1.3910097740422103,
"learning_rate": 2.480108646195006e-07,
"loss": 1.6319,
"step": 14600
},
{
"epoch": 0.5401111993645751,
"grad_norm": 1.8158803135234147,
"learning_rate": 2.474924254305956e-07,
"loss": 1.6902,
"step": 14620
},
{
"epoch": 0.5408500655743761,
"grad_norm": 1.6514040636762424,
"learning_rate": 2.4697413445919785e-07,
"loss": 1.6479,
"step": 14640
},
{
"epoch": 0.5415889317841772,
"grad_norm": 1.5739603939688216,
"learning_rate": 2.4645599449798536e-07,
"loss": 1.639,
"step": 14660
},
{
"epoch": 0.5423277979939782,
"grad_norm": 1.5178753830207266,
"learning_rate": 2.459380083388221e-07,
"loss": 1.6235,
"step": 14680
},
{
"epoch": 0.5430666642037792,
"grad_norm": 1.52558838171546,
"learning_rate": 2.4542017877274397e-07,
"loss": 1.6835,
"step": 14700
},
{
"epoch": 0.5438055304135804,
"grad_norm": 2.2408509501139533,
"learning_rate": 2.4490250858994243e-07,
"loss": 1.5869,
"step": 14720
},
{
"epoch": 0.5445443966233814,
"grad_norm": 1.6053244248684069,
"learning_rate": 2.4438500057975043e-07,
"loss": 1.6698,
"step": 14740
},
{
"epoch": 0.5452832628331825,
"grad_norm": 1.4975811830975623,
"learning_rate": 2.4386765753062733e-07,
"loss": 1.6337,
"step": 14760
},
{
"epoch": 0.5460221290429835,
"grad_norm": 1.4849817547603397,
"learning_rate": 2.4335048223014316e-07,
"loss": 1.6095,
"step": 14780
},
{
"epoch": 0.5467609952527847,
"grad_norm": 1.8454272427613772,
"learning_rate": 2.4283347746496436e-07,
"loss": 1.6191,
"step": 14800
},
{
"epoch": 0.5474998614625857,
"grad_norm": 1.484721990845683,
"learning_rate": 2.4231664602083857e-07,
"loss": 1.6156,
"step": 14820
},
{
"epoch": 0.5482387276723867,
"grad_norm": 1.4970531164331227,
"learning_rate": 2.4179999068257935e-07,
"loss": 1.6903,
"step": 14840
},
{
"epoch": 0.5489775938821878,
"grad_norm": 1.60919652354879,
"learning_rate": 2.412835142340513e-07,
"loss": 1.6813,
"step": 14860
},
{
"epoch": 0.5497164600919888,
"grad_norm": 1.3606018353206684,
"learning_rate": 2.4076721945815544e-07,
"loss": 1.6769,
"step": 14880
},
{
"epoch": 0.55045532630179,
"grad_norm": 1.458693168765768,
"learning_rate": 2.4025110913681355e-07,
"loss": 1.6373,
"step": 14900
},
{
"epoch": 0.551194192511591,
"grad_norm": 1.547291419668359,
"learning_rate": 2.397351860509537e-07,
"loss": 1.6525,
"step": 14920
},
{
"epoch": 0.551933058721392,
"grad_norm": 1.7224542921095407,
"learning_rate": 2.392194529804951e-07,
"loss": 1.6761,
"step": 14940
},
{
"epoch": 0.5526719249311931,
"grad_norm": 1.6677249547234672,
"learning_rate": 2.38703912704333e-07,
"loss": 1.625,
"step": 14960
},
{
"epoch": 0.5534107911409941,
"grad_norm": 1.4519952098563818,
"learning_rate": 2.3818856800032395e-07,
"loss": 1.6244,
"step": 14980
},
{
"epoch": 0.5541496573507952,
"grad_norm": 1.7967122495859562,
"learning_rate": 2.3767342164527055e-07,
"loss": 1.6719,
"step": 15000
},
{
"epoch": 0.5548885235605963,
"grad_norm": 1.3751693238795433,
"learning_rate": 2.3715847641490688e-07,
"loss": 1.6397,
"step": 15020
},
{
"epoch": 0.5556273897703973,
"grad_norm": 1.5461207825297583,
"learning_rate": 2.3664373508388318e-07,
"loss": 1.6871,
"step": 15040
},
{
"epoch": 0.5563662559801984,
"grad_norm": 1.3729095610665938,
"learning_rate": 2.3612920042575091e-07,
"loss": 1.6568,
"step": 15060
},
{
"epoch": 0.5571051221899994,
"grad_norm": 1.5955595428086877,
"learning_rate": 2.3561487521294814e-07,
"loss": 1.6439,
"step": 15080
},
{
"epoch": 0.5578439883998005,
"grad_norm": 1.505255489966295,
"learning_rate": 2.351007622167843e-07,
"loss": 1.6114,
"step": 15100
},
{
"epoch": 0.5585828546096016,
"grad_norm": 1.4629681148522744,
"learning_rate": 2.3458686420742528e-07,
"loss": 1.6114,
"step": 15120
},
{
"epoch": 0.5593217208194027,
"grad_norm": 1.7359961722060924,
"learning_rate": 2.3407318395387875e-07,
"loss": 1.6416,
"step": 15140
},
{
"epoch": 0.5600605870292037,
"grad_norm": 1.6390324621472498,
"learning_rate": 2.3355972422397895e-07,
"loss": 1.6625,
"step": 15160
},
{
"epoch": 0.5607994532390047,
"grad_norm": 1.7925619507510513,
"learning_rate": 2.3304648778437175e-07,
"loss": 1.6822,
"step": 15180
},
{
"epoch": 0.5615383194488058,
"grad_norm": 1.6256712121515025,
"learning_rate": 2.3253347740050012e-07,
"loss": 1.6793,
"step": 15200
},
{
"epoch": 0.5622771856586068,
"grad_norm": 1.6887168187109596,
"learning_rate": 2.3202069583658883e-07,
"loss": 1.6403,
"step": 15220
},
{
"epoch": 0.563016051868408,
"grad_norm": 1.4622893380793243,
"learning_rate": 2.3150814585562984e-07,
"loss": 1.6256,
"step": 15240
},
{
"epoch": 0.563754918078209,
"grad_norm": 1.720681049824639,
"learning_rate": 2.3099583021936703e-07,
"loss": 1.6331,
"step": 15260
},
{
"epoch": 0.56449378428801,
"grad_norm": 1.6844323896773028,
"learning_rate": 2.3048375168828194e-07,
"loss": 1.6249,
"step": 15280
},
{
"epoch": 0.5652326504978111,
"grad_norm": 1.4304416297000766,
"learning_rate": 2.2997191302157831e-07,
"loss": 1.6476,
"step": 15300
},
{
"epoch": 0.5659715167076121,
"grad_norm": 2.6747036703519966,
"learning_rate": 2.2946031697716728e-07,
"loss": 1.6704,
"step": 15320
},
{
"epoch": 0.5667103829174133,
"grad_norm": 1.8934913018327109,
"learning_rate": 2.2894896631165312e-07,
"loss": 1.6557,
"step": 15340
},
{
"epoch": 0.5674492491272143,
"grad_norm": 1.5864443521535418,
"learning_rate": 2.2843786378031749e-07,
"loss": 1.6111,
"step": 15360
},
{
"epoch": 0.5681881153370153,
"grad_norm": 1.6147764207744268,
"learning_rate": 2.279270121371053e-07,
"loss": 1.6617,
"step": 15380
},
{
"epoch": 0.5689269815468164,
"grad_norm": 1.5889401903281988,
"learning_rate": 2.274164141346096e-07,
"loss": 1.6472,
"step": 15400
},
{
"epoch": 0.5696658477566174,
"grad_norm": 1.8322046948313095,
"learning_rate": 2.2690607252405664e-07,
"loss": 1.681,
"step": 15420
},
{
"epoch": 0.5704047139664186,
"grad_norm": 1.319095874026253,
"learning_rate": 2.2639599005529124e-07,
"loss": 1.6339,
"step": 15440
},
{
"epoch": 0.5711435801762196,
"grad_norm": 1.568413450074265,
"learning_rate": 2.258861694767619e-07,
"loss": 1.6385,
"step": 15460
},
{
"epoch": 0.5718824463860207,
"grad_norm": 1.659163649600049,
"learning_rate": 2.2537661353550603e-07,
"loss": 1.6292,
"step": 15480
},
{
"epoch": 0.5726213125958217,
"grad_norm": 1.484851792665619,
"learning_rate": 2.2486732497713507e-07,
"loss": 1.6887,
"step": 15500
},
{
"epoch": 0.5733601788056227,
"grad_norm": 1.609907878598695,
"learning_rate": 2.2435830654581962e-07,
"loss": 1.6266,
"step": 15520
},
{
"epoch": 0.5740990450154239,
"grad_norm": 1.4453575034227937,
"learning_rate": 2.2387499173937125e-07,
"loss": 1.6537,
"step": 15540
},
{
"epoch": 0.5748379112252249,
"grad_norm": 1.7710876217433056,
"learning_rate": 2.2336650794320994e-07,
"loss": 1.6588,
"step": 15560
},
{
"epoch": 0.575576777435026,
"grad_norm": 1.4085011499137292,
"learning_rate": 2.2285830236087167e-07,
"loss": 1.6293,
"step": 15580
},
{
"epoch": 0.576315643644827,
"grad_norm": 1.4053148152524308,
"learning_rate": 2.2235037773069188e-07,
"loss": 1.629,
"step": 15600
},
{
"epoch": 0.577054509854628,
"grad_norm": 1.456136317052379,
"learning_rate": 2.2184273678949212e-07,
"loss": 1.6448,
"step": 15620
},
{
"epoch": 0.5777933760644292,
"grad_norm": 1.5709035364905237,
"learning_rate": 2.213353822725652e-07,
"loss": 1.6556,
"step": 15640
},
{
"epoch": 0.5785322422742302,
"grad_norm": 2.381482655936729,
"learning_rate": 2.2082831691366104e-07,
"loss": 1.6298,
"step": 15660
},
{
"epoch": 0.5792711084840313,
"grad_norm": 1.510088899026219,
"learning_rate": 2.2032154344497096e-07,
"loss": 1.69,
"step": 15680
},
{
"epoch": 0.5800099746938323,
"grad_norm": 1.4208293328335637,
"learning_rate": 2.198150645971138e-07,
"loss": 1.6533,
"step": 15700
},
{
"epoch": 0.5807488409036333,
"grad_norm": 1.5394108559637645,
"learning_rate": 2.1930888309912098e-07,
"loss": 1.6145,
"step": 15720
},
{
"epoch": 0.5814877071134344,
"grad_norm": 1.8494498268185677,
"learning_rate": 2.188030016784216e-07,
"loss": 1.6262,
"step": 15740
},
{
"epoch": 0.5822265733232355,
"grad_norm": 2.390942191221342,
"learning_rate": 2.1829742306082778e-07,
"loss": 1.612,
"step": 15760
},
{
"epoch": 0.5829654395330366,
"grad_norm": 2.4364332149226446,
"learning_rate": 2.1779214997052025e-07,
"loss": 1.6548,
"step": 15780
},
{
"epoch": 0.5837043057428376,
"grad_norm": 1.7161768355514782,
"learning_rate": 2.1728718513003342e-07,
"loss": 1.6822,
"step": 15800
},
{
"epoch": 0.5844431719526387,
"grad_norm": 1.6209379371159418,
"learning_rate": 2.1678253126024072e-07,
"loss": 1.6068,
"step": 15820
},
{
"epoch": 0.5851820381624397,
"grad_norm": 2.1623351366291725,
"learning_rate": 2.1627819108034002e-07,
"loss": 1.6138,
"step": 15840
},
{
"epoch": 0.5859209043722408,
"grad_norm": 1.3848518910214123,
"learning_rate": 2.1577416730783904e-07,
"loss": 1.6315,
"step": 15860
},
{
"epoch": 0.5866597705820419,
"grad_norm": 1.377598599479366,
"learning_rate": 2.1527046265854049e-07,
"loss": 1.6263,
"step": 15880
},
{
"epoch": 0.5873986367918429,
"grad_norm": 1.5951258889353628,
"learning_rate": 2.1476707984652764e-07,
"loss": 1.6442,
"step": 15900
},
{
"epoch": 0.588137503001644,
"grad_norm": 1.4119428291190372,
"learning_rate": 2.1426402158414964e-07,
"loss": 1.6776,
"step": 15920
},
{
"epoch": 0.588876369211445,
"grad_norm": 1.5401792838637114,
"learning_rate": 2.1376129058200687e-07,
"loss": 1.6489,
"step": 15940
},
{
"epoch": 0.589615235421246,
"grad_norm": 1.603780373356476,
"learning_rate": 2.1325888954893618e-07,
"loss": 1.6525,
"step": 15960
},
{
"epoch": 0.5903541016310472,
"grad_norm": 1.5200619012123444,
"learning_rate": 2.1275682119199674e-07,
"loss": 1.6103,
"step": 15980
},
{
"epoch": 0.5910929678408482,
"grad_norm": 2.1303907208230637,
"learning_rate": 2.122550882164552e-07,
"loss": 1.6515,
"step": 16000
},
{
"epoch": 0.5918318340506493,
"grad_norm": 1.4309458414094776,
"learning_rate": 2.1175369332577075e-07,
"loss": 1.6476,
"step": 16020
},
{
"epoch": 0.5925707002604503,
"grad_norm": 1.3885096209200305,
"learning_rate": 2.112526392215811e-07,
"loss": 1.6161,
"step": 16040
},
{
"epoch": 0.5933095664702513,
"grad_norm": 1.4639170589501997,
"learning_rate": 2.107519286036879e-07,
"loss": 1.6626,
"step": 16060
},
{
"epoch": 0.5940484326800525,
"grad_norm": 1.5413296048888148,
"learning_rate": 2.102515641700417e-07,
"loss": 1.7111,
"step": 16080
},
{
"epoch": 0.5947872988898535,
"grad_norm": 1.477261253181655,
"learning_rate": 2.0975154861672782e-07,
"loss": 1.6606,
"step": 16100
},
{
"epoch": 0.5955261650996546,
"grad_norm": 1.484117052461405,
"learning_rate": 2.0925188463795195e-07,
"loss": 1.6587,
"step": 16120
},
{
"epoch": 0.5962650313094556,
"grad_norm": 1.492261770923395,
"learning_rate": 2.0875257492602505e-07,
"loss": 1.629,
"step": 16140
},
{
"epoch": 0.5970038975192568,
"grad_norm": 1.4469424063226348,
"learning_rate": 2.082536221713494e-07,
"loss": 1.6496,
"step": 16160
},
{
"epoch": 0.5977427637290578,
"grad_norm": 1.6092362505845061,
"learning_rate": 2.07755029062404e-07,
"loss": 1.6664,
"step": 16180
},
{
"epoch": 0.5984816299388588,
"grad_norm": 1.779958420465131,
"learning_rate": 2.0725679828572983e-07,
"loss": 1.6212,
"step": 16200
},
{
"epoch": 0.5992204961486599,
"grad_norm": 2.256981377181274,
"learning_rate": 2.0675893252591558e-07,
"loss": 1.6603,
"step": 16220
},
{
"epoch": 0.5999593623584609,
"grad_norm": 1.4438145967369689,
"learning_rate": 2.0626143446558313e-07,
"loss": 1.7086,
"step": 16240
},
{
"epoch": 0.600698228568262,
"grad_norm": 1.4523681015745287,
"learning_rate": 2.0576430678537314e-07,
"loss": 1.6363,
"step": 16260
},
{
"epoch": 0.6014370947780631,
"grad_norm": 2.081965836536827,
"learning_rate": 2.052675521639306e-07,
"loss": 1.6525,
"step": 16280
},
{
"epoch": 0.6021759609878641,
"grad_norm": 1.641105539346371,
"learning_rate": 2.0477117327789017e-07,
"loss": 1.7219,
"step": 16300
},
{
"epoch": 0.6029148271976652,
"grad_norm": 2.1960028742429887,
"learning_rate": 2.0427517280186225e-07,
"loss": 1.7079,
"step": 16320
},
{
"epoch": 0.6036536934074662,
"grad_norm": 1.421358868551972,
"learning_rate": 2.0377955340841817e-07,
"loss": 1.6494,
"step": 16340
},
{
"epoch": 0.6043925596172673,
"grad_norm": 1.4519180712299584,
"learning_rate": 2.032843177680757e-07,
"loss": 1.6497,
"step": 16360
},
{
"epoch": 0.6051314258270684,
"grad_norm": 1.4554186364319244,
"learning_rate": 2.0278946854928512e-07,
"loss": 1.6623,
"step": 16380
},
{
"epoch": 0.6058702920368694,
"grad_norm": 1.453630709571824,
"learning_rate": 2.022950084184145e-07,
"loss": 1.6481,
"step": 16400
},
{
"epoch": 0.6066091582466705,
"grad_norm": 1.504491667770329,
"learning_rate": 2.018009400397353e-07,
"loss": 1.677,
"step": 16420
},
{
"epoch": 0.6073480244564715,
"grad_norm": 1.388924417705384,
"learning_rate": 2.0130726607540828e-07,
"loss": 1.6496,
"step": 16440
},
{
"epoch": 0.6080868906662726,
"grad_norm": 1.464940095501643,
"learning_rate": 2.0081398918546882e-07,
"loss": 1.6999,
"step": 16460
},
{
"epoch": 0.6088257568760737,
"grad_norm": 1.7055463049168984,
"learning_rate": 2.0032111202781282e-07,
"loss": 1.6249,
"step": 16480
},
{
"epoch": 0.6095646230858748,
"grad_norm": 1.6279220224411552,
"learning_rate": 1.9982863725818267e-07,
"loss": 1.6285,
"step": 16500
},
{
"epoch": 0.6103034892956758,
"grad_norm": 2.0351245502127404,
"learning_rate": 1.9933656753015204e-07,
"loss": 1.6595,
"step": 16520
},
{
"epoch": 0.6110423555054768,
"grad_norm": 2.018723900559302,
"learning_rate": 1.9884490549511252e-07,
"loss": 1.7325,
"step": 16540
},
{
"epoch": 0.6117812217152779,
"grad_norm": 1.4930972850593807,
"learning_rate": 1.983782066004026e-07,
"loss": 1.6739,
"step": 16560
},
{
"epoch": 0.612520087925079,
"grad_norm": 1.6719536221986355,
"learning_rate": 1.9788734718442834e-07,
"loss": 1.6453,
"step": 16580
},
{
"epoch": 0.6132589541348801,
"grad_norm": 1.5901664783269642,
"learning_rate": 1.9739690327019692e-07,
"loss": 1.6688,
"step": 16600
},
{
"epoch": 0.6139978203446811,
"grad_norm": 1.5005389488409309,
"learning_rate": 1.9693136881713379e-07,
"loss": 1.6697,
"step": 16620
},
{
"epoch": 0.6147366865544821,
"grad_norm": 1.5857034959363703,
"learning_rate": 1.9644174273011738e-07,
"loss": 1.6639,
"step": 16640
},
{
"epoch": 0.6154755527642832,
"grad_norm": 1.8800052700521002,
"learning_rate": 1.959525399341126e-07,
"loss": 1.6406,
"step": 16660
},
{
"epoch": 0.6162144189740842,
"grad_norm": 1.5463318718925796,
"learning_rate": 1.954637630650633e-07,
"loss": 1.6456,
"step": 16680
},
{
"epoch": 0.6169532851838854,
"grad_norm": 1.7265411721417883,
"learning_rate": 1.9497541475661822e-07,
"loss": 1.6396,
"step": 16700
},
{
"epoch": 0.6176921513936864,
"grad_norm": 1.6019332231293413,
"learning_rate": 1.9448749764011674e-07,
"loss": 1.6319,
"step": 16720
},
{
"epoch": 0.6184310176034874,
"grad_norm": 1.6078339500202126,
"learning_rate": 1.940000143445753e-07,
"loss": 1.6287,
"step": 16740
},
{
"epoch": 0.6191698838132885,
"grad_norm": 1.5200063311449286,
"learning_rate": 1.9351296749667239e-07,
"loss": 1.6556,
"step": 16760
},
{
"epoch": 0.6199087500230895,
"grad_norm": 1.5605900758303721,
"learning_rate": 1.9302635972073504e-07,
"loss": 1.6709,
"step": 16780
},
{
"epoch": 0.6206476162328907,
"grad_norm": 1.5245501861602075,
"learning_rate": 1.9254019363872432e-07,
"loss": 1.6744,
"step": 16800
},
{
"epoch": 0.6213864824426917,
"grad_norm": 1.4527294863239084,
"learning_rate": 1.9205447187022145e-07,
"loss": 1.6564,
"step": 16820
},
{
"epoch": 0.6221253486524928,
"grad_norm": 2.0368137299260276,
"learning_rate": 1.915691970324137e-07,
"loss": 1.6289,
"step": 16840
},
{
"epoch": 0.6228642148622938,
"grad_norm": 2.2640348268112147,
"learning_rate": 1.9108437174007967e-07,
"loss": 1.667,
"step": 16860
},
{
"epoch": 0.6236030810720948,
"grad_norm": 1.4879411305430876,
"learning_rate": 1.9059999860557635e-07,
"loss": 1.6516,
"step": 16880
},
{
"epoch": 0.624341947281896,
"grad_norm": 1.99321589038771,
"learning_rate": 1.9011608023882396e-07,
"loss": 1.6617,
"step": 16900
},
{
"epoch": 0.625080813491697,
"grad_norm": 1.4486992732108148,
"learning_rate": 1.8963261924729247e-07,
"loss": 1.6477,
"step": 16920
},
{
"epoch": 0.6258196797014981,
"grad_norm": 1.4436779823541692,
"learning_rate": 1.8914961823598742e-07,
"loss": 1.6276,
"step": 16940
},
{
"epoch": 0.6265585459112991,
"grad_norm": 1.7823515681610929,
"learning_rate": 1.886670798074358e-07,
"loss": 1.6722,
"step": 16960
},
{
"epoch": 0.6272974121211001,
"grad_norm": 1.4559994514082784,
"learning_rate": 1.8818500656167198e-07,
"loss": 1.6721,
"step": 16980
},
{
"epoch": 0.6280362783309013,
"grad_norm": 1.5502170823927217,
"learning_rate": 1.8770340109622418e-07,
"loss": 1.6468,
"step": 17000
},
{
"epoch": 0.6287751445407023,
"grad_norm": 1.3693032988758314,
"learning_rate": 1.8722226600609974e-07,
"loss": 1.6503,
"step": 17020
},
{
"epoch": 0.6295140107505034,
"grad_norm": 1.8228163395950472,
"learning_rate": 1.8674160388377174e-07,
"loss": 1.6691,
"step": 17040
},
{
"epoch": 0.6302528769603044,
"grad_norm": 1.607512275964286,
"learning_rate": 1.8626141731916446e-07,
"loss": 1.6381,
"step": 17060
},
{
"epoch": 0.6309917431701054,
"grad_norm": 1.6555733853411483,
"learning_rate": 1.8578170889964022e-07,
"loss": 1.624,
"step": 17080
},
{
"epoch": 0.6317306093799065,
"grad_norm": 1.4667357369050853,
"learning_rate": 1.853024812099847e-07,
"loss": 1.6233,
"step": 17100
},
{
"epoch": 0.6324694755897076,
"grad_norm": 1.555065221242107,
"learning_rate": 1.8482373683239316e-07,
"loss": 1.6372,
"step": 17120
},
{
"epoch": 0.6332083417995087,
"grad_norm": 1.5169327799558363,
"learning_rate": 1.8434547834645714e-07,
"loss": 1.6738,
"step": 17140
},
{
"epoch": 0.6339472080093097,
"grad_norm": 1.419410682586359,
"learning_rate": 1.8386770832914955e-07,
"loss": 1.6677,
"step": 17160
},
{
"epoch": 0.6346860742191108,
"grad_norm": 1.6719841699284368,
"learning_rate": 1.833904293548116e-07,
"loss": 1.6821,
"step": 17180
},
{
"epoch": 0.6354249404289118,
"grad_norm": 1.5798183541162123,
"learning_rate": 1.8291364399513864e-07,
"loss": 1.7092,
"step": 17200
},
{
"epoch": 0.6361638066387129,
"grad_norm": 1.4604030691233605,
"learning_rate": 1.8243735481916611e-07,
"loss": 1.662,
"step": 17220
},
{
"epoch": 0.636902672848514,
"grad_norm": 1.7774575653306484,
"learning_rate": 1.8196156439325604e-07,
"loss": 1.655,
"step": 17240
},
{
"epoch": 0.637641539058315,
"grad_norm": 2.062948052538768,
"learning_rate": 1.8148627528108323e-07,
"loss": 1.65,
"step": 17260
},
{
"epoch": 0.6383804052681161,
"grad_norm": 1.7560243016328074,
"learning_rate": 1.8101149004362088e-07,
"loss": 1.6068,
"step": 17280
},
{
"epoch": 0.6391192714779171,
"grad_norm": 1.589922555292764,
"learning_rate": 1.8053721123912764e-07,
"loss": 1.6432,
"step": 17300
},
{
"epoch": 0.6398581376877182,
"grad_norm": 1.7855781248038047,
"learning_rate": 1.8006344142313285e-07,
"loss": 1.6444,
"step": 17320
},
{
"epoch": 0.6405970038975193,
"grad_norm": 1.462859488532895,
"learning_rate": 1.7959018314842395e-07,
"loss": 1.6225,
"step": 17340
},
{
"epoch": 0.6413358701073203,
"grad_norm": 1.5201929263554286,
"learning_rate": 1.7911743896503144e-07,
"loss": 1.6216,
"step": 17360
},
{
"epoch": 0.6420747363171214,
"grad_norm": 1.5039545520824391,
"learning_rate": 1.7864521142021616e-07,
"loss": 1.597,
"step": 17380
},
{
"epoch": 0.6428136025269224,
"grad_norm": 2.1198882531068106,
"learning_rate": 1.7817350305845503e-07,
"loss": 1.6762,
"step": 17400
},
{
"epoch": 0.6435524687367234,
"grad_norm": 1.5052045132821683,
"learning_rate": 1.7770231642142758e-07,
"loss": 1.6459,
"step": 17420
},
{
"epoch": 0.6442913349465246,
"grad_norm": 1.5702310750127326,
"learning_rate": 1.77231654048002e-07,
"loss": 1.5676,
"step": 17440
},
{
"epoch": 0.6450302011563256,
"grad_norm": 1.49975631121171,
"learning_rate": 1.7676151847422188e-07,
"loss": 1.6558,
"step": 17460
},
{
"epoch": 0.6457690673661267,
"grad_norm": 1.8852376014336283,
"learning_rate": 1.7629191223329188e-07,
"loss": 1.6598,
"step": 17480
},
{
"epoch": 0.6465079335759277,
"grad_norm": 1.5809036111526213,
"learning_rate": 1.7582283785556494e-07,
"loss": 1.6148,
"step": 17500
},
{
"epoch": 0.6472467997857289,
"grad_norm": 1.4247569077843545,
"learning_rate": 1.75354297868528e-07,
"loss": 1.6318,
"step": 17520
},
{
"epoch": 0.6479856659955299,
"grad_norm": 1.6577683592238937,
"learning_rate": 1.748862947967885e-07,
"loss": 1.6551,
"step": 17540
},
{
"epoch": 0.6487245322053309,
"grad_norm": 7.300032033927882,
"learning_rate": 1.744188311620608e-07,
"loss": 1.6892,
"step": 17560
},
{
"epoch": 0.649463398415132,
"grad_norm": 1.4132601163703873,
"learning_rate": 1.7395190948315282e-07,
"loss": 1.6817,
"step": 17580
},
{
"epoch": 0.650202264624933,
"grad_norm": 1.5063433467194194,
"learning_rate": 1.7348553227595218e-07,
"loss": 1.6158,
"step": 17600
},
{
"epoch": 0.6509411308347341,
"grad_norm": 1.5169596981657725,
"learning_rate": 1.7301970205341292e-07,
"loss": 1.6779,
"step": 17620
},
{
"epoch": 0.6516799970445352,
"grad_norm": 1.6068564294026548,
"learning_rate": 1.725544213255415e-07,
"loss": 1.6179,
"step": 17640
},
{
"epoch": 0.6524188632543362,
"grad_norm": 1.401533779590892,
"learning_rate": 1.7208969259938396e-07,
"loss": 1.6992,
"step": 17660
},
{
"epoch": 0.6531577294641373,
"grad_norm": 1.7940271180903984,
"learning_rate": 1.7162551837901149e-07,
"loss": 1.6343,
"step": 17680
},
{
"epoch": 0.6538965956739383,
"grad_norm": 1.4503762459176361,
"learning_rate": 1.7116190116550798e-07,
"loss": 1.6241,
"step": 17700
},
{
"epoch": 0.6546354618837394,
"grad_norm": 1.9129744363614924,
"learning_rate": 1.7069884345695585e-07,
"loss": 1.6242,
"step": 17720
},
{
"epoch": 0.6553743280935405,
"grad_norm": 1.4592502547252286,
"learning_rate": 1.7023634774842265e-07,
"loss": 1.6433,
"step": 17740
},
{
"epoch": 0.6561131943033415,
"grad_norm": 2.3740218695344026,
"learning_rate": 1.6977441653194778e-07,
"loss": 1.6407,
"step": 17760
},
{
"epoch": 0.6568520605131426,
"grad_norm": 1.652867656549423,
"learning_rate": 1.6931305229652911e-07,
"loss": 1.6571,
"step": 17780
},
{
"epoch": 0.6575909267229436,
"grad_norm": 1.8510532804043571,
"learning_rate": 1.688522575281096e-07,
"loss": 1.6393,
"step": 17800
},
{
"epoch": 0.6583297929327447,
"grad_norm": 1.5330852891820108,
"learning_rate": 1.6839203470956348e-07,
"loss": 1.6181,
"step": 17820
},
{
"epoch": 0.6590686591425458,
"grad_norm": 2.179872107638406,
"learning_rate": 1.6793238632068323e-07,
"loss": 1.6467,
"step": 17840
},
{
"epoch": 0.6598075253523468,
"grad_norm": 1.5709625450812563,
"learning_rate": 1.6747331483816645e-07,
"loss": 1.6931,
"step": 17860
},
{
"epoch": 0.6605463915621479,
"grad_norm": 1.7454282483475967,
"learning_rate": 1.6701482273560185e-07,
"loss": 1.6292,
"step": 17880
},
{
"epoch": 0.6612852577719489,
"grad_norm": 1.7594994883208979,
"learning_rate": 1.6655691248345655e-07,
"loss": 1.6171,
"step": 17900
},
{
"epoch": 0.66202412398175,
"grad_norm": 1.5140697252908892,
"learning_rate": 1.6609958654906255e-07,
"loss": 1.6319,
"step": 17920
},
{
"epoch": 0.662762990191551,
"grad_norm": 2.248352984954327,
"learning_rate": 1.6564284739660316e-07,
"loss": 1.6363,
"step": 17940
},
{
"epoch": 0.6635018564013522,
"grad_norm": 2.0596192177611368,
"learning_rate": 1.6518669748710013e-07,
"loss": 1.6264,
"step": 17960
},
{
"epoch": 0.6642407226111532,
"grad_norm": 1.4805518708471208,
"learning_rate": 1.647311392784002e-07,
"loss": 1.6559,
"step": 17980
},
{
"epoch": 0.6649795888209542,
"grad_norm": 1.5620227618208977,
"learning_rate": 1.6427617522516196e-07,
"loss": 1.6528,
"step": 18000
},
{
"epoch": 0.6657184550307553,
"grad_norm": 1.5698059903501222,
"learning_rate": 1.6382180777884236e-07,
"loss": 1.68,
"step": 18020
},
{
"epoch": 0.6664573212405563,
"grad_norm": 1.525456023190327,
"learning_rate": 1.6336803938768396e-07,
"loss": 1.6129,
"step": 18040
},
{
"epoch": 0.6671961874503575,
"grad_norm": 1.9244616810959143,
"learning_rate": 1.6291487249670116e-07,
"loss": 1.6074,
"step": 18060
},
{
"epoch": 0.6679350536601585,
"grad_norm": 1.5470316335951617,
"learning_rate": 1.6246230954766744e-07,
"loss": 1.6174,
"step": 18080
},
{
"epoch": 0.6686739198699595,
"grad_norm": 1.460047028189958,
"learning_rate": 1.6201035297910215e-07,
"loss": 1.6387,
"step": 18100
},
{
"epoch": 0.6694127860797606,
"grad_norm": 1.849597715575099,
"learning_rate": 1.6155900522625744e-07,
"loss": 1.6357,
"step": 18120
},
{
"epoch": 0.6701516522895616,
"grad_norm": 1.595432962229376,
"learning_rate": 1.6110826872110478e-07,
"loss": 1.6175,
"step": 18140
},
{
"epoch": 0.6708905184993628,
"grad_norm": 1.5318757576478021,
"learning_rate": 1.6065814589232206e-07,
"loss": 1.6235,
"step": 18160
},
{
"epoch": 0.6716293847091638,
"grad_norm": 1.4152502346247018,
"learning_rate": 1.602086391652807e-07,
"loss": 1.6287,
"step": 18180
},
{
"epoch": 0.6723682509189648,
"grad_norm": 1.730605954821045,
"learning_rate": 1.5975975096203248e-07,
"loss": 1.6297,
"step": 18200
},
{
"epoch": 0.6731071171287659,
"grad_norm": 1.641811600664541,
"learning_rate": 1.5931148370129613e-07,
"loss": 1.6575,
"step": 18220
},
{
"epoch": 0.6738459833385669,
"grad_norm": 1.4446876896322507,
"learning_rate": 1.5886383979844492e-07,
"loss": 1.6488,
"step": 18240
},
{
"epoch": 0.6745848495483681,
"grad_norm": 1.6489416268912538,
"learning_rate": 1.5841682166549308e-07,
"loss": 1.6466,
"step": 18260
},
{
"epoch": 0.6753237157581691,
"grad_norm": 1.6240331247999147,
"learning_rate": 1.5797043171108297e-07,
"loss": 1.6693,
"step": 18280
},
{
"epoch": 0.6760625819679702,
"grad_norm": 2.2147991050957,
"learning_rate": 1.5752467234047263e-07,
"loss": 1.6051,
"step": 18300
},
{
"epoch": 0.6768014481777712,
"grad_norm": 1.5203059720344088,
"learning_rate": 1.5707954595552187e-07,
"loss": 1.653,
"step": 18320
},
{
"epoch": 0.6775403143875722,
"grad_norm": 1.5328417599383586,
"learning_rate": 1.5663505495468e-07,
"loss": 1.6381,
"step": 18340
},
{
"epoch": 0.6782791805973734,
"grad_norm": 1.5445956099646183,
"learning_rate": 1.5619120173297267e-07,
"loss": 1.6037,
"step": 18360
},
{
"epoch": 0.6790180468071744,
"grad_norm": 1.479872310550016,
"learning_rate": 1.5574798868198912e-07,
"loss": 1.6353,
"step": 18380
},
{
"epoch": 0.6797569130169755,
"grad_norm": 1.7841436633262773,
"learning_rate": 1.5530541818986927e-07,
"loss": 1.7364,
"step": 18400
},
{
"epoch": 0.6804957792267765,
"grad_norm": 1.529508435392583,
"learning_rate": 1.5486349264129046e-07,
"loss": 1.6181,
"step": 18420
},
{
"epoch": 0.6812346454365775,
"grad_norm": 1.6539396952625665,
"learning_rate": 1.5442221441745533e-07,
"loss": 1.6985,
"step": 18440
},
{
"epoch": 0.6819735116463786,
"grad_norm": 1.5860780535239207,
"learning_rate": 1.5398158589607813e-07,
"loss": 1.6636,
"step": 18460
},
{
"epoch": 0.6827123778561797,
"grad_norm": 1.9353694955508953,
"learning_rate": 1.5354160945137268e-07,
"loss": 1.6277,
"step": 18480
},
{
"epoch": 0.6834512440659808,
"grad_norm": 1.4060414431962835,
"learning_rate": 1.5310228745403925e-07,
"loss": 1.6348,
"step": 18500
},
{
"epoch": 0.6841901102757818,
"grad_norm": 1.9510007446700244,
"learning_rate": 1.5266362227125164e-07,
"loss": 1.666,
"step": 18520
},
{
"epoch": 0.6849289764855828,
"grad_norm": 2.5976331102164694,
"learning_rate": 1.5222561626664448e-07,
"loss": 1.6437,
"step": 18540
},
{
"epoch": 0.6856678426953839,
"grad_norm": 1.635565277090673,
"learning_rate": 1.51788271800301e-07,
"loss": 1.6367,
"step": 18560
},
{
"epoch": 0.686406708905185,
"grad_norm": 1.6414633412876904,
"learning_rate": 1.5135159122873936e-07,
"loss": 1.6239,
"step": 18580
},
{
"epoch": 0.6871455751149861,
"grad_norm": 1.972663651970077,
"learning_rate": 1.5091557690490104e-07,
"loss": 1.6551,
"step": 18600
},
{
"epoch": 0.6878844413247871,
"grad_norm": 1.376913066395765,
"learning_rate": 1.504802311781371e-07,
"loss": 1.6494,
"step": 18620
},
{
"epoch": 0.6886233075345882,
"grad_norm": 1.441207784040776,
"learning_rate": 1.5004555639419648e-07,
"loss": 1.6697,
"step": 18640
},
{
"epoch": 0.6893621737443892,
"grad_norm": 2.5475644652288514,
"learning_rate": 1.4961155489521253e-07,
"loss": 1.6449,
"step": 18660
},
{
"epoch": 0.6901010399541903,
"grad_norm": 1.4330764200962958,
"learning_rate": 1.4917822901969108e-07,
"loss": 1.5962,
"step": 18680
},
{
"epoch": 0.6908399061639914,
"grad_norm": 1.5535375552432238,
"learning_rate": 1.487455811024975e-07,
"loss": 1.6682,
"step": 18700
},
{
"epoch": 0.6915787723737924,
"grad_norm": 1.5430558472764233,
"learning_rate": 1.4831361347484396e-07,
"loss": 1.6646,
"step": 18720
},
{
"epoch": 0.6923176385835935,
"grad_norm": 1.5354124537032656,
"learning_rate": 1.4788232846427718e-07,
"loss": 1.6569,
"step": 18740
},
{
"epoch": 0.6930565047933945,
"grad_norm": 1.723896126450271,
"learning_rate": 1.474517283946658e-07,
"loss": 1.6694,
"step": 18760
},
{
"epoch": 0.6937953710031955,
"grad_norm": 1.4743738549149994,
"learning_rate": 1.4702181558618777e-07,
"loss": 1.6161,
"step": 18780
},
{
"epoch": 0.6945342372129967,
"grad_norm": 1.675747008439809,
"learning_rate": 1.4659259235531796e-07,
"loss": 1.6558,
"step": 18800
},
{
"epoch": 0.6952731034227977,
"grad_norm": 1.760786257067446,
"learning_rate": 1.4616406101481574e-07,
"loss": 1.5887,
"step": 18820
},
{
"epoch": 0.6960119696325988,
"grad_norm": 2.8049367365120608,
"learning_rate": 1.4573622387371217e-07,
"loss": 1.6649,
"step": 18840
},
{
"epoch": 0.6967508358423998,
"grad_norm": 1.496529351669967,
"learning_rate": 1.4530908323729782e-07,
"loss": 1.6433,
"step": 18860
},
{
"epoch": 0.6974897020522008,
"grad_norm": 1.4994802420062043,
"learning_rate": 1.448826414071105e-07,
"loss": 1.6841,
"step": 18880
},
{
"epoch": 0.698228568262002,
"grad_norm": 1.420851366464802,
"learning_rate": 1.4445690068092265e-07,
"loss": 1.6504,
"step": 18900
},
{
"epoch": 0.698967434471803,
"grad_norm": 1.7411191806669424,
"learning_rate": 1.4403186335272888e-07,
"loss": 1.6298,
"step": 18920
},
{
"epoch": 0.6997063006816041,
"grad_norm": 1.628227507992112,
"learning_rate": 1.4360753171273364e-07,
"loss": 1.673,
"step": 18940
},
{
"epoch": 0.7004451668914051,
"grad_norm": 1.7368645634603777,
"learning_rate": 1.4318390804733927e-07,
"loss": 1.6198,
"step": 18960
},
{
"epoch": 0.7011840331012062,
"grad_norm": 1.4616447916754742,
"learning_rate": 1.4276099463913315e-07,
"loss": 1.6096,
"step": 18980
},
{
"epoch": 0.7019228993110073,
"grad_norm": 1.517480098110094,
"learning_rate": 1.4233879376687563e-07,
"loss": 1.6345,
"step": 19000
},
{
"epoch": 0.7026617655208083,
"grad_norm": 1.636195025828432,
"learning_rate": 1.419173077054878e-07,
"loss": 1.6119,
"step": 19020
},
{
"epoch": 0.7034006317306094,
"grad_norm": 1.5039586339840252,
"learning_rate": 1.4149653872603917e-07,
"loss": 1.7208,
"step": 19040
},
{
"epoch": 0.7041394979404104,
"grad_norm": 1.4728764699529369,
"learning_rate": 1.410764890957353e-07,
"loss": 1.6572,
"step": 19060
},
{
"epoch": 0.7048783641502115,
"grad_norm": 1.9218697223400836,
"learning_rate": 1.406571610779059e-07,
"loss": 1.6514,
"step": 19080
},
{
"epoch": 0.7056172303600126,
"grad_norm": 1.5761294021476189,
"learning_rate": 1.4023855693199254e-07,
"loss": 1.6381,
"step": 19100
},
{
"epoch": 0.7063560965698136,
"grad_norm": 1.435908518604352,
"learning_rate": 1.398206789135361e-07,
"loss": 1.6126,
"step": 19120
},
{
"epoch": 0.7070949627796147,
"grad_norm": 4.717577212666518,
"learning_rate": 1.3940352927416504e-07,
"loss": 1.6647,
"step": 19140
},
{
"epoch": 0.7078338289894157,
"grad_norm": 2.1188904047967245,
"learning_rate": 1.3898711026158323e-07,
"loss": 1.6794,
"step": 19160
},
{
"epoch": 0.7085726951992168,
"grad_norm": 1.5687418673344722,
"learning_rate": 1.3857142411955767e-07,
"loss": 1.6474,
"step": 19180
},
{
"epoch": 0.7093115614090179,
"grad_norm": 1.6271449022527302,
"learning_rate": 1.381564730879064e-07,
"loss": 1.6347,
"step": 19200
},
{
"epoch": 0.7100504276188189,
"grad_norm": 1.4693942372788273,
"learning_rate": 1.377422594024867e-07,
"loss": 1.6474,
"step": 19220
},
{
"epoch": 0.71078929382862,
"grad_norm": 1.488154969512232,
"learning_rate": 1.373287852951826e-07,
"loss": 1.6128,
"step": 19240
},
{
"epoch": 0.711528160038421,
"grad_norm": 1.5779135188256272,
"learning_rate": 1.3691605299389328e-07,
"loss": 1.7183,
"step": 19260
},
{
"epoch": 0.7122670262482221,
"grad_norm": 1.6650630460525442,
"learning_rate": 1.3650406472252083e-07,
"loss": 1.6683,
"step": 19280
},
{
"epoch": 0.7130058924580231,
"grad_norm": 1.4154650117196357,
"learning_rate": 1.360928227009584e-07,
"loss": 1.6717,
"step": 19300
},
{
"epoch": 0.7137447586678243,
"grad_norm": 1.6468623503038222,
"learning_rate": 1.3568232914507802e-07,
"loss": 1.6348,
"step": 19320
},
{
"epoch": 0.7144836248776253,
"grad_norm": 1.5015397491680238,
"learning_rate": 1.3527258626671898e-07,
"loss": 1.6112,
"step": 19340
},
{
"epoch": 0.7152224910874263,
"grad_norm": 3.3400095996186865,
"learning_rate": 1.348635962736755e-07,
"loss": 1.6523,
"step": 19360
},
{
"epoch": 0.7159613572972274,
"grad_norm": 1.5139103946143873,
"learning_rate": 1.344553613696854e-07,
"loss": 1.6941,
"step": 19380
},
{
"epoch": 0.7167002235070284,
"grad_norm": 1.4051928644539238,
"learning_rate": 1.340478837544175e-07,
"loss": 1.6237,
"step": 19400
},
{
"epoch": 0.7174390897168296,
"grad_norm": 1.5234389161550645,
"learning_rate": 1.3364116562346055e-07,
"loss": 1.6559,
"step": 19420
},
{
"epoch": 0.7181779559266306,
"grad_norm": 1.4205504198026582,
"learning_rate": 1.3323520916831077e-07,
"loss": 1.6478,
"step": 19440
},
{
"epoch": 0.7189168221364316,
"grad_norm": 1.5989862880917087,
"learning_rate": 1.328300165763602e-07,
"loss": 1.6123,
"step": 19460
},
{
"epoch": 0.7196556883462327,
"grad_norm": 1.6443108557654487,
"learning_rate": 1.3242559003088546e-07,
"loss": 1.6832,
"step": 19480
},
{
"epoch": 0.7203945545560337,
"grad_norm": 1.3202697054517272,
"learning_rate": 1.3202193171103506e-07,
"loss": 1.6339,
"step": 19500
},
{
"epoch": 0.7211334207658349,
"grad_norm": 1.5006943077767945,
"learning_rate": 1.316190437918182e-07,
"loss": 1.6469,
"step": 19520
},
{
"epoch": 0.7218722869756359,
"grad_norm": 1.7379534891877164,
"learning_rate": 1.3121692844409321e-07,
"loss": 1.6797,
"step": 19540
},
{
"epoch": 0.7226111531854369,
"grad_norm": 1.526373724090785,
"learning_rate": 1.308155878345553e-07,
"loss": 1.6636,
"step": 19560
},
{
"epoch": 0.723350019395238,
"grad_norm": 2.0046685771285793,
"learning_rate": 1.3041502412572542e-07,
"loss": 1.6748,
"step": 19580
},
{
"epoch": 0.724088885605039,
"grad_norm": 1.4955882650728989,
"learning_rate": 1.3001523947593845e-07,
"loss": 1.6293,
"step": 19600
},
{
"epoch": 0.7248277518148402,
"grad_norm": 2.4302511767713324,
"learning_rate": 1.2961623603933134e-07,
"loss": 1.6004,
"step": 19620
},
{
"epoch": 0.7255666180246412,
"grad_norm": 1.6494154347871601,
"learning_rate": 1.2921801596583153e-07,
"loss": 1.6136,
"step": 19640
},
{
"epoch": 0.7263054842344423,
"grad_norm": 1.4459727786023948,
"learning_rate": 1.2882058140114594e-07,
"loss": 1.6435,
"step": 19660
},
{
"epoch": 0.7270443504442433,
"grad_norm": 1.4490955525578755,
"learning_rate": 1.2842393448674869e-07,
"loss": 1.6508,
"step": 19680
},
{
"epoch": 0.7277832166540443,
"grad_norm": 1.7167939191812815,
"learning_rate": 1.280280773598699e-07,
"loss": 1.6299,
"step": 19700
},
{
"epoch": 0.7285220828638455,
"grad_norm": 1.8412228497101617,
"learning_rate": 1.2763301215348402e-07,
"loss": 1.6758,
"step": 19720
},
{
"epoch": 0.7292609490736465,
"grad_norm": 1.6407591339864582,
"learning_rate": 1.2723874099629866e-07,
"loss": 1.6443,
"step": 19740
},
{
"epoch": 0.7299998152834476,
"grad_norm": 2.010243920808459,
"learning_rate": 1.268452660127427e-07,
"loss": 1.6317,
"step": 19760
},
{
"epoch": 0.7307386814932486,
"grad_norm": 1.521357800662826,
"learning_rate": 1.2645258932295518e-07,
"loss": 1.6162,
"step": 19780
},
{
"epoch": 0.7314775477030496,
"grad_norm": 1.5657714545631887,
"learning_rate": 1.260607130427737e-07,
"loss": 1.6134,
"step": 19800
},
{
"epoch": 0.7322164139128507,
"grad_norm": 1.7902489767561236,
"learning_rate": 1.2566963928372308e-07,
"loss": 1.6633,
"step": 19820
},
{
"epoch": 0.7329552801226518,
"grad_norm": 2.0435731294538466,
"learning_rate": 1.2527937015300378e-07,
"loss": 1.6505,
"step": 19840
},
{
"epoch": 0.7336941463324529,
"grad_norm": 5.207754525218824,
"learning_rate": 1.2488990775348092e-07,
"loss": 1.6453,
"step": 19860
},
{
"epoch": 0.7344330125422539,
"grad_norm": 1.6112840529464336,
"learning_rate": 1.245012541836728e-07,
"loss": 1.6082,
"step": 19880
},
{
"epoch": 0.7351718787520549,
"grad_norm": 1.4827262765821532,
"learning_rate": 1.241134115377394e-07,
"loss": 1.6161,
"step": 19900
},
{
"epoch": 0.735910744961856,
"grad_norm": 1.7219801506968755,
"learning_rate": 1.2372638190547122e-07,
"loss": 1.6305,
"step": 19920
},
{
"epoch": 0.7366496111716571,
"grad_norm": 1.3720219936046893,
"learning_rate": 1.233401673722782e-07,
"loss": 1.6099,
"step": 19940
},
{
"epoch": 0.7373884773814582,
"grad_norm": 1.7432612385035637,
"learning_rate": 1.229547700191783e-07,
"loss": 1.6372,
"step": 19960
},
{
"epoch": 0.7381273435912592,
"grad_norm": 3.2034872788326925,
"learning_rate": 1.2257019192278617e-07,
"loss": 1.6147,
"step": 19980
},
{
"epoch": 0.7388662098010603,
"grad_norm": 1.6442745462596664,
"learning_rate": 1.2218643515530227e-07,
"loss": 1.6344,
"step": 20000
},
{
"epoch": 0.7396050760108613,
"grad_norm": 1.959102806007239,
"learning_rate": 1.218035017845015e-07,
"loss": 1.6451,
"step": 20020
},
{
"epoch": 0.7403439422206624,
"grad_norm": 1.6408059937998853,
"learning_rate": 1.214213938737219e-07,
"loss": 1.6757,
"step": 20040
},
{
"epoch": 0.7410828084304635,
"grad_norm": 1.5657243128524525,
"learning_rate": 1.210591578161399e-07,
"loss": 1.6359,
"step": 20060
},
{
"epoch": 0.7418216746402645,
"grad_norm": 1.4736673628427441,
"learning_rate": 1.2067866547022443e-07,
"loss": 1.6603,
"step": 20080
},
{
"epoch": 0.7425605408500656,
"grad_norm": 1.4833315219916223,
"learning_rate": 1.2029900464522203e-07,
"loss": 1.6342,
"step": 20100
},
{
"epoch": 0.7432994070598666,
"grad_norm": 1.9340686772443259,
"learning_rate": 1.1992017738683768e-07,
"loss": 1.6416,
"step": 20120
},
{
"epoch": 0.7440382732696676,
"grad_norm": 1.6355654798248513,
"learning_rate": 1.1954218573628499e-07,
"loss": 1.6678,
"step": 20140
},
{
"epoch": 0.7447771394794688,
"grad_norm": 1.5624481100138734,
"learning_rate": 1.1916503173027475e-07,
"loss": 1.614,
"step": 20160
},
{
"epoch": 0.7455160056892698,
"grad_norm": 1.5029974061648055,
"learning_rate": 1.1878871740100476e-07,
"loss": 1.639,
"step": 20180
},
{
"epoch": 0.7462548718990709,
"grad_norm": 1.4683397727523646,
"learning_rate": 1.1841324477614812e-07,
"loss": 1.6516,
"step": 20200
},
{
"epoch": 0.7469937381088719,
"grad_norm": 1.478703041295488,
"learning_rate": 1.1803861587884268e-07,
"loss": 1.7247,
"step": 20220
},
{
"epoch": 0.7477326043186729,
"grad_norm": 1.4765169074470068,
"learning_rate": 1.1766483272768017e-07,
"loss": 1.6786,
"step": 20240
},
{
"epoch": 0.7484714705284741,
"grad_norm": 1.3861683142566674,
"learning_rate": 1.1729189733669528e-07,
"loss": 1.6242,
"step": 20260
},
{
"epoch": 0.7492103367382751,
"grad_norm": 1.5107470749741048,
"learning_rate": 1.1691981171535459e-07,
"loss": 1.6476,
"step": 20280
},
{
"epoch": 0.7499492029480762,
"grad_norm": 1.5696138247640767,
"learning_rate": 1.1654857786854591e-07,
"loss": 1.6691,
"step": 20300
}
],
"logging_steps": 20,
"max_steps": 27068,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 6767,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3859009492746240.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}