QuranGPT / checkpoint-12603 /trainer_state.json
tempdas's picture
Upload folder using huggingface_hub
3981ba9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 12603,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00238038562247084,
"grad_norm": 29.197416305541992,
"learning_rate": 4.996032690629215e-05,
"loss": 5.8295,
"step": 10
},
{
"epoch": 0.00476077124494168,
"grad_norm": 2.8866491317749023,
"learning_rate": 4.99206538125843e-05,
"loss": 0.9476,
"step": 20
},
{
"epoch": 0.007141156867412521,
"grad_norm": 2.2606563568115234,
"learning_rate": 4.988098071887646e-05,
"loss": 0.1466,
"step": 30
},
{
"epoch": 0.00952154248988336,
"grad_norm": 2.5246834754943848,
"learning_rate": 4.984130762516862e-05,
"loss": 0.0596,
"step": 40
},
{
"epoch": 0.011901928112354201,
"grad_norm": 1.10219144821167,
"learning_rate": 4.980163453146077e-05,
"loss": 0.0351,
"step": 50
},
{
"epoch": 0.014282313734825042,
"grad_norm": 1.7988760471343994,
"learning_rate": 4.976196143775292e-05,
"loss": 0.0293,
"step": 60
},
{
"epoch": 0.016662699357295883,
"grad_norm": 0.2419203370809555,
"learning_rate": 4.972228834404507e-05,
"loss": 0.024,
"step": 70
},
{
"epoch": 0.01904308497976672,
"grad_norm": 0.992480993270874,
"learning_rate": 4.9682615250337225e-05,
"loss": 0.0191,
"step": 80
},
{
"epoch": 0.021423470602237562,
"grad_norm": 1.2107903957366943,
"learning_rate": 4.9642942156629376e-05,
"loss": 0.0147,
"step": 90
},
{
"epoch": 0.023803856224708403,
"grad_norm": 1.5667377710342407,
"learning_rate": 4.960326906292153e-05,
"loss": 0.0144,
"step": 100
},
{
"epoch": 0.026184241847179244,
"grad_norm": 1.7987982034683228,
"learning_rate": 4.956359596921368e-05,
"loss": 0.0121,
"step": 110
},
{
"epoch": 0.028564627469650085,
"grad_norm": 0.7142848968505859,
"learning_rate": 4.952392287550583e-05,
"loss": 0.0109,
"step": 120
},
{
"epoch": 0.030945013092120922,
"grad_norm": 0.9309341311454773,
"learning_rate": 4.9484249781797984e-05,
"loss": 0.0087,
"step": 130
},
{
"epoch": 0.03332539871459177,
"grad_norm": 0.2679256498813629,
"learning_rate": 4.944457668809014e-05,
"loss": 0.0065,
"step": 140
},
{
"epoch": 0.035705784337062604,
"grad_norm": 0.36588725447654724,
"learning_rate": 4.940490359438229e-05,
"loss": 0.0075,
"step": 150
},
{
"epoch": 0.03808616995953344,
"grad_norm": 0.6737563610076904,
"learning_rate": 4.936523050067445e-05,
"loss": 0.0092,
"step": 160
},
{
"epoch": 0.040466555582004286,
"grad_norm": 0.3371886610984802,
"learning_rate": 4.93255574069666e-05,
"loss": 0.0067,
"step": 170
},
{
"epoch": 0.042846941204475124,
"grad_norm": 1.0238951444625854,
"learning_rate": 4.928588431325875e-05,
"loss": 0.0084,
"step": 180
},
{
"epoch": 0.04522732682694597,
"grad_norm": 1.0350103378295898,
"learning_rate": 4.9246211219550906e-05,
"loss": 0.0073,
"step": 190
},
{
"epoch": 0.047607712449416806,
"grad_norm": 0.33256474137306213,
"learning_rate": 4.9206538125843056e-05,
"loss": 0.0082,
"step": 200
},
{
"epoch": 0.04998809807188764,
"grad_norm": 0.0693468451499939,
"learning_rate": 4.9166865032135206e-05,
"loss": 0.0044,
"step": 210
},
{
"epoch": 0.05236848369435849,
"grad_norm": 0.8809625506401062,
"learning_rate": 4.912719193842736e-05,
"loss": 0.0064,
"step": 220
},
{
"epoch": 0.054748869316829325,
"grad_norm": 0.36927270889282227,
"learning_rate": 4.9087518844719514e-05,
"loss": 0.0066,
"step": 230
},
{
"epoch": 0.05712925493930017,
"grad_norm": 0.8885632753372192,
"learning_rate": 4.9047845751011664e-05,
"loss": 0.0063,
"step": 240
},
{
"epoch": 0.05950964056177101,
"grad_norm": 0.5330325365066528,
"learning_rate": 4.900817265730382e-05,
"loss": 0.0059,
"step": 250
},
{
"epoch": 0.061890026184241845,
"grad_norm": 0.5747584700584412,
"learning_rate": 4.896849956359597e-05,
"loss": 0.0056,
"step": 260
},
{
"epoch": 0.06427041180671268,
"grad_norm": 0.10936570912599564,
"learning_rate": 4.892882646988812e-05,
"loss": 0.0038,
"step": 270
},
{
"epoch": 0.06665079742918353,
"grad_norm": 0.136638343334198,
"learning_rate": 4.888915337618027e-05,
"loss": 0.006,
"step": 280
},
{
"epoch": 0.06903118305165437,
"grad_norm": 0.25448599457740784,
"learning_rate": 4.884948028247243e-05,
"loss": 0.0052,
"step": 290
},
{
"epoch": 0.07141156867412521,
"grad_norm": 0.19224955141544342,
"learning_rate": 4.8809807188764586e-05,
"loss": 0.0041,
"step": 300
},
{
"epoch": 0.07379195429659605,
"grad_norm": 0.9061737060546875,
"learning_rate": 4.8770134095056736e-05,
"loss": 0.0051,
"step": 310
},
{
"epoch": 0.07617233991906688,
"grad_norm": 0.31071603298187256,
"learning_rate": 4.873046100134889e-05,
"loss": 0.0043,
"step": 320
},
{
"epoch": 0.07855272554153774,
"grad_norm": 0.054100409150123596,
"learning_rate": 4.869078790764104e-05,
"loss": 0.004,
"step": 330
},
{
"epoch": 0.08093311116400857,
"grad_norm": 0.11965326964855194,
"learning_rate": 4.865111481393319e-05,
"loss": 0.0039,
"step": 340
},
{
"epoch": 0.08331349678647941,
"grad_norm": 0.16056092083454132,
"learning_rate": 4.861144172022535e-05,
"loss": 0.0036,
"step": 350
},
{
"epoch": 0.08569388240895025,
"grad_norm": 0.08699148148298264,
"learning_rate": 4.85717686265175e-05,
"loss": 0.0032,
"step": 360
},
{
"epoch": 0.08807426803142109,
"grad_norm": 0.16824030876159668,
"learning_rate": 4.853209553280965e-05,
"loss": 0.0033,
"step": 370
},
{
"epoch": 0.09045465365389194,
"grad_norm": 0.07728957384824753,
"learning_rate": 4.84924224391018e-05,
"loss": 0.0023,
"step": 380
},
{
"epoch": 0.09283503927636277,
"grad_norm": 0.2950897514820099,
"learning_rate": 4.845274934539395e-05,
"loss": 0.0039,
"step": 390
},
{
"epoch": 0.09521542489883361,
"grad_norm": 0.6249143481254578,
"learning_rate": 4.841307625168611e-05,
"loss": 0.012,
"step": 400
},
{
"epoch": 0.09759581052130445,
"grad_norm": 0.06545058637857437,
"learning_rate": 4.837340315797826e-05,
"loss": 0.0022,
"step": 410
},
{
"epoch": 0.09997619614377529,
"grad_norm": 0.40417027473449707,
"learning_rate": 4.833373006427042e-05,
"loss": 0.003,
"step": 420
},
{
"epoch": 0.10235658176624614,
"grad_norm": 0.38520482182502747,
"learning_rate": 4.829405697056257e-05,
"loss": 0.0037,
"step": 430
},
{
"epoch": 0.10473696738871698,
"grad_norm": 0.9367744326591492,
"learning_rate": 4.825438387685472e-05,
"loss": 0.0029,
"step": 440
},
{
"epoch": 0.10711735301118781,
"grad_norm": 0.09369224309921265,
"learning_rate": 4.8214710783146875e-05,
"loss": 0.0021,
"step": 450
},
{
"epoch": 0.10949773863365865,
"grad_norm": 1.1114966869354248,
"learning_rate": 4.8175037689439025e-05,
"loss": 0.0024,
"step": 460
},
{
"epoch": 0.11187812425612949,
"grad_norm": 0.15539304912090302,
"learning_rate": 4.8135364595731175e-05,
"loss": 0.0026,
"step": 470
},
{
"epoch": 0.11425850987860034,
"grad_norm": 0.05451425537467003,
"learning_rate": 4.809569150202333e-05,
"loss": 0.0024,
"step": 480
},
{
"epoch": 0.11663889550107118,
"grad_norm": 0.08954957127571106,
"learning_rate": 4.805601840831548e-05,
"loss": 0.0032,
"step": 490
},
{
"epoch": 0.11901928112354201,
"grad_norm": 0.24188756942749023,
"learning_rate": 4.801634531460763e-05,
"loss": 0.0023,
"step": 500
},
{
"epoch": 0.12139966674601285,
"grad_norm": 0.062233567237854004,
"learning_rate": 4.797667222089979e-05,
"loss": 0.002,
"step": 510
},
{
"epoch": 0.12378005236848369,
"grad_norm": 0.605993926525116,
"learning_rate": 4.793699912719194e-05,
"loss": 0.0021,
"step": 520
},
{
"epoch": 0.12616043799095453,
"grad_norm": 1.5091257095336914,
"learning_rate": 4.789732603348409e-05,
"loss": 0.0026,
"step": 530
},
{
"epoch": 0.12854082361342536,
"grad_norm": 0.07300706952810287,
"learning_rate": 4.785765293977625e-05,
"loss": 0.0018,
"step": 540
},
{
"epoch": 0.1309212092358962,
"grad_norm": 0.07547351717948914,
"learning_rate": 4.78179798460684e-05,
"loss": 0.0022,
"step": 550
},
{
"epoch": 0.13330159485836707,
"grad_norm": 0.017345329746603966,
"learning_rate": 4.7778306752360555e-05,
"loss": 0.002,
"step": 560
},
{
"epoch": 0.1356819804808379,
"grad_norm": 0.048248808830976486,
"learning_rate": 4.7738633658652705e-05,
"loss": 0.0018,
"step": 570
},
{
"epoch": 0.13806236610330874,
"grad_norm": 0.04654766246676445,
"learning_rate": 4.7698960564944856e-05,
"loss": 0.0026,
"step": 580
},
{
"epoch": 0.14044275172577958,
"grad_norm": 0.7228689193725586,
"learning_rate": 4.7659287471237006e-05,
"loss": 0.0033,
"step": 590
},
{
"epoch": 0.14282313734825042,
"grad_norm": 0.01947982981801033,
"learning_rate": 4.761961437752916e-05,
"loss": 0.0024,
"step": 600
},
{
"epoch": 0.14520352297072125,
"grad_norm": 0.03398985415697098,
"learning_rate": 4.7579941283821314e-05,
"loss": 0.0019,
"step": 610
},
{
"epoch": 0.1475839085931921,
"grad_norm": 0.11993751674890518,
"learning_rate": 4.754026819011347e-05,
"loss": 0.0024,
"step": 620
},
{
"epoch": 0.14996429421566293,
"grad_norm": 0.02739240974187851,
"learning_rate": 4.750059509640562e-05,
"loss": 0.0019,
"step": 630
},
{
"epoch": 0.15234467983813377,
"grad_norm": 0.08998490869998932,
"learning_rate": 4.746092200269777e-05,
"loss": 0.0026,
"step": 640
},
{
"epoch": 0.1547250654606046,
"grad_norm": 0.06008267030119896,
"learning_rate": 4.742124890898992e-05,
"loss": 0.0019,
"step": 650
},
{
"epoch": 0.15710545108307547,
"grad_norm": 0.2969667911529541,
"learning_rate": 4.738157581528208e-05,
"loss": 0.0016,
"step": 660
},
{
"epoch": 0.1594858367055463,
"grad_norm": 0.056759823113679886,
"learning_rate": 4.7341902721574236e-05,
"loss": 0.0026,
"step": 670
},
{
"epoch": 0.16186622232801715,
"grad_norm": 0.36679673194885254,
"learning_rate": 4.7302229627866386e-05,
"loss": 0.0023,
"step": 680
},
{
"epoch": 0.16424660795048798,
"grad_norm": 0.29111284017562866,
"learning_rate": 4.7262556534158536e-05,
"loss": 0.0028,
"step": 690
},
{
"epoch": 0.16662699357295882,
"grad_norm": 0.48570939898490906,
"learning_rate": 4.722288344045069e-05,
"loss": 0.0015,
"step": 700
},
{
"epoch": 0.16900737919542966,
"grad_norm": 0.06863627582788467,
"learning_rate": 4.718321034674284e-05,
"loss": 0.0016,
"step": 710
},
{
"epoch": 0.1713877648179005,
"grad_norm": 0.18400460481643677,
"learning_rate": 4.7143537253034994e-05,
"loss": 0.0025,
"step": 720
},
{
"epoch": 0.17376815044037133,
"grad_norm": 0.02043345756828785,
"learning_rate": 4.710386415932715e-05,
"loss": 0.0014,
"step": 730
},
{
"epoch": 0.17614853606284217,
"grad_norm": 0.22026614844799042,
"learning_rate": 4.70641910656193e-05,
"loss": 0.0016,
"step": 740
},
{
"epoch": 0.178528921685313,
"grad_norm": 0.033756159245967865,
"learning_rate": 4.702451797191145e-05,
"loss": 0.0015,
"step": 750
},
{
"epoch": 0.18090930730778387,
"grad_norm": 0.03022690862417221,
"learning_rate": 4.69848448782036e-05,
"loss": 0.0016,
"step": 760
},
{
"epoch": 0.1832896929302547,
"grad_norm": 0.32997235655784607,
"learning_rate": 4.694517178449576e-05,
"loss": 0.0017,
"step": 770
},
{
"epoch": 0.18567007855272555,
"grad_norm": 0.6392120718955994,
"learning_rate": 4.690549869078791e-05,
"loss": 0.0015,
"step": 780
},
{
"epoch": 0.18805046417519639,
"grad_norm": 0.12279071658849716,
"learning_rate": 4.6865825597080066e-05,
"loss": 0.0016,
"step": 790
},
{
"epoch": 0.19043084979766722,
"grad_norm": 0.1228996068239212,
"learning_rate": 4.682615250337222e-05,
"loss": 0.0012,
"step": 800
},
{
"epoch": 0.19281123542013806,
"grad_norm": 0.23846402764320374,
"learning_rate": 4.678647940966437e-05,
"loss": 0.0011,
"step": 810
},
{
"epoch": 0.1951916210426089,
"grad_norm": 0.06786726415157318,
"learning_rate": 4.674680631595652e-05,
"loss": 0.0008,
"step": 820
},
{
"epoch": 0.19757200666507974,
"grad_norm": 0.062252361327409744,
"learning_rate": 4.6707133222248675e-05,
"loss": 0.0009,
"step": 830
},
{
"epoch": 0.19995239228755057,
"grad_norm": 0.10420612245798111,
"learning_rate": 4.6667460128540825e-05,
"loss": 0.0007,
"step": 840
},
{
"epoch": 0.2023327779100214,
"grad_norm": 0.024685313925147057,
"learning_rate": 4.6627787034832975e-05,
"loss": 0.0012,
"step": 850
},
{
"epoch": 0.20471316353249228,
"grad_norm": 0.07784374058246613,
"learning_rate": 4.658811394112513e-05,
"loss": 0.0011,
"step": 860
},
{
"epoch": 0.2070935491549631,
"grad_norm": 0.1463196724653244,
"learning_rate": 4.654844084741728e-05,
"loss": 0.0017,
"step": 870
},
{
"epoch": 0.20947393477743395,
"grad_norm": 0.04599474370479584,
"learning_rate": 4.650876775370944e-05,
"loss": 0.0013,
"step": 880
},
{
"epoch": 0.2118543203999048,
"grad_norm": 0.44877147674560547,
"learning_rate": 4.646909466000159e-05,
"loss": 0.0012,
"step": 890
},
{
"epoch": 0.21423470602237563,
"grad_norm": 1.3056105375289917,
"learning_rate": 4.642942156629374e-05,
"loss": 0.0018,
"step": 900
},
{
"epoch": 0.21661509164484646,
"grad_norm": 0.5220457911491394,
"learning_rate": 4.638974847258589e-05,
"loss": 0.0011,
"step": 910
},
{
"epoch": 0.2189954772673173,
"grad_norm": 0.5913621783256531,
"learning_rate": 4.635007537887805e-05,
"loss": 0.0013,
"step": 920
},
{
"epoch": 0.22137586288978814,
"grad_norm": 0.150216206908226,
"learning_rate": 4.63104022851702e-05,
"loss": 0.001,
"step": 930
},
{
"epoch": 0.22375624851225898,
"grad_norm": 0.022638270631432533,
"learning_rate": 4.6270729191462355e-05,
"loss": 0.0012,
"step": 940
},
{
"epoch": 0.2261366341347298,
"grad_norm": 0.017948875203728676,
"learning_rate": 4.6231056097754505e-05,
"loss": 0.0008,
"step": 950
},
{
"epoch": 0.22851701975720068,
"grad_norm": 0.25053608417510986,
"learning_rate": 4.6191383004046656e-05,
"loss": 0.0019,
"step": 960
},
{
"epoch": 0.23089740537967152,
"grad_norm": 0.12757046520709991,
"learning_rate": 4.6151709910338806e-05,
"loss": 0.0019,
"step": 970
},
{
"epoch": 0.23327779100214235,
"grad_norm": 0.185049369931221,
"learning_rate": 4.611203681663096e-05,
"loss": 0.0014,
"step": 980
},
{
"epoch": 0.2356581766246132,
"grad_norm": 0.37812331318855286,
"learning_rate": 4.607236372292312e-05,
"loss": 0.0014,
"step": 990
},
{
"epoch": 0.23803856224708403,
"grad_norm": 0.7450318336486816,
"learning_rate": 4.603269062921527e-05,
"loss": 0.0016,
"step": 1000
},
{
"epoch": 0.24041894786955487,
"grad_norm": 0.03629771247506142,
"learning_rate": 4.599301753550742e-05,
"loss": 0.0012,
"step": 1010
},
{
"epoch": 0.2427993334920257,
"grad_norm": 0.23223434388637543,
"learning_rate": 4.595334444179957e-05,
"loss": 0.0011,
"step": 1020
},
{
"epoch": 0.24517971911449654,
"grad_norm": 0.08511273562908173,
"learning_rate": 4.591367134809172e-05,
"loss": 0.0006,
"step": 1030
},
{
"epoch": 0.24756010473696738,
"grad_norm": 0.17114369571208954,
"learning_rate": 4.587399825438388e-05,
"loss": 0.001,
"step": 1040
},
{
"epoch": 0.24994049035943822,
"grad_norm": 0.04517650604248047,
"learning_rate": 4.5834325160676035e-05,
"loss": 0.0008,
"step": 1050
},
{
"epoch": 0.25232087598190905,
"grad_norm": 0.20234528183937073,
"learning_rate": 4.5794652066968186e-05,
"loss": 0.0014,
"step": 1060
},
{
"epoch": 0.2547012616043799,
"grad_norm": 0.007534442003816366,
"learning_rate": 4.5754978973260336e-05,
"loss": 0.0008,
"step": 1070
},
{
"epoch": 0.25708164722685073,
"grad_norm": 0.02520332857966423,
"learning_rate": 4.5715305879552486e-05,
"loss": 0.0008,
"step": 1080
},
{
"epoch": 0.25946203284932157,
"grad_norm": 0.02674415148794651,
"learning_rate": 4.5675632785844644e-05,
"loss": 0.005,
"step": 1090
},
{
"epoch": 0.2618424184717924,
"grad_norm": 0.0756726786494255,
"learning_rate": 4.5635959692136794e-05,
"loss": 0.0008,
"step": 1100
},
{
"epoch": 0.2642228040942633,
"grad_norm": 0.18692266941070557,
"learning_rate": 4.559628659842895e-05,
"loss": 0.0021,
"step": 1110
},
{
"epoch": 0.26660318971673413,
"grad_norm": 0.021881476044654846,
"learning_rate": 4.55566135047211e-05,
"loss": 0.0016,
"step": 1120
},
{
"epoch": 0.26898357533920497,
"grad_norm": 0.16764195263385773,
"learning_rate": 4.551694041101325e-05,
"loss": 0.001,
"step": 1130
},
{
"epoch": 0.2713639609616758,
"grad_norm": 0.6519142389297485,
"learning_rate": 4.547726731730541e-05,
"loss": 0.0015,
"step": 1140
},
{
"epoch": 0.27374434658414665,
"grad_norm": 0.07793217897415161,
"learning_rate": 4.543759422359756e-05,
"loss": 0.0005,
"step": 1150
},
{
"epoch": 0.2761247322066175,
"grad_norm": 0.04451458901166916,
"learning_rate": 4.539792112988971e-05,
"loss": 0.0009,
"step": 1160
},
{
"epoch": 0.2785051178290883,
"grad_norm": 0.02606957219541073,
"learning_rate": 4.5358248036181866e-05,
"loss": 0.0013,
"step": 1170
},
{
"epoch": 0.28088550345155916,
"grad_norm": 0.03642681613564491,
"learning_rate": 4.531857494247402e-05,
"loss": 0.0007,
"step": 1180
},
{
"epoch": 0.28326588907403,
"grad_norm": 0.27240046858787537,
"learning_rate": 4.527890184876617e-05,
"loss": 0.0007,
"step": 1190
},
{
"epoch": 0.28564627469650083,
"grad_norm": 0.01732662320137024,
"learning_rate": 4.5239228755058324e-05,
"loss": 0.0011,
"step": 1200
},
{
"epoch": 0.2880266603189717,
"grad_norm": 0.10321195423603058,
"learning_rate": 4.5199555661350474e-05,
"loss": 0.0007,
"step": 1210
},
{
"epoch": 0.2904070459414425,
"grad_norm": 0.060121580958366394,
"learning_rate": 4.5159882567642625e-05,
"loss": 0.0014,
"step": 1220
},
{
"epoch": 0.29278743156391335,
"grad_norm": 0.028955884277820587,
"learning_rate": 4.5120209473934775e-05,
"loss": 0.0007,
"step": 1230
},
{
"epoch": 0.2951678171863842,
"grad_norm": 0.0714436024427414,
"learning_rate": 4.508053638022693e-05,
"loss": 0.0007,
"step": 1240
},
{
"epoch": 0.297548202808855,
"grad_norm": 0.052230022847652435,
"learning_rate": 4.504086328651909e-05,
"loss": 0.0008,
"step": 1250
},
{
"epoch": 0.29992858843132586,
"grad_norm": 0.33476394414901733,
"learning_rate": 4.500119019281124e-05,
"loss": 0.0008,
"step": 1260
},
{
"epoch": 0.3023089740537967,
"grad_norm": 0.07732009142637253,
"learning_rate": 4.496151709910339e-05,
"loss": 0.0012,
"step": 1270
},
{
"epoch": 0.30468935967626753,
"grad_norm": 0.6843579411506653,
"learning_rate": 4.492184400539554e-05,
"loss": 0.0007,
"step": 1280
},
{
"epoch": 0.3070697452987384,
"grad_norm": 0.08292358368635178,
"learning_rate": 4.488217091168769e-05,
"loss": 0.0005,
"step": 1290
},
{
"epoch": 0.3094501309212092,
"grad_norm": 0.02598383277654648,
"learning_rate": 4.484249781797985e-05,
"loss": 0.001,
"step": 1300
},
{
"epoch": 0.3118305165436801,
"grad_norm": 0.7855332493782043,
"learning_rate": 4.4802824724272005e-05,
"loss": 0.0007,
"step": 1310
},
{
"epoch": 0.31421090216615094,
"grad_norm": 0.07066315412521362,
"learning_rate": 4.4763151630564155e-05,
"loss": 0.0005,
"step": 1320
},
{
"epoch": 0.3165912877886218,
"grad_norm": 0.012595695443451405,
"learning_rate": 4.4723478536856305e-05,
"loss": 0.0005,
"step": 1330
},
{
"epoch": 0.3189716734110926,
"grad_norm": 0.015364304184913635,
"learning_rate": 4.4683805443148455e-05,
"loss": 0.0005,
"step": 1340
},
{
"epoch": 0.32135205903356345,
"grad_norm": 0.0556706003844738,
"learning_rate": 4.464413234944061e-05,
"loss": 0.0011,
"step": 1350
},
{
"epoch": 0.3237324446560343,
"grad_norm": 0.22568030655384064,
"learning_rate": 4.460445925573277e-05,
"loss": 0.0023,
"step": 1360
},
{
"epoch": 0.32611283027850513,
"grad_norm": 0.048404548317193985,
"learning_rate": 4.456478616202492e-05,
"loss": 0.0016,
"step": 1370
},
{
"epoch": 0.32849321590097597,
"grad_norm": 0.0693359524011612,
"learning_rate": 4.452511306831707e-05,
"loss": 0.0038,
"step": 1380
},
{
"epoch": 0.3308736015234468,
"grad_norm": 0.16493481397628784,
"learning_rate": 4.448543997460922e-05,
"loss": 0.0006,
"step": 1390
},
{
"epoch": 0.33325398714591764,
"grad_norm": 1.200024962425232,
"learning_rate": 4.444576688090137e-05,
"loss": 0.0011,
"step": 1400
},
{
"epoch": 0.3356343727683885,
"grad_norm": 0.23021258413791656,
"learning_rate": 4.440609378719353e-05,
"loss": 0.0009,
"step": 1410
},
{
"epoch": 0.3380147583908593,
"grad_norm": 0.0196574367582798,
"learning_rate": 4.436642069348568e-05,
"loss": 0.0006,
"step": 1420
},
{
"epoch": 0.34039514401333015,
"grad_norm": 0.3254101574420929,
"learning_rate": 4.4326747599777835e-05,
"loss": 0.0015,
"step": 1430
},
{
"epoch": 0.342775529635801,
"grad_norm": 0.026332201436161995,
"learning_rate": 4.4287074506069986e-05,
"loss": 0.0017,
"step": 1440
},
{
"epoch": 0.34515591525827183,
"grad_norm": 0.2679558992385864,
"learning_rate": 4.4247401412362136e-05,
"loss": 0.0012,
"step": 1450
},
{
"epoch": 0.34753630088074267,
"grad_norm": 0.06991584599018097,
"learning_rate": 4.420772831865429e-05,
"loss": 0.0007,
"step": 1460
},
{
"epoch": 0.3499166865032135,
"grad_norm": 0.036999981850385666,
"learning_rate": 4.416805522494644e-05,
"loss": 0.001,
"step": 1470
},
{
"epoch": 0.35229707212568434,
"grad_norm": 0.042684607207775116,
"learning_rate": 4.4128382131238594e-05,
"loss": 0.0009,
"step": 1480
},
{
"epoch": 0.3546774577481552,
"grad_norm": 0.013829515315592289,
"learning_rate": 4.408870903753075e-05,
"loss": 0.0008,
"step": 1490
},
{
"epoch": 0.357057843370626,
"grad_norm": 0.0129277054220438,
"learning_rate": 4.40490359438229e-05,
"loss": 0.0007,
"step": 1500
},
{
"epoch": 0.3594382289930969,
"grad_norm": 0.03553192317485809,
"learning_rate": 4.400936285011505e-05,
"loss": 0.0008,
"step": 1510
},
{
"epoch": 0.36181861461556775,
"grad_norm": 0.01258548628538847,
"learning_rate": 4.396968975640721e-05,
"loss": 0.001,
"step": 1520
},
{
"epoch": 0.3641990002380386,
"grad_norm": 0.021352197974920273,
"learning_rate": 4.393001666269936e-05,
"loss": 0.001,
"step": 1530
},
{
"epoch": 0.3665793858605094,
"grad_norm": 0.035958483815193176,
"learning_rate": 4.389034356899151e-05,
"loss": 0.0007,
"step": 1540
},
{
"epoch": 0.36895977148298026,
"grad_norm": 0.013187541626393795,
"learning_rate": 4.3850670475283666e-05,
"loss": 0.0009,
"step": 1550
},
{
"epoch": 0.3713401571054511,
"grad_norm": 0.02294233813881874,
"learning_rate": 4.3810997381575816e-05,
"loss": 0.0008,
"step": 1560
},
{
"epoch": 0.37372054272792193,
"grad_norm": 0.14476238191127777,
"learning_rate": 4.3771324287867974e-05,
"loss": 0.0005,
"step": 1570
},
{
"epoch": 0.37610092835039277,
"grad_norm": 0.2275228053331375,
"learning_rate": 4.3731651194160124e-05,
"loss": 0.0006,
"step": 1580
},
{
"epoch": 0.3784813139728636,
"grad_norm": 0.020434999838471413,
"learning_rate": 4.3691978100452274e-05,
"loss": 0.0004,
"step": 1590
},
{
"epoch": 0.38086169959533445,
"grad_norm": 0.01040293462574482,
"learning_rate": 4.3652305006744424e-05,
"loss": 0.0003,
"step": 1600
},
{
"epoch": 0.3832420852178053,
"grad_norm": 0.0240499097853899,
"learning_rate": 4.3612631913036575e-05,
"loss": 0.0008,
"step": 1610
},
{
"epoch": 0.3856224708402761,
"grad_norm": 0.014826517552137375,
"learning_rate": 4.357295881932874e-05,
"loss": 0.0004,
"step": 1620
},
{
"epoch": 0.38800285646274696,
"grad_norm": 0.011841246858239174,
"learning_rate": 4.353328572562089e-05,
"loss": 0.0007,
"step": 1630
},
{
"epoch": 0.3903832420852178,
"grad_norm": 0.0156678706407547,
"learning_rate": 4.349361263191304e-05,
"loss": 0.0006,
"step": 1640
},
{
"epoch": 0.39276362770768863,
"grad_norm": 0.06124578043818474,
"learning_rate": 4.345393953820519e-05,
"loss": 0.0005,
"step": 1650
},
{
"epoch": 0.39514401333015947,
"grad_norm": 0.06753918528556824,
"learning_rate": 4.341426644449734e-05,
"loss": 0.0006,
"step": 1660
},
{
"epoch": 0.3975243989526303,
"grad_norm": 0.08766347169876099,
"learning_rate": 4.33745933507895e-05,
"loss": 0.0003,
"step": 1670
},
{
"epoch": 0.39990478457510115,
"grad_norm": 0.021080242469906807,
"learning_rate": 4.3334920257081654e-05,
"loss": 0.0008,
"step": 1680
},
{
"epoch": 0.402285170197572,
"grad_norm": 0.11970046162605286,
"learning_rate": 4.3295247163373804e-05,
"loss": 0.0005,
"step": 1690
},
{
"epoch": 0.4046655558200428,
"grad_norm": 0.027210582047700882,
"learning_rate": 4.3255574069665955e-05,
"loss": 0.0003,
"step": 1700
},
{
"epoch": 0.4070459414425137,
"grad_norm": 0.021168386563658714,
"learning_rate": 4.3215900975958105e-05,
"loss": 0.0005,
"step": 1710
},
{
"epoch": 0.40942632706498455,
"grad_norm": 0.012768070213496685,
"learning_rate": 4.3176227882250255e-05,
"loss": 0.0005,
"step": 1720
},
{
"epoch": 0.4118067126874554,
"grad_norm": 0.1276211142539978,
"learning_rate": 4.313655478854241e-05,
"loss": 0.0005,
"step": 1730
},
{
"epoch": 0.4141870983099262,
"grad_norm": 0.08978109806776047,
"learning_rate": 4.309688169483457e-05,
"loss": 0.0009,
"step": 1740
},
{
"epoch": 0.41656748393239706,
"grad_norm": 0.3068161606788635,
"learning_rate": 4.305720860112672e-05,
"loss": 0.0007,
"step": 1750
},
{
"epoch": 0.4189478695548679,
"grad_norm": 0.01211560145020485,
"learning_rate": 4.301753550741887e-05,
"loss": 0.0006,
"step": 1760
},
{
"epoch": 0.42132825517733874,
"grad_norm": 0.02517927996814251,
"learning_rate": 4.297786241371102e-05,
"loss": 0.0006,
"step": 1770
},
{
"epoch": 0.4237086407998096,
"grad_norm": 0.017450081184506416,
"learning_rate": 4.293818932000318e-05,
"loss": 0.0003,
"step": 1780
},
{
"epoch": 0.4260890264222804,
"grad_norm": 0.014250938780605793,
"learning_rate": 4.289851622629533e-05,
"loss": 0.0005,
"step": 1790
},
{
"epoch": 0.42846941204475125,
"grad_norm": 0.027526648715138435,
"learning_rate": 4.2858843132587485e-05,
"loss": 0.0005,
"step": 1800
},
{
"epoch": 0.4308497976672221,
"grad_norm": 0.0071271262131631374,
"learning_rate": 4.2819170038879635e-05,
"loss": 0.0007,
"step": 1810
},
{
"epoch": 0.4332301832896929,
"grad_norm": 0.11835234612226486,
"learning_rate": 4.2779496945171785e-05,
"loss": 0.0005,
"step": 1820
},
{
"epoch": 0.43561056891216376,
"grad_norm": 0.016718665137887,
"learning_rate": 4.273982385146394e-05,
"loss": 0.0007,
"step": 1830
},
{
"epoch": 0.4379909545346346,
"grad_norm": 0.04138866439461708,
"learning_rate": 4.270015075775609e-05,
"loss": 0.0005,
"step": 1840
},
{
"epoch": 0.44037134015710544,
"grad_norm": 0.5920994281768799,
"learning_rate": 4.266047766404824e-05,
"loss": 0.0009,
"step": 1850
},
{
"epoch": 0.4427517257795763,
"grad_norm": 0.010394711047410965,
"learning_rate": 4.2620804570340393e-05,
"loss": 0.0004,
"step": 1860
},
{
"epoch": 0.4451321114020471,
"grad_norm": 0.031543031334877014,
"learning_rate": 4.258113147663255e-05,
"loss": 0.0005,
"step": 1870
},
{
"epoch": 0.44751249702451795,
"grad_norm": 0.016665128991007805,
"learning_rate": 4.25414583829247e-05,
"loss": 0.0083,
"step": 1880
},
{
"epoch": 0.4498928826469888,
"grad_norm": 0.03811788931488991,
"learning_rate": 4.250178528921686e-05,
"loss": 0.0014,
"step": 1890
},
{
"epoch": 0.4522732682694596,
"grad_norm": 0.0656796246767044,
"learning_rate": 4.246211219550901e-05,
"loss": 0.0008,
"step": 1900
},
{
"epoch": 0.4546536538919305,
"grad_norm": 0.011904909275472164,
"learning_rate": 4.242243910180116e-05,
"loss": 0.0006,
"step": 1910
},
{
"epoch": 0.45703403951440136,
"grad_norm": 0.01850457303225994,
"learning_rate": 4.238276600809331e-05,
"loss": 0.0004,
"step": 1920
},
{
"epoch": 0.4594144251368722,
"grad_norm": 0.10309766978025436,
"learning_rate": 4.2343092914385466e-05,
"loss": 0.0005,
"step": 1930
},
{
"epoch": 0.46179481075934303,
"grad_norm": 0.13206863403320312,
"learning_rate": 4.230341982067762e-05,
"loss": 0.0004,
"step": 1940
},
{
"epoch": 0.46417519638181387,
"grad_norm": 0.010924161411821842,
"learning_rate": 4.226374672696977e-05,
"loss": 0.0003,
"step": 1950
},
{
"epoch": 0.4665555820042847,
"grad_norm": 0.013266120105981827,
"learning_rate": 4.2224073633261924e-05,
"loss": 0.0004,
"step": 1960
},
{
"epoch": 0.46893596762675555,
"grad_norm": 0.008552256040275097,
"learning_rate": 4.2184400539554074e-05,
"loss": 0.0003,
"step": 1970
},
{
"epoch": 0.4713163532492264,
"grad_norm": 0.0052538709715008736,
"learning_rate": 4.2144727445846224e-05,
"loss": 0.0005,
"step": 1980
},
{
"epoch": 0.4736967388716972,
"grad_norm": 0.0074672214686870575,
"learning_rate": 4.210505435213838e-05,
"loss": 0.0003,
"step": 1990
},
{
"epoch": 0.47607712449416806,
"grad_norm": 0.5743750929832458,
"learning_rate": 4.206538125843054e-05,
"loss": 0.0005,
"step": 2000
},
{
"epoch": 0.4784575101166389,
"grad_norm": 0.0076432847417891026,
"learning_rate": 4.202570816472269e-05,
"loss": 0.0005,
"step": 2010
},
{
"epoch": 0.48083789573910973,
"grad_norm": 0.09265641123056412,
"learning_rate": 4.198603507101484e-05,
"loss": 0.0003,
"step": 2020
},
{
"epoch": 0.48321828136158057,
"grad_norm": 0.01519245095551014,
"learning_rate": 4.194636197730699e-05,
"loss": 0.0002,
"step": 2030
},
{
"epoch": 0.4855986669840514,
"grad_norm": 0.04831220954656601,
"learning_rate": 4.1906688883599146e-05,
"loss": 0.0003,
"step": 2040
},
{
"epoch": 0.48797905260652225,
"grad_norm": 0.024797851219773293,
"learning_rate": 4.18670157898913e-05,
"loss": 0.0004,
"step": 2050
},
{
"epoch": 0.4903594382289931,
"grad_norm": 0.008994129486382008,
"learning_rate": 4.1827342696183454e-05,
"loss": 0.0002,
"step": 2060
},
{
"epoch": 0.4927398238514639,
"grad_norm": 0.00806290004402399,
"learning_rate": 4.1787669602475604e-05,
"loss": 0.0004,
"step": 2070
},
{
"epoch": 0.49512020947393476,
"grad_norm": 0.003900889540091157,
"learning_rate": 4.1747996508767754e-05,
"loss": 0.0002,
"step": 2080
},
{
"epoch": 0.4975005950964056,
"grad_norm": 0.00262014614418149,
"learning_rate": 4.1708323415059905e-05,
"loss": 0.0002,
"step": 2090
},
{
"epoch": 0.49988098071887643,
"grad_norm": 0.30837100744247437,
"learning_rate": 4.166865032135206e-05,
"loss": 0.0004,
"step": 2100
},
{
"epoch": 0.5022613663413473,
"grad_norm": 0.5304675102233887,
"learning_rate": 4.162897722764421e-05,
"loss": 0.0003,
"step": 2110
},
{
"epoch": 0.5046417519638181,
"grad_norm": 0.3627573847770691,
"learning_rate": 4.158930413393637e-05,
"loss": 0.0043,
"step": 2120
},
{
"epoch": 0.507022137586289,
"grad_norm": 0.011327610351145267,
"learning_rate": 4.154963104022852e-05,
"loss": 0.0005,
"step": 2130
},
{
"epoch": 0.5094025232087598,
"grad_norm": 0.055182114243507385,
"learning_rate": 4.150995794652067e-05,
"loss": 0.0005,
"step": 2140
},
{
"epoch": 0.5117829088312307,
"grad_norm": 0.009911212138831615,
"learning_rate": 4.147028485281283e-05,
"loss": 0.0004,
"step": 2150
},
{
"epoch": 0.5141632944537015,
"grad_norm": 0.028569847345352173,
"learning_rate": 4.143061175910498e-05,
"loss": 0.0003,
"step": 2160
},
{
"epoch": 0.5165436800761724,
"grad_norm": 0.0070992144756019115,
"learning_rate": 4.139093866539713e-05,
"loss": 0.0006,
"step": 2170
},
{
"epoch": 0.5189240656986431,
"grad_norm": 0.008213848806917667,
"learning_rate": 4.1351265571689285e-05,
"loss": 0.0002,
"step": 2180
},
{
"epoch": 0.521304451321114,
"grad_norm": 0.018964022397994995,
"learning_rate": 4.1311592477981435e-05,
"loss": 0.0003,
"step": 2190
},
{
"epoch": 0.5236848369435848,
"grad_norm": 0.004533541388809681,
"learning_rate": 4.1271919384273585e-05,
"loss": 0.0003,
"step": 2200
},
{
"epoch": 0.5260652225660557,
"grad_norm": 0.12422726303339005,
"learning_rate": 4.123224629056574e-05,
"loss": 0.0003,
"step": 2210
},
{
"epoch": 0.5284456081885266,
"grad_norm": 0.019521724432706833,
"learning_rate": 4.119257319685789e-05,
"loss": 0.0003,
"step": 2220
},
{
"epoch": 0.5308259938109974,
"grad_norm": 0.03547817841172218,
"learning_rate": 4.115290010315004e-05,
"loss": 0.0004,
"step": 2230
},
{
"epoch": 0.5332063794334683,
"grad_norm": 0.9750944375991821,
"learning_rate": 4.111322700944219e-05,
"loss": 0.0005,
"step": 2240
},
{
"epoch": 0.535586765055939,
"grad_norm": 0.09758254885673523,
"learning_rate": 4.107355391573435e-05,
"loss": 0.0004,
"step": 2250
},
{
"epoch": 0.5379671506784099,
"grad_norm": 0.20201332867145538,
"learning_rate": 4.103388082202651e-05,
"loss": 0.0008,
"step": 2260
},
{
"epoch": 0.5403475363008807,
"grad_norm": 0.2006085067987442,
"learning_rate": 4.099420772831866e-05,
"loss": 0.0008,
"step": 2270
},
{
"epoch": 0.5427279219233516,
"grad_norm": 0.0802696943283081,
"learning_rate": 4.095453463461081e-05,
"loss": 0.0007,
"step": 2280
},
{
"epoch": 0.5451083075458224,
"grad_norm": 0.4039531350135803,
"learning_rate": 4.091486154090296e-05,
"loss": 0.0024,
"step": 2290
},
{
"epoch": 0.5474886931682933,
"grad_norm": 0.006702470127493143,
"learning_rate": 4.087518844719511e-05,
"loss": 0.0007,
"step": 2300
},
{
"epoch": 0.5498690787907641,
"grad_norm": 0.1001976877450943,
"learning_rate": 4.083551535348727e-05,
"loss": 0.0003,
"step": 2310
},
{
"epoch": 0.552249464413235,
"grad_norm": 0.005626179743558168,
"learning_rate": 4.079584225977942e-05,
"loss": 0.0009,
"step": 2320
},
{
"epoch": 0.5546298500357058,
"grad_norm": 0.009593102149665356,
"learning_rate": 4.075616916607157e-05,
"loss": 0.0003,
"step": 2330
},
{
"epoch": 0.5570102356581766,
"grad_norm": 0.014003382995724678,
"learning_rate": 4.0716496072363723e-05,
"loss": 0.0003,
"step": 2340
},
{
"epoch": 0.5593906212806474,
"grad_norm": 0.012953966856002808,
"learning_rate": 4.0676822978655874e-05,
"loss": 0.0004,
"step": 2350
},
{
"epoch": 0.5617710069031183,
"grad_norm": 0.007770949974656105,
"learning_rate": 4.063714988494803e-05,
"loss": 0.0006,
"step": 2360
},
{
"epoch": 0.5641513925255891,
"grad_norm": 0.01227940246462822,
"learning_rate": 4.059747679124019e-05,
"loss": 0.0003,
"step": 2370
},
{
"epoch": 0.56653177814806,
"grad_norm": 0.2204684615135193,
"learning_rate": 4.055780369753234e-05,
"loss": 0.0003,
"step": 2380
},
{
"epoch": 0.5689121637705308,
"grad_norm": 0.03364790603518486,
"learning_rate": 4.051813060382449e-05,
"loss": 0.0003,
"step": 2390
},
{
"epoch": 0.5712925493930017,
"grad_norm": 0.049715492874383926,
"learning_rate": 4.047845751011664e-05,
"loss": 0.0003,
"step": 2400
},
{
"epoch": 0.5736729350154725,
"grad_norm": 0.028070533648133278,
"learning_rate": 4.0438784416408796e-05,
"loss": 0.0007,
"step": 2410
},
{
"epoch": 0.5760533206379433,
"grad_norm": 0.020421486347913742,
"learning_rate": 4.0399111322700946e-05,
"loss": 0.0009,
"step": 2420
},
{
"epoch": 0.5784337062604142,
"grad_norm": 0.010064806789159775,
"learning_rate": 4.0359438228993097e-05,
"loss": 0.0008,
"step": 2430
},
{
"epoch": 0.580814091882885,
"grad_norm": 0.3017018735408783,
"learning_rate": 4.0319765135285254e-05,
"loss": 0.001,
"step": 2440
},
{
"epoch": 0.5831944775053559,
"grad_norm": 0.20759595930576324,
"learning_rate": 4.0280092041577404e-05,
"loss": 0.0003,
"step": 2450
},
{
"epoch": 0.5855748631278267,
"grad_norm": 0.016160350292921066,
"learning_rate": 4.0240418947869554e-05,
"loss": 0.0006,
"step": 2460
},
{
"epoch": 0.5879552487502976,
"grad_norm": 0.5293152332305908,
"learning_rate": 4.020074585416171e-05,
"loss": 0.0011,
"step": 2470
},
{
"epoch": 0.5903356343727684,
"grad_norm": 0.007493559271097183,
"learning_rate": 4.016107276045386e-05,
"loss": 0.0004,
"step": 2480
},
{
"epoch": 0.5927160199952393,
"grad_norm": 0.018649157136678696,
"learning_rate": 4.012139966674601e-05,
"loss": 0.0005,
"step": 2490
},
{
"epoch": 0.59509640561771,
"grad_norm": 0.01135182660073042,
"learning_rate": 4.008172657303817e-05,
"loss": 0.0004,
"step": 2500
},
{
"epoch": 0.5974767912401809,
"grad_norm": 0.0733335018157959,
"learning_rate": 4.004205347933032e-05,
"loss": 0.0005,
"step": 2510
},
{
"epoch": 0.5998571768626517,
"grad_norm": 0.02785026654601097,
"learning_rate": 4.0002380385622476e-05,
"loss": 0.0003,
"step": 2520
},
{
"epoch": 0.6022375624851226,
"grad_norm": 0.005258665420114994,
"learning_rate": 3.996270729191463e-05,
"loss": 0.0002,
"step": 2530
},
{
"epoch": 0.6046179481075934,
"grad_norm": 0.006735061760991812,
"learning_rate": 3.992303419820678e-05,
"loss": 0.0003,
"step": 2540
},
{
"epoch": 0.6069983337300643,
"grad_norm": 0.008341578766703606,
"learning_rate": 3.988336110449893e-05,
"loss": 0.0003,
"step": 2550
},
{
"epoch": 0.6093787193525351,
"grad_norm": 0.0027205003425478935,
"learning_rate": 3.9843688010791084e-05,
"loss": 0.0003,
"step": 2560
},
{
"epoch": 0.611759104975006,
"grad_norm": 0.01718416064977646,
"learning_rate": 3.9804014917083235e-05,
"loss": 0.0005,
"step": 2570
},
{
"epoch": 0.6141394905974767,
"grad_norm": 0.06104213371872902,
"learning_rate": 3.976434182337539e-05,
"loss": 0.0002,
"step": 2580
},
{
"epoch": 0.6165198762199476,
"grad_norm": 0.008454731665551662,
"learning_rate": 3.972466872966754e-05,
"loss": 0.0001,
"step": 2590
},
{
"epoch": 0.6189002618424184,
"grad_norm": 0.006591182202100754,
"learning_rate": 3.968499563595969e-05,
"loss": 0.0002,
"step": 2600
},
{
"epoch": 0.6212806474648893,
"grad_norm": 0.009718428365886211,
"learning_rate": 3.964532254225184e-05,
"loss": 0.0019,
"step": 2610
},
{
"epoch": 0.6236610330873602,
"grad_norm": 0.0156183410435915,
"learning_rate": 3.9605649448544e-05,
"loss": 0.0002,
"step": 2620
},
{
"epoch": 0.626041418709831,
"grad_norm": 0.012816215865314007,
"learning_rate": 3.956597635483616e-05,
"loss": 0.0008,
"step": 2630
},
{
"epoch": 0.6284218043323019,
"grad_norm": 0.0211672130972147,
"learning_rate": 3.952630326112831e-05,
"loss": 0.0002,
"step": 2640
},
{
"epoch": 0.6308021899547727,
"grad_norm": 0.012701870873570442,
"learning_rate": 3.948663016742046e-05,
"loss": 0.0003,
"step": 2650
},
{
"epoch": 0.6331825755772436,
"grad_norm": 0.008668744005262852,
"learning_rate": 3.944695707371261e-05,
"loss": 0.0002,
"step": 2660
},
{
"epoch": 0.6355629611997143,
"grad_norm": 0.020911380648612976,
"learning_rate": 3.940728398000476e-05,
"loss": 0.0004,
"step": 2670
},
{
"epoch": 0.6379433468221852,
"grad_norm": 0.0015960232121869922,
"learning_rate": 3.9367610886296915e-05,
"loss": 0.0002,
"step": 2680
},
{
"epoch": 0.640323732444656,
"grad_norm": 0.01783674582839012,
"learning_rate": 3.932793779258907e-05,
"loss": 0.0001,
"step": 2690
},
{
"epoch": 0.6427041180671269,
"grad_norm": 0.006887937895953655,
"learning_rate": 3.928826469888122e-05,
"loss": 0.0002,
"step": 2700
},
{
"epoch": 0.6450845036895977,
"grad_norm": 0.004555295687168837,
"learning_rate": 3.924859160517337e-05,
"loss": 0.0002,
"step": 2710
},
{
"epoch": 0.6474648893120686,
"grad_norm": 0.00994735024869442,
"learning_rate": 3.920891851146552e-05,
"loss": 0.0003,
"step": 2720
},
{
"epoch": 0.6498452749345394,
"grad_norm": 0.03482622653245926,
"learning_rate": 3.916924541775768e-05,
"loss": 0.0002,
"step": 2730
},
{
"epoch": 0.6522256605570103,
"grad_norm": 0.06792888045310974,
"learning_rate": 3.912957232404983e-05,
"loss": 0.0002,
"step": 2740
},
{
"epoch": 0.654606046179481,
"grad_norm": 0.02015574462711811,
"learning_rate": 3.908989923034199e-05,
"loss": 0.0008,
"step": 2750
},
{
"epoch": 0.6569864318019519,
"grad_norm": 0.07359887659549713,
"learning_rate": 3.905022613663414e-05,
"loss": 0.0003,
"step": 2760
},
{
"epoch": 0.6593668174244227,
"grad_norm": 0.006248469930142164,
"learning_rate": 3.901055304292629e-05,
"loss": 0.0002,
"step": 2770
},
{
"epoch": 0.6617472030468936,
"grad_norm": 0.01739078015089035,
"learning_rate": 3.897087994921844e-05,
"loss": 0.0002,
"step": 2780
},
{
"epoch": 0.6641275886693644,
"grad_norm": 0.008228071965277195,
"learning_rate": 3.8931206855510596e-05,
"loss": 0.0005,
"step": 2790
},
{
"epoch": 0.6665079742918353,
"grad_norm": 0.012569721788167953,
"learning_rate": 3.8891533761802746e-05,
"loss": 0.0002,
"step": 2800
},
{
"epoch": 0.6688883599143061,
"grad_norm": 0.003245885483920574,
"learning_rate": 3.88518606680949e-05,
"loss": 0.0001,
"step": 2810
},
{
"epoch": 0.671268745536777,
"grad_norm": 0.010106906294822693,
"learning_rate": 3.8812187574387053e-05,
"loss": 0.0002,
"step": 2820
},
{
"epoch": 0.6736491311592478,
"grad_norm": 0.0013821216998621821,
"learning_rate": 3.8772514480679204e-05,
"loss": 0.0002,
"step": 2830
},
{
"epoch": 0.6760295167817186,
"grad_norm": 0.008525123819708824,
"learning_rate": 3.873284138697136e-05,
"loss": 0.0001,
"step": 2840
},
{
"epoch": 0.6784099024041895,
"grad_norm": 0.0045269266702234745,
"learning_rate": 3.869316829326351e-05,
"loss": 0.0001,
"step": 2850
},
{
"epoch": 0.6807902880266603,
"grad_norm": 0.005178367253392935,
"learning_rate": 3.865349519955566e-05,
"loss": 0.0002,
"step": 2860
},
{
"epoch": 0.6831706736491312,
"grad_norm": 0.015604405663907528,
"learning_rate": 3.861382210584781e-05,
"loss": 0.0001,
"step": 2870
},
{
"epoch": 0.685551059271602,
"grad_norm": 0.7911249399185181,
"learning_rate": 3.857414901213997e-05,
"loss": 0.0002,
"step": 2880
},
{
"epoch": 0.6879314448940729,
"grad_norm": 0.005056778434664011,
"learning_rate": 3.853447591843212e-05,
"loss": 0.0003,
"step": 2890
},
{
"epoch": 0.6903118305165437,
"grad_norm": 0.007354553788900375,
"learning_rate": 3.8494802824724276e-05,
"loss": 0.0002,
"step": 2900
},
{
"epoch": 0.6926922161390145,
"grad_norm": 0.10069092363119125,
"learning_rate": 3.8455129731016427e-05,
"loss": 0.0003,
"step": 2910
},
{
"epoch": 0.6950726017614853,
"grad_norm": 0.007913509383797646,
"learning_rate": 3.841545663730858e-05,
"loss": 0.0005,
"step": 2920
},
{
"epoch": 0.6974529873839562,
"grad_norm": 0.04653599485754967,
"learning_rate": 3.837578354360073e-05,
"loss": 0.0005,
"step": 2930
},
{
"epoch": 0.699833373006427,
"grad_norm": 0.007795447017997503,
"learning_rate": 3.8336110449892884e-05,
"loss": 0.0002,
"step": 2940
},
{
"epoch": 0.7022137586288979,
"grad_norm": 0.0843840092420578,
"learning_rate": 3.829643735618504e-05,
"loss": 0.0008,
"step": 2950
},
{
"epoch": 0.7045941442513687,
"grad_norm": 0.019790470600128174,
"learning_rate": 3.825676426247719e-05,
"loss": 0.0004,
"step": 2960
},
{
"epoch": 0.7069745298738396,
"grad_norm": 0.04970049858093262,
"learning_rate": 3.821709116876934e-05,
"loss": 0.0008,
"step": 2970
},
{
"epoch": 0.7093549154963104,
"grad_norm": 0.011334414593875408,
"learning_rate": 3.817741807506149e-05,
"loss": 0.0003,
"step": 2980
},
{
"epoch": 0.7117353011187812,
"grad_norm": 0.12627428770065308,
"learning_rate": 3.813774498135364e-05,
"loss": 0.0006,
"step": 2990
},
{
"epoch": 0.714115686741252,
"grad_norm": 0.03299270570278168,
"learning_rate": 3.8098071887645806e-05,
"loss": 0.0006,
"step": 3000
},
{
"epoch": 0.7164960723637229,
"grad_norm": 0.014470428228378296,
"learning_rate": 3.805839879393796e-05,
"loss": 0.0002,
"step": 3010
},
{
"epoch": 0.7188764579861938,
"grad_norm": 0.010081595741212368,
"learning_rate": 3.801872570023011e-05,
"loss": 0.0002,
"step": 3020
},
{
"epoch": 0.7212568436086646,
"grad_norm": 0.006527799181640148,
"learning_rate": 3.797905260652226e-05,
"loss": 0.0005,
"step": 3030
},
{
"epoch": 0.7236372292311355,
"grad_norm": 0.025967439636588097,
"learning_rate": 3.793937951281441e-05,
"loss": 0.0003,
"step": 3040
},
{
"epoch": 0.7260176148536063,
"grad_norm": 0.012788872234523296,
"learning_rate": 3.7899706419106565e-05,
"loss": 0.0006,
"step": 3050
},
{
"epoch": 0.7283980004760772,
"grad_norm": 0.05159073323011398,
"learning_rate": 3.7860033325398715e-05,
"loss": 0.0002,
"step": 3060
},
{
"epoch": 0.730778386098548,
"grad_norm": 0.09669562429189682,
"learning_rate": 3.782036023169087e-05,
"loss": 0.0003,
"step": 3070
},
{
"epoch": 0.7331587717210188,
"grad_norm": 0.0008232035324908793,
"learning_rate": 3.778068713798302e-05,
"loss": 0.0002,
"step": 3080
},
{
"epoch": 0.7355391573434896,
"grad_norm": 0.0026904919650405645,
"learning_rate": 3.774101404427517e-05,
"loss": 0.0008,
"step": 3090
},
{
"epoch": 0.7379195429659605,
"grad_norm": 0.22064454853534698,
"learning_rate": 3.770134095056733e-05,
"loss": 0.0001,
"step": 3100
},
{
"epoch": 0.7402999285884313,
"grad_norm": 0.0037417325656861067,
"learning_rate": 3.766166785685948e-05,
"loss": 0.0002,
"step": 3110
},
{
"epoch": 0.7426803142109022,
"grad_norm": 0.008903945796191692,
"learning_rate": 3.762199476315163e-05,
"loss": 0.0004,
"step": 3120
},
{
"epoch": 0.745060699833373,
"grad_norm": 0.01190115325152874,
"learning_rate": 3.758232166944379e-05,
"loss": 0.0003,
"step": 3130
},
{
"epoch": 0.7474410854558439,
"grad_norm": 0.005016674287617207,
"learning_rate": 3.754264857573594e-05,
"loss": 0.0002,
"step": 3140
},
{
"epoch": 0.7498214710783146,
"grad_norm": 0.009286819957196712,
"learning_rate": 3.750297548202809e-05,
"loss": 0.0002,
"step": 3150
},
{
"epoch": 0.7522018567007855,
"grad_norm": 0.06282204389572144,
"learning_rate": 3.7463302388320245e-05,
"loss": 0.0003,
"step": 3160
},
{
"epoch": 0.7545822423232563,
"grad_norm": 0.008628441952168941,
"learning_rate": 3.7423629294612396e-05,
"loss": 0.0003,
"step": 3170
},
{
"epoch": 0.7569626279457272,
"grad_norm": 0.03511732071638107,
"learning_rate": 3.7383956200904546e-05,
"loss": 0.0001,
"step": 3180
},
{
"epoch": 0.759343013568198,
"grad_norm": 0.003294560592621565,
"learning_rate": 3.73442831071967e-05,
"loss": 0.0003,
"step": 3190
},
{
"epoch": 0.7617233991906689,
"grad_norm": 0.032009340822696686,
"learning_rate": 3.730461001348885e-05,
"loss": 0.0002,
"step": 3200
},
{
"epoch": 0.7641037848131397,
"grad_norm": 0.022615088149905205,
"learning_rate": 3.726493691978101e-05,
"loss": 0.0002,
"step": 3210
},
{
"epoch": 0.7664841704356106,
"grad_norm": 0.0026582872960716486,
"learning_rate": 3.722526382607316e-05,
"loss": 0.0001,
"step": 3220
},
{
"epoch": 0.7688645560580815,
"grad_norm": 0.3148833215236664,
"learning_rate": 3.718559073236531e-05,
"loss": 0.0002,
"step": 3230
},
{
"epoch": 0.7712449416805522,
"grad_norm": 0.03451314941048622,
"learning_rate": 3.714591763865746e-05,
"loss": 0.0002,
"step": 3240
},
{
"epoch": 0.7736253273030231,
"grad_norm": 0.008008177392184734,
"learning_rate": 3.710624454494961e-05,
"loss": 0.0001,
"step": 3250
},
{
"epoch": 0.7760057129254939,
"grad_norm": 0.07701031118631363,
"learning_rate": 3.706657145124177e-05,
"loss": 0.0005,
"step": 3260
},
{
"epoch": 0.7783860985479648,
"grad_norm": 0.010465078055858612,
"learning_rate": 3.7026898357533926e-05,
"loss": 0.0002,
"step": 3270
},
{
"epoch": 0.7807664841704356,
"grad_norm": 0.00499736238270998,
"learning_rate": 3.6987225263826076e-05,
"loss": 0.0007,
"step": 3280
},
{
"epoch": 0.7831468697929065,
"grad_norm": 0.6453936696052551,
"learning_rate": 3.6947552170118226e-05,
"loss": 0.0003,
"step": 3290
},
{
"epoch": 0.7855272554153773,
"grad_norm": 0.016864465549588203,
"learning_rate": 3.690787907641038e-05,
"loss": 0.0003,
"step": 3300
},
{
"epoch": 0.7879076410378482,
"grad_norm": 0.05074018985033035,
"learning_rate": 3.6868205982702534e-05,
"loss": 0.0002,
"step": 3310
},
{
"epoch": 0.7902880266603189,
"grad_norm": 0.006529835984110832,
"learning_rate": 3.682853288899469e-05,
"loss": 0.0005,
"step": 3320
},
{
"epoch": 0.7926684122827898,
"grad_norm": 0.041339557617902756,
"learning_rate": 3.678885979528684e-05,
"loss": 0.0004,
"step": 3330
},
{
"epoch": 0.7950487979052606,
"grad_norm": 0.006891661789268255,
"learning_rate": 3.674918670157899e-05,
"loss": 0.0004,
"step": 3340
},
{
"epoch": 0.7974291835277315,
"grad_norm": 0.01043302658945322,
"learning_rate": 3.670951360787114e-05,
"loss": 0.0003,
"step": 3350
},
{
"epoch": 0.7998095691502023,
"grad_norm": 0.01914358325302601,
"learning_rate": 3.666984051416329e-05,
"loss": 0.0008,
"step": 3360
},
{
"epoch": 0.8021899547726732,
"grad_norm": 0.016266925260424614,
"learning_rate": 3.663016742045545e-05,
"loss": 0.0004,
"step": 3370
},
{
"epoch": 0.804570340395144,
"grad_norm": 0.005765034817159176,
"learning_rate": 3.6590494326747606e-05,
"loss": 0.0002,
"step": 3380
},
{
"epoch": 0.8069507260176149,
"grad_norm": 0.007664472330361605,
"learning_rate": 3.6550821233039757e-05,
"loss": 0.0002,
"step": 3390
},
{
"epoch": 0.8093311116400856,
"grad_norm": 0.00499699218198657,
"learning_rate": 3.651114813933191e-05,
"loss": 0.0001,
"step": 3400
},
{
"epoch": 0.8117114972625565,
"grad_norm": 0.012575655244290829,
"learning_rate": 3.647147504562406e-05,
"loss": 0.0002,
"step": 3410
},
{
"epoch": 0.8140918828850274,
"grad_norm": 0.010001065209507942,
"learning_rate": 3.6431801951916214e-05,
"loss": 0.0005,
"step": 3420
},
{
"epoch": 0.8164722685074982,
"grad_norm": 0.06131220981478691,
"learning_rate": 3.6392128858208365e-05,
"loss": 0.0002,
"step": 3430
},
{
"epoch": 0.8188526541299691,
"grad_norm": 0.037141721695661545,
"learning_rate": 3.6352455764500515e-05,
"loss": 0.0001,
"step": 3440
},
{
"epoch": 0.8212330397524399,
"grad_norm": 0.05955801159143448,
"learning_rate": 3.631278267079267e-05,
"loss": 0.0005,
"step": 3450
},
{
"epoch": 0.8236134253749108,
"grad_norm": 0.012499036267399788,
"learning_rate": 3.627310957708482e-05,
"loss": 0.0002,
"step": 3460
},
{
"epoch": 0.8259938109973816,
"grad_norm": 0.007782169617712498,
"learning_rate": 3.623343648337697e-05,
"loss": 0.0004,
"step": 3470
},
{
"epoch": 0.8283741966198525,
"grad_norm": 0.016740377992391586,
"learning_rate": 3.619376338966913e-05,
"loss": 0.0004,
"step": 3480
},
{
"epoch": 0.8307545822423232,
"grad_norm": 0.05157579854130745,
"learning_rate": 3.615409029596128e-05,
"loss": 0.0003,
"step": 3490
},
{
"epoch": 0.8331349678647941,
"grad_norm": 0.00816064327955246,
"learning_rate": 3.611441720225343e-05,
"loss": 0.0003,
"step": 3500
},
{
"epoch": 0.8355153534872649,
"grad_norm": 0.02470710128545761,
"learning_rate": 3.607474410854559e-05,
"loss": 0.0002,
"step": 3510
},
{
"epoch": 0.8378957391097358,
"grad_norm": 0.004836896900087595,
"learning_rate": 3.603507101483774e-05,
"loss": 0.0001,
"step": 3520
},
{
"epoch": 0.8402761247322066,
"grad_norm": 0.003796802368015051,
"learning_rate": 3.5995397921129895e-05,
"loss": 0.0002,
"step": 3530
},
{
"epoch": 0.8426565103546775,
"grad_norm": 0.006737705785781145,
"learning_rate": 3.5955724827422045e-05,
"loss": 0.0003,
"step": 3540
},
{
"epoch": 0.8450368959771483,
"grad_norm": 0.0021388079039752483,
"learning_rate": 3.5916051733714195e-05,
"loss": 0.0001,
"step": 3550
},
{
"epoch": 0.8474172815996192,
"grad_norm": 0.047663912177085876,
"learning_rate": 3.5876378640006346e-05,
"loss": 0.0001,
"step": 3560
},
{
"epoch": 0.8497976672220899,
"grad_norm": 0.015320863574743271,
"learning_rate": 3.58367055462985e-05,
"loss": 0.0002,
"step": 3570
},
{
"epoch": 0.8521780528445608,
"grad_norm": 0.008627827279269695,
"learning_rate": 3.579703245259065e-05,
"loss": 0.0003,
"step": 3580
},
{
"epoch": 0.8545584384670316,
"grad_norm": 0.0034904240164905787,
"learning_rate": 3.575735935888281e-05,
"loss": 0.0001,
"step": 3590
},
{
"epoch": 0.8569388240895025,
"grad_norm": 0.01078026182949543,
"learning_rate": 3.571768626517496e-05,
"loss": 0.0002,
"step": 3600
},
{
"epoch": 0.8593192097119733,
"grad_norm": 0.011285877786576748,
"learning_rate": 3.567801317146711e-05,
"loss": 0.0007,
"step": 3610
},
{
"epoch": 0.8616995953344442,
"grad_norm": 0.005885743070393801,
"learning_rate": 3.563834007775926e-05,
"loss": 0.0003,
"step": 3620
},
{
"epoch": 0.8640799809569151,
"grad_norm": 0.1011798158288002,
"learning_rate": 3.559866698405142e-05,
"loss": 0.0002,
"step": 3630
},
{
"epoch": 0.8664603665793859,
"grad_norm": 0.012861615046858788,
"learning_rate": 3.5558993890343575e-05,
"loss": 0.0002,
"step": 3640
},
{
"epoch": 0.8688407522018567,
"grad_norm": 0.009324765764176846,
"learning_rate": 3.5519320796635726e-05,
"loss": 0.0007,
"step": 3650
},
{
"epoch": 0.8712211378243275,
"grad_norm": 0.0035065708216279745,
"learning_rate": 3.5479647702927876e-05,
"loss": 0.0004,
"step": 3660
},
{
"epoch": 0.8736015234467984,
"grad_norm": 0.010472165420651436,
"learning_rate": 3.5439974609220026e-05,
"loss": 0.0001,
"step": 3670
},
{
"epoch": 0.8759819090692692,
"grad_norm": 0.009073158726096153,
"learning_rate": 3.5400301515512176e-05,
"loss": 0.0001,
"step": 3680
},
{
"epoch": 0.8783622946917401,
"grad_norm": 0.0028665116988122463,
"learning_rate": 3.5360628421804334e-05,
"loss": 0.0001,
"step": 3690
},
{
"epoch": 0.8807426803142109,
"grad_norm": 0.009178753942251205,
"learning_rate": 3.532095532809649e-05,
"loss": 0.0003,
"step": 3700
},
{
"epoch": 0.8831230659366818,
"grad_norm": 0.007954353466629982,
"learning_rate": 3.528128223438864e-05,
"loss": 0.0006,
"step": 3710
},
{
"epoch": 0.8855034515591526,
"grad_norm": 0.009399271570146084,
"learning_rate": 3.524160914068079e-05,
"loss": 0.0002,
"step": 3720
},
{
"epoch": 0.8878838371816234,
"grad_norm": 0.0035749957896769047,
"learning_rate": 3.520193604697294e-05,
"loss": 0.0001,
"step": 3730
},
{
"epoch": 0.8902642228040942,
"grad_norm": 0.007753758691251278,
"learning_rate": 3.51622629532651e-05,
"loss": 0.0001,
"step": 3740
},
{
"epoch": 0.8926446084265651,
"grad_norm": 0.007471214048564434,
"learning_rate": 3.512258985955725e-05,
"loss": 0.0003,
"step": 3750
},
{
"epoch": 0.8950249940490359,
"grad_norm": 0.016612932085990906,
"learning_rate": 3.5082916765849406e-05,
"loss": 0.0001,
"step": 3760
},
{
"epoch": 0.8974053796715068,
"grad_norm": 0.008320000022649765,
"learning_rate": 3.5043243672141556e-05,
"loss": 0.0001,
"step": 3770
},
{
"epoch": 0.8997857652939776,
"grad_norm": 0.010242090560495853,
"learning_rate": 3.500357057843371e-05,
"loss": 0.0001,
"step": 3780
},
{
"epoch": 0.9021661509164485,
"grad_norm": 0.0036350861191749573,
"learning_rate": 3.4963897484725864e-05,
"loss": 0.0001,
"step": 3790
},
{
"epoch": 0.9045465365389193,
"grad_norm": 0.002153201960027218,
"learning_rate": 3.4924224391018014e-05,
"loss": 0.0002,
"step": 3800
},
{
"epoch": 0.9069269221613901,
"grad_norm": 0.003587006125599146,
"learning_rate": 3.4884551297310164e-05,
"loss": 0.0002,
"step": 3810
},
{
"epoch": 0.909307307783861,
"grad_norm": 0.006511629093438387,
"learning_rate": 3.4844878203602315e-05,
"loss": 0.0002,
"step": 3820
},
{
"epoch": 0.9116876934063318,
"grad_norm": 0.008945467881858349,
"learning_rate": 3.480520510989447e-05,
"loss": 0.0001,
"step": 3830
},
{
"epoch": 0.9140680790288027,
"grad_norm": 0.006604051683098078,
"learning_rate": 3.476553201618662e-05,
"loss": 0.0001,
"step": 3840
},
{
"epoch": 0.9164484646512735,
"grad_norm": 0.0031156474724411964,
"learning_rate": 3.472585892247878e-05,
"loss": 0.0003,
"step": 3850
},
{
"epoch": 0.9188288502737444,
"grad_norm": 0.005195919424295425,
"learning_rate": 3.468618582877093e-05,
"loss": 0.0001,
"step": 3860
},
{
"epoch": 0.9212092358962152,
"grad_norm": 0.008878687396645546,
"learning_rate": 3.464651273506308e-05,
"loss": 0.0001,
"step": 3870
},
{
"epoch": 0.9235896215186861,
"grad_norm": 0.0020940713584423065,
"learning_rate": 3.460683964135523e-05,
"loss": 0.0001,
"step": 3880
},
{
"epoch": 0.9259700071411568,
"grad_norm": 0.0066345930099487305,
"learning_rate": 3.456716654764739e-05,
"loss": 0.0001,
"step": 3890
},
{
"epoch": 0.9283503927636277,
"grad_norm": 0.0018133444245904684,
"learning_rate": 3.4527493453939544e-05,
"loss": 0.0001,
"step": 3900
},
{
"epoch": 0.9307307783860985,
"grad_norm": 0.000830967677757144,
"learning_rate": 3.4487820360231695e-05,
"loss": 0.0001,
"step": 3910
},
{
"epoch": 0.9331111640085694,
"grad_norm": 0.0037288174498826265,
"learning_rate": 3.4448147266523845e-05,
"loss": 0.0001,
"step": 3920
},
{
"epoch": 0.9354915496310402,
"grad_norm": 0.005838675890117884,
"learning_rate": 3.4408474172815995e-05,
"loss": 0.0003,
"step": 3930
},
{
"epoch": 0.9378719352535111,
"grad_norm": 0.008044001646339893,
"learning_rate": 3.4368801079108145e-05,
"loss": 0.0002,
"step": 3940
},
{
"epoch": 0.9402523208759819,
"grad_norm": 0.07016938179731369,
"learning_rate": 3.43291279854003e-05,
"loss": 0.0005,
"step": 3950
},
{
"epoch": 0.9426327064984528,
"grad_norm": 0.11337173730134964,
"learning_rate": 3.428945489169246e-05,
"loss": 0.0002,
"step": 3960
},
{
"epoch": 0.9450130921209235,
"grad_norm": 0.0017598132835701108,
"learning_rate": 3.424978179798461e-05,
"loss": 0.0003,
"step": 3970
},
{
"epoch": 0.9473934777433944,
"grad_norm": 0.030149806290864944,
"learning_rate": 3.421010870427676e-05,
"loss": 0.0003,
"step": 3980
},
{
"epoch": 0.9497738633658652,
"grad_norm": 0.11280670762062073,
"learning_rate": 3.417043561056891e-05,
"loss": 0.0003,
"step": 3990
},
{
"epoch": 0.9521542489883361,
"grad_norm": 0.02797405980527401,
"learning_rate": 3.413076251686107e-05,
"loss": 0.0003,
"step": 4000
},
{
"epoch": 0.9545346346108069,
"grad_norm": 0.009325963445007801,
"learning_rate": 3.4091089423153225e-05,
"loss": 0.0002,
"step": 4010
},
{
"epoch": 0.9569150202332778,
"grad_norm": 0.015098505653440952,
"learning_rate": 3.4051416329445375e-05,
"loss": 0.0002,
"step": 4020
},
{
"epoch": 0.9592954058557487,
"grad_norm": 0.0010631170589476824,
"learning_rate": 3.4011743235737525e-05,
"loss": 0.0002,
"step": 4030
},
{
"epoch": 0.9616757914782195,
"grad_norm": 0.11537562310695648,
"learning_rate": 3.3972070142029676e-05,
"loss": 0.0004,
"step": 4040
},
{
"epoch": 0.9640561771006904,
"grad_norm": 0.055657465010881424,
"learning_rate": 3.3932397048321826e-05,
"loss": 0.0002,
"step": 4050
},
{
"epoch": 0.9664365627231611,
"grad_norm": 0.004681292921304703,
"learning_rate": 3.389272395461398e-05,
"loss": 0.0001,
"step": 4060
},
{
"epoch": 0.968816948345632,
"grad_norm": 0.0036875929217785597,
"learning_rate": 3.385305086090613e-05,
"loss": 0.0003,
"step": 4070
},
{
"epoch": 0.9711973339681028,
"grad_norm": 0.3181780278682709,
"learning_rate": 3.381337776719829e-05,
"loss": 0.0002,
"step": 4080
},
{
"epoch": 0.9735777195905737,
"grad_norm": 0.008175074122846127,
"learning_rate": 3.377370467349044e-05,
"loss": 0.0001,
"step": 4090
},
{
"epoch": 0.9759581052130445,
"grad_norm": 0.008897043764591217,
"learning_rate": 3.373403157978259e-05,
"loss": 0.0002,
"step": 4100
},
{
"epoch": 0.9783384908355154,
"grad_norm": 0.005149902775883675,
"learning_rate": 3.369435848607475e-05,
"loss": 0.0001,
"step": 4110
},
{
"epoch": 0.9807188764579862,
"grad_norm": 0.005102005321532488,
"learning_rate": 3.36546853923669e-05,
"loss": 0.0001,
"step": 4120
},
{
"epoch": 0.9830992620804571,
"grad_norm": 0.003907215781509876,
"learning_rate": 3.361501229865905e-05,
"loss": 0.0001,
"step": 4130
},
{
"epoch": 0.9854796477029278,
"grad_norm": 0.006176768336445093,
"learning_rate": 3.3575339204951206e-05,
"loss": 0.0001,
"step": 4140
},
{
"epoch": 0.9878600333253987,
"grad_norm": 0.007574237417429686,
"learning_rate": 3.3535666111243356e-05,
"loss": 0.0001,
"step": 4150
},
{
"epoch": 0.9902404189478695,
"grad_norm": 0.0036479670088738203,
"learning_rate": 3.3495993017535506e-05,
"loss": 0.0001,
"step": 4160
},
{
"epoch": 0.9926208045703404,
"grad_norm": 0.0031234126072376966,
"learning_rate": 3.3456319923827664e-05,
"loss": 0.0001,
"step": 4170
},
{
"epoch": 0.9950011901928112,
"grad_norm": 0.015276722609996796,
"learning_rate": 3.3416646830119814e-05,
"loss": 0.0004,
"step": 4180
},
{
"epoch": 0.9973815758152821,
"grad_norm": 0.015308289788663387,
"learning_rate": 3.3376973736411964e-05,
"loss": 0.0002,
"step": 4190
},
{
"epoch": 0.9997619614377529,
"grad_norm": 1.1039026975631714,
"learning_rate": 3.333730064270412e-05,
"loss": 0.0034,
"step": 4200
},
{
"epoch": 1.0,
"eval_loss": 6.8751428443647455e-06,
"eval_runtime": 52.0551,
"eval_samples_per_second": 35.885,
"eval_steps_per_second": 8.971,
"step": 4201
},
{
"epoch": 1.0021423470602238,
"grad_norm": 0.4035731852054596,
"learning_rate": 3.329762754899627e-05,
"loss": 0.001,
"step": 4210
},
{
"epoch": 1.0045227326826947,
"grad_norm": 0.058116745203733444,
"learning_rate": 3.325795445528843e-05,
"loss": 0.0006,
"step": 4220
},
{
"epoch": 1.0069031183051655,
"grad_norm": 0.030079133808612823,
"learning_rate": 3.321828136158058e-05,
"loss": 0.0005,
"step": 4230
},
{
"epoch": 1.0092835039276362,
"grad_norm": 0.03710814565420151,
"learning_rate": 3.317860826787273e-05,
"loss": 0.0008,
"step": 4240
},
{
"epoch": 1.011663889550107,
"grad_norm": 0.25699111819267273,
"learning_rate": 3.313893517416488e-05,
"loss": 0.0003,
"step": 4250
},
{
"epoch": 1.014044275172578,
"grad_norm": 0.01729218102991581,
"learning_rate": 3.309926208045703e-05,
"loss": 0.0004,
"step": 4260
},
{
"epoch": 1.016424660795049,
"grad_norm": 0.004348506219685078,
"learning_rate": 3.3059588986749194e-05,
"loss": 0.0003,
"step": 4270
},
{
"epoch": 1.0188050464175196,
"grad_norm": 0.008898822590708733,
"learning_rate": 3.3019915893041344e-05,
"loss": 0.0002,
"step": 4280
},
{
"epoch": 1.0211854320399905,
"grad_norm": 0.021421125158667564,
"learning_rate": 3.2980242799333494e-05,
"loss": 0.0002,
"step": 4290
},
{
"epoch": 1.0235658176624614,
"grad_norm": 0.09812607616186142,
"learning_rate": 3.2940569705625645e-05,
"loss": 0.0003,
"step": 4300
},
{
"epoch": 1.0259462032849322,
"grad_norm": 0.00921029131859541,
"learning_rate": 3.2900896611917795e-05,
"loss": 0.0003,
"step": 4310
},
{
"epoch": 1.028326588907403,
"grad_norm": 0.18005193769931793,
"learning_rate": 3.286122351820995e-05,
"loss": 0.0009,
"step": 4320
},
{
"epoch": 1.0307069745298738,
"grad_norm": 0.022728268057107925,
"learning_rate": 3.282155042450211e-05,
"loss": 0.001,
"step": 4330
},
{
"epoch": 1.0330873601523447,
"grad_norm": 0.002307797549292445,
"learning_rate": 3.278187733079426e-05,
"loss": 0.0001,
"step": 4340
},
{
"epoch": 1.0354677457748156,
"grad_norm": 0.09381233900785446,
"learning_rate": 3.274220423708641e-05,
"loss": 0.0001,
"step": 4350
},
{
"epoch": 1.0378481313972863,
"grad_norm": 0.30725282430648804,
"learning_rate": 3.270253114337856e-05,
"loss": 0.0006,
"step": 4360
},
{
"epoch": 1.0402285170197572,
"grad_norm": 0.0028942192438989878,
"learning_rate": 3.266285804967071e-05,
"loss": 0.0002,
"step": 4370
},
{
"epoch": 1.042608902642228,
"grad_norm": 0.07420436292886734,
"learning_rate": 3.262318495596287e-05,
"loss": 0.0001,
"step": 4380
},
{
"epoch": 1.044989288264699,
"grad_norm": 0.0038873206358402967,
"learning_rate": 3.2583511862255025e-05,
"loss": 0.0004,
"step": 4390
},
{
"epoch": 1.0473696738871696,
"grad_norm": 0.00487096281722188,
"learning_rate": 3.2543838768547175e-05,
"loss": 0.0001,
"step": 4400
},
{
"epoch": 1.0497500595096405,
"grad_norm": 0.00458755437284708,
"learning_rate": 3.2504165674839325e-05,
"loss": 0.0001,
"step": 4410
},
{
"epoch": 1.0521304451321114,
"grad_norm": 0.003127770032733679,
"learning_rate": 3.2464492581131475e-05,
"loss": 0.0001,
"step": 4420
},
{
"epoch": 1.0545108307545823,
"grad_norm": 0.0036109236534684896,
"learning_rate": 3.242481948742363e-05,
"loss": 0.0003,
"step": 4430
},
{
"epoch": 1.0568912163770532,
"grad_norm": 0.01696913130581379,
"learning_rate": 3.238514639371578e-05,
"loss": 0.0001,
"step": 4440
},
{
"epoch": 1.0592716019995239,
"grad_norm": 0.0007638796814717352,
"learning_rate": 3.234547330000793e-05,
"loss": 0.0003,
"step": 4450
},
{
"epoch": 1.0616519876219948,
"grad_norm": 0.005359685514122248,
"learning_rate": 3.230580020630009e-05,
"loss": 0.0001,
"step": 4460
},
{
"epoch": 1.0640323732444656,
"grad_norm": 0.008990432135760784,
"learning_rate": 3.226612711259224e-05,
"loss": 0.0002,
"step": 4470
},
{
"epoch": 1.0664127588669365,
"grad_norm": 0.004698805510997772,
"learning_rate": 3.22264540188844e-05,
"loss": 0.0001,
"step": 4480
},
{
"epoch": 1.0687931444894072,
"grad_norm": 0.07380379736423492,
"learning_rate": 3.218678092517655e-05,
"loss": 0.0005,
"step": 4490
},
{
"epoch": 1.071173530111878,
"grad_norm": 0.0072670914232730865,
"learning_rate": 3.21471078314687e-05,
"loss": 0.0001,
"step": 4500
},
{
"epoch": 1.073553915734349,
"grad_norm": 0.003431397257372737,
"learning_rate": 3.210743473776085e-05,
"loss": 0.0001,
"step": 4510
},
{
"epoch": 1.0759343013568199,
"grad_norm": 0.012710604816675186,
"learning_rate": 3.2067761644053006e-05,
"loss": 0.0001,
"step": 4520
},
{
"epoch": 1.0783146869792906,
"grad_norm": 0.0029263871256262064,
"learning_rate": 3.2028088550345156e-05,
"loss": 0.0001,
"step": 4530
},
{
"epoch": 1.0806950726017615,
"grad_norm": 0.0013361535966396332,
"learning_rate": 3.198841545663731e-05,
"loss": 0.0001,
"step": 4540
},
{
"epoch": 1.0830754582242323,
"grad_norm": 0.0027455012314021587,
"learning_rate": 3.194874236292946e-05,
"loss": 0.0001,
"step": 4550
},
{
"epoch": 1.0854558438467032,
"grad_norm": 0.0015189964324235916,
"learning_rate": 3.1909069269221614e-05,
"loss": 0.0002,
"step": 4560
},
{
"epoch": 1.087836229469174,
"grad_norm": 0.019486431032419205,
"learning_rate": 3.1869396175513764e-05,
"loss": 0.0001,
"step": 4570
},
{
"epoch": 1.0902166150916448,
"grad_norm": 0.009100046940147877,
"learning_rate": 3.182972308180592e-05,
"loss": 0.0002,
"step": 4580
},
{
"epoch": 1.0925970007141157,
"grad_norm": 0.6804227828979492,
"learning_rate": 3.179004998809808e-05,
"loss": 0.0008,
"step": 4590
},
{
"epoch": 1.0949773863365866,
"grad_norm": 0.004166084341704845,
"learning_rate": 3.175037689439023e-05,
"loss": 0.0004,
"step": 4600
},
{
"epoch": 1.0973577719590573,
"grad_norm": 0.0014277161099016666,
"learning_rate": 3.171070380068238e-05,
"loss": 0.0005,
"step": 4610
},
{
"epoch": 1.0997381575815282,
"grad_norm": 0.02292274497449398,
"learning_rate": 3.167103070697453e-05,
"loss": 0.0001,
"step": 4620
},
{
"epoch": 1.102118543203999,
"grad_norm": 0.006580695044249296,
"learning_rate": 3.163135761326668e-05,
"loss": 0.0002,
"step": 4630
},
{
"epoch": 1.10449892882647,
"grad_norm": 0.005075294058769941,
"learning_rate": 3.1591684519558836e-05,
"loss": 0.0002,
"step": 4640
},
{
"epoch": 1.1068793144489408,
"grad_norm": 0.0034661772660911083,
"learning_rate": 3.1552011425850994e-05,
"loss": 0.0003,
"step": 4650
},
{
"epoch": 1.1092597000714115,
"grad_norm": 0.0035978129599243402,
"learning_rate": 3.1512338332143144e-05,
"loss": 0.0002,
"step": 4660
},
{
"epoch": 1.1116400856938824,
"grad_norm": 0.01178679708391428,
"learning_rate": 3.1472665238435294e-05,
"loss": 0.0001,
"step": 4670
},
{
"epoch": 1.1140204713163533,
"grad_norm": 0.0021494280081242323,
"learning_rate": 3.1432992144727444e-05,
"loss": 0.0003,
"step": 4680
},
{
"epoch": 1.1164008569388242,
"grad_norm": 0.00325006153434515,
"learning_rate": 3.13933190510196e-05,
"loss": 0.0001,
"step": 4690
},
{
"epoch": 1.1187812425612949,
"grad_norm": 0.006323399022221565,
"learning_rate": 3.135364595731175e-05,
"loss": 0.0002,
"step": 4700
},
{
"epoch": 1.1211616281837657,
"grad_norm": 0.006911338306963444,
"learning_rate": 3.131397286360391e-05,
"loss": 0.0001,
"step": 4710
},
{
"epoch": 1.1235420138062366,
"grad_norm": 0.0032435038592666388,
"learning_rate": 3.127429976989606e-05,
"loss": 0.0001,
"step": 4720
},
{
"epoch": 1.1259223994287075,
"grad_norm": 0.18325313925743103,
"learning_rate": 3.123462667618821e-05,
"loss": 0.0002,
"step": 4730
},
{
"epoch": 1.1283027850511782,
"grad_norm": 0.12742838263511658,
"learning_rate": 3.119495358248036e-05,
"loss": 0.0004,
"step": 4740
},
{
"epoch": 1.130683170673649,
"grad_norm": 0.001981141045689583,
"learning_rate": 3.115528048877252e-05,
"loss": 0.0002,
"step": 4750
},
{
"epoch": 1.13306355629612,
"grad_norm": 0.0030578586738556623,
"learning_rate": 3.111560739506467e-05,
"loss": 0.0001,
"step": 4760
},
{
"epoch": 1.1354439419185909,
"grad_norm": 0.00284597952850163,
"learning_rate": 3.1075934301356824e-05,
"loss": 0.0001,
"step": 4770
},
{
"epoch": 1.1378243275410616,
"grad_norm": 0.023655202239751816,
"learning_rate": 3.1036261207648975e-05,
"loss": 0.0002,
"step": 4780
},
{
"epoch": 1.1402047131635324,
"grad_norm": 0.008493321016430855,
"learning_rate": 3.0996588113941125e-05,
"loss": 0.0002,
"step": 4790
},
{
"epoch": 1.1425850987860033,
"grad_norm": 0.0038551143370568752,
"learning_rate": 3.095691502023328e-05,
"loss": 0.0001,
"step": 4800
},
{
"epoch": 1.1449654844084742,
"grad_norm": 0.0014539804542437196,
"learning_rate": 3.091724192652543e-05,
"loss": 0.0001,
"step": 4810
},
{
"epoch": 1.1473458700309451,
"grad_norm": 0.0026364317163825035,
"learning_rate": 3.087756883281758e-05,
"loss": 0.0006,
"step": 4820
},
{
"epoch": 1.1497262556534158,
"grad_norm": 0.0010660483967512846,
"learning_rate": 3.083789573910973e-05,
"loss": 0.0001,
"step": 4830
},
{
"epoch": 1.1521066412758867,
"grad_norm": 0.005250291433185339,
"learning_rate": 3.079822264540189e-05,
"loss": 0.0013,
"step": 4840
},
{
"epoch": 1.1544870268983576,
"grad_norm": 0.0824214443564415,
"learning_rate": 3.075854955169404e-05,
"loss": 0.0002,
"step": 4850
},
{
"epoch": 1.1568674125208283,
"grad_norm": 0.003175609977915883,
"learning_rate": 3.07188764579862e-05,
"loss": 0.0006,
"step": 4860
},
{
"epoch": 1.1592477981432991,
"grad_norm": 0.0015882077859714627,
"learning_rate": 3.067920336427835e-05,
"loss": 0.0001,
"step": 4870
},
{
"epoch": 1.16162818376577,
"grad_norm": 0.003802343737334013,
"learning_rate": 3.06395302705705e-05,
"loss": 0.0001,
"step": 4880
},
{
"epoch": 1.164008569388241,
"grad_norm": 0.002745629521086812,
"learning_rate": 3.059985717686265e-05,
"loss": 0.0004,
"step": 4890
},
{
"epoch": 1.1663889550107118,
"grad_norm": 0.006173206493258476,
"learning_rate": 3.0560184083154805e-05,
"loss": 0.0001,
"step": 4900
},
{
"epoch": 1.1687693406331825,
"grad_norm": 0.006407946813851595,
"learning_rate": 3.052051098944696e-05,
"loss": 0.0001,
"step": 4910
},
{
"epoch": 1.1711497262556534,
"grad_norm": 0.017478201538324356,
"learning_rate": 3.0480837895739113e-05,
"loss": 0.0003,
"step": 4920
},
{
"epoch": 1.1735301118781243,
"grad_norm": 0.0035310271196067333,
"learning_rate": 3.0441164802031263e-05,
"loss": 0.0001,
"step": 4930
},
{
"epoch": 1.1759104975005952,
"grad_norm": 0.0057274349965155125,
"learning_rate": 3.0401491708323417e-05,
"loss": 0.0001,
"step": 4940
},
{
"epoch": 1.1782908831230658,
"grad_norm": 0.013580716215074062,
"learning_rate": 3.0361818614615567e-05,
"loss": 0.0001,
"step": 4950
},
{
"epoch": 1.1806712687455367,
"grad_norm": 0.005545695312321186,
"learning_rate": 3.0322145520907724e-05,
"loss": 0.0001,
"step": 4960
},
{
"epoch": 1.1830516543680076,
"grad_norm": 0.001243342412635684,
"learning_rate": 3.0282472427199875e-05,
"loss": 0.0001,
"step": 4970
},
{
"epoch": 1.1854320399904785,
"grad_norm": 0.004315751604735851,
"learning_rate": 3.0242799333492028e-05,
"loss": 0.0001,
"step": 4980
},
{
"epoch": 1.1878124256129494,
"grad_norm": 0.0020015877671539783,
"learning_rate": 3.020312623978418e-05,
"loss": 0.0001,
"step": 4990
},
{
"epoch": 1.19019281123542,
"grad_norm": 0.0013068486005067825,
"learning_rate": 3.0163453146076332e-05,
"loss": 0.0001,
"step": 5000
},
{
"epoch": 1.192573196857891,
"grad_norm": 0.0020259215962141752,
"learning_rate": 3.0123780052368483e-05,
"loss": 0.0001,
"step": 5010
},
{
"epoch": 1.1949535824803619,
"grad_norm": 0.00229440163820982,
"learning_rate": 3.008410695866064e-05,
"loss": 0.0001,
"step": 5020
},
{
"epoch": 1.1973339681028325,
"grad_norm": 0.006487131118774414,
"learning_rate": 3.0044433864952793e-05,
"loss": 0.0,
"step": 5030
},
{
"epoch": 1.1997143537253034,
"grad_norm": 0.0029580420814454556,
"learning_rate": 3.0004760771244944e-05,
"loss": 0.0001,
"step": 5040
},
{
"epoch": 1.2020947393477743,
"grad_norm": 0.004215626046061516,
"learning_rate": 2.9965087677537097e-05,
"loss": 0.0002,
"step": 5050
},
{
"epoch": 1.2044751249702452,
"grad_norm": 0.0045689307153224945,
"learning_rate": 2.9925414583829248e-05,
"loss": 0.0001,
"step": 5060
},
{
"epoch": 1.2068555105927161,
"grad_norm": 0.0018343930132687092,
"learning_rate": 2.9885741490121398e-05,
"loss": 0.0001,
"step": 5070
},
{
"epoch": 1.2092358962151868,
"grad_norm": 0.21103504300117493,
"learning_rate": 2.984606839641355e-05,
"loss": 0.0003,
"step": 5080
},
{
"epoch": 1.2116162818376577,
"grad_norm": 0.04271009564399719,
"learning_rate": 2.980639530270571e-05,
"loss": 0.0003,
"step": 5090
},
{
"epoch": 1.2139966674601286,
"grad_norm": 0.008761608973145485,
"learning_rate": 2.976672220899786e-05,
"loss": 0.0002,
"step": 5100
},
{
"epoch": 1.2163770530825995,
"grad_norm": 0.002944928128272295,
"learning_rate": 2.9727049115290013e-05,
"loss": 0.0004,
"step": 5110
},
{
"epoch": 1.2187574387050701,
"grad_norm": 0.0039098006673157215,
"learning_rate": 2.9687376021582163e-05,
"loss": 0.0001,
"step": 5120
},
{
"epoch": 1.221137824327541,
"grad_norm": 0.007188912481069565,
"learning_rate": 2.9647702927874317e-05,
"loss": 0.0003,
"step": 5130
},
{
"epoch": 1.223518209950012,
"grad_norm": 0.0020366155076771975,
"learning_rate": 2.9608029834166467e-05,
"loss": 0.0001,
"step": 5140
},
{
"epoch": 1.2258985955724828,
"grad_norm": 0.0052825105376541615,
"learning_rate": 2.9568356740458624e-05,
"loss": 0.0001,
"step": 5150
},
{
"epoch": 1.2282789811949535,
"grad_norm": 0.0322733074426651,
"learning_rate": 2.9528683646750778e-05,
"loss": 0.0002,
"step": 5160
},
{
"epoch": 1.2306593668174244,
"grad_norm": 0.0030191782861948013,
"learning_rate": 2.9489010553042928e-05,
"loss": 0.0001,
"step": 5170
},
{
"epoch": 1.2330397524398953,
"grad_norm": 0.0158090703189373,
"learning_rate": 2.944933745933508e-05,
"loss": 0.0001,
"step": 5180
},
{
"epoch": 1.2354201380623662,
"grad_norm": 0.0023131452035158873,
"learning_rate": 2.9409664365627232e-05,
"loss": 0.0002,
"step": 5190
},
{
"epoch": 1.2378005236848368,
"grad_norm": 0.0010957660852000117,
"learning_rate": 2.9369991271919382e-05,
"loss": 0.0001,
"step": 5200
},
{
"epoch": 1.2401809093073077,
"grad_norm": 0.006251092534512281,
"learning_rate": 2.933031817821154e-05,
"loss": 0.0001,
"step": 5210
},
{
"epoch": 1.2425612949297786,
"grad_norm": 0.002981637604534626,
"learning_rate": 2.9290645084503693e-05,
"loss": 0.0001,
"step": 5220
},
{
"epoch": 1.2449416805522495,
"grad_norm": 0.0044878036715090275,
"learning_rate": 2.9250971990795844e-05,
"loss": 0.0009,
"step": 5230
},
{
"epoch": 1.2473220661747204,
"grad_norm": 0.0026534402277320623,
"learning_rate": 2.9211298897087997e-05,
"loss": 0.0001,
"step": 5240
},
{
"epoch": 1.249702451797191,
"grad_norm": 0.0017549542244523764,
"learning_rate": 2.9171625803380148e-05,
"loss": 0.0001,
"step": 5250
},
{
"epoch": 1.252082837419662,
"grad_norm": 0.0030411062762141228,
"learning_rate": 2.91319527096723e-05,
"loss": 0.0001,
"step": 5260
},
{
"epoch": 1.2544632230421329,
"grad_norm": 0.006810466758906841,
"learning_rate": 2.909227961596445e-05,
"loss": 0.0001,
"step": 5270
},
{
"epoch": 1.2568436086646035,
"grad_norm": 0.008998183533549309,
"learning_rate": 2.905260652225661e-05,
"loss": 0.0001,
"step": 5280
},
{
"epoch": 1.2592239942870744,
"grad_norm": 0.0006000595167279243,
"learning_rate": 2.9012933428548762e-05,
"loss": 0.0001,
"step": 5290
},
{
"epoch": 1.2616043799095453,
"grad_norm": 0.0037659297231584787,
"learning_rate": 2.8973260334840913e-05,
"loss": 0.0001,
"step": 5300
},
{
"epoch": 1.2639847655320162,
"grad_norm": 0.003123963950201869,
"learning_rate": 2.8933587241133063e-05,
"loss": 0.0001,
"step": 5310
},
{
"epoch": 1.2663651511544871,
"grad_norm": 0.0024721056688576937,
"learning_rate": 2.8893914147425217e-05,
"loss": 0.0001,
"step": 5320
},
{
"epoch": 1.268745536776958,
"grad_norm": 0.04851701855659485,
"learning_rate": 2.8854241053717367e-05,
"loss": 0.0002,
"step": 5330
},
{
"epoch": 1.2711259223994287,
"grad_norm": 0.0003437872801441699,
"learning_rate": 2.8814567960009524e-05,
"loss": 0.0,
"step": 5340
},
{
"epoch": 1.2735063080218996,
"grad_norm": 0.36953097581863403,
"learning_rate": 2.8774894866301678e-05,
"loss": 0.0002,
"step": 5350
},
{
"epoch": 1.2758866936443705,
"grad_norm": 0.004762616939842701,
"learning_rate": 2.8735221772593828e-05,
"loss": 0.0,
"step": 5360
},
{
"epoch": 1.2782670792668411,
"grad_norm": 0.0032022674567997456,
"learning_rate": 2.8695548678885982e-05,
"loss": 0.0001,
"step": 5370
},
{
"epoch": 1.280647464889312,
"grad_norm": 0.112340047955513,
"learning_rate": 2.8655875585178132e-05,
"loss": 0.0001,
"step": 5380
},
{
"epoch": 1.283027850511783,
"grad_norm": 0.0022161102388054132,
"learning_rate": 2.8616202491470286e-05,
"loss": 0.0001,
"step": 5390
},
{
"epoch": 1.2854082361342538,
"grad_norm": 0.0012134364806115627,
"learning_rate": 2.8576529397762443e-05,
"loss": 0.0001,
"step": 5400
},
{
"epoch": 1.2877886217567247,
"grad_norm": 0.003832167712971568,
"learning_rate": 2.8536856304054593e-05,
"loss": 0.0001,
"step": 5410
},
{
"epoch": 1.2901690073791954,
"grad_norm": 0.001739076804369688,
"learning_rate": 2.8497183210346743e-05,
"loss": 0.0,
"step": 5420
},
{
"epoch": 1.2925493930016663,
"grad_norm": 0.000749527825973928,
"learning_rate": 2.8457510116638897e-05,
"loss": 0.0001,
"step": 5430
},
{
"epoch": 1.2949297786241372,
"grad_norm": 0.006486440543085337,
"learning_rate": 2.8417837022931047e-05,
"loss": 0.0001,
"step": 5440
},
{
"epoch": 1.2973101642466078,
"grad_norm": 0.002875624457374215,
"learning_rate": 2.83781639292232e-05,
"loss": 0.0003,
"step": 5450
},
{
"epoch": 1.2996905498690787,
"grad_norm": 0.011916677467525005,
"learning_rate": 2.833849083551535e-05,
"loss": 0.0002,
"step": 5460
},
{
"epoch": 1.3020709354915496,
"grad_norm": 0.014456122182309628,
"learning_rate": 2.829881774180751e-05,
"loss": 0.0001,
"step": 5470
},
{
"epoch": 1.3044513211140205,
"grad_norm": 0.00652431882917881,
"learning_rate": 2.8259144648099662e-05,
"loss": 0.0003,
"step": 5480
},
{
"epoch": 1.3068317067364914,
"grad_norm": 0.004612395539879799,
"learning_rate": 2.8219471554391813e-05,
"loss": 0.0001,
"step": 5490
},
{
"epoch": 1.309212092358962,
"grad_norm": 0.0016554853646084666,
"learning_rate": 2.8179798460683966e-05,
"loss": 0.0001,
"step": 5500
},
{
"epoch": 1.311592477981433,
"grad_norm": 0.00955954473465681,
"learning_rate": 2.8140125366976117e-05,
"loss": 0.0003,
"step": 5510
},
{
"epoch": 1.3139728636039039,
"grad_norm": 0.0014887260040268302,
"learning_rate": 2.8100452273268267e-05,
"loss": 0.0,
"step": 5520
},
{
"epoch": 1.3163532492263748,
"grad_norm": 0.004022569395601749,
"learning_rate": 2.8060779179560427e-05,
"loss": 0.0001,
"step": 5530
},
{
"epoch": 1.3187336348488454,
"grad_norm": 0.01300437469035387,
"learning_rate": 2.8021106085852578e-05,
"loss": 0.0001,
"step": 5540
},
{
"epoch": 1.3211140204713163,
"grad_norm": 0.0033303312957286835,
"learning_rate": 2.7981432992144728e-05,
"loss": 0.0001,
"step": 5550
},
{
"epoch": 1.3234944060937872,
"grad_norm": 0.00033377157524228096,
"learning_rate": 2.794175989843688e-05,
"loss": 0.0001,
"step": 5560
},
{
"epoch": 1.325874791716258,
"grad_norm": 0.001646155840717256,
"learning_rate": 2.7902086804729032e-05,
"loss": 0.0,
"step": 5570
},
{
"epoch": 1.328255177338729,
"grad_norm": 0.009458147920668125,
"learning_rate": 2.7862413711021186e-05,
"loss": 0.0,
"step": 5580
},
{
"epoch": 1.3306355629611997,
"grad_norm": 0.044097207486629486,
"learning_rate": 2.7822740617313343e-05,
"loss": 0.0001,
"step": 5590
},
{
"epoch": 1.3330159485836706,
"grad_norm": 0.3018762469291687,
"learning_rate": 2.7783067523605493e-05,
"loss": 0.0003,
"step": 5600
},
{
"epoch": 1.3353963342061415,
"grad_norm": 0.00142444740049541,
"learning_rate": 2.7743394429897647e-05,
"loss": 0.0,
"step": 5610
},
{
"epoch": 1.3377767198286121,
"grad_norm": 0.026065746322274208,
"learning_rate": 2.7703721336189797e-05,
"loss": 0.0001,
"step": 5620
},
{
"epoch": 1.340157105451083,
"grad_norm": 0.002285444876179099,
"learning_rate": 2.766404824248195e-05,
"loss": 0.0004,
"step": 5630
},
{
"epoch": 1.342537491073554,
"grad_norm": 0.0023544467985630035,
"learning_rate": 2.76243751487741e-05,
"loss": 0.0001,
"step": 5640
},
{
"epoch": 1.3449178766960248,
"grad_norm": 0.005093382205814123,
"learning_rate": 2.758470205506625e-05,
"loss": 0.0008,
"step": 5650
},
{
"epoch": 1.3472982623184957,
"grad_norm": 0.01395428366959095,
"learning_rate": 2.754502896135841e-05,
"loss": 0.0001,
"step": 5660
},
{
"epoch": 1.3496786479409664,
"grad_norm": 0.0021814145147800446,
"learning_rate": 2.7505355867650562e-05,
"loss": 0.0001,
"step": 5670
},
{
"epoch": 1.3520590335634373,
"grad_norm": 0.0020568270701915026,
"learning_rate": 2.7465682773942712e-05,
"loss": 0.0001,
"step": 5680
},
{
"epoch": 1.3544394191859082,
"grad_norm": 0.001564579550176859,
"learning_rate": 2.7426009680234866e-05,
"loss": 0.0002,
"step": 5690
},
{
"epoch": 1.3568198048083788,
"grad_norm": 0.0009057559072971344,
"learning_rate": 2.7386336586527016e-05,
"loss": 0.0001,
"step": 5700
},
{
"epoch": 1.3592001904308497,
"grad_norm": 0.005018309690058231,
"learning_rate": 2.734666349281917e-05,
"loss": 0.0001,
"step": 5710
},
{
"epoch": 1.3615805760533206,
"grad_norm": 0.0018629188416525722,
"learning_rate": 2.7306990399111327e-05,
"loss": 0.0003,
"step": 5720
},
{
"epoch": 1.3639609616757915,
"grad_norm": 0.001482214662246406,
"learning_rate": 2.7267317305403478e-05,
"loss": 0.0001,
"step": 5730
},
{
"epoch": 1.3663413472982624,
"grad_norm": 0.012405039742588997,
"learning_rate": 2.722764421169563e-05,
"loss": 0.0005,
"step": 5740
},
{
"epoch": 1.3687217329207333,
"grad_norm": 0.0018485913751646876,
"learning_rate": 2.718797111798778e-05,
"loss": 0.0,
"step": 5750
},
{
"epoch": 1.371102118543204,
"grad_norm": 0.0015681314980611205,
"learning_rate": 2.7148298024279932e-05,
"loss": 0.0001,
"step": 5760
},
{
"epoch": 1.3734825041656749,
"grad_norm": 0.017725007608532906,
"learning_rate": 2.7108624930572086e-05,
"loss": 0.0001,
"step": 5770
},
{
"epoch": 1.3758628897881457,
"grad_norm": 0.011187481693923473,
"learning_rate": 2.7068951836864243e-05,
"loss": 0.0001,
"step": 5780
},
{
"epoch": 1.3782432754106164,
"grad_norm": 0.003125675953924656,
"learning_rate": 2.7029278743156393e-05,
"loss": 0.0001,
"step": 5790
},
{
"epoch": 1.3806236610330873,
"grad_norm": 0.004620529245585203,
"learning_rate": 2.6989605649448547e-05,
"loss": 0.0001,
"step": 5800
},
{
"epoch": 1.3830040466555582,
"grad_norm": 0.004881042055785656,
"learning_rate": 2.6949932555740697e-05,
"loss": 0.0001,
"step": 5810
},
{
"epoch": 1.385384432278029,
"grad_norm": 0.015351341105997562,
"learning_rate": 2.691025946203285e-05,
"loss": 0.0001,
"step": 5820
},
{
"epoch": 1.3877648179005,
"grad_norm": 0.06165415793657303,
"learning_rate": 2.6870586368325e-05,
"loss": 0.0001,
"step": 5830
},
{
"epoch": 1.3901452035229707,
"grad_norm": 0.000691259338054806,
"learning_rate": 2.6830913274617155e-05,
"loss": 0.0001,
"step": 5840
},
{
"epoch": 1.3925255891454416,
"grad_norm": 0.006264138966798782,
"learning_rate": 2.6791240180909312e-05,
"loss": 0.0,
"step": 5850
},
{
"epoch": 1.3949059747679124,
"grad_norm": 0.0016265185549855232,
"learning_rate": 2.6751567087201462e-05,
"loss": 0.0001,
"step": 5860
},
{
"epoch": 1.3972863603903831,
"grad_norm": 0.0036318551283329725,
"learning_rate": 2.6711893993493616e-05,
"loss": 0.0,
"step": 5870
},
{
"epoch": 1.399666746012854,
"grad_norm": 0.0011168549535796046,
"learning_rate": 2.6672220899785766e-05,
"loss": 0.0001,
"step": 5880
},
{
"epoch": 1.402047131635325,
"grad_norm": 0.011570369824767113,
"learning_rate": 2.6632547806077916e-05,
"loss": 0.0001,
"step": 5890
},
{
"epoch": 1.4044275172577958,
"grad_norm": 0.004564432427287102,
"learning_rate": 2.659287471237007e-05,
"loss": 0.0001,
"step": 5900
},
{
"epoch": 1.4068079028802667,
"grad_norm": 0.003310930449515581,
"learning_rate": 2.6553201618662227e-05,
"loss": 0.0001,
"step": 5910
},
{
"epoch": 1.4091882885027374,
"grad_norm": 0.005474664270877838,
"learning_rate": 2.6513528524954377e-05,
"loss": 0.0,
"step": 5920
},
{
"epoch": 1.4115686741252083,
"grad_norm": 0.003840883495286107,
"learning_rate": 2.647385543124653e-05,
"loss": 0.0,
"step": 5930
},
{
"epoch": 1.4139490597476791,
"grad_norm": 0.0011354766320437193,
"learning_rate": 2.643418233753868e-05,
"loss": 0.0001,
"step": 5940
},
{
"epoch": 1.41632944537015,
"grad_norm": 0.0011250395327806473,
"learning_rate": 2.6394509243830835e-05,
"loss": 0.0001,
"step": 5950
},
{
"epoch": 1.4187098309926207,
"grad_norm": 0.0025986500550061464,
"learning_rate": 2.6354836150122985e-05,
"loss": 0.0,
"step": 5960
},
{
"epoch": 1.4210902166150916,
"grad_norm": 0.0018986169015988708,
"learning_rate": 2.6315163056415143e-05,
"loss": 0.0001,
"step": 5970
},
{
"epoch": 1.4234706022375625,
"grad_norm": 0.006072606425732374,
"learning_rate": 2.6275489962707296e-05,
"loss": 0.0001,
"step": 5980
},
{
"epoch": 1.4258509878600334,
"grad_norm": 0.005382834933698177,
"learning_rate": 2.6235816868999447e-05,
"loss": 0.0001,
"step": 5990
},
{
"epoch": 1.4282313734825043,
"grad_norm": 0.0069602313451468945,
"learning_rate": 2.6196143775291597e-05,
"loss": 0.0001,
"step": 6000
},
{
"epoch": 1.430611759104975,
"grad_norm": 0.00503483647480607,
"learning_rate": 2.615647068158375e-05,
"loss": 0.0001,
"step": 6010
},
{
"epoch": 1.4329921447274458,
"grad_norm": 0.009482208639383316,
"learning_rate": 2.61167975878759e-05,
"loss": 0.0001,
"step": 6020
},
{
"epoch": 1.4353725303499167,
"grad_norm": 0.003071409650146961,
"learning_rate": 2.6077124494168058e-05,
"loss": 0.0003,
"step": 6030
},
{
"epoch": 1.4377529159723874,
"grad_norm": 0.025201931595802307,
"learning_rate": 2.603745140046021e-05,
"loss": 0.0002,
"step": 6040
},
{
"epoch": 1.4401333015948583,
"grad_norm": 0.029845217242836952,
"learning_rate": 2.5997778306752362e-05,
"loss": 0.0001,
"step": 6050
},
{
"epoch": 1.4425136872173292,
"grad_norm": 0.002946893684566021,
"learning_rate": 2.5958105213044516e-05,
"loss": 0.0001,
"step": 6060
},
{
"epoch": 1.4448940728398,
"grad_norm": 0.002334748860448599,
"learning_rate": 2.5918432119336666e-05,
"loss": 0.0001,
"step": 6070
},
{
"epoch": 1.447274458462271,
"grad_norm": 0.0038676797412335873,
"learning_rate": 2.587875902562882e-05,
"loss": 0.0001,
"step": 6080
},
{
"epoch": 1.4496548440847417,
"grad_norm": 0.39916858077049255,
"learning_rate": 2.583908593192097e-05,
"loss": 0.0005,
"step": 6090
},
{
"epoch": 1.4520352297072125,
"grad_norm": 0.005464503075927496,
"learning_rate": 2.5799412838213127e-05,
"loss": 0.0,
"step": 6100
},
{
"epoch": 1.4544156153296834,
"grad_norm": 0.002350292168557644,
"learning_rate": 2.5759739744505277e-05,
"loss": 0.0001,
"step": 6110
},
{
"epoch": 1.4567960009521541,
"grad_norm": 0.02950800396502018,
"learning_rate": 2.572006665079743e-05,
"loss": 0.0001,
"step": 6120
},
{
"epoch": 1.459176386574625,
"grad_norm": 0.0020270231179893017,
"learning_rate": 2.568039355708958e-05,
"loss": 0.0001,
"step": 6130
},
{
"epoch": 1.461556772197096,
"grad_norm": 0.29163315892219543,
"learning_rate": 2.5640720463381735e-05,
"loss": 0.0004,
"step": 6140
},
{
"epoch": 1.4639371578195668,
"grad_norm": 0.0028463418129831553,
"learning_rate": 2.5601047369673885e-05,
"loss": 0.0001,
"step": 6150
},
{
"epoch": 1.4663175434420377,
"grad_norm": 0.007839919067919254,
"learning_rate": 2.5561374275966042e-05,
"loss": 0.0,
"step": 6160
},
{
"epoch": 1.4686979290645086,
"grad_norm": 0.0009790142066776752,
"learning_rate": 2.5521701182258196e-05,
"loss": 0.0001,
"step": 6170
},
{
"epoch": 1.4710783146869792,
"grad_norm": 0.019366919994354248,
"learning_rate": 2.5482028088550346e-05,
"loss": 0.0001,
"step": 6180
},
{
"epoch": 1.4734587003094501,
"grad_norm": 0.002335514174774289,
"learning_rate": 2.54423549948425e-05,
"loss": 0.0001,
"step": 6190
},
{
"epoch": 1.475839085931921,
"grad_norm": 0.004448035266250372,
"learning_rate": 2.540268190113465e-05,
"loss": 0.0,
"step": 6200
},
{
"epoch": 1.4782194715543917,
"grad_norm": 0.0020590273197740316,
"learning_rate": 2.53630088074268e-05,
"loss": 0.0,
"step": 6210
},
{
"epoch": 1.4805998571768626,
"grad_norm": 0.0015115641290321946,
"learning_rate": 2.532333571371896e-05,
"loss": 0.0001,
"step": 6220
},
{
"epoch": 1.4829802427993335,
"grad_norm": 0.0024076756089925766,
"learning_rate": 2.528366262001111e-05,
"loss": 0.0003,
"step": 6230
},
{
"epoch": 1.4853606284218044,
"grad_norm": 0.0048133935779333115,
"learning_rate": 2.5243989526303262e-05,
"loss": 0.0001,
"step": 6240
},
{
"epoch": 1.4877410140442753,
"grad_norm": 0.015479459427297115,
"learning_rate": 2.5204316432595416e-05,
"loss": 0.0001,
"step": 6250
},
{
"epoch": 1.490121399666746,
"grad_norm": 0.1010046973824501,
"learning_rate": 2.5164643338887566e-05,
"loss": 0.0001,
"step": 6260
},
{
"epoch": 1.4925017852892168,
"grad_norm": 0.0011843384709209204,
"learning_rate": 2.512497024517972e-05,
"loss": 0.0002,
"step": 6270
},
{
"epoch": 1.4948821709116877,
"grad_norm": 0.002041852567344904,
"learning_rate": 2.508529715147187e-05,
"loss": 0.0001,
"step": 6280
},
{
"epoch": 1.4972625565341584,
"grad_norm": 0.002975156530737877,
"learning_rate": 2.5045624057764027e-05,
"loss": 0.0001,
"step": 6290
},
{
"epoch": 1.4996429421566293,
"grad_norm": 0.005752989556640387,
"learning_rate": 2.500595096405618e-05,
"loss": 0.0001,
"step": 6300
},
{
"epoch": 1.5020233277791002,
"grad_norm": 0.002325852634385228,
"learning_rate": 2.496627787034833e-05,
"loss": 0.0,
"step": 6310
},
{
"epoch": 1.504403713401571,
"grad_norm": 0.006379146594554186,
"learning_rate": 2.4926604776640485e-05,
"loss": 0.0001,
"step": 6320
},
{
"epoch": 1.506784099024042,
"grad_norm": 0.0011644313344731927,
"learning_rate": 2.488693168293264e-05,
"loss": 0.0,
"step": 6330
},
{
"epoch": 1.5091644846465129,
"grad_norm": 0.06679144501686096,
"learning_rate": 2.484725858922479e-05,
"loss": 0.0001,
"step": 6340
},
{
"epoch": 1.5115448702689835,
"grad_norm": 0.010065040551126003,
"learning_rate": 2.4807585495516942e-05,
"loss": 0.0003,
"step": 6350
},
{
"epoch": 1.5139252558914544,
"grad_norm": 0.00404448714107275,
"learning_rate": 2.4767912401809093e-05,
"loss": 0.0001,
"step": 6360
},
{
"epoch": 1.516305641513925,
"grad_norm": 0.005027102772146463,
"learning_rate": 2.4728239308101246e-05,
"loss": 0.0001,
"step": 6370
},
{
"epoch": 1.518686027136396,
"grad_norm": 0.0007329948712140322,
"learning_rate": 2.46885662143934e-05,
"loss": 0.0001,
"step": 6380
},
{
"epoch": 1.521066412758867,
"grad_norm": 0.008010495454072952,
"learning_rate": 2.464889312068555e-05,
"loss": 0.0001,
"step": 6390
},
{
"epoch": 1.5234467983813378,
"grad_norm": 0.0004263845912646502,
"learning_rate": 2.4609220026977704e-05,
"loss": 0.0,
"step": 6400
},
{
"epoch": 1.5258271840038087,
"grad_norm": 0.0008505060104653239,
"learning_rate": 2.4569546933269858e-05,
"loss": 0.0001,
"step": 6410
},
{
"epoch": 1.5282075696262796,
"grad_norm": 0.005009577609598637,
"learning_rate": 2.4529873839562008e-05,
"loss": 0.0001,
"step": 6420
},
{
"epoch": 1.5305879552487502,
"grad_norm": 0.0055831428617239,
"learning_rate": 2.4490200745854165e-05,
"loss": 0.0,
"step": 6430
},
{
"epoch": 1.5329683408712211,
"grad_norm": 0.0025661292020231485,
"learning_rate": 2.4450527652146315e-05,
"loss": 0.0002,
"step": 6440
},
{
"epoch": 1.535348726493692,
"grad_norm": 0.002652715193107724,
"learning_rate": 2.4410854558438466e-05,
"loss": 0.0,
"step": 6450
},
{
"epoch": 1.5377291121161627,
"grad_norm": 0.0017773109721019864,
"learning_rate": 2.4371181464730623e-05,
"loss": 0.0001,
"step": 6460
},
{
"epoch": 1.5401094977386336,
"grad_norm": 0.023734472692012787,
"learning_rate": 2.4331508371022773e-05,
"loss": 0.0001,
"step": 6470
},
{
"epoch": 1.5424898833611045,
"grad_norm": 0.0018312609754502773,
"learning_rate": 2.4291835277314927e-05,
"loss": 0.0001,
"step": 6480
},
{
"epoch": 1.5448702689835754,
"grad_norm": 0.004327055066823959,
"learning_rate": 2.425216218360708e-05,
"loss": 0.0001,
"step": 6490
},
{
"epoch": 1.5472506546060463,
"grad_norm": 0.0021172019187361,
"learning_rate": 2.421248908989923e-05,
"loss": 0.0001,
"step": 6500
},
{
"epoch": 1.5496310402285172,
"grad_norm": 0.001905101933516562,
"learning_rate": 2.4172815996191385e-05,
"loss": 0.0,
"step": 6510
},
{
"epoch": 1.5520114258509878,
"grad_norm": 0.0016990803414955735,
"learning_rate": 2.4133142902483538e-05,
"loss": 0.0001,
"step": 6520
},
{
"epoch": 1.5543918114734587,
"grad_norm": 0.0022508346009999514,
"learning_rate": 2.409346980877569e-05,
"loss": 0.0001,
"step": 6530
},
{
"epoch": 1.5567721970959294,
"grad_norm": 0.0018837592797353864,
"learning_rate": 2.4053796715067842e-05,
"loss": 0.0001,
"step": 6540
},
{
"epoch": 1.5591525827184003,
"grad_norm": 0.001968635246157646,
"learning_rate": 2.4014123621359993e-05,
"loss": 0.0002,
"step": 6550
},
{
"epoch": 1.5615329683408712,
"grad_norm": 0.0019730927888303995,
"learning_rate": 2.397445052765215e-05,
"loss": 0.0001,
"step": 6560
},
{
"epoch": 1.563913353963342,
"grad_norm": 0.0006384404841810465,
"learning_rate": 2.39347774339443e-05,
"loss": 0.0,
"step": 6570
},
{
"epoch": 1.566293739585813,
"grad_norm": 0.05303851515054703,
"learning_rate": 2.389510434023645e-05,
"loss": 0.0002,
"step": 6580
},
{
"epoch": 1.5686741252082839,
"grad_norm": 0.009338784962892532,
"learning_rate": 2.3855431246528607e-05,
"loss": 0.0,
"step": 6590
},
{
"epoch": 1.5710545108307545,
"grad_norm": 0.001042340649291873,
"learning_rate": 2.3815758152820758e-05,
"loss": 0.0,
"step": 6600
},
{
"epoch": 1.5734348964532254,
"grad_norm": 0.008856063708662987,
"learning_rate": 2.377608505911291e-05,
"loss": 0.0001,
"step": 6610
},
{
"epoch": 1.575815282075696,
"grad_norm": 0.0010636444203555584,
"learning_rate": 2.3736411965405065e-05,
"loss": 0.0001,
"step": 6620
},
{
"epoch": 1.578195667698167,
"grad_norm": 0.044303007423877716,
"learning_rate": 2.3696738871697215e-05,
"loss": 0.0001,
"step": 6630
},
{
"epoch": 1.5805760533206379,
"grad_norm": 0.003368295030668378,
"learning_rate": 2.365706577798937e-05,
"loss": 0.0001,
"step": 6640
},
{
"epoch": 1.5829564389431088,
"grad_norm": 0.0010406200308352709,
"learning_rate": 2.3617392684281523e-05,
"loss": 0.0001,
"step": 6650
},
{
"epoch": 1.5853368245655797,
"grad_norm": 0.009850569069385529,
"learning_rate": 2.3577719590573673e-05,
"loss": 0.0,
"step": 6660
},
{
"epoch": 1.5877172101880506,
"grad_norm": 0.00514467665925622,
"learning_rate": 2.3538046496865827e-05,
"loss": 0.0001,
"step": 6670
},
{
"epoch": 1.5900975958105215,
"grad_norm": 0.00200643390417099,
"learning_rate": 2.349837340315798e-05,
"loss": 0.0002,
"step": 6680
},
{
"epoch": 1.5924779814329921,
"grad_norm": 0.01371715497225523,
"learning_rate": 2.345870030945013e-05,
"loss": 0.0002,
"step": 6690
},
{
"epoch": 1.594858367055463,
"grad_norm": 0.0005170275107957423,
"learning_rate": 2.3419027215742284e-05,
"loss": 0.0001,
"step": 6700
},
{
"epoch": 1.5972387526779337,
"grad_norm": 0.0018967930227518082,
"learning_rate": 2.3379354122034438e-05,
"loss": 0.0,
"step": 6710
},
{
"epoch": 1.5996191383004046,
"grad_norm": 0.002288557356223464,
"learning_rate": 2.3339681028326592e-05,
"loss": 0.0003,
"step": 6720
},
{
"epoch": 1.6019995239228755,
"grad_norm": 0.0017687254585325718,
"learning_rate": 2.3300007934618742e-05,
"loss": 0.0001,
"step": 6730
},
{
"epoch": 1.6043799095453464,
"grad_norm": 0.023880669847130775,
"learning_rate": 2.3260334840910893e-05,
"loss": 0.0001,
"step": 6740
},
{
"epoch": 1.6067602951678173,
"grad_norm": 0.004767647013068199,
"learning_rate": 2.322066174720305e-05,
"loss": 0.0001,
"step": 6750
},
{
"epoch": 1.6091406807902882,
"grad_norm": 0.0016061540227383375,
"learning_rate": 2.31809886534952e-05,
"loss": 0.0001,
"step": 6760
},
{
"epoch": 1.6115210664127588,
"grad_norm": 0.009586431086063385,
"learning_rate": 2.3141315559787354e-05,
"loss": 0.0001,
"step": 6770
},
{
"epoch": 1.6139014520352297,
"grad_norm": 0.003596968250349164,
"learning_rate": 2.3101642466079507e-05,
"loss": 0.0001,
"step": 6780
},
{
"epoch": 1.6162818376577004,
"grad_norm": 0.003184641245752573,
"learning_rate": 2.3061969372371658e-05,
"loss": 0.0001,
"step": 6790
},
{
"epoch": 1.6186622232801713,
"grad_norm": 0.02113034948706627,
"learning_rate": 2.302229627866381e-05,
"loss": 0.0,
"step": 6800
},
{
"epoch": 1.6210426089026422,
"grad_norm": 0.0022694601211696863,
"learning_rate": 2.2982623184955965e-05,
"loss": 0.0001,
"step": 6810
},
{
"epoch": 1.623422994525113,
"grad_norm": 0.0007104437099769711,
"learning_rate": 2.2942950091248115e-05,
"loss": 0.0,
"step": 6820
},
{
"epoch": 1.625803380147584,
"grad_norm": 0.004562158603221178,
"learning_rate": 2.290327699754027e-05,
"loss": 0.0001,
"step": 6830
},
{
"epoch": 1.6281837657700549,
"grad_norm": 0.0015846043825149536,
"learning_rate": 2.2863603903832423e-05,
"loss": 0.0001,
"step": 6840
},
{
"epoch": 1.6305641513925258,
"grad_norm": 0.012255080044269562,
"learning_rate": 2.2823930810124573e-05,
"loss": 0.0001,
"step": 6850
},
{
"epoch": 1.6329445370149964,
"grad_norm": 0.0012517154682427645,
"learning_rate": 2.2784257716416727e-05,
"loss": 0.0001,
"step": 6860
},
{
"epoch": 1.6353249226374673,
"grad_norm": 0.0006557099404744804,
"learning_rate": 2.274458462270888e-05,
"loss": 0.0001,
"step": 6870
},
{
"epoch": 1.637705308259938,
"grad_norm": 0.0007641498814336956,
"learning_rate": 2.2704911529001034e-05,
"loss": 0.0,
"step": 6880
},
{
"epoch": 1.6400856938824089,
"grad_norm": 0.005642781965434551,
"learning_rate": 2.2665238435293184e-05,
"loss": 0.0,
"step": 6890
},
{
"epoch": 1.6424660795048798,
"grad_norm": 0.0022149153519421816,
"learning_rate": 2.2625565341585338e-05,
"loss": 0.0004,
"step": 6900
},
{
"epoch": 1.6448464651273507,
"grad_norm": 0.8982350826263428,
"learning_rate": 2.2585892247877492e-05,
"loss": 0.0003,
"step": 6910
},
{
"epoch": 1.6472268507498216,
"grad_norm": 0.002032769611105323,
"learning_rate": 2.2546219154169642e-05,
"loss": 0.0001,
"step": 6920
},
{
"epoch": 1.6496072363722925,
"grad_norm": 0.0021233465522527695,
"learning_rate": 2.2506546060461796e-05,
"loss": 0.0001,
"step": 6930
},
{
"epoch": 1.6519876219947631,
"grad_norm": 0.019824443385004997,
"learning_rate": 2.246687296675395e-05,
"loss": 0.0001,
"step": 6940
},
{
"epoch": 1.654368007617234,
"grad_norm": 0.002160045551136136,
"learning_rate": 2.24271998730461e-05,
"loss": 0.0001,
"step": 6950
},
{
"epoch": 1.6567483932397047,
"grad_norm": 0.002742405980825424,
"learning_rate": 2.2387526779338254e-05,
"loss": 0.0003,
"step": 6960
},
{
"epoch": 1.6591287788621756,
"grad_norm": 0.04358428716659546,
"learning_rate": 2.2347853685630407e-05,
"loss": 0.0003,
"step": 6970
},
{
"epoch": 1.6615091644846465,
"grad_norm": 0.0023650035727769136,
"learning_rate": 2.2308180591922558e-05,
"loss": 0.0,
"step": 6980
},
{
"epoch": 1.6638895501071174,
"grad_norm": 0.0027010326739400625,
"learning_rate": 2.226850749821471e-05,
"loss": 0.0001,
"step": 6990
},
{
"epoch": 1.6662699357295883,
"grad_norm": 0.01885942928493023,
"learning_rate": 2.2228834404506865e-05,
"loss": 0.0001,
"step": 7000
},
{
"epoch": 1.6686503213520592,
"grad_norm": 0.013014287687838078,
"learning_rate": 2.218916131079902e-05,
"loss": 0.0004,
"step": 7010
},
{
"epoch": 1.6710307069745298,
"grad_norm": 0.0015542235923931003,
"learning_rate": 2.214948821709117e-05,
"loss": 0.0001,
"step": 7020
},
{
"epoch": 1.6734110925970007,
"grad_norm": 0.011335782706737518,
"learning_rate": 2.2109815123383323e-05,
"loss": 0.0,
"step": 7030
},
{
"epoch": 1.6757914782194716,
"grad_norm": 0.1068568155169487,
"learning_rate": 2.2070142029675476e-05,
"loss": 0.0001,
"step": 7040
},
{
"epoch": 1.6781718638419423,
"grad_norm": 0.004407468251883984,
"learning_rate": 2.2030468935967627e-05,
"loss": 0.0001,
"step": 7050
},
{
"epoch": 1.6805522494644132,
"grad_norm": 0.0026373250875622034,
"learning_rate": 2.199079584225978e-05,
"loss": 0.0,
"step": 7060
},
{
"epoch": 1.682932635086884,
"grad_norm": 0.020453903824090958,
"learning_rate": 2.1951122748551934e-05,
"loss": 0.0001,
"step": 7070
},
{
"epoch": 1.685313020709355,
"grad_norm": 0.009605340659618378,
"learning_rate": 2.1911449654844084e-05,
"loss": 0.0003,
"step": 7080
},
{
"epoch": 1.6876934063318259,
"grad_norm": 0.0008563417941331863,
"learning_rate": 2.1871776561136238e-05,
"loss": 0.0,
"step": 7090
},
{
"epoch": 1.6900737919542967,
"grad_norm": 0.0017095934599637985,
"learning_rate": 2.1832103467428392e-05,
"loss": 0.0001,
"step": 7100
},
{
"epoch": 1.6924541775767674,
"grad_norm": 0.0017231311649084091,
"learning_rate": 2.1792430373720542e-05,
"loss": 0.0001,
"step": 7110
},
{
"epoch": 1.6948345631992383,
"grad_norm": 0.0004322198801673949,
"learning_rate": 2.17527572800127e-05,
"loss": 0.0001,
"step": 7120
},
{
"epoch": 1.697214948821709,
"grad_norm": 0.06828305870294571,
"learning_rate": 2.171308418630485e-05,
"loss": 0.0003,
"step": 7130
},
{
"epoch": 1.6995953344441799,
"grad_norm": 0.012662236578762531,
"learning_rate": 2.1673411092597e-05,
"loss": 0.0,
"step": 7140
},
{
"epoch": 1.7019757200666508,
"grad_norm": 0.0004414702707435936,
"learning_rate": 2.1633737998889153e-05,
"loss": 0.0,
"step": 7150
},
{
"epoch": 1.7043561056891217,
"grad_norm": 0.0018225832609459758,
"learning_rate": 2.1594064905181307e-05,
"loss": 0.0001,
"step": 7160
},
{
"epoch": 1.7067364913115926,
"grad_norm": 0.10008008033037186,
"learning_rate": 2.155439181147346e-05,
"loss": 0.0001,
"step": 7170
},
{
"epoch": 1.7091168769340634,
"grad_norm": 0.0027361391112208366,
"learning_rate": 2.151471871776561e-05,
"loss": 0.0,
"step": 7180
},
{
"epoch": 1.7114972625565341,
"grad_norm": 0.0021505611948668957,
"learning_rate": 2.1475045624057765e-05,
"loss": 0.0,
"step": 7190
},
{
"epoch": 1.713877648179005,
"grad_norm": 0.00697895884513855,
"learning_rate": 2.143537253034992e-05,
"loss": 0.0,
"step": 7200
},
{
"epoch": 1.7162580338014757,
"grad_norm": 0.002057724166661501,
"learning_rate": 2.139569943664207e-05,
"loss": 0.0,
"step": 7210
},
{
"epoch": 1.7186384194239466,
"grad_norm": 0.002399923512712121,
"learning_rate": 2.1356026342934223e-05,
"loss": 0.0001,
"step": 7220
},
{
"epoch": 1.7210188050464175,
"grad_norm": 0.3061892092227936,
"learning_rate": 2.1316353249226376e-05,
"loss": 0.0002,
"step": 7230
},
{
"epoch": 1.7233991906688884,
"grad_norm": 0.004888875875622034,
"learning_rate": 2.1276680155518527e-05,
"loss": 0.0,
"step": 7240
},
{
"epoch": 1.7257795762913593,
"grad_norm": 0.04453931376338005,
"learning_rate": 2.1237007061810684e-05,
"loss": 0.0001,
"step": 7250
},
{
"epoch": 1.7281599619138301,
"grad_norm": 0.02463744953274727,
"learning_rate": 2.1197333968102834e-05,
"loss": 0.0001,
"step": 7260
},
{
"epoch": 1.730540347536301,
"grad_norm": 0.002113641705363989,
"learning_rate": 2.1157660874394984e-05,
"loss": 0.0001,
"step": 7270
},
{
"epoch": 1.7329207331587717,
"grad_norm": 0.0024889137130230665,
"learning_rate": 2.111798778068714e-05,
"loss": 0.0001,
"step": 7280
},
{
"epoch": 1.7353011187812426,
"grad_norm": 0.10477261245250702,
"learning_rate": 2.107831468697929e-05,
"loss": 0.0001,
"step": 7290
},
{
"epoch": 1.7376815044037133,
"grad_norm": 0.0008585329633206129,
"learning_rate": 2.1038641593271445e-05,
"loss": 0.0001,
"step": 7300
},
{
"epoch": 1.7400618900261842,
"grad_norm": 0.17968738079071045,
"learning_rate": 2.09989684995636e-05,
"loss": 0.0001,
"step": 7310
},
{
"epoch": 1.742442275648655,
"grad_norm": 0.0023223140742629766,
"learning_rate": 2.095929540585575e-05,
"loss": 0.0001,
"step": 7320
},
{
"epoch": 1.744822661271126,
"grad_norm": 0.0016741958679631352,
"learning_rate": 2.0919622312147903e-05,
"loss": 0.0002,
"step": 7330
},
{
"epoch": 1.7472030468935968,
"grad_norm": 0.009992700070142746,
"learning_rate": 2.0879949218440053e-05,
"loss": 0.0,
"step": 7340
},
{
"epoch": 1.7495834325160677,
"grad_norm": 0.002163327531889081,
"learning_rate": 2.0840276124732207e-05,
"loss": 0.0,
"step": 7350
},
{
"epoch": 1.7519638181385384,
"grad_norm": 0.15539680421352386,
"learning_rate": 2.080060303102436e-05,
"loss": 0.0003,
"step": 7360
},
{
"epoch": 1.7543442037610093,
"grad_norm": 0.002331450814381242,
"learning_rate": 2.076092993731651e-05,
"loss": 0.0,
"step": 7370
},
{
"epoch": 1.75672458938348,
"grad_norm": 0.0014541965210810304,
"learning_rate": 2.0721256843608665e-05,
"loss": 0.0,
"step": 7380
},
{
"epoch": 1.7591049750059509,
"grad_norm": 0.002874292666092515,
"learning_rate": 2.068158374990082e-05,
"loss": 0.0,
"step": 7390
},
{
"epoch": 1.7614853606284218,
"grad_norm": 0.046790674328804016,
"learning_rate": 2.064191065619297e-05,
"loss": 0.0001,
"step": 7400
},
{
"epoch": 1.7638657462508927,
"grad_norm": 0.012541896663606167,
"learning_rate": 2.0602237562485126e-05,
"loss": 0.0001,
"step": 7410
},
{
"epoch": 1.7662461318733635,
"grad_norm": 0.0005884987185709178,
"learning_rate": 2.0562564468777276e-05,
"loss": 0.0001,
"step": 7420
},
{
"epoch": 1.7686265174958344,
"grad_norm": 0.0090475520119071,
"learning_rate": 2.0522891375069426e-05,
"loss": 0.0001,
"step": 7430
},
{
"epoch": 1.7710069031183053,
"grad_norm": 0.04852410405874252,
"learning_rate": 2.0483218281361584e-05,
"loss": 0.0001,
"step": 7440
},
{
"epoch": 1.773387288740776,
"grad_norm": 0.003311296459287405,
"learning_rate": 2.0443545187653734e-05,
"loss": 0.0001,
"step": 7450
},
{
"epoch": 1.775767674363247,
"grad_norm": 0.03242022171616554,
"learning_rate": 2.0403872093945888e-05,
"loss": 0.0001,
"step": 7460
},
{
"epoch": 1.7781480599857176,
"grad_norm": 0.010833712294697762,
"learning_rate": 2.036419900023804e-05,
"loss": 0.0002,
"step": 7470
},
{
"epoch": 1.7805284456081885,
"grad_norm": 0.0031983698718249798,
"learning_rate": 2.032452590653019e-05,
"loss": 0.0001,
"step": 7480
},
{
"epoch": 1.7829088312306594,
"grad_norm": 0.021590987220406532,
"learning_rate": 2.0284852812822345e-05,
"loss": 0.0001,
"step": 7490
},
{
"epoch": 1.7852892168531302,
"grad_norm": 0.005147872492671013,
"learning_rate": 2.02451797191145e-05,
"loss": 0.0,
"step": 7500
},
{
"epoch": 1.7876696024756011,
"grad_norm": 0.0012411813950166106,
"learning_rate": 2.020550662540665e-05,
"loss": 0.0001,
"step": 7510
},
{
"epoch": 1.790049988098072,
"grad_norm": 0.0009874672396108508,
"learning_rate": 2.0165833531698803e-05,
"loss": 0.0001,
"step": 7520
},
{
"epoch": 1.7924303737205427,
"grad_norm": 0.002135714516043663,
"learning_rate": 2.0126160437990957e-05,
"loss": 0.0003,
"step": 7530
},
{
"epoch": 1.7948107593430136,
"grad_norm": 0.002928838599473238,
"learning_rate": 2.008648734428311e-05,
"loss": 0.0002,
"step": 7540
},
{
"epoch": 1.7971911449654843,
"grad_norm": 0.002418682212010026,
"learning_rate": 2.004681425057526e-05,
"loss": 0.0001,
"step": 7550
},
{
"epoch": 1.7995715305879552,
"grad_norm": 0.022359730675816536,
"learning_rate": 2.000714115686741e-05,
"loss": 0.0,
"step": 7560
},
{
"epoch": 1.801951916210426,
"grad_norm": 0.0013171250466257334,
"learning_rate": 1.9967468063159568e-05,
"loss": 0.0001,
"step": 7570
},
{
"epoch": 1.804332301832897,
"grad_norm": 0.005206149537116289,
"learning_rate": 1.992779496945172e-05,
"loss": 0.0001,
"step": 7580
},
{
"epoch": 1.8067126874553678,
"grad_norm": 0.5035125613212585,
"learning_rate": 1.988812187574387e-05,
"loss": 0.0004,
"step": 7590
},
{
"epoch": 1.8090930730778387,
"grad_norm": 0.0018090710509568453,
"learning_rate": 1.9848448782036026e-05,
"loss": 0.0,
"step": 7600
},
{
"epoch": 1.8114734587003094,
"grad_norm": 0.0020274862181395292,
"learning_rate": 1.9808775688328176e-05,
"loss": 0.0,
"step": 7610
},
{
"epoch": 1.8138538443227803,
"grad_norm": 0.008559592068195343,
"learning_rate": 1.976910259462033e-05,
"loss": 0.0001,
"step": 7620
},
{
"epoch": 1.816234229945251,
"grad_norm": 0.002766631543636322,
"learning_rate": 1.9729429500912483e-05,
"loss": 0.0,
"step": 7630
},
{
"epoch": 1.8186146155677219,
"grad_norm": 0.003933802247047424,
"learning_rate": 1.9689756407204634e-05,
"loss": 0.0001,
"step": 7640
},
{
"epoch": 1.8209950011901928,
"grad_norm": 0.0502641461789608,
"learning_rate": 1.9650083313496787e-05,
"loss": 0.0001,
"step": 7650
},
{
"epoch": 1.8233753868126636,
"grad_norm": 0.002705627353861928,
"learning_rate": 1.961041021978894e-05,
"loss": 0.0,
"step": 7660
},
{
"epoch": 1.8257557724351345,
"grad_norm": 0.015057703480124474,
"learning_rate": 1.957073712608109e-05,
"loss": 0.0001,
"step": 7670
},
{
"epoch": 1.8281361580576054,
"grad_norm": 0.0005775150493718684,
"learning_rate": 1.9531064032373245e-05,
"loss": 0.0001,
"step": 7680
},
{
"epoch": 1.8305165436800763,
"grad_norm": 0.006392305716872215,
"learning_rate": 1.94913909386654e-05,
"loss": 0.0001,
"step": 7690
},
{
"epoch": 1.832896929302547,
"grad_norm": 0.0014930195175111294,
"learning_rate": 1.9451717844957553e-05,
"loss": 0.0002,
"step": 7700
},
{
"epoch": 1.8352773149250179,
"grad_norm": 0.0161952693015337,
"learning_rate": 1.9412044751249703e-05,
"loss": 0.0001,
"step": 7710
},
{
"epoch": 1.8376577005474886,
"grad_norm": 0.0019109123386442661,
"learning_rate": 1.9372371657541857e-05,
"loss": 0.0001,
"step": 7720
},
{
"epoch": 1.8400380861699595,
"grad_norm": 0.0026801279745996,
"learning_rate": 1.933269856383401e-05,
"loss": 0.0,
"step": 7730
},
{
"epoch": 1.8424184717924303,
"grad_norm": 0.006187149789184332,
"learning_rate": 1.929302547012616e-05,
"loss": 0.0001,
"step": 7740
},
{
"epoch": 1.8447988574149012,
"grad_norm": 0.002990028355270624,
"learning_rate": 1.9253352376418314e-05,
"loss": 0.0001,
"step": 7750
},
{
"epoch": 1.8471792430373721,
"grad_norm": 0.0044268155470490456,
"learning_rate": 1.9213679282710468e-05,
"loss": 0.0,
"step": 7760
},
{
"epoch": 1.849559628659843,
"grad_norm": 0.005206019151955843,
"learning_rate": 1.9174006189002618e-05,
"loss": 0.0001,
"step": 7770
},
{
"epoch": 1.8519400142823137,
"grad_norm": 0.005415783729404211,
"learning_rate": 1.9134333095294772e-05,
"loss": 0.0001,
"step": 7780
},
{
"epoch": 1.8543203999047846,
"grad_norm": 0.0016888550017029047,
"learning_rate": 1.9094660001586926e-05,
"loss": 0.0,
"step": 7790
},
{
"epoch": 1.8567007855272553,
"grad_norm": 0.003122705966234207,
"learning_rate": 1.9054986907879076e-05,
"loss": 0.0,
"step": 7800
},
{
"epoch": 1.8590811711497262,
"grad_norm": 0.021525248885154724,
"learning_rate": 1.901531381417123e-05,
"loss": 0.0,
"step": 7810
},
{
"epoch": 1.861461556772197,
"grad_norm": 0.004836782813072205,
"learning_rate": 1.8975640720463383e-05,
"loss": 0.0,
"step": 7820
},
{
"epoch": 1.863841942394668,
"grad_norm": 0.003003711812198162,
"learning_rate": 1.8935967626755534e-05,
"loss": 0.0001,
"step": 7830
},
{
"epoch": 1.8662223280171388,
"grad_norm": 0.0034373151138424873,
"learning_rate": 1.8896294533047687e-05,
"loss": 0.0001,
"step": 7840
},
{
"epoch": 1.8686027136396097,
"grad_norm": 0.061307862401008606,
"learning_rate": 1.885662143933984e-05,
"loss": 0.0001,
"step": 7850
},
{
"epoch": 1.8709830992620806,
"grad_norm": 0.001207771128974855,
"learning_rate": 1.8816948345631995e-05,
"loss": 0.0,
"step": 7860
},
{
"epoch": 1.8733634848845513,
"grad_norm": 0.007686016149818897,
"learning_rate": 1.8777275251924145e-05,
"loss": 0.0001,
"step": 7870
},
{
"epoch": 1.8757438705070222,
"grad_norm": 0.0019049645634368062,
"learning_rate": 1.87376021582163e-05,
"loss": 0.0001,
"step": 7880
},
{
"epoch": 1.8781242561294929,
"grad_norm": 0.00202633673325181,
"learning_rate": 1.8697929064508452e-05,
"loss": 0.0,
"step": 7890
},
{
"epoch": 1.8805046417519637,
"grad_norm": 0.0011157892877236009,
"learning_rate": 1.8658255970800603e-05,
"loss": 0.0002,
"step": 7900
},
{
"epoch": 1.8828850273744346,
"grad_norm": 0.001622357638552785,
"learning_rate": 1.8618582877092756e-05,
"loss": 0.0006,
"step": 7910
},
{
"epoch": 1.8852654129969055,
"grad_norm": 0.04895901307463646,
"learning_rate": 1.857890978338491e-05,
"loss": 0.0002,
"step": 7920
},
{
"epoch": 1.8876457986193764,
"grad_norm": 0.0012425240129232407,
"learning_rate": 1.853923668967706e-05,
"loss": 0.0001,
"step": 7930
},
{
"epoch": 1.8900261842418473,
"grad_norm": 0.004690519999712706,
"learning_rate": 1.8499563595969214e-05,
"loss": 0.0001,
"step": 7940
},
{
"epoch": 1.892406569864318,
"grad_norm": 0.0015794184291735291,
"learning_rate": 1.8459890502261368e-05,
"loss": 0.0,
"step": 7950
},
{
"epoch": 1.8947869554867889,
"grad_norm": 0.01080586388707161,
"learning_rate": 1.8420217408553518e-05,
"loss": 0.0002,
"step": 7960
},
{
"epoch": 1.8971673411092596,
"grad_norm": 0.0018335338681936264,
"learning_rate": 1.8380544314845672e-05,
"loss": 0.0,
"step": 7970
},
{
"epoch": 1.8995477267317304,
"grad_norm": 0.003800921142101288,
"learning_rate": 1.8340871221137826e-05,
"loss": 0.0,
"step": 7980
},
{
"epoch": 1.9019281123542013,
"grad_norm": 0.0035681715235114098,
"learning_rate": 1.830119812742998e-05,
"loss": 0.0001,
"step": 7990
},
{
"epoch": 1.9043084979766722,
"grad_norm": 0.001115818158723414,
"learning_rate": 1.826152503372213e-05,
"loss": 0.0003,
"step": 8000
},
{
"epoch": 1.9066888835991431,
"grad_norm": 0.004726150073111057,
"learning_rate": 1.8221851940014283e-05,
"loss": 0.0001,
"step": 8010
},
{
"epoch": 1.909069269221614,
"grad_norm": 0.025985538959503174,
"learning_rate": 1.8182178846306437e-05,
"loss": 0.0,
"step": 8020
},
{
"epoch": 1.9114496548440847,
"grad_norm": 0.002658289624378085,
"learning_rate": 1.8142505752598587e-05,
"loss": 0.0001,
"step": 8030
},
{
"epoch": 1.9138300404665556,
"grad_norm": 0.010776730254292488,
"learning_rate": 1.810283265889074e-05,
"loss": 0.0001,
"step": 8040
},
{
"epoch": 1.9162104260890265,
"grad_norm": 0.004742765333503485,
"learning_rate": 1.8063159565182895e-05,
"loss": 0.0001,
"step": 8050
},
{
"epoch": 1.9185908117114971,
"grad_norm": 0.0017833469901233912,
"learning_rate": 1.8023486471475045e-05,
"loss": 0.0003,
"step": 8060
},
{
"epoch": 1.920971197333968,
"grad_norm": 0.0015226156683638692,
"learning_rate": 1.79838133777672e-05,
"loss": 0.0,
"step": 8070
},
{
"epoch": 1.923351582956439,
"grad_norm": 0.0021416472736746073,
"learning_rate": 1.7944140284059352e-05,
"loss": 0.0001,
"step": 8080
},
{
"epoch": 1.9257319685789098,
"grad_norm": 0.0021594560239464045,
"learning_rate": 1.7904467190351503e-05,
"loss": 0.0,
"step": 8090
},
{
"epoch": 1.9281123542013807,
"grad_norm": 0.0018359982641413808,
"learning_rate": 1.786479409664366e-05,
"loss": 0.0,
"step": 8100
},
{
"epoch": 1.9304927398238516,
"grad_norm": 0.0036185849457979202,
"learning_rate": 1.782512100293581e-05,
"loss": 0.0001,
"step": 8110
},
{
"epoch": 1.9328731254463223,
"grad_norm": 0.019637318328022957,
"learning_rate": 1.778544790922796e-05,
"loss": 0.0,
"step": 8120
},
{
"epoch": 1.9352535110687932,
"grad_norm": 0.002496182220056653,
"learning_rate": 1.7745774815520117e-05,
"loss": 0.0001,
"step": 8130
},
{
"epoch": 1.9376338966912638,
"grad_norm": 0.004374451469630003,
"learning_rate": 1.7706101721812268e-05,
"loss": 0.0005,
"step": 8140
},
{
"epoch": 1.9400142823137347,
"grad_norm": 0.0006196928443387151,
"learning_rate": 1.766642862810442e-05,
"loss": 0.0,
"step": 8150
},
{
"epoch": 1.9423946679362056,
"grad_norm": 0.0037022046744823456,
"learning_rate": 1.7626755534396572e-05,
"loss": 0.0,
"step": 8160
},
{
"epoch": 1.9447750535586765,
"grad_norm": 0.004300027620047331,
"learning_rate": 1.7587082440688725e-05,
"loss": 0.0,
"step": 8170
},
{
"epoch": 1.9471554391811474,
"grad_norm": 0.0019766122568398714,
"learning_rate": 1.754740934698088e-05,
"loss": 0.0001,
"step": 8180
},
{
"epoch": 1.9495358248036183,
"grad_norm": 0.0018594982102513313,
"learning_rate": 1.750773625327303e-05,
"loss": 0.0001,
"step": 8190
},
{
"epoch": 1.951916210426089,
"grad_norm": 0.0012102769687771797,
"learning_rate": 1.7468063159565183e-05,
"loss": 0.0001,
"step": 8200
},
{
"epoch": 1.9542965960485599,
"grad_norm": 0.0012130772229284048,
"learning_rate": 1.7428390065857337e-05,
"loss": 0.0,
"step": 8210
},
{
"epoch": 1.9566769816710305,
"grad_norm": 0.0006833472289144993,
"learning_rate": 1.7388716972149487e-05,
"loss": 0.0004,
"step": 8220
},
{
"epoch": 1.9590573672935014,
"grad_norm": 0.0017617164412513375,
"learning_rate": 1.7349043878441644e-05,
"loss": 0.0001,
"step": 8230
},
{
"epoch": 1.9614377529159723,
"grad_norm": 0.0013312195660546422,
"learning_rate": 1.7309370784733795e-05,
"loss": 0.0,
"step": 8240
},
{
"epoch": 1.9638181385384432,
"grad_norm": 0.0018878667615354061,
"learning_rate": 1.7269697691025945e-05,
"loss": 0.0,
"step": 8250
},
{
"epoch": 1.9661985241609141,
"grad_norm": 0.0019427284132689238,
"learning_rate": 1.7230024597318102e-05,
"loss": 0.0,
"step": 8260
},
{
"epoch": 1.968578909783385,
"grad_norm": 0.004271362908184528,
"learning_rate": 1.7190351503610252e-05,
"loss": 0.0001,
"step": 8270
},
{
"epoch": 1.970959295405856,
"grad_norm": 0.0027857243549078703,
"learning_rate": 1.7150678409902406e-05,
"loss": 0.0,
"step": 8280
},
{
"epoch": 1.9733396810283266,
"grad_norm": 0.0018286170670762658,
"learning_rate": 1.711100531619456e-05,
"loss": 0.0001,
"step": 8290
},
{
"epoch": 1.9757200666507975,
"grad_norm": 0.001666391035541892,
"learning_rate": 1.707133222248671e-05,
"loss": 0.0,
"step": 8300
},
{
"epoch": 1.9781004522732681,
"grad_norm": 0.021936526522040367,
"learning_rate": 1.7031659128778864e-05,
"loss": 0.0001,
"step": 8310
},
{
"epoch": 1.980480837895739,
"grad_norm": 0.00029301681206561625,
"learning_rate": 1.6991986035071017e-05,
"loss": 0.0,
"step": 8320
},
{
"epoch": 1.98286122351821,
"grad_norm": 0.0009200606727972627,
"learning_rate": 1.6952312941363168e-05,
"loss": 0.0,
"step": 8330
},
{
"epoch": 1.9852416091406808,
"grad_norm": 0.00579107366502285,
"learning_rate": 1.691263984765532e-05,
"loss": 0.0,
"step": 8340
},
{
"epoch": 1.9876219947631517,
"grad_norm": 0.000620057515334338,
"learning_rate": 1.687296675394747e-05,
"loss": 0.0001,
"step": 8350
},
{
"epoch": 1.9900023803856226,
"grad_norm": 0.0015694822650402784,
"learning_rate": 1.6833293660239625e-05,
"loss": 0.0,
"step": 8360
},
{
"epoch": 1.9923827660080933,
"grad_norm": 0.0013426202349364758,
"learning_rate": 1.679362056653178e-05,
"loss": 0.0,
"step": 8370
},
{
"epoch": 1.9947631516305642,
"grad_norm": 0.06455473601818085,
"learning_rate": 1.675394747282393e-05,
"loss": 0.0001,
"step": 8380
},
{
"epoch": 1.9971435372530348,
"grad_norm": 0.0007938113994896412,
"learning_rate": 1.6714274379116086e-05,
"loss": 0.0001,
"step": 8390
},
{
"epoch": 1.9995239228755057,
"grad_norm": 0.0030489168129861355,
"learning_rate": 1.6674601285408237e-05,
"loss": 0.0001,
"step": 8400
},
{
"epoch": 2.0,
"eval_loss": 7.416475114041532e-07,
"eval_runtime": 52.1219,
"eval_samples_per_second": 35.839,
"eval_steps_per_second": 8.96,
"step": 8402
},
{
"epoch": 2.0019043084979766,
"grad_norm": 0.00039361350354738533,
"learning_rate": 1.6634928191700387e-05,
"loss": 0.0001,
"step": 8410
},
{
"epoch": 2.0042846941204475,
"grad_norm": 0.007912525907158852,
"learning_rate": 1.6595255097992544e-05,
"loss": 0.0001,
"step": 8420
},
{
"epoch": 2.0066650797429184,
"grad_norm": 0.003857001895084977,
"learning_rate": 1.6555582004284694e-05,
"loss": 0.0001,
"step": 8430
},
{
"epoch": 2.0090454653653893,
"grad_norm": 0.002192788990214467,
"learning_rate": 1.6515908910576848e-05,
"loss": 0.0,
"step": 8440
},
{
"epoch": 2.01142585098786,
"grad_norm": 0.00107199524063617,
"learning_rate": 1.6476235816869002e-05,
"loss": 0.0002,
"step": 8450
},
{
"epoch": 2.013806236610331,
"grad_norm": 0.024036822840571404,
"learning_rate": 1.6436562723161152e-05,
"loss": 0.0001,
"step": 8460
},
{
"epoch": 2.0161866222328015,
"grad_norm": 0.000551603501662612,
"learning_rate": 1.6396889629453306e-05,
"loss": 0.0,
"step": 8470
},
{
"epoch": 2.0185670078552724,
"grad_norm": 0.001782495528459549,
"learning_rate": 1.635721653574546e-05,
"loss": 0.0001,
"step": 8480
},
{
"epoch": 2.0209473934777433,
"grad_norm": 0.030838970094919205,
"learning_rate": 1.631754344203761e-05,
"loss": 0.0001,
"step": 8490
},
{
"epoch": 2.023327779100214,
"grad_norm": 0.0005242625484243035,
"learning_rate": 1.6277870348329764e-05,
"loss": 0.0003,
"step": 8500
},
{
"epoch": 2.025708164722685,
"grad_norm": 0.001871236483566463,
"learning_rate": 1.6238197254621917e-05,
"loss": 0.0,
"step": 8510
},
{
"epoch": 2.028088550345156,
"grad_norm": 0.0005813137395307422,
"learning_rate": 1.6198524160914068e-05,
"loss": 0.0,
"step": 8520
},
{
"epoch": 2.030468935967627,
"grad_norm": 0.0007783659384585917,
"learning_rate": 1.615885106720622e-05,
"loss": 0.0,
"step": 8530
},
{
"epoch": 2.032849321590098,
"grad_norm": 0.002862844616174698,
"learning_rate": 1.6119177973498375e-05,
"loss": 0.0001,
"step": 8540
},
{
"epoch": 2.0352297072125682,
"grad_norm": 0.0016766699263826013,
"learning_rate": 1.607950487979053e-05,
"loss": 0.0,
"step": 8550
},
{
"epoch": 2.037610092835039,
"grad_norm": 0.06566356122493744,
"learning_rate": 1.603983178608268e-05,
"loss": 0.0002,
"step": 8560
},
{
"epoch": 2.03999047845751,
"grad_norm": 0.0013121259398758411,
"learning_rate": 1.600015869237483e-05,
"loss": 0.0,
"step": 8570
},
{
"epoch": 2.042370864079981,
"grad_norm": 0.0012001970317214727,
"learning_rate": 1.5960485598666986e-05,
"loss": 0.0001,
"step": 8580
},
{
"epoch": 2.044751249702452,
"grad_norm": 0.008261552080512047,
"learning_rate": 1.5920812504959137e-05,
"loss": 0.0,
"step": 8590
},
{
"epoch": 2.0471316353249227,
"grad_norm": 0.0006174147129058838,
"learning_rate": 1.588113941125129e-05,
"loss": 0.0,
"step": 8600
},
{
"epoch": 2.0495120209473936,
"grad_norm": 0.005130809266120195,
"learning_rate": 1.5841466317543444e-05,
"loss": 0.0001,
"step": 8610
},
{
"epoch": 2.0518924065698645,
"grad_norm": 0.0034670240711420774,
"learning_rate": 1.5801793223835594e-05,
"loss": 0.0004,
"step": 8620
},
{
"epoch": 2.054272792192335,
"grad_norm": 0.0055514005944132805,
"learning_rate": 1.5762120130127748e-05,
"loss": 0.0001,
"step": 8630
},
{
"epoch": 2.056653177814806,
"grad_norm": 0.0003135903971269727,
"learning_rate": 1.5722447036419902e-05,
"loss": 0.0,
"step": 8640
},
{
"epoch": 2.0590335634372767,
"grad_norm": 0.002474389737471938,
"learning_rate": 1.5682773942712052e-05,
"loss": 0.0001,
"step": 8650
},
{
"epoch": 2.0614139490597476,
"grad_norm": 0.004792024847120047,
"learning_rate": 1.5643100849004206e-05,
"loss": 0.0,
"step": 8660
},
{
"epoch": 2.0637943346822185,
"grad_norm": 0.0030985362827777863,
"learning_rate": 1.560342775529636e-05,
"loss": 0.0001,
"step": 8670
},
{
"epoch": 2.0661747203046894,
"grad_norm": 0.004058391321450472,
"learning_rate": 1.5563754661588513e-05,
"loss": 0.0,
"step": 8680
},
{
"epoch": 2.0685551059271603,
"grad_norm": 0.00150771695189178,
"learning_rate": 1.5524081567880663e-05,
"loss": 0.0,
"step": 8690
},
{
"epoch": 2.070935491549631,
"grad_norm": 0.001020533381961286,
"learning_rate": 1.5484408474172817e-05,
"loss": 0.0,
"step": 8700
},
{
"epoch": 2.073315877172102,
"grad_norm": 0.00616106390953064,
"learning_rate": 1.544473538046497e-05,
"loss": 0.0,
"step": 8710
},
{
"epoch": 2.0756962627945725,
"grad_norm": 0.0025589261204004288,
"learning_rate": 1.540506228675712e-05,
"loss": 0.0001,
"step": 8720
},
{
"epoch": 2.0780766484170434,
"grad_norm": 0.0006466865306720138,
"learning_rate": 1.5365389193049275e-05,
"loss": 0.0,
"step": 8730
},
{
"epoch": 2.0804570340395143,
"grad_norm": 0.002343350788578391,
"learning_rate": 1.532571609934143e-05,
"loss": 0.0001,
"step": 8740
},
{
"epoch": 2.082837419661985,
"grad_norm": 0.0006717872456647456,
"learning_rate": 1.528604300563358e-05,
"loss": 0.0001,
"step": 8750
},
{
"epoch": 2.085217805284456,
"grad_norm": 0.0009957224829122424,
"learning_rate": 1.524636991192573e-05,
"loss": 0.0002,
"step": 8760
},
{
"epoch": 2.087598190906927,
"grad_norm": 0.0014106009621173143,
"learning_rate": 1.5206696818217886e-05,
"loss": 0.0,
"step": 8770
},
{
"epoch": 2.089978576529398,
"grad_norm": 0.0011065505677834153,
"learning_rate": 1.5167023724510038e-05,
"loss": 0.0001,
"step": 8780
},
{
"epoch": 2.092358962151869,
"grad_norm": 0.0027844165451824665,
"learning_rate": 1.512735063080219e-05,
"loss": 0.0001,
"step": 8790
},
{
"epoch": 2.0947393477743392,
"grad_norm": 0.0006960778846405447,
"learning_rate": 1.5087677537094344e-05,
"loss": 0.0,
"step": 8800
},
{
"epoch": 2.09711973339681,
"grad_norm": 0.0003423156449571252,
"learning_rate": 1.5048004443386496e-05,
"loss": 0.0,
"step": 8810
},
{
"epoch": 2.099500119019281,
"grad_norm": 0.0011733579449355602,
"learning_rate": 1.5008331349678648e-05,
"loss": 0.0,
"step": 8820
},
{
"epoch": 2.101880504641752,
"grad_norm": 0.004115458112210035,
"learning_rate": 1.4968658255970802e-05,
"loss": 0.0,
"step": 8830
},
{
"epoch": 2.104260890264223,
"grad_norm": 0.072359099984169,
"learning_rate": 1.4928985162262954e-05,
"loss": 0.0,
"step": 8840
},
{
"epoch": 2.1066412758866937,
"grad_norm": 0.003922273404896259,
"learning_rate": 1.4889312068555106e-05,
"loss": 0.0,
"step": 8850
},
{
"epoch": 2.1090216615091646,
"grad_norm": 0.012736503966152668,
"learning_rate": 1.4849638974847261e-05,
"loss": 0.0,
"step": 8860
},
{
"epoch": 2.1114020471316355,
"grad_norm": 0.0019338323036208749,
"learning_rate": 1.4809965881139411e-05,
"loss": 0.0001,
"step": 8870
},
{
"epoch": 2.1137824327541064,
"grad_norm": 0.0015457593835890293,
"learning_rate": 1.4770292787431563e-05,
"loss": 0.0,
"step": 8880
},
{
"epoch": 2.116162818376577,
"grad_norm": 0.0016716497484594584,
"learning_rate": 1.4730619693723719e-05,
"loss": 0.0,
"step": 8890
},
{
"epoch": 2.1185432039990477,
"grad_norm": 0.001560089411213994,
"learning_rate": 1.469094660001587e-05,
"loss": 0.0,
"step": 8900
},
{
"epoch": 2.1209235896215186,
"grad_norm": 0.0031743065919727087,
"learning_rate": 1.4651273506308023e-05,
"loss": 0.0,
"step": 8910
},
{
"epoch": 2.1233039752439895,
"grad_norm": 0.0015614436706528068,
"learning_rate": 1.4611600412600176e-05,
"loss": 0.0,
"step": 8920
},
{
"epoch": 2.1256843608664604,
"grad_norm": 0.0005399516085162759,
"learning_rate": 1.4571927318892328e-05,
"loss": 0.0,
"step": 8930
},
{
"epoch": 2.1280647464889313,
"grad_norm": 0.0014794693561270833,
"learning_rate": 1.453225422518448e-05,
"loss": 0.0,
"step": 8940
},
{
"epoch": 2.130445132111402,
"grad_norm": 0.0024672893341630697,
"learning_rate": 1.4492581131476632e-05,
"loss": 0.0,
"step": 8950
},
{
"epoch": 2.132825517733873,
"grad_norm": 0.0013646584702655673,
"learning_rate": 1.4452908037768786e-05,
"loss": 0.0,
"step": 8960
},
{
"epoch": 2.1352059033563435,
"grad_norm": 0.07290241867303848,
"learning_rate": 1.4413234944060938e-05,
"loss": 0.0001,
"step": 8970
},
{
"epoch": 2.1375862889788144,
"grad_norm": 0.001859787036664784,
"learning_rate": 1.437356185035309e-05,
"loss": 0.0,
"step": 8980
},
{
"epoch": 2.1399666746012853,
"grad_norm": 0.001754750614054501,
"learning_rate": 1.4333888756645244e-05,
"loss": 0.0,
"step": 8990
},
{
"epoch": 2.142347060223756,
"grad_norm": 0.028476126492023468,
"learning_rate": 1.4294215662937396e-05,
"loss": 0.0,
"step": 9000
},
{
"epoch": 2.144727445846227,
"grad_norm": 0.0005994876846671104,
"learning_rate": 1.4254542569229548e-05,
"loss": 0.0,
"step": 9010
},
{
"epoch": 2.147107831468698,
"grad_norm": 0.0007879494805820286,
"learning_rate": 1.4214869475521703e-05,
"loss": 0.0,
"step": 9020
},
{
"epoch": 2.149488217091169,
"grad_norm": 0.0012654970632866025,
"learning_rate": 1.4175196381813855e-05,
"loss": 0.0,
"step": 9030
},
{
"epoch": 2.1518686027136398,
"grad_norm": 0.0018679037457332015,
"learning_rate": 1.4135523288106006e-05,
"loss": 0.0,
"step": 9040
},
{
"epoch": 2.1542489883361107,
"grad_norm": 0.0017861429369077086,
"learning_rate": 1.4095850194398161e-05,
"loss": 0.0,
"step": 9050
},
{
"epoch": 2.156629373958581,
"grad_norm": 0.006415149662643671,
"learning_rate": 1.4056177100690313e-05,
"loss": 0.0,
"step": 9060
},
{
"epoch": 2.159009759581052,
"grad_norm": 0.002842891961336136,
"learning_rate": 1.4016504006982465e-05,
"loss": 0.0,
"step": 9070
},
{
"epoch": 2.161390145203523,
"grad_norm": 0.0013869826216250658,
"learning_rate": 1.3976830913274619e-05,
"loss": 0.0,
"step": 9080
},
{
"epoch": 2.163770530825994,
"grad_norm": 0.018388478085398674,
"learning_rate": 1.393715781956677e-05,
"loss": 0.0001,
"step": 9090
},
{
"epoch": 2.1661509164484647,
"grad_norm": 0.0008245584322139621,
"learning_rate": 1.3897484725858923e-05,
"loss": 0.0,
"step": 9100
},
{
"epoch": 2.1685313020709356,
"grad_norm": 0.36837905645370483,
"learning_rate": 1.3857811632151076e-05,
"loss": 0.0001,
"step": 9110
},
{
"epoch": 2.1709116876934065,
"grad_norm": 0.002466343343257904,
"learning_rate": 1.3818138538443228e-05,
"loss": 0.0001,
"step": 9120
},
{
"epoch": 2.1732920733158774,
"grad_norm": 0.0035982499830424786,
"learning_rate": 1.377846544473538e-05,
"loss": 0.0,
"step": 9130
},
{
"epoch": 2.175672458938348,
"grad_norm": 0.13738982379436493,
"learning_rate": 1.3738792351027536e-05,
"loss": 0.0001,
"step": 9140
},
{
"epoch": 2.1780528445608187,
"grad_norm": 0.00042806967394426465,
"learning_rate": 1.3699119257319688e-05,
"loss": 0.0,
"step": 9150
},
{
"epoch": 2.1804332301832896,
"grad_norm": 0.002727969316765666,
"learning_rate": 1.3659446163611838e-05,
"loss": 0.0,
"step": 9160
},
{
"epoch": 2.1828136158057605,
"grad_norm": 0.0010691905627027154,
"learning_rate": 1.361977306990399e-05,
"loss": 0.0,
"step": 9170
},
{
"epoch": 2.1851940014282314,
"grad_norm": 0.020881984382867813,
"learning_rate": 1.3580099976196145e-05,
"loss": 0.0,
"step": 9180
},
{
"epoch": 2.1875743870507023,
"grad_norm": 0.0019363940227776766,
"learning_rate": 1.3540426882488297e-05,
"loss": 0.0,
"step": 9190
},
{
"epoch": 2.189954772673173,
"grad_norm": 0.001359110465273261,
"learning_rate": 1.350075378878045e-05,
"loss": 0.0,
"step": 9200
},
{
"epoch": 2.192335158295644,
"grad_norm": 0.0024417322129011154,
"learning_rate": 1.3461080695072603e-05,
"loss": 0.0,
"step": 9210
},
{
"epoch": 2.1947155439181145,
"grad_norm": 0.0006399775156751275,
"learning_rate": 1.3421407601364755e-05,
"loss": 0.0,
"step": 9220
},
{
"epoch": 2.1970959295405854,
"grad_norm": 0.001347382552921772,
"learning_rate": 1.3381734507656907e-05,
"loss": 0.0001,
"step": 9230
},
{
"epoch": 2.1994763151630563,
"grad_norm": 0.002276881132274866,
"learning_rate": 1.334206141394906e-05,
"loss": 0.0,
"step": 9240
},
{
"epoch": 2.201856700785527,
"grad_norm": 0.0005205354536883533,
"learning_rate": 1.3302388320241213e-05,
"loss": 0.0,
"step": 9250
},
{
"epoch": 2.204237086407998,
"grad_norm": 0.001351204700767994,
"learning_rate": 1.3262715226533365e-05,
"loss": 0.0,
"step": 9260
},
{
"epoch": 2.206617472030469,
"grad_norm": 0.00529600540176034,
"learning_rate": 1.322304213282552e-05,
"loss": 0.0002,
"step": 9270
},
{
"epoch": 2.20899785765294,
"grad_norm": 0.002000352367758751,
"learning_rate": 1.318336903911767e-05,
"loss": 0.0,
"step": 9280
},
{
"epoch": 2.2113782432754108,
"grad_norm": 0.0011036837240681052,
"learning_rate": 1.3143695945409823e-05,
"loss": 0.0,
"step": 9290
},
{
"epoch": 2.2137586288978817,
"grad_norm": 0.0023322845809161663,
"learning_rate": 1.3104022851701978e-05,
"loss": 0.0,
"step": 9300
},
{
"epoch": 2.216139014520352,
"grad_norm": 0.0029122158885002136,
"learning_rate": 1.306434975799413e-05,
"loss": 0.0,
"step": 9310
},
{
"epoch": 2.218519400142823,
"grad_norm": 0.00949085596948862,
"learning_rate": 1.302467666428628e-05,
"loss": 0.0,
"step": 9320
},
{
"epoch": 2.220899785765294,
"grad_norm": 0.0013391702668741345,
"learning_rate": 1.2985003570578436e-05,
"loss": 0.0,
"step": 9330
},
{
"epoch": 2.223280171387765,
"grad_norm": 0.00047678747796453536,
"learning_rate": 1.2945330476870588e-05,
"loss": 0.0,
"step": 9340
},
{
"epoch": 2.2256605570102357,
"grad_norm": 0.0031029602978378534,
"learning_rate": 1.290565738316274e-05,
"loss": 0.0,
"step": 9350
},
{
"epoch": 2.2280409426327066,
"grad_norm": 0.00046392931835725904,
"learning_rate": 1.2865984289454892e-05,
"loss": 0.0,
"step": 9360
},
{
"epoch": 2.2304213282551775,
"grad_norm": 0.0008917547529563308,
"learning_rate": 1.2826311195747045e-05,
"loss": 0.0,
"step": 9370
},
{
"epoch": 2.2328017138776484,
"grad_norm": 0.0039760940708220005,
"learning_rate": 1.2786638102039197e-05,
"loss": 0.0,
"step": 9380
},
{
"epoch": 2.235182099500119,
"grad_norm": 0.0009416754473932087,
"learning_rate": 1.274696500833135e-05,
"loss": 0.0,
"step": 9390
},
{
"epoch": 2.2375624851225897,
"grad_norm": 0.0008697324083186686,
"learning_rate": 1.2707291914623503e-05,
"loss": 0.0,
"step": 9400
},
{
"epoch": 2.2399428707450606,
"grad_norm": 0.00044792311382479966,
"learning_rate": 1.2667618820915655e-05,
"loss": 0.0,
"step": 9410
},
{
"epoch": 2.2423232563675315,
"grad_norm": 0.0014049585442990065,
"learning_rate": 1.2627945727207807e-05,
"loss": 0.0,
"step": 9420
},
{
"epoch": 2.2447036419900024,
"grad_norm": 0.00259969150647521,
"learning_rate": 1.2588272633499962e-05,
"loss": 0.0,
"step": 9430
},
{
"epoch": 2.2470840276124733,
"grad_norm": 0.0012579966569319367,
"learning_rate": 1.2548599539792113e-05,
"loss": 0.0,
"step": 9440
},
{
"epoch": 2.249464413234944,
"grad_norm": 0.008475791662931442,
"learning_rate": 1.2508926446084265e-05,
"loss": 0.0,
"step": 9450
},
{
"epoch": 2.251844798857415,
"grad_norm": 0.007055677939206362,
"learning_rate": 1.2469253352376418e-05,
"loss": 0.0,
"step": 9460
},
{
"epoch": 2.2542251844798855,
"grad_norm": 0.00043771168566308916,
"learning_rate": 1.2429580258668572e-05,
"loss": 0.0,
"step": 9470
},
{
"epoch": 2.2566055701023564,
"grad_norm": 0.0004315728147048503,
"learning_rate": 1.2389907164960724e-05,
"loss": 0.0,
"step": 9480
},
{
"epoch": 2.2589859557248273,
"grad_norm": 0.0006574731087312102,
"learning_rate": 1.2350234071252876e-05,
"loss": 0.0,
"step": 9490
},
{
"epoch": 2.261366341347298,
"grad_norm": 0.000502898299600929,
"learning_rate": 1.231056097754503e-05,
"loss": 0.0,
"step": 9500
},
{
"epoch": 2.263746726969769,
"grad_norm": 0.0014464023988693953,
"learning_rate": 1.2270887883837182e-05,
"loss": 0.0,
"step": 9510
},
{
"epoch": 2.26612711259224,
"grad_norm": 0.0007312349043786526,
"learning_rate": 1.2231214790129336e-05,
"loss": 0.0,
"step": 9520
},
{
"epoch": 2.268507498214711,
"grad_norm": 0.0012411205098032951,
"learning_rate": 1.2191541696421488e-05,
"loss": 0.0,
"step": 9530
},
{
"epoch": 2.2708878838371818,
"grad_norm": 0.003359739203006029,
"learning_rate": 1.215186860271364e-05,
"loss": 0.0,
"step": 9540
},
{
"epoch": 2.2732682694596527,
"grad_norm": 0.0025401897728443146,
"learning_rate": 1.2112195509005793e-05,
"loss": 0.0,
"step": 9550
},
{
"epoch": 2.275648655082123,
"grad_norm": 0.0009357984527014196,
"learning_rate": 1.2072522415297945e-05,
"loss": 0.0,
"step": 9560
},
{
"epoch": 2.278029040704594,
"grad_norm": 0.015569353476166725,
"learning_rate": 1.2032849321590097e-05,
"loss": 0.0,
"step": 9570
},
{
"epoch": 2.280409426327065,
"grad_norm": 0.0005228265072219074,
"learning_rate": 1.1993176227882251e-05,
"loss": 0.0,
"step": 9580
},
{
"epoch": 2.282789811949536,
"grad_norm": 0.0006133327260613441,
"learning_rate": 1.1953503134174405e-05,
"loss": 0.0,
"step": 9590
},
{
"epoch": 2.2851701975720067,
"grad_norm": 0.0006283469265326858,
"learning_rate": 1.1913830040466557e-05,
"loss": 0.0,
"step": 9600
},
{
"epoch": 2.2875505831944776,
"grad_norm": 0.0017937012016773224,
"learning_rate": 1.1874156946758709e-05,
"loss": 0.0,
"step": 9610
},
{
"epoch": 2.2899309688169485,
"grad_norm": 0.00227372907102108,
"learning_rate": 1.183448385305086e-05,
"loss": 0.0,
"step": 9620
},
{
"epoch": 2.2923113544394194,
"grad_norm": 0.0007874960429035127,
"learning_rate": 1.1794810759343014e-05,
"loss": 0.0,
"step": 9630
},
{
"epoch": 2.2946917400618903,
"grad_norm": 0.0012992926640436053,
"learning_rate": 1.1755137665635168e-05,
"loss": 0.0,
"step": 9640
},
{
"epoch": 2.2970721256843607,
"grad_norm": 0.0026856097392737865,
"learning_rate": 1.1715464571927318e-05,
"loss": 0.0001,
"step": 9650
},
{
"epoch": 2.2994525113068316,
"grad_norm": 0.027589144185185432,
"learning_rate": 1.1675791478219472e-05,
"loss": 0.0005,
"step": 9660
},
{
"epoch": 2.3018328969293025,
"grad_norm": 0.00021341729734558612,
"learning_rate": 1.1636118384511626e-05,
"loss": 0.0,
"step": 9670
},
{
"epoch": 2.3042132825517734,
"grad_norm": 0.0005525678861886263,
"learning_rate": 1.1596445290803778e-05,
"loss": 0.0,
"step": 9680
},
{
"epoch": 2.3065936681742443,
"grad_norm": 0.0006510653183795512,
"learning_rate": 1.155677219709593e-05,
"loss": 0.0,
"step": 9690
},
{
"epoch": 2.308974053796715,
"grad_norm": 0.0011141913710162044,
"learning_rate": 1.1517099103388082e-05,
"loss": 0.0,
"step": 9700
},
{
"epoch": 2.311354439419186,
"grad_norm": 0.001998309977352619,
"learning_rate": 1.1477426009680235e-05,
"loss": 0.0,
"step": 9710
},
{
"epoch": 2.3137348250416565,
"grad_norm": 0.008638182654976845,
"learning_rate": 1.1437752915972389e-05,
"loss": 0.0,
"step": 9720
},
{
"epoch": 2.3161152106641274,
"grad_norm": 0.0004837829037569463,
"learning_rate": 1.139807982226454e-05,
"loss": 0.0,
"step": 9730
},
{
"epoch": 2.3184955962865983,
"grad_norm": 0.008834806270897388,
"learning_rate": 1.1358406728556693e-05,
"loss": 0.0,
"step": 9740
},
{
"epoch": 2.320875981909069,
"grad_norm": 0.017421774566173553,
"learning_rate": 1.1318733634848847e-05,
"loss": 0.0,
"step": 9750
},
{
"epoch": 2.32325636753154,
"grad_norm": 0.0008695673895999789,
"learning_rate": 1.1279060541140999e-05,
"loss": 0.0,
"step": 9760
},
{
"epoch": 2.325636753154011,
"grad_norm": 0.007985567674040794,
"learning_rate": 1.1239387447433151e-05,
"loss": 0.0001,
"step": 9770
},
{
"epoch": 2.328017138776482,
"grad_norm": 0.0002991770743392408,
"learning_rate": 1.1199714353725305e-05,
"loss": 0.0,
"step": 9780
},
{
"epoch": 2.3303975243989528,
"grad_norm": 0.0018964770715683699,
"learning_rate": 1.1160041260017457e-05,
"loss": 0.0,
"step": 9790
},
{
"epoch": 2.3327779100214237,
"grad_norm": 0.0003782061976380646,
"learning_rate": 1.112036816630961e-05,
"loss": 0.0,
"step": 9800
},
{
"epoch": 2.335158295643894,
"grad_norm": 0.0005394426407292485,
"learning_rate": 1.108069507260176e-05,
"loss": 0.0,
"step": 9810
},
{
"epoch": 2.337538681266365,
"grad_norm": 0.0008728650282137096,
"learning_rate": 1.1041021978893914e-05,
"loss": 0.0,
"step": 9820
},
{
"epoch": 2.339919066888836,
"grad_norm": 1.026079773902893,
"learning_rate": 1.1001348885186068e-05,
"loss": 0.0001,
"step": 9830
},
{
"epoch": 2.342299452511307,
"grad_norm": 0.000987286795862019,
"learning_rate": 1.096167579147822e-05,
"loss": 0.0,
"step": 9840
},
{
"epoch": 2.3446798381337777,
"grad_norm": 0.0015003952430561185,
"learning_rate": 1.0922002697770372e-05,
"loss": 0.0,
"step": 9850
},
{
"epoch": 2.3470602237562486,
"grad_norm": 0.001296977628953755,
"learning_rate": 1.0882329604062526e-05,
"loss": 0.0,
"step": 9860
},
{
"epoch": 2.3494406093787195,
"grad_norm": 0.000640163547359407,
"learning_rate": 1.0842656510354678e-05,
"loss": 0.0,
"step": 9870
},
{
"epoch": 2.3518209950011904,
"grad_norm": 0.0009391361963935196,
"learning_rate": 1.0802983416646831e-05,
"loss": 0.0,
"step": 9880
},
{
"epoch": 2.3542013806236612,
"grad_norm": 0.0006612977595068514,
"learning_rate": 1.0763310322938983e-05,
"loss": 0.0001,
"step": 9890
},
{
"epoch": 2.3565817662461317,
"grad_norm": 0.0014715328579768538,
"learning_rate": 1.0723637229231135e-05,
"loss": 0.0,
"step": 9900
},
{
"epoch": 2.3589621518686026,
"grad_norm": 0.0004139976226724684,
"learning_rate": 1.0683964135523289e-05,
"loss": 0.0,
"step": 9910
},
{
"epoch": 2.3613425374910735,
"grad_norm": 0.001368595752865076,
"learning_rate": 1.0644291041815441e-05,
"loss": 0.0001,
"step": 9920
},
{
"epoch": 2.3637229231135444,
"grad_norm": 0.0010275020031258464,
"learning_rate": 1.0604617948107593e-05,
"loss": 0.0,
"step": 9930
},
{
"epoch": 2.3661033087360153,
"grad_norm": 0.0008476102957502007,
"learning_rate": 1.0564944854399747e-05,
"loss": 0.0,
"step": 9940
},
{
"epoch": 2.368483694358486,
"grad_norm": 0.019286731258034706,
"learning_rate": 1.0525271760691899e-05,
"loss": 0.0,
"step": 9950
},
{
"epoch": 2.370864079980957,
"grad_norm": 0.0007589785964228213,
"learning_rate": 1.0485598666984052e-05,
"loss": 0.0001,
"step": 9960
},
{
"epoch": 2.373244465603428,
"grad_norm": 0.0007659016991965473,
"learning_rate": 1.0445925573276204e-05,
"loss": 0.0,
"step": 9970
},
{
"epoch": 2.375624851225899,
"grad_norm": 0.0035345428623259068,
"learning_rate": 1.0406252479568356e-05,
"loss": 0.0,
"step": 9980
},
{
"epoch": 2.3780052368483693,
"grad_norm": 0.0021891535725444555,
"learning_rate": 1.036657938586051e-05,
"loss": 0.0,
"step": 9990
},
{
"epoch": 2.38038562247084,
"grad_norm": 0.2655426263809204,
"learning_rate": 1.0326906292152662e-05,
"loss": 0.001,
"step": 10000
},
{
"epoch": 2.382766008093311,
"grad_norm": 0.0008121923892758787,
"learning_rate": 1.0287233198444816e-05,
"loss": 0.0001,
"step": 10010
},
{
"epoch": 2.385146393715782,
"grad_norm": 0.006638567429035902,
"learning_rate": 1.0247560104736968e-05,
"loss": 0.0,
"step": 10020
},
{
"epoch": 2.387526779338253,
"grad_norm": 0.0033031317871063948,
"learning_rate": 1.020788701102912e-05,
"loss": 0.0,
"step": 10030
},
{
"epoch": 2.3899071649607238,
"grad_norm": 0.0004701575671788305,
"learning_rate": 1.0168213917321274e-05,
"loss": 0.0002,
"step": 10040
},
{
"epoch": 2.3922875505831946,
"grad_norm": 0.007627520710229874,
"learning_rate": 1.0128540823613426e-05,
"loss": 0.0,
"step": 10050
},
{
"epoch": 2.394667936205665,
"grad_norm": 0.0011233366094529629,
"learning_rate": 1.0088867729905578e-05,
"loss": 0.0,
"step": 10060
},
{
"epoch": 2.397048321828136,
"grad_norm": 0.0003728682058863342,
"learning_rate": 1.0049194636197731e-05,
"loss": 0.0001,
"step": 10070
},
{
"epoch": 2.399428707450607,
"grad_norm": 0.0018078387947753072,
"learning_rate": 1.0009521542489885e-05,
"loss": 0.0,
"step": 10080
},
{
"epoch": 2.4018090930730778,
"grad_norm": 0.004032574128359556,
"learning_rate": 9.969848448782037e-06,
"loss": 0.0,
"step": 10090
},
{
"epoch": 2.4041894786955487,
"grad_norm": 0.0010251044295728207,
"learning_rate": 9.930175355074189e-06,
"loss": 0.0,
"step": 10100
},
{
"epoch": 2.4065698643180196,
"grad_norm": 0.0012369100004434586,
"learning_rate": 9.890502261366341e-06,
"loss": 0.0001,
"step": 10110
},
{
"epoch": 2.4089502499404905,
"grad_norm": 0.0008841692470014095,
"learning_rate": 9.850829167658495e-06,
"loss": 0.0,
"step": 10120
},
{
"epoch": 2.4113306355629613,
"grad_norm": 0.05967468023300171,
"learning_rate": 9.811156073950648e-06,
"loss": 0.0001,
"step": 10130
},
{
"epoch": 2.4137110211854322,
"grad_norm": 0.002878790721297264,
"learning_rate": 9.771482980242799e-06,
"loss": 0.0,
"step": 10140
},
{
"epoch": 2.4160914068079027,
"grad_norm": 0.0005018101655878127,
"learning_rate": 9.731809886534952e-06,
"loss": 0.0,
"step": 10150
},
{
"epoch": 2.4184717924303736,
"grad_norm": 0.0015724776312708855,
"learning_rate": 9.692136792827106e-06,
"loss": 0.0003,
"step": 10160
},
{
"epoch": 2.4208521780528445,
"grad_norm": 0.004237225744873285,
"learning_rate": 9.652463699119258e-06,
"loss": 0.0,
"step": 10170
},
{
"epoch": 2.4232325636753154,
"grad_norm": 0.00131317344494164,
"learning_rate": 9.61279060541141e-06,
"loss": 0.0,
"step": 10180
},
{
"epoch": 2.4256129492977863,
"grad_norm": 0.002073557348921895,
"learning_rate": 9.573117511703564e-06,
"loss": 0.0,
"step": 10190
},
{
"epoch": 2.427993334920257,
"grad_norm": 0.0045993453823029995,
"learning_rate": 9.533444417995716e-06,
"loss": 0.0002,
"step": 10200
},
{
"epoch": 2.430373720542728,
"grad_norm": 0.001618819311261177,
"learning_rate": 9.49377132428787e-06,
"loss": 0.0001,
"step": 10210
},
{
"epoch": 2.432754106165199,
"grad_norm": 0.00304215750657022,
"learning_rate": 9.45409823058002e-06,
"loss": 0.0,
"step": 10220
},
{
"epoch": 2.43513449178767,
"grad_norm": 0.0007059932686388493,
"learning_rate": 9.414425136872173e-06,
"loss": 0.0,
"step": 10230
},
{
"epoch": 2.4375148774101403,
"grad_norm": 0.0031899004243314266,
"learning_rate": 9.374752043164327e-06,
"loss": 0.0,
"step": 10240
},
{
"epoch": 2.439895263032611,
"grad_norm": 0.002477418165653944,
"learning_rate": 9.33507894945648e-06,
"loss": 0.0,
"step": 10250
},
{
"epoch": 2.442275648655082,
"grad_norm": 0.00046585980453528464,
"learning_rate": 9.295405855748631e-06,
"loss": 0.0,
"step": 10260
},
{
"epoch": 2.444656034277553,
"grad_norm": 0.0005838835495524108,
"learning_rate": 9.255732762040785e-06,
"loss": 0.0,
"step": 10270
},
{
"epoch": 2.447036419900024,
"grad_norm": 0.001370543148368597,
"learning_rate": 9.216059668332937e-06,
"loss": 0.0,
"step": 10280
},
{
"epoch": 2.4494168055224947,
"grad_norm": 0.0016045079100877047,
"learning_rate": 9.17638657462509e-06,
"loss": 0.0,
"step": 10290
},
{
"epoch": 2.4517971911449656,
"grad_norm": 0.0020401678048074245,
"learning_rate": 9.136713480917243e-06,
"loss": 0.0001,
"step": 10300
},
{
"epoch": 2.454177576767436,
"grad_norm": 0.00043605471728369594,
"learning_rate": 9.097040387209395e-06,
"loss": 0.0,
"step": 10310
},
{
"epoch": 2.456557962389907,
"grad_norm": 0.0005910994368605316,
"learning_rate": 9.057367293501548e-06,
"loss": 0.0,
"step": 10320
},
{
"epoch": 2.458938348012378,
"grad_norm": 0.0005397904315032065,
"learning_rate": 9.0176941997937e-06,
"loss": 0.0,
"step": 10330
},
{
"epoch": 2.4613187336348488,
"grad_norm": 0.014002328738570213,
"learning_rate": 8.978021106085852e-06,
"loss": 0.0,
"step": 10340
},
{
"epoch": 2.4636991192573197,
"grad_norm": 0.0011001590173691511,
"learning_rate": 8.938348012378006e-06,
"loss": 0.0001,
"step": 10350
},
{
"epoch": 2.4660795048797906,
"grad_norm": 0.0029695210978388786,
"learning_rate": 8.898674918670158e-06,
"loss": 0.0,
"step": 10360
},
{
"epoch": 2.4684598905022614,
"grad_norm": 0.00410072086378932,
"learning_rate": 8.859001824962312e-06,
"loss": 0.0,
"step": 10370
},
{
"epoch": 2.4708402761247323,
"grad_norm": 0.0005128366756252944,
"learning_rate": 8.819328731254464e-06,
"loss": 0.0,
"step": 10380
},
{
"epoch": 2.4732206617472032,
"grad_norm": 0.0021037100814282894,
"learning_rate": 8.779655637546616e-06,
"loss": 0.0,
"step": 10390
},
{
"epoch": 2.4756010473696737,
"grad_norm": 0.0005958130932413042,
"learning_rate": 8.73998254383877e-06,
"loss": 0.0,
"step": 10400
},
{
"epoch": 2.4779814329921446,
"grad_norm": 0.0021961687598377466,
"learning_rate": 8.700309450130921e-06,
"loss": 0.0001,
"step": 10410
},
{
"epoch": 2.4803618186146155,
"grad_norm": 0.0011290331603959203,
"learning_rate": 8.660636356423073e-06,
"loss": 0.0,
"step": 10420
},
{
"epoch": 2.4827422042370864,
"grad_norm": 0.003101737704128027,
"learning_rate": 8.620963262715227e-06,
"loss": 0.0,
"step": 10430
},
{
"epoch": 2.4851225898595573,
"grad_norm": 0.010269707068800926,
"learning_rate": 8.581290169007379e-06,
"loss": 0.0,
"step": 10440
},
{
"epoch": 2.487502975482028,
"grad_norm": 0.0006016406114213169,
"learning_rate": 8.541617075299533e-06,
"loss": 0.0,
"step": 10450
},
{
"epoch": 2.489883361104499,
"grad_norm": 0.012370145879685879,
"learning_rate": 8.501943981591685e-06,
"loss": 0.0,
"step": 10460
},
{
"epoch": 2.49226374672697,
"grad_norm": 0.002209730911999941,
"learning_rate": 8.462270887883837e-06,
"loss": 0.0,
"step": 10470
},
{
"epoch": 2.494644132349441,
"grad_norm": 0.0002978077973239124,
"learning_rate": 8.42259779417599e-06,
"loss": 0.0,
"step": 10480
},
{
"epoch": 2.4970245179719113,
"grad_norm": 0.0006728899315930903,
"learning_rate": 8.382924700468144e-06,
"loss": 0.0,
"step": 10490
},
{
"epoch": 2.499404903594382,
"grad_norm": 0.0008764348458498716,
"learning_rate": 8.343251606760296e-06,
"loss": 0.0,
"step": 10500
},
{
"epoch": 2.501785289216853,
"grad_norm": 0.001580104581080377,
"learning_rate": 8.303578513052448e-06,
"loss": 0.0,
"step": 10510
},
{
"epoch": 2.504165674839324,
"grad_norm": 0.0003571589768398553,
"learning_rate": 8.2639054193446e-06,
"loss": 0.0,
"step": 10520
},
{
"epoch": 2.506546060461795,
"grad_norm": 0.004758020397275686,
"learning_rate": 8.224232325636754e-06,
"loss": 0.0,
"step": 10530
},
{
"epoch": 2.5089264460842657,
"grad_norm": 0.0013680767733603716,
"learning_rate": 8.184559231928906e-06,
"loss": 0.0,
"step": 10540
},
{
"epoch": 2.5113068317067366,
"grad_norm": 0.0010658970568329096,
"learning_rate": 8.144886138221058e-06,
"loss": 0.0,
"step": 10550
},
{
"epoch": 2.513687217329207,
"grad_norm": 0.0007452235440723598,
"learning_rate": 8.105213044513212e-06,
"loss": 0.0002,
"step": 10560
},
{
"epoch": 2.5160676029516784,
"grad_norm": 0.0006281470414251089,
"learning_rate": 8.065539950805365e-06,
"loss": 0.0,
"step": 10570
},
{
"epoch": 2.518447988574149,
"grad_norm": 0.0007866009837016463,
"learning_rate": 8.025866857097517e-06,
"loss": 0.0,
"step": 10580
},
{
"epoch": 2.5208283741966198,
"grad_norm": 0.00039683215436525643,
"learning_rate": 7.98619376338967e-06,
"loss": 0.0,
"step": 10590
},
{
"epoch": 2.5232087598190907,
"grad_norm": 0.0009177124593406916,
"learning_rate": 7.946520669681823e-06,
"loss": 0.0,
"step": 10600
},
{
"epoch": 2.5255891454415615,
"grad_norm": 0.00038271176163107157,
"learning_rate": 7.906847575973975e-06,
"loss": 0.0,
"step": 10610
},
{
"epoch": 2.5279695310640324,
"grad_norm": 0.00041592001798562706,
"learning_rate": 7.867174482266127e-06,
"loss": 0.0,
"step": 10620
},
{
"epoch": 2.5303499166865033,
"grad_norm": 0.0009455361287109554,
"learning_rate": 7.827501388558279e-06,
"loss": 0.0,
"step": 10630
},
{
"epoch": 2.5327303023089742,
"grad_norm": 0.0005674211424775422,
"learning_rate": 7.787828294850433e-06,
"loss": 0.0001,
"step": 10640
},
{
"epoch": 2.5351106879314447,
"grad_norm": 0.008180541917681694,
"learning_rate": 7.748155201142586e-06,
"loss": 0.0001,
"step": 10650
},
{
"epoch": 2.537491073553916,
"grad_norm": 0.006044210400432348,
"learning_rate": 7.708482107434738e-06,
"loss": 0.0,
"step": 10660
},
{
"epoch": 2.5398714591763865,
"grad_norm": 0.00039350485894829035,
"learning_rate": 7.66880901372689e-06,
"loss": 0.0,
"step": 10670
},
{
"epoch": 2.5422518447988574,
"grad_norm": 0.0007660723640583456,
"learning_rate": 7.629135920019044e-06,
"loss": 0.0,
"step": 10680
},
{
"epoch": 2.5446322304213282,
"grad_norm": 0.001309241633862257,
"learning_rate": 7.589462826311196e-06,
"loss": 0.0,
"step": 10690
},
{
"epoch": 2.547012616043799,
"grad_norm": 0.023756977170705795,
"learning_rate": 7.549789732603349e-06,
"loss": 0.0002,
"step": 10700
},
{
"epoch": 2.54939300166627,
"grad_norm": 0.002046087756752968,
"learning_rate": 7.510116638895501e-06,
"loss": 0.0,
"step": 10710
},
{
"epoch": 2.551773387288741,
"grad_norm": 0.0047508729621768,
"learning_rate": 7.470443545187654e-06,
"loss": 0.0,
"step": 10720
},
{
"epoch": 2.554153772911212,
"grad_norm": 0.0010949558345600963,
"learning_rate": 7.4307704514798075e-06,
"loss": 0.0,
"step": 10730
},
{
"epoch": 2.5565341585336823,
"grad_norm": 0.010589073412120342,
"learning_rate": 7.391097357771959e-06,
"loss": 0.0,
"step": 10740
},
{
"epoch": 2.558914544156153,
"grad_norm": 0.0006332534248940647,
"learning_rate": 7.351424264064112e-06,
"loss": 0.0,
"step": 10750
},
{
"epoch": 2.561294929778624,
"grad_norm": 0.00027181513723917305,
"learning_rate": 7.311751170356265e-06,
"loss": 0.0,
"step": 10760
},
{
"epoch": 2.563675315401095,
"grad_norm": 0.0036267938558012247,
"learning_rate": 7.272078076648417e-06,
"loss": 0.0,
"step": 10770
},
{
"epoch": 2.566055701023566,
"grad_norm": 0.002974023576825857,
"learning_rate": 7.23240498294057e-06,
"loss": 0.0,
"step": 10780
},
{
"epoch": 2.5684360866460367,
"grad_norm": 0.0005654848064295948,
"learning_rate": 7.192731889232724e-06,
"loss": 0.0001,
"step": 10790
},
{
"epoch": 2.5708164722685076,
"grad_norm": 0.001776995835825801,
"learning_rate": 7.153058795524875e-06,
"loss": 0.0,
"step": 10800
},
{
"epoch": 2.573196857890978,
"grad_norm": 0.0031643370166420937,
"learning_rate": 7.1133857018170286e-06,
"loss": 0.0,
"step": 10810
},
{
"epoch": 2.5755772435134494,
"grad_norm": 0.0006117381271906197,
"learning_rate": 7.07371260810918e-06,
"loss": 0.0,
"step": 10820
},
{
"epoch": 2.57795762913592,
"grad_norm": 0.00013082509394735098,
"learning_rate": 7.034039514401333e-06,
"loss": 0.0,
"step": 10830
},
{
"epoch": 2.5803380147583908,
"grad_norm": 0.009411906823515892,
"learning_rate": 6.994366420693486e-06,
"loss": 0.0,
"step": 10840
},
{
"epoch": 2.5827184003808616,
"grad_norm": 0.007766501512378454,
"learning_rate": 6.954693326985638e-06,
"loss": 0.0,
"step": 10850
},
{
"epoch": 2.5850987860033325,
"grad_norm": 0.001036152825690806,
"learning_rate": 6.915020233277791e-06,
"loss": 0.0,
"step": 10860
},
{
"epoch": 2.5874791716258034,
"grad_norm": 0.0007062302902340889,
"learning_rate": 6.875347139569945e-06,
"loss": 0.0,
"step": 10870
},
{
"epoch": 2.5898595572482743,
"grad_norm": 0.004976709373295307,
"learning_rate": 6.835674045862096e-06,
"loss": 0.0,
"step": 10880
},
{
"epoch": 2.592239942870745,
"grad_norm": 0.0005074761575087905,
"learning_rate": 6.79600095215425e-06,
"loss": 0.0,
"step": 10890
},
{
"epoch": 2.5946203284932157,
"grad_norm": 0.0028977631591260433,
"learning_rate": 6.7563278584464025e-06,
"loss": 0.0,
"step": 10900
},
{
"epoch": 2.597000714115687,
"grad_norm": 0.004557565785944462,
"learning_rate": 6.7166547647385545e-06,
"loss": 0.0,
"step": 10910
},
{
"epoch": 2.5993810997381575,
"grad_norm": 0.0018358832458034158,
"learning_rate": 6.676981671030707e-06,
"loss": 0.0,
"step": 10920
},
{
"epoch": 2.6017614853606283,
"grad_norm": 0.0014729060931131244,
"learning_rate": 6.637308577322859e-06,
"loss": 0.0,
"step": 10930
},
{
"epoch": 2.6041418709830992,
"grad_norm": 0.0004332439857535064,
"learning_rate": 6.597635483615012e-06,
"loss": 0.0,
"step": 10940
},
{
"epoch": 2.60652225660557,
"grad_norm": 0.0009114540298469365,
"learning_rate": 6.557962389907166e-06,
"loss": 0.0,
"step": 10950
},
{
"epoch": 2.608902642228041,
"grad_norm": 0.010355968959629536,
"learning_rate": 6.518289296199318e-06,
"loss": 0.0,
"step": 10960
},
{
"epoch": 2.611283027850512,
"grad_norm": 0.054084401577711105,
"learning_rate": 6.478616202491471e-06,
"loss": 0.0,
"step": 10970
},
{
"epoch": 2.613663413472983,
"grad_norm": 0.0009903626050800085,
"learning_rate": 6.438943108783624e-06,
"loss": 0.0,
"step": 10980
},
{
"epoch": 2.6160437990954533,
"grad_norm": 0.00019378839351702482,
"learning_rate": 6.399270015075776e-06,
"loss": 0.0,
"step": 10990
},
{
"epoch": 2.618424184717924,
"grad_norm": 0.0006563541246578097,
"learning_rate": 6.3595969213679285e-06,
"loss": 0.0,
"step": 11000
},
{
"epoch": 2.620804570340395,
"grad_norm": 0.0006744746351614594,
"learning_rate": 6.3199238276600805e-06,
"loss": 0.0,
"step": 11010
},
{
"epoch": 2.623184955962866,
"grad_norm": 0.0011966971214860678,
"learning_rate": 6.280250733952233e-06,
"loss": 0.0,
"step": 11020
},
{
"epoch": 2.625565341585337,
"grad_norm": 0.0017309453105553985,
"learning_rate": 6.240577640244387e-06,
"loss": 0.0,
"step": 11030
},
{
"epoch": 2.6279457272078077,
"grad_norm": 0.0008661380270496011,
"learning_rate": 6.200904546536539e-06,
"loss": 0.0005,
"step": 11040
},
{
"epoch": 2.6303261128302786,
"grad_norm": 0.0003683891554828733,
"learning_rate": 6.161231452828692e-06,
"loss": 0.0,
"step": 11050
},
{
"epoch": 2.6327064984527495,
"grad_norm": 0.0005742148496210575,
"learning_rate": 6.121558359120845e-06,
"loss": 0.0,
"step": 11060
},
{
"epoch": 2.6350868840752204,
"grad_norm": 0.0010009456891566515,
"learning_rate": 6.0818852654129976e-06,
"loss": 0.0,
"step": 11070
},
{
"epoch": 2.637467269697691,
"grad_norm": 0.0008674330892972648,
"learning_rate": 6.0422121717051496e-06,
"loss": 0.0001,
"step": 11080
},
{
"epoch": 2.6398476553201617,
"grad_norm": 0.00011453252227511257,
"learning_rate": 6.002539077997302e-06,
"loss": 0.0,
"step": 11090
},
{
"epoch": 2.6422280409426326,
"grad_norm": 0.0014997412217780948,
"learning_rate": 5.962865984289455e-06,
"loss": 0.0,
"step": 11100
},
{
"epoch": 2.6446084265651035,
"grad_norm": 0.0013535526813939214,
"learning_rate": 5.923192890581608e-06,
"loss": 0.0,
"step": 11110
},
{
"epoch": 2.6469888121875744,
"grad_norm": 0.0010607549920678139,
"learning_rate": 5.883519796873761e-06,
"loss": 0.0,
"step": 11120
},
{
"epoch": 2.6493691978100453,
"grad_norm": 0.001384345581755042,
"learning_rate": 5.843846703165913e-06,
"loss": 0.0,
"step": 11130
},
{
"epoch": 2.651749583432516,
"grad_norm": 0.009620246477425098,
"learning_rate": 5.804173609458066e-06,
"loss": 0.0,
"step": 11140
},
{
"epoch": 2.6541299690549867,
"grad_norm": 0.004576113075017929,
"learning_rate": 5.764500515750219e-06,
"loss": 0.0,
"step": 11150
},
{
"epoch": 2.656510354677458,
"grad_norm": 0.0007963149109855294,
"learning_rate": 5.7248274220423715e-06,
"loss": 0.0,
"step": 11160
},
{
"epoch": 2.6588907402999284,
"grad_norm": 0.0005275904550217092,
"learning_rate": 5.6851543283345235e-06,
"loss": 0.0,
"step": 11170
},
{
"epoch": 2.6612711259223993,
"grad_norm": 0.0007748051430098712,
"learning_rate": 5.645481234626677e-06,
"loss": 0.0,
"step": 11180
},
{
"epoch": 2.6636515115448702,
"grad_norm": 0.0005676033324562013,
"learning_rate": 5.605808140918829e-06,
"loss": 0.0,
"step": 11190
},
{
"epoch": 2.666031897167341,
"grad_norm": 0.0009870273061096668,
"learning_rate": 5.566135047210982e-06,
"loss": 0.0,
"step": 11200
},
{
"epoch": 2.668412282789812,
"grad_norm": 0.0004960622172802687,
"learning_rate": 5.526461953503134e-06,
"loss": 0.0,
"step": 11210
},
{
"epoch": 2.670792668412283,
"grad_norm": 0.2789072096347809,
"learning_rate": 5.486788859795288e-06,
"loss": 0.0001,
"step": 11220
},
{
"epoch": 2.673173054034754,
"grad_norm": 0.004494486376643181,
"learning_rate": 5.44711576608744e-06,
"loss": 0.0001,
"step": 11230
},
{
"epoch": 2.6755534396572243,
"grad_norm": 0.0009736506035551429,
"learning_rate": 5.407442672379593e-06,
"loss": 0.0,
"step": 11240
},
{
"epoch": 2.677933825279695,
"grad_norm": 0.0027844863943755627,
"learning_rate": 5.367769578671745e-06,
"loss": 0.0,
"step": 11250
},
{
"epoch": 2.680314210902166,
"grad_norm": 0.013426104560494423,
"learning_rate": 5.328096484963898e-06,
"loss": 0.0,
"step": 11260
},
{
"epoch": 2.682694596524637,
"grad_norm": 0.0002785604156088084,
"learning_rate": 5.28842339125605e-06,
"loss": 0.0,
"step": 11270
},
{
"epoch": 2.685074982147108,
"grad_norm": 0.0007079096976667643,
"learning_rate": 5.248750297548203e-06,
"loss": 0.0,
"step": 11280
},
{
"epoch": 2.6874553677695787,
"grad_norm": 0.0004877845640294254,
"learning_rate": 5.209077203840355e-06,
"loss": 0.0001,
"step": 11290
},
{
"epoch": 2.6898357533920496,
"grad_norm": 0.029308408498764038,
"learning_rate": 5.169404110132509e-06,
"loss": 0.0,
"step": 11300
},
{
"epoch": 2.6922161390145205,
"grad_norm": 0.0011891064932569861,
"learning_rate": 5.129731016424661e-06,
"loss": 0.0,
"step": 11310
},
{
"epoch": 2.6945965246369914,
"grad_norm": 0.009328281506896019,
"learning_rate": 5.090057922716814e-06,
"loss": 0.0,
"step": 11320
},
{
"epoch": 2.696976910259462,
"grad_norm": 0.0010127691784873605,
"learning_rate": 5.0503848290089666e-06,
"loss": 0.0,
"step": 11330
},
{
"epoch": 2.6993572958819327,
"grad_norm": 0.0006704577244818211,
"learning_rate": 5.010711735301119e-06,
"loss": 0.0,
"step": 11340
},
{
"epoch": 2.7017376815044036,
"grad_norm": 0.0015914466930553317,
"learning_rate": 4.971038641593271e-06,
"loss": 0.0,
"step": 11350
},
{
"epoch": 2.7041180671268745,
"grad_norm": 0.00046926282811909914,
"learning_rate": 4.931365547885424e-06,
"loss": 0.0,
"step": 11360
},
{
"epoch": 2.7064984527493454,
"grad_norm": 0.0008572743972763419,
"learning_rate": 4.891692454177577e-06,
"loss": 0.0,
"step": 11370
},
{
"epoch": 2.7088788383718163,
"grad_norm": 0.001012885244563222,
"learning_rate": 4.85201936046973e-06,
"loss": 0.0,
"step": 11380
},
{
"epoch": 2.711259223994287,
"grad_norm": 0.000291361880954355,
"learning_rate": 4.812346266761882e-06,
"loss": 0.0,
"step": 11390
},
{
"epoch": 2.7136396096167577,
"grad_norm": 0.001445894013158977,
"learning_rate": 4.772673173054035e-06,
"loss": 0.0,
"step": 11400
},
{
"epoch": 2.716019995239229,
"grad_norm": 0.0007329813088290393,
"learning_rate": 4.733000079346188e-06,
"loss": 0.0,
"step": 11410
},
{
"epoch": 2.7184003808616994,
"grad_norm": 0.02237352356314659,
"learning_rate": 4.6933269856383405e-06,
"loss": 0.0,
"step": 11420
},
{
"epoch": 2.7207807664841703,
"grad_norm": 0.0004787015204783529,
"learning_rate": 4.6536538919304925e-06,
"loss": 0.0,
"step": 11430
},
{
"epoch": 2.7231611521066412,
"grad_norm": 0.0011766423704102635,
"learning_rate": 4.613980798222645e-06,
"loss": 0.0,
"step": 11440
},
{
"epoch": 2.725541537729112,
"grad_norm": 0.0003720026579685509,
"learning_rate": 4.574307704514798e-06,
"loss": 0.0,
"step": 11450
},
{
"epoch": 2.727921923351583,
"grad_norm": 0.0004271367215551436,
"learning_rate": 4.534634610806951e-06,
"loss": 0.0,
"step": 11460
},
{
"epoch": 2.730302308974054,
"grad_norm": 0.001319264993071556,
"learning_rate": 4.494961517099103e-06,
"loss": 0.0,
"step": 11470
},
{
"epoch": 2.732682694596525,
"grad_norm": 0.0012237573973834515,
"learning_rate": 4.455288423391257e-06,
"loss": 0.0,
"step": 11480
},
{
"epoch": 2.7350630802189952,
"grad_norm": 0.00044418079778552055,
"learning_rate": 4.415615329683409e-06,
"loss": 0.0,
"step": 11490
},
{
"epoch": 2.7374434658414666,
"grad_norm": 0.0009368477039970458,
"learning_rate": 4.375942235975562e-06,
"loss": 0.0,
"step": 11500
},
{
"epoch": 2.739823851463937,
"grad_norm": 0.0015390801709145308,
"learning_rate": 4.336269142267714e-06,
"loss": 0.0001,
"step": 11510
},
{
"epoch": 2.742204237086408,
"grad_norm": 0.00022943236399441957,
"learning_rate": 4.296596048559867e-06,
"loss": 0.0,
"step": 11520
},
{
"epoch": 2.744584622708879,
"grad_norm": 0.0031924904324114323,
"learning_rate": 4.256922954852019e-06,
"loss": 0.0,
"step": 11530
},
{
"epoch": 2.7469650083313497,
"grad_norm": 0.0011005508713424206,
"learning_rate": 4.217249861144172e-06,
"loss": 0.0,
"step": 11540
},
{
"epoch": 2.7493453939538206,
"grad_norm": 0.00039162219036370516,
"learning_rate": 4.177576767436325e-06,
"loss": 0.0,
"step": 11550
},
{
"epoch": 2.7517257795762915,
"grad_norm": 0.0011376795591786504,
"learning_rate": 4.137903673728478e-06,
"loss": 0.0,
"step": 11560
},
{
"epoch": 2.7541061651987624,
"grad_norm": 0.0005944286240264773,
"learning_rate": 4.09823058002063e-06,
"loss": 0.0,
"step": 11570
},
{
"epoch": 2.756486550821233,
"grad_norm": 0.0007298539276234806,
"learning_rate": 4.058557486312783e-06,
"loss": 0.0,
"step": 11580
},
{
"epoch": 2.7588669364437037,
"grad_norm": 0.00018211067072115839,
"learning_rate": 4.0188843926049356e-06,
"loss": 0.0,
"step": 11590
},
{
"epoch": 2.7612473220661746,
"grad_norm": 0.0034182893577963114,
"learning_rate": 3.9792112988970884e-06,
"loss": 0.0,
"step": 11600
},
{
"epoch": 2.7636277076886455,
"grad_norm": 0.000364614010322839,
"learning_rate": 3.939538205189241e-06,
"loss": 0.0,
"step": 11610
},
{
"epoch": 2.7660080933111164,
"grad_norm": 0.0021814818028360605,
"learning_rate": 3.899865111481393e-06,
"loss": 0.0,
"step": 11620
},
{
"epoch": 2.7683884789335873,
"grad_norm": 0.0014812530716881156,
"learning_rate": 3.860192017773546e-06,
"loss": 0.0001,
"step": 11630
},
{
"epoch": 2.770768864556058,
"grad_norm": 0.0005358079797588289,
"learning_rate": 3.820518924065699e-06,
"loss": 0.0,
"step": 11640
},
{
"epoch": 2.7731492501785286,
"grad_norm": 0.00028996021137572825,
"learning_rate": 3.7808458303578514e-06,
"loss": 0.0,
"step": 11650
},
{
"epoch": 2.775529635801,
"grad_norm": 0.001182155217975378,
"learning_rate": 3.741172736650004e-06,
"loss": 0.0,
"step": 11660
},
{
"epoch": 2.7779100214234704,
"grad_norm": 0.00023413899180013686,
"learning_rate": 3.701499642942157e-06,
"loss": 0.0,
"step": 11670
},
{
"epoch": 2.7802904070459413,
"grad_norm": 0.0006019670399837196,
"learning_rate": 3.6618265492343095e-06,
"loss": 0.0,
"step": 11680
},
{
"epoch": 2.782670792668412,
"grad_norm": 0.0004944771062582731,
"learning_rate": 3.622153455526462e-06,
"loss": 0.0,
"step": 11690
},
{
"epoch": 2.785051178290883,
"grad_norm": 6.98843869031407e-05,
"learning_rate": 3.5824803618186144e-06,
"loss": 0.0,
"step": 11700
},
{
"epoch": 2.787431563913354,
"grad_norm": 0.0005101510905660689,
"learning_rate": 3.5428072681107677e-06,
"loss": 0.0,
"step": 11710
},
{
"epoch": 2.789811949535825,
"grad_norm": 0.00034247711300849915,
"learning_rate": 3.50313417440292e-06,
"loss": 0.0,
"step": 11720
},
{
"epoch": 2.792192335158296,
"grad_norm": 0.00044277720735408366,
"learning_rate": 3.4634610806950725e-06,
"loss": 0.0,
"step": 11730
},
{
"epoch": 2.7945727207807662,
"grad_norm": 0.0005088172620162368,
"learning_rate": 3.423787986987225e-06,
"loss": 0.0,
"step": 11740
},
{
"epoch": 2.7969531064032376,
"grad_norm": 0.00021512300008907914,
"learning_rate": 3.384114893279378e-06,
"loss": 0.0,
"step": 11750
},
{
"epoch": 2.799333492025708,
"grad_norm": 0.0007052098517306149,
"learning_rate": 3.3444417995715306e-06,
"loss": 0.0,
"step": 11760
},
{
"epoch": 2.801713877648179,
"grad_norm": 0.036882251501083374,
"learning_rate": 3.304768705863683e-06,
"loss": 0.0,
"step": 11770
},
{
"epoch": 2.80409426327065,
"grad_norm": 0.00013749166100751609,
"learning_rate": 3.2650956121558363e-06,
"loss": 0.0,
"step": 11780
},
{
"epoch": 2.8064746488931207,
"grad_norm": 0.0006571552366949618,
"learning_rate": 3.2254225184479888e-06,
"loss": 0.0,
"step": 11790
},
{
"epoch": 2.8088550345155916,
"grad_norm": 0.0008290376281365752,
"learning_rate": 3.185749424740141e-06,
"loss": 0.0,
"step": 11800
},
{
"epoch": 2.8112354201380625,
"grad_norm": 8.49374700919725e-05,
"learning_rate": 3.146076331032294e-06,
"loss": 0.0,
"step": 11810
},
{
"epoch": 2.8136158057605334,
"grad_norm": 0.00033748464193195105,
"learning_rate": 3.1064032373244465e-06,
"loss": 0.0,
"step": 11820
},
{
"epoch": 2.815996191383004,
"grad_norm": 0.0003914514381904155,
"learning_rate": 3.0667301436165993e-06,
"loss": 0.0,
"step": 11830
},
{
"epoch": 2.8183765770054747,
"grad_norm": 0.00029730124515481293,
"learning_rate": 3.0270570499087517e-06,
"loss": 0.0,
"step": 11840
},
{
"epoch": 2.8207569626279456,
"grad_norm": 0.00035526990541256964,
"learning_rate": 2.9873839562009046e-06,
"loss": 0.0,
"step": 11850
},
{
"epoch": 2.8231373482504165,
"grad_norm": 0.0007370146340690553,
"learning_rate": 2.9477108624930574e-06,
"loss": 0.0,
"step": 11860
},
{
"epoch": 2.8255177338728874,
"grad_norm": 8.048515883274376e-05,
"learning_rate": 2.90803776878521e-06,
"loss": 0.0,
"step": 11870
},
{
"epoch": 2.8278981194953583,
"grad_norm": 0.00022186528076417744,
"learning_rate": 2.8683646750773627e-06,
"loss": 0.0,
"step": 11880
},
{
"epoch": 2.830278505117829,
"grad_norm": 0.0004252239887136966,
"learning_rate": 2.8286915813695156e-06,
"loss": 0.0,
"step": 11890
},
{
"epoch": 2.8326588907403,
"grad_norm": 0.00027670618146657944,
"learning_rate": 2.789018487661668e-06,
"loss": 0.0,
"step": 11900
},
{
"epoch": 2.835039276362771,
"grad_norm": 0.0020431778393685818,
"learning_rate": 2.749345393953821e-06,
"loss": 0.0,
"step": 11910
},
{
"epoch": 2.8374196619852414,
"grad_norm": 0.001547365915030241,
"learning_rate": 2.7096723002459737e-06,
"loss": 0.0,
"step": 11920
},
{
"epoch": 2.8398000476077123,
"grad_norm": 0.0013964555691927671,
"learning_rate": 2.669999206538126e-06,
"loss": 0.0,
"step": 11930
},
{
"epoch": 2.842180433230183,
"grad_norm": 0.00027170139946974814,
"learning_rate": 2.630326112830279e-06,
"loss": 0.0,
"step": 11940
},
{
"epoch": 2.844560818852654,
"grad_norm": 0.0008765398524701595,
"learning_rate": 2.5906530191224314e-06,
"loss": 0.0,
"step": 11950
},
{
"epoch": 2.846941204475125,
"grad_norm": 0.00015922258899081498,
"learning_rate": 2.5509799254145842e-06,
"loss": 0.0,
"step": 11960
},
{
"epoch": 2.849321590097596,
"grad_norm": 0.00011323492071824148,
"learning_rate": 2.5113068317067367e-06,
"loss": 0.0,
"step": 11970
},
{
"epoch": 2.851701975720067,
"grad_norm": 0.0008671206305734813,
"learning_rate": 2.4716337379988895e-06,
"loss": 0.0001,
"step": 11980
},
{
"epoch": 2.8540823613425372,
"grad_norm": 0.00013449507241602987,
"learning_rate": 2.431960644291042e-06,
"loss": 0.0,
"step": 11990
},
{
"epoch": 2.8564627469650086,
"grad_norm": 0.0008318678010255098,
"learning_rate": 2.3922875505831948e-06,
"loss": 0.0,
"step": 12000
},
{
"epoch": 2.858843132587479,
"grad_norm": 0.0012901159934699535,
"learning_rate": 2.352614456875347e-06,
"loss": 0.0,
"step": 12010
},
{
"epoch": 2.86122351820995,
"grad_norm": 0.00032769294921308756,
"learning_rate": 2.3129413631675e-06,
"loss": 0.0,
"step": 12020
},
{
"epoch": 2.863603903832421,
"grad_norm": 0.0022394724655896425,
"learning_rate": 2.2732682694596525e-06,
"loss": 0.0,
"step": 12030
},
{
"epoch": 2.8659842894548917,
"grad_norm": 0.0001916442415677011,
"learning_rate": 2.2335951757518053e-06,
"loss": 0.0,
"step": 12040
},
{
"epoch": 2.8683646750773626,
"grad_norm": 0.0008263205527327955,
"learning_rate": 2.1939220820439578e-06,
"loss": 0.0,
"step": 12050
},
{
"epoch": 2.8707450606998335,
"grad_norm": 0.01558750867843628,
"learning_rate": 2.1542489883361106e-06,
"loss": 0.0,
"step": 12060
},
{
"epoch": 2.8731254463223044,
"grad_norm": 0.0005802076193504035,
"learning_rate": 2.1145758946282635e-06,
"loss": 0.0002,
"step": 12070
},
{
"epoch": 2.875505831944775,
"grad_norm": 0.0006769265746697783,
"learning_rate": 2.074902800920416e-06,
"loss": 0.0,
"step": 12080
},
{
"epoch": 2.877886217567246,
"grad_norm": 0.00040787094621919096,
"learning_rate": 2.0352297072125687e-06,
"loss": 0.0,
"step": 12090
},
{
"epoch": 2.8802666031897166,
"grad_norm": 0.00034027136280201375,
"learning_rate": 1.995556613504721e-06,
"loss": 0.0,
"step": 12100
},
{
"epoch": 2.8826469888121875,
"grad_norm": 0.008367573842406273,
"learning_rate": 1.955883519796874e-06,
"loss": 0.0,
"step": 12110
},
{
"epoch": 2.8850273744346584,
"grad_norm": 0.0002640595193952322,
"learning_rate": 1.9162104260890264e-06,
"loss": 0.0,
"step": 12120
},
{
"epoch": 2.8874077600571293,
"grad_norm": 0.0006561621557921171,
"learning_rate": 1.8765373323811793e-06,
"loss": 0.0,
"step": 12130
},
{
"epoch": 2.8897881456796,
"grad_norm": 0.0008464111597277224,
"learning_rate": 1.8368642386733317e-06,
"loss": 0.0,
"step": 12140
},
{
"epoch": 2.892168531302071,
"grad_norm": 0.0003002223384100944,
"learning_rate": 1.7971911449654846e-06,
"loss": 0.0,
"step": 12150
},
{
"epoch": 2.894548916924542,
"grad_norm": 0.0003043843025807291,
"learning_rate": 1.757518051257637e-06,
"loss": 0.0,
"step": 12160
},
{
"epoch": 2.8969293025470124,
"grad_norm": 0.00041168101597577333,
"learning_rate": 1.7178449575497898e-06,
"loss": 0.0,
"step": 12170
},
{
"epoch": 2.8993096881694833,
"grad_norm": 0.002103559672832489,
"learning_rate": 1.6781718638419423e-06,
"loss": 0.0,
"step": 12180
},
{
"epoch": 2.901690073791954,
"grad_norm": 0.00029975874349474907,
"learning_rate": 1.6384987701340951e-06,
"loss": 0.0,
"step": 12190
},
{
"epoch": 2.904070459414425,
"grad_norm": 0.004904668778181076,
"learning_rate": 1.5988256764262475e-06,
"loss": 0.0001,
"step": 12200
},
{
"epoch": 2.906450845036896,
"grad_norm": 0.0009001428843475878,
"learning_rate": 1.5591525827184004e-06,
"loss": 0.0,
"step": 12210
},
{
"epoch": 2.908831230659367,
"grad_norm": 0.0004976601339876652,
"learning_rate": 1.519479489010553e-06,
"loss": 0.0,
"step": 12220
},
{
"epoch": 2.9112116162818378,
"grad_norm": 0.0002044235880021006,
"learning_rate": 1.4798063953027057e-06,
"loss": 0.0,
"step": 12230
},
{
"epoch": 2.9135920019043082,
"grad_norm": 0.0003118833410553634,
"learning_rate": 1.4401333015948583e-06,
"loss": 0.0,
"step": 12240
},
{
"epoch": 2.9159723875267796,
"grad_norm": 0.00038868881529197097,
"learning_rate": 1.4004602078870111e-06,
"loss": 0.0,
"step": 12250
},
{
"epoch": 2.91835277314925,
"grad_norm": 0.0005747165414504707,
"learning_rate": 1.3607871141791638e-06,
"loss": 0.0,
"step": 12260
},
{
"epoch": 2.920733158771721,
"grad_norm": 0.0013731828657910228,
"learning_rate": 1.3211140204713164e-06,
"loss": 0.0,
"step": 12270
},
{
"epoch": 2.923113544394192,
"grad_norm": 0.000688336614985019,
"learning_rate": 1.281440926763469e-06,
"loss": 0.0,
"step": 12280
},
{
"epoch": 2.9254939300166627,
"grad_norm": 0.00041094853077083826,
"learning_rate": 1.241767833055622e-06,
"loss": 0.0,
"step": 12290
},
{
"epoch": 2.9278743156391336,
"grad_norm": 0.00040040462044999003,
"learning_rate": 1.2020947393477745e-06,
"loss": 0.0,
"step": 12300
},
{
"epoch": 2.9302547012616045,
"grad_norm": 0.0027486933395266533,
"learning_rate": 1.1624216456399272e-06,
"loss": 0.0,
"step": 12310
},
{
"epoch": 2.9326350868840754,
"grad_norm": 0.000705558864865452,
"learning_rate": 1.1227485519320798e-06,
"loss": 0.0,
"step": 12320
},
{
"epoch": 2.935015472506546,
"grad_norm": 0.0013841954059898853,
"learning_rate": 1.0830754582242325e-06,
"loss": 0.0001,
"step": 12330
},
{
"epoch": 2.937395858129017,
"grad_norm": 0.0013595300260931253,
"learning_rate": 1.043402364516385e-06,
"loss": 0.0,
"step": 12340
},
{
"epoch": 2.9397762437514876,
"grad_norm": 0.0011891273315995932,
"learning_rate": 1.0037292708085377e-06,
"loss": 0.0,
"step": 12350
},
{
"epoch": 2.9421566293739585,
"grad_norm": 0.0009695956250652671,
"learning_rate": 9.640561771006904e-07,
"loss": 0.0,
"step": 12360
},
{
"epoch": 2.9445370149964294,
"grad_norm": 0.00034754411899484694,
"learning_rate": 9.24383083392843e-07,
"loss": 0.0001,
"step": 12370
},
{
"epoch": 2.9469174006189003,
"grad_norm": 0.00020417921768967062,
"learning_rate": 8.847099896849956e-07,
"loss": 0.0,
"step": 12380
},
{
"epoch": 2.949297786241371,
"grad_norm": 0.0010077544720843434,
"learning_rate": 8.450368959771483e-07,
"loss": 0.0,
"step": 12390
},
{
"epoch": 2.951678171863842,
"grad_norm": 0.0006951851537451148,
"learning_rate": 8.053638022693009e-07,
"loss": 0.0,
"step": 12400
},
{
"epoch": 2.954058557486313,
"grad_norm": 0.0005225545028224587,
"learning_rate": 7.656907085614537e-07,
"loss": 0.0,
"step": 12410
},
{
"epoch": 2.9564389431087834,
"grad_norm": 0.0004363077168818563,
"learning_rate": 7.260176148536063e-07,
"loss": 0.0,
"step": 12420
},
{
"epoch": 2.9588193287312543,
"grad_norm": 0.00024609945830889046,
"learning_rate": 6.863445211457589e-07,
"loss": 0.0,
"step": 12430
},
{
"epoch": 2.961199714353725,
"grad_norm": 0.06491145491600037,
"learning_rate": 6.466714274379116e-07,
"loss": 0.0,
"step": 12440
},
{
"epoch": 2.963580099976196,
"grad_norm": 0.0004482944495975971,
"learning_rate": 6.069983337300642e-07,
"loss": 0.0,
"step": 12450
},
{
"epoch": 2.965960485598667,
"grad_norm": 0.001836300129070878,
"learning_rate": 5.67325240022217e-07,
"loss": 0.0,
"step": 12460
},
{
"epoch": 2.968340871221138,
"grad_norm": 0.0004112005408387631,
"learning_rate": 5.276521463143697e-07,
"loss": 0.0,
"step": 12470
},
{
"epoch": 2.9707212568436088,
"grad_norm": 0.0020831027068197727,
"learning_rate": 4.879790526065223e-07,
"loss": 0.0,
"step": 12480
},
{
"epoch": 2.9731016424660797,
"grad_norm": 0.0012763678096234798,
"learning_rate": 4.4830595889867493e-07,
"loss": 0.0,
"step": 12490
},
{
"epoch": 2.9754820280885506,
"grad_norm": 0.0011779662454500794,
"learning_rate": 4.086328651908276e-07,
"loss": 0.0,
"step": 12500
},
{
"epoch": 2.977862413711021,
"grad_norm": 0.0005871544708497822,
"learning_rate": 3.6895977148298026e-07,
"loss": 0.0,
"step": 12510
},
{
"epoch": 2.980242799333492,
"grad_norm": 0.002057824982330203,
"learning_rate": 3.2928667777513295e-07,
"loss": 0.0,
"step": 12520
},
{
"epoch": 2.982623184955963,
"grad_norm": 0.00029588877805508673,
"learning_rate": 2.896135840672856e-07,
"loss": 0.0,
"step": 12530
},
{
"epoch": 2.9850035705784337,
"grad_norm": 0.0004726073530036956,
"learning_rate": 2.499404903594382e-07,
"loss": 0.0,
"step": 12540
},
{
"epoch": 2.9873839562009046,
"grad_norm": 0.0014838631032034755,
"learning_rate": 2.102673966515909e-07,
"loss": 0.0,
"step": 12550
},
{
"epoch": 2.9897643418233755,
"grad_norm": 0.0010778923751786351,
"learning_rate": 1.7059430294374355e-07,
"loss": 0.0,
"step": 12560
},
{
"epoch": 2.9921447274458464,
"grad_norm": 0.0007851801346987486,
"learning_rate": 1.3092120923589622e-07,
"loss": 0.0,
"step": 12570
},
{
"epoch": 2.994525113068317,
"grad_norm": 0.00047710456419736147,
"learning_rate": 9.124811552804888e-08,
"loss": 0.0,
"step": 12580
},
{
"epoch": 2.996905498690788,
"grad_norm": 0.003749624127522111,
"learning_rate": 5.1575021820201544e-08,
"loss": 0.0,
"step": 12590
},
{
"epoch": 2.9992858843132586,
"grad_norm": 0.0007799621089361608,
"learning_rate": 1.1901928112354202e-08,
"loss": 0.0001,
"step": 12600
},
{
"epoch": 3.0,
"eval_loss": 2.340411811019294e-07,
"eval_runtime": 52.9973,
"eval_samples_per_second": 35.247,
"eval_steps_per_second": 8.812,
"step": 12603
}
],
"logging_steps": 10,
"max_steps": 12603,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6586245895421952.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}