{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 12603, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00238038562247084, "grad_norm": 29.197416305541992, "learning_rate": 4.996032690629215e-05, "loss": 5.8295, "step": 10 }, { "epoch": 0.00476077124494168, "grad_norm": 2.8866491317749023, "learning_rate": 4.99206538125843e-05, "loss": 0.9476, "step": 20 }, { "epoch": 0.007141156867412521, "grad_norm": 2.2606563568115234, "learning_rate": 4.988098071887646e-05, "loss": 0.1466, "step": 30 }, { "epoch": 0.00952154248988336, "grad_norm": 2.5246834754943848, "learning_rate": 4.984130762516862e-05, "loss": 0.0596, "step": 40 }, { "epoch": 0.011901928112354201, "grad_norm": 1.10219144821167, "learning_rate": 4.980163453146077e-05, "loss": 0.0351, "step": 50 }, { "epoch": 0.014282313734825042, "grad_norm": 1.7988760471343994, "learning_rate": 4.976196143775292e-05, "loss": 0.0293, "step": 60 }, { "epoch": 0.016662699357295883, "grad_norm": 0.2419203370809555, "learning_rate": 4.972228834404507e-05, "loss": 0.024, "step": 70 }, { "epoch": 0.01904308497976672, "grad_norm": 0.992480993270874, "learning_rate": 4.9682615250337225e-05, "loss": 0.0191, "step": 80 }, { "epoch": 0.021423470602237562, "grad_norm": 1.2107903957366943, "learning_rate": 4.9642942156629376e-05, "loss": 0.0147, "step": 90 }, { "epoch": 0.023803856224708403, "grad_norm": 1.5667377710342407, "learning_rate": 4.960326906292153e-05, "loss": 0.0144, "step": 100 }, { "epoch": 0.026184241847179244, "grad_norm": 1.7987982034683228, "learning_rate": 4.956359596921368e-05, "loss": 0.0121, "step": 110 }, { "epoch": 0.028564627469650085, "grad_norm": 0.7142848968505859, "learning_rate": 4.952392287550583e-05, "loss": 0.0109, "step": 120 }, { "epoch": 0.030945013092120922, "grad_norm": 0.9309341311454773, "learning_rate": 4.9484249781797984e-05, "loss": 0.0087, "step": 130 }, { "epoch": 0.03332539871459177, "grad_norm": 0.2679256498813629, "learning_rate": 4.944457668809014e-05, "loss": 0.0065, "step": 140 }, { "epoch": 0.035705784337062604, "grad_norm": 0.36588725447654724, "learning_rate": 4.940490359438229e-05, "loss": 0.0075, "step": 150 }, { "epoch": 0.03808616995953344, "grad_norm": 0.6737563610076904, "learning_rate": 4.936523050067445e-05, "loss": 0.0092, "step": 160 }, { "epoch": 0.040466555582004286, "grad_norm": 0.3371886610984802, "learning_rate": 4.93255574069666e-05, "loss": 0.0067, "step": 170 }, { "epoch": 0.042846941204475124, "grad_norm": 1.0238951444625854, "learning_rate": 4.928588431325875e-05, "loss": 0.0084, "step": 180 }, { "epoch": 0.04522732682694597, "grad_norm": 1.0350103378295898, "learning_rate": 4.9246211219550906e-05, "loss": 0.0073, "step": 190 }, { "epoch": 0.047607712449416806, "grad_norm": 0.33256474137306213, "learning_rate": 4.9206538125843056e-05, "loss": 0.0082, "step": 200 }, { "epoch": 0.04998809807188764, "grad_norm": 0.0693468451499939, "learning_rate": 4.9166865032135206e-05, "loss": 0.0044, "step": 210 }, { "epoch": 0.05236848369435849, "grad_norm": 0.8809625506401062, "learning_rate": 4.912719193842736e-05, "loss": 0.0064, "step": 220 }, { "epoch": 0.054748869316829325, "grad_norm": 0.36927270889282227, "learning_rate": 4.9087518844719514e-05, "loss": 0.0066, "step": 230 }, { "epoch": 0.05712925493930017, "grad_norm": 0.8885632753372192, "learning_rate": 4.9047845751011664e-05, "loss": 0.0063, "step": 240 }, { "epoch": 0.05950964056177101, "grad_norm": 0.5330325365066528, "learning_rate": 4.900817265730382e-05, "loss": 0.0059, "step": 250 }, { "epoch": 0.061890026184241845, "grad_norm": 0.5747584700584412, "learning_rate": 4.896849956359597e-05, "loss": 0.0056, "step": 260 }, { "epoch": 0.06427041180671268, "grad_norm": 0.10936570912599564, "learning_rate": 4.892882646988812e-05, "loss": 0.0038, "step": 270 }, { "epoch": 0.06665079742918353, "grad_norm": 0.136638343334198, "learning_rate": 4.888915337618027e-05, "loss": 0.006, "step": 280 }, { "epoch": 0.06903118305165437, "grad_norm": 0.25448599457740784, "learning_rate": 4.884948028247243e-05, "loss": 0.0052, "step": 290 }, { "epoch": 0.07141156867412521, "grad_norm": 0.19224955141544342, "learning_rate": 4.8809807188764586e-05, "loss": 0.0041, "step": 300 }, { "epoch": 0.07379195429659605, "grad_norm": 0.9061737060546875, "learning_rate": 4.8770134095056736e-05, "loss": 0.0051, "step": 310 }, { "epoch": 0.07617233991906688, "grad_norm": 0.31071603298187256, "learning_rate": 4.873046100134889e-05, "loss": 0.0043, "step": 320 }, { "epoch": 0.07855272554153774, "grad_norm": 0.054100409150123596, "learning_rate": 4.869078790764104e-05, "loss": 0.004, "step": 330 }, { "epoch": 0.08093311116400857, "grad_norm": 0.11965326964855194, "learning_rate": 4.865111481393319e-05, "loss": 0.0039, "step": 340 }, { "epoch": 0.08331349678647941, "grad_norm": 0.16056092083454132, "learning_rate": 4.861144172022535e-05, "loss": 0.0036, "step": 350 }, { "epoch": 0.08569388240895025, "grad_norm": 0.08699148148298264, "learning_rate": 4.85717686265175e-05, "loss": 0.0032, "step": 360 }, { "epoch": 0.08807426803142109, "grad_norm": 0.16824030876159668, "learning_rate": 4.853209553280965e-05, "loss": 0.0033, "step": 370 }, { "epoch": 0.09045465365389194, "grad_norm": 0.07728957384824753, "learning_rate": 4.84924224391018e-05, "loss": 0.0023, "step": 380 }, { "epoch": 0.09283503927636277, "grad_norm": 0.2950897514820099, "learning_rate": 4.845274934539395e-05, "loss": 0.0039, "step": 390 }, { "epoch": 0.09521542489883361, "grad_norm": 0.6249143481254578, "learning_rate": 4.841307625168611e-05, "loss": 0.012, "step": 400 }, { "epoch": 0.09759581052130445, "grad_norm": 0.06545058637857437, "learning_rate": 4.837340315797826e-05, "loss": 0.0022, "step": 410 }, { "epoch": 0.09997619614377529, "grad_norm": 0.40417027473449707, "learning_rate": 4.833373006427042e-05, "loss": 0.003, "step": 420 }, { "epoch": 0.10235658176624614, "grad_norm": 0.38520482182502747, "learning_rate": 4.829405697056257e-05, "loss": 0.0037, "step": 430 }, { "epoch": 0.10473696738871698, "grad_norm": 0.9367744326591492, "learning_rate": 4.825438387685472e-05, "loss": 0.0029, "step": 440 }, { "epoch": 0.10711735301118781, "grad_norm": 0.09369224309921265, "learning_rate": 4.8214710783146875e-05, "loss": 0.0021, "step": 450 }, { "epoch": 0.10949773863365865, "grad_norm": 1.1114966869354248, "learning_rate": 4.8175037689439025e-05, "loss": 0.0024, "step": 460 }, { "epoch": 0.11187812425612949, "grad_norm": 0.15539304912090302, "learning_rate": 4.8135364595731175e-05, "loss": 0.0026, "step": 470 }, { "epoch": 0.11425850987860034, "grad_norm": 0.05451425537467003, "learning_rate": 4.809569150202333e-05, "loss": 0.0024, "step": 480 }, { "epoch": 0.11663889550107118, "grad_norm": 0.08954957127571106, "learning_rate": 4.805601840831548e-05, "loss": 0.0032, "step": 490 }, { "epoch": 0.11901928112354201, "grad_norm": 0.24188756942749023, "learning_rate": 4.801634531460763e-05, "loss": 0.0023, "step": 500 }, { "epoch": 0.12139966674601285, "grad_norm": 0.062233567237854004, "learning_rate": 4.797667222089979e-05, "loss": 0.002, "step": 510 }, { "epoch": 0.12378005236848369, "grad_norm": 0.605993926525116, "learning_rate": 4.793699912719194e-05, "loss": 0.0021, "step": 520 }, { "epoch": 0.12616043799095453, "grad_norm": 1.5091257095336914, "learning_rate": 4.789732603348409e-05, "loss": 0.0026, "step": 530 }, { "epoch": 0.12854082361342536, "grad_norm": 0.07300706952810287, "learning_rate": 4.785765293977625e-05, "loss": 0.0018, "step": 540 }, { "epoch": 0.1309212092358962, "grad_norm": 0.07547351717948914, "learning_rate": 4.78179798460684e-05, "loss": 0.0022, "step": 550 }, { "epoch": 0.13330159485836707, "grad_norm": 0.017345329746603966, "learning_rate": 4.7778306752360555e-05, "loss": 0.002, "step": 560 }, { "epoch": 0.1356819804808379, "grad_norm": 0.048248808830976486, "learning_rate": 4.7738633658652705e-05, "loss": 0.0018, "step": 570 }, { "epoch": 0.13806236610330874, "grad_norm": 0.04654766246676445, "learning_rate": 4.7698960564944856e-05, "loss": 0.0026, "step": 580 }, { "epoch": 0.14044275172577958, "grad_norm": 0.7228689193725586, "learning_rate": 4.7659287471237006e-05, "loss": 0.0033, "step": 590 }, { "epoch": 0.14282313734825042, "grad_norm": 0.01947982981801033, "learning_rate": 4.761961437752916e-05, "loss": 0.0024, "step": 600 }, { "epoch": 0.14520352297072125, "grad_norm": 0.03398985415697098, "learning_rate": 4.7579941283821314e-05, "loss": 0.0019, "step": 610 }, { "epoch": 0.1475839085931921, "grad_norm": 0.11993751674890518, "learning_rate": 4.754026819011347e-05, "loss": 0.0024, "step": 620 }, { "epoch": 0.14996429421566293, "grad_norm": 0.02739240974187851, "learning_rate": 4.750059509640562e-05, "loss": 0.0019, "step": 630 }, { "epoch": 0.15234467983813377, "grad_norm": 0.08998490869998932, "learning_rate": 4.746092200269777e-05, "loss": 0.0026, "step": 640 }, { "epoch": 0.1547250654606046, "grad_norm": 0.06008267030119896, "learning_rate": 4.742124890898992e-05, "loss": 0.0019, "step": 650 }, { "epoch": 0.15710545108307547, "grad_norm": 0.2969667911529541, "learning_rate": 4.738157581528208e-05, "loss": 0.0016, "step": 660 }, { "epoch": 0.1594858367055463, "grad_norm": 0.056759823113679886, "learning_rate": 4.7341902721574236e-05, "loss": 0.0026, "step": 670 }, { "epoch": 0.16186622232801715, "grad_norm": 0.36679673194885254, "learning_rate": 4.7302229627866386e-05, "loss": 0.0023, "step": 680 }, { "epoch": 0.16424660795048798, "grad_norm": 0.29111284017562866, "learning_rate": 4.7262556534158536e-05, "loss": 0.0028, "step": 690 }, { "epoch": 0.16662699357295882, "grad_norm": 0.48570939898490906, "learning_rate": 4.722288344045069e-05, "loss": 0.0015, "step": 700 }, { "epoch": 0.16900737919542966, "grad_norm": 0.06863627582788467, "learning_rate": 4.718321034674284e-05, "loss": 0.0016, "step": 710 }, { "epoch": 0.1713877648179005, "grad_norm": 0.18400460481643677, "learning_rate": 4.7143537253034994e-05, "loss": 0.0025, "step": 720 }, { "epoch": 0.17376815044037133, "grad_norm": 0.02043345756828785, "learning_rate": 4.710386415932715e-05, "loss": 0.0014, "step": 730 }, { "epoch": 0.17614853606284217, "grad_norm": 0.22026614844799042, "learning_rate": 4.70641910656193e-05, "loss": 0.0016, "step": 740 }, { "epoch": 0.178528921685313, "grad_norm": 0.033756159245967865, "learning_rate": 4.702451797191145e-05, "loss": 0.0015, "step": 750 }, { "epoch": 0.18090930730778387, "grad_norm": 0.03022690862417221, "learning_rate": 4.69848448782036e-05, "loss": 0.0016, "step": 760 }, { "epoch": 0.1832896929302547, "grad_norm": 0.32997235655784607, "learning_rate": 4.694517178449576e-05, "loss": 0.0017, "step": 770 }, { "epoch": 0.18567007855272555, "grad_norm": 0.6392120718955994, "learning_rate": 4.690549869078791e-05, "loss": 0.0015, "step": 780 }, { "epoch": 0.18805046417519639, "grad_norm": 0.12279071658849716, "learning_rate": 4.6865825597080066e-05, "loss": 0.0016, "step": 790 }, { "epoch": 0.19043084979766722, "grad_norm": 0.1228996068239212, "learning_rate": 4.682615250337222e-05, "loss": 0.0012, "step": 800 }, { "epoch": 0.19281123542013806, "grad_norm": 0.23846402764320374, "learning_rate": 4.678647940966437e-05, "loss": 0.0011, "step": 810 }, { "epoch": 0.1951916210426089, "grad_norm": 0.06786726415157318, "learning_rate": 4.674680631595652e-05, "loss": 0.0008, "step": 820 }, { "epoch": 0.19757200666507974, "grad_norm": 0.062252361327409744, "learning_rate": 4.6707133222248675e-05, "loss": 0.0009, "step": 830 }, { "epoch": 0.19995239228755057, "grad_norm": 0.10420612245798111, "learning_rate": 4.6667460128540825e-05, "loss": 0.0007, "step": 840 }, { "epoch": 0.2023327779100214, "grad_norm": 0.024685313925147057, "learning_rate": 4.6627787034832975e-05, "loss": 0.0012, "step": 850 }, { "epoch": 0.20471316353249228, "grad_norm": 0.07784374058246613, "learning_rate": 4.658811394112513e-05, "loss": 0.0011, "step": 860 }, { "epoch": 0.2070935491549631, "grad_norm": 0.1463196724653244, "learning_rate": 4.654844084741728e-05, "loss": 0.0017, "step": 870 }, { "epoch": 0.20947393477743395, "grad_norm": 0.04599474370479584, "learning_rate": 4.650876775370944e-05, "loss": 0.0013, "step": 880 }, { "epoch": 0.2118543203999048, "grad_norm": 0.44877147674560547, "learning_rate": 4.646909466000159e-05, "loss": 0.0012, "step": 890 }, { "epoch": 0.21423470602237563, "grad_norm": 1.3056105375289917, "learning_rate": 4.642942156629374e-05, "loss": 0.0018, "step": 900 }, { "epoch": 0.21661509164484646, "grad_norm": 0.5220457911491394, "learning_rate": 4.638974847258589e-05, "loss": 0.0011, "step": 910 }, { "epoch": 0.2189954772673173, "grad_norm": 0.5913621783256531, "learning_rate": 4.635007537887805e-05, "loss": 0.0013, "step": 920 }, { "epoch": 0.22137586288978814, "grad_norm": 0.150216206908226, "learning_rate": 4.63104022851702e-05, "loss": 0.001, "step": 930 }, { "epoch": 0.22375624851225898, "grad_norm": 0.022638270631432533, "learning_rate": 4.6270729191462355e-05, "loss": 0.0012, "step": 940 }, { "epoch": 0.2261366341347298, "grad_norm": 0.017948875203728676, "learning_rate": 4.6231056097754505e-05, "loss": 0.0008, "step": 950 }, { "epoch": 0.22851701975720068, "grad_norm": 0.25053608417510986, "learning_rate": 4.6191383004046656e-05, "loss": 0.0019, "step": 960 }, { "epoch": 0.23089740537967152, "grad_norm": 0.12757046520709991, "learning_rate": 4.6151709910338806e-05, "loss": 0.0019, "step": 970 }, { "epoch": 0.23327779100214235, "grad_norm": 0.185049369931221, "learning_rate": 4.611203681663096e-05, "loss": 0.0014, "step": 980 }, { "epoch": 0.2356581766246132, "grad_norm": 0.37812331318855286, "learning_rate": 4.607236372292312e-05, "loss": 0.0014, "step": 990 }, { "epoch": 0.23803856224708403, "grad_norm": 0.7450318336486816, "learning_rate": 4.603269062921527e-05, "loss": 0.0016, "step": 1000 }, { "epoch": 0.24041894786955487, "grad_norm": 0.03629771247506142, "learning_rate": 4.599301753550742e-05, "loss": 0.0012, "step": 1010 }, { "epoch": 0.2427993334920257, "grad_norm": 0.23223434388637543, "learning_rate": 4.595334444179957e-05, "loss": 0.0011, "step": 1020 }, { "epoch": 0.24517971911449654, "grad_norm": 0.08511273562908173, "learning_rate": 4.591367134809172e-05, "loss": 0.0006, "step": 1030 }, { "epoch": 0.24756010473696738, "grad_norm": 0.17114369571208954, "learning_rate": 4.587399825438388e-05, "loss": 0.001, "step": 1040 }, { "epoch": 0.24994049035943822, "grad_norm": 0.04517650604248047, "learning_rate": 4.5834325160676035e-05, "loss": 0.0008, "step": 1050 }, { "epoch": 0.25232087598190905, "grad_norm": 0.20234528183937073, "learning_rate": 4.5794652066968186e-05, "loss": 0.0014, "step": 1060 }, { "epoch": 0.2547012616043799, "grad_norm": 0.007534442003816366, "learning_rate": 4.5754978973260336e-05, "loss": 0.0008, "step": 1070 }, { "epoch": 0.25708164722685073, "grad_norm": 0.02520332857966423, "learning_rate": 4.5715305879552486e-05, "loss": 0.0008, "step": 1080 }, { "epoch": 0.25946203284932157, "grad_norm": 0.02674415148794651, "learning_rate": 4.5675632785844644e-05, "loss": 0.005, "step": 1090 }, { "epoch": 0.2618424184717924, "grad_norm": 0.0756726786494255, "learning_rate": 4.5635959692136794e-05, "loss": 0.0008, "step": 1100 }, { "epoch": 0.2642228040942633, "grad_norm": 0.18692266941070557, "learning_rate": 4.559628659842895e-05, "loss": 0.0021, "step": 1110 }, { "epoch": 0.26660318971673413, "grad_norm": 0.021881476044654846, "learning_rate": 4.55566135047211e-05, "loss": 0.0016, "step": 1120 }, { "epoch": 0.26898357533920497, "grad_norm": 0.16764195263385773, "learning_rate": 4.551694041101325e-05, "loss": 0.001, "step": 1130 }, { "epoch": 0.2713639609616758, "grad_norm": 0.6519142389297485, "learning_rate": 4.547726731730541e-05, "loss": 0.0015, "step": 1140 }, { "epoch": 0.27374434658414665, "grad_norm": 0.07793217897415161, "learning_rate": 4.543759422359756e-05, "loss": 0.0005, "step": 1150 }, { "epoch": 0.2761247322066175, "grad_norm": 0.04451458901166916, "learning_rate": 4.539792112988971e-05, "loss": 0.0009, "step": 1160 }, { "epoch": 0.2785051178290883, "grad_norm": 0.02606957219541073, "learning_rate": 4.5358248036181866e-05, "loss": 0.0013, "step": 1170 }, { "epoch": 0.28088550345155916, "grad_norm": 0.03642681613564491, "learning_rate": 4.531857494247402e-05, "loss": 0.0007, "step": 1180 }, { "epoch": 0.28326588907403, "grad_norm": 0.27240046858787537, "learning_rate": 4.527890184876617e-05, "loss": 0.0007, "step": 1190 }, { "epoch": 0.28564627469650083, "grad_norm": 0.01732662320137024, "learning_rate": 4.5239228755058324e-05, "loss": 0.0011, "step": 1200 }, { "epoch": 0.2880266603189717, "grad_norm": 0.10321195423603058, "learning_rate": 4.5199555661350474e-05, "loss": 0.0007, "step": 1210 }, { "epoch": 0.2904070459414425, "grad_norm": 0.060121580958366394, "learning_rate": 4.5159882567642625e-05, "loss": 0.0014, "step": 1220 }, { "epoch": 0.29278743156391335, "grad_norm": 0.028955884277820587, "learning_rate": 4.5120209473934775e-05, "loss": 0.0007, "step": 1230 }, { "epoch": 0.2951678171863842, "grad_norm": 0.0714436024427414, "learning_rate": 4.508053638022693e-05, "loss": 0.0007, "step": 1240 }, { "epoch": 0.297548202808855, "grad_norm": 0.052230022847652435, "learning_rate": 4.504086328651909e-05, "loss": 0.0008, "step": 1250 }, { "epoch": 0.29992858843132586, "grad_norm": 0.33476394414901733, "learning_rate": 4.500119019281124e-05, "loss": 0.0008, "step": 1260 }, { "epoch": 0.3023089740537967, "grad_norm": 0.07732009142637253, "learning_rate": 4.496151709910339e-05, "loss": 0.0012, "step": 1270 }, { "epoch": 0.30468935967626753, "grad_norm": 0.6843579411506653, "learning_rate": 4.492184400539554e-05, "loss": 0.0007, "step": 1280 }, { "epoch": 0.3070697452987384, "grad_norm": 0.08292358368635178, "learning_rate": 4.488217091168769e-05, "loss": 0.0005, "step": 1290 }, { "epoch": 0.3094501309212092, "grad_norm": 0.02598383277654648, "learning_rate": 4.484249781797985e-05, "loss": 0.001, "step": 1300 }, { "epoch": 0.3118305165436801, "grad_norm": 0.7855332493782043, "learning_rate": 4.4802824724272005e-05, "loss": 0.0007, "step": 1310 }, { "epoch": 0.31421090216615094, "grad_norm": 0.07066315412521362, "learning_rate": 4.4763151630564155e-05, "loss": 0.0005, "step": 1320 }, { "epoch": 0.3165912877886218, "grad_norm": 0.012595695443451405, "learning_rate": 4.4723478536856305e-05, "loss": 0.0005, "step": 1330 }, { "epoch": 0.3189716734110926, "grad_norm": 0.015364304184913635, "learning_rate": 4.4683805443148455e-05, "loss": 0.0005, "step": 1340 }, { "epoch": 0.32135205903356345, "grad_norm": 0.0556706003844738, "learning_rate": 4.464413234944061e-05, "loss": 0.0011, "step": 1350 }, { "epoch": 0.3237324446560343, "grad_norm": 0.22568030655384064, "learning_rate": 4.460445925573277e-05, "loss": 0.0023, "step": 1360 }, { "epoch": 0.32611283027850513, "grad_norm": 0.048404548317193985, "learning_rate": 4.456478616202492e-05, "loss": 0.0016, "step": 1370 }, { "epoch": 0.32849321590097597, "grad_norm": 0.0693359524011612, "learning_rate": 4.452511306831707e-05, "loss": 0.0038, "step": 1380 }, { "epoch": 0.3308736015234468, "grad_norm": 0.16493481397628784, "learning_rate": 4.448543997460922e-05, "loss": 0.0006, "step": 1390 }, { "epoch": 0.33325398714591764, "grad_norm": 1.200024962425232, "learning_rate": 4.444576688090137e-05, "loss": 0.0011, "step": 1400 }, { "epoch": 0.3356343727683885, "grad_norm": 0.23021258413791656, "learning_rate": 4.440609378719353e-05, "loss": 0.0009, "step": 1410 }, { "epoch": 0.3380147583908593, "grad_norm": 0.0196574367582798, "learning_rate": 4.436642069348568e-05, "loss": 0.0006, "step": 1420 }, { "epoch": 0.34039514401333015, "grad_norm": 0.3254101574420929, "learning_rate": 4.4326747599777835e-05, "loss": 0.0015, "step": 1430 }, { "epoch": 0.342775529635801, "grad_norm": 0.026332201436161995, "learning_rate": 4.4287074506069986e-05, "loss": 0.0017, "step": 1440 }, { "epoch": 0.34515591525827183, "grad_norm": 0.2679558992385864, "learning_rate": 4.4247401412362136e-05, "loss": 0.0012, "step": 1450 }, { "epoch": 0.34753630088074267, "grad_norm": 0.06991584599018097, "learning_rate": 4.420772831865429e-05, "loss": 0.0007, "step": 1460 }, { "epoch": 0.3499166865032135, "grad_norm": 0.036999981850385666, "learning_rate": 4.416805522494644e-05, "loss": 0.001, "step": 1470 }, { "epoch": 0.35229707212568434, "grad_norm": 0.042684607207775116, "learning_rate": 4.4128382131238594e-05, "loss": 0.0009, "step": 1480 }, { "epoch": 0.3546774577481552, "grad_norm": 0.013829515315592289, "learning_rate": 4.408870903753075e-05, "loss": 0.0008, "step": 1490 }, { "epoch": 0.357057843370626, "grad_norm": 0.0129277054220438, "learning_rate": 4.40490359438229e-05, "loss": 0.0007, "step": 1500 }, { "epoch": 0.3594382289930969, "grad_norm": 0.03553192317485809, "learning_rate": 4.400936285011505e-05, "loss": 0.0008, "step": 1510 }, { "epoch": 0.36181861461556775, "grad_norm": 0.01258548628538847, "learning_rate": 4.396968975640721e-05, "loss": 0.001, "step": 1520 }, { "epoch": 0.3641990002380386, "grad_norm": 0.021352197974920273, "learning_rate": 4.393001666269936e-05, "loss": 0.001, "step": 1530 }, { "epoch": 0.3665793858605094, "grad_norm": 0.035958483815193176, "learning_rate": 4.389034356899151e-05, "loss": 0.0007, "step": 1540 }, { "epoch": 0.36895977148298026, "grad_norm": 0.013187541626393795, "learning_rate": 4.3850670475283666e-05, "loss": 0.0009, "step": 1550 }, { "epoch": 0.3713401571054511, "grad_norm": 0.02294233813881874, "learning_rate": 4.3810997381575816e-05, "loss": 0.0008, "step": 1560 }, { "epoch": 0.37372054272792193, "grad_norm": 0.14476238191127777, "learning_rate": 4.3771324287867974e-05, "loss": 0.0005, "step": 1570 }, { "epoch": 0.37610092835039277, "grad_norm": 0.2275228053331375, "learning_rate": 4.3731651194160124e-05, "loss": 0.0006, "step": 1580 }, { "epoch": 0.3784813139728636, "grad_norm": 0.020434999838471413, "learning_rate": 4.3691978100452274e-05, "loss": 0.0004, "step": 1590 }, { "epoch": 0.38086169959533445, "grad_norm": 0.01040293462574482, "learning_rate": 4.3652305006744424e-05, "loss": 0.0003, "step": 1600 }, { "epoch": 0.3832420852178053, "grad_norm": 0.0240499097853899, "learning_rate": 4.3612631913036575e-05, "loss": 0.0008, "step": 1610 }, { "epoch": 0.3856224708402761, "grad_norm": 0.014826517552137375, "learning_rate": 4.357295881932874e-05, "loss": 0.0004, "step": 1620 }, { "epoch": 0.38800285646274696, "grad_norm": 0.011841246858239174, "learning_rate": 4.353328572562089e-05, "loss": 0.0007, "step": 1630 }, { "epoch": 0.3903832420852178, "grad_norm": 0.0156678706407547, "learning_rate": 4.349361263191304e-05, "loss": 0.0006, "step": 1640 }, { "epoch": 0.39276362770768863, "grad_norm": 0.06124578043818474, "learning_rate": 4.345393953820519e-05, "loss": 0.0005, "step": 1650 }, { "epoch": 0.39514401333015947, "grad_norm": 0.06753918528556824, "learning_rate": 4.341426644449734e-05, "loss": 0.0006, "step": 1660 }, { "epoch": 0.3975243989526303, "grad_norm": 0.08766347169876099, "learning_rate": 4.33745933507895e-05, "loss": 0.0003, "step": 1670 }, { "epoch": 0.39990478457510115, "grad_norm": 0.021080242469906807, "learning_rate": 4.3334920257081654e-05, "loss": 0.0008, "step": 1680 }, { "epoch": 0.402285170197572, "grad_norm": 0.11970046162605286, "learning_rate": 4.3295247163373804e-05, "loss": 0.0005, "step": 1690 }, { "epoch": 0.4046655558200428, "grad_norm": 0.027210582047700882, "learning_rate": 4.3255574069665955e-05, "loss": 0.0003, "step": 1700 }, { "epoch": 0.4070459414425137, "grad_norm": 0.021168386563658714, "learning_rate": 4.3215900975958105e-05, "loss": 0.0005, "step": 1710 }, { "epoch": 0.40942632706498455, "grad_norm": 0.012768070213496685, "learning_rate": 4.3176227882250255e-05, "loss": 0.0005, "step": 1720 }, { "epoch": 0.4118067126874554, "grad_norm": 0.1276211142539978, "learning_rate": 4.313655478854241e-05, "loss": 0.0005, "step": 1730 }, { "epoch": 0.4141870983099262, "grad_norm": 0.08978109806776047, "learning_rate": 4.309688169483457e-05, "loss": 0.0009, "step": 1740 }, { "epoch": 0.41656748393239706, "grad_norm": 0.3068161606788635, "learning_rate": 4.305720860112672e-05, "loss": 0.0007, "step": 1750 }, { "epoch": 0.4189478695548679, "grad_norm": 0.01211560145020485, "learning_rate": 4.301753550741887e-05, "loss": 0.0006, "step": 1760 }, { "epoch": 0.42132825517733874, "grad_norm": 0.02517927996814251, "learning_rate": 4.297786241371102e-05, "loss": 0.0006, "step": 1770 }, { "epoch": 0.4237086407998096, "grad_norm": 0.017450081184506416, "learning_rate": 4.293818932000318e-05, "loss": 0.0003, "step": 1780 }, { "epoch": 0.4260890264222804, "grad_norm": 0.014250938780605793, "learning_rate": 4.289851622629533e-05, "loss": 0.0005, "step": 1790 }, { "epoch": 0.42846941204475125, "grad_norm": 0.027526648715138435, "learning_rate": 4.2858843132587485e-05, "loss": 0.0005, "step": 1800 }, { "epoch": 0.4308497976672221, "grad_norm": 0.0071271262131631374, "learning_rate": 4.2819170038879635e-05, "loss": 0.0007, "step": 1810 }, { "epoch": 0.4332301832896929, "grad_norm": 0.11835234612226486, "learning_rate": 4.2779496945171785e-05, "loss": 0.0005, "step": 1820 }, { "epoch": 0.43561056891216376, "grad_norm": 0.016718665137887, "learning_rate": 4.273982385146394e-05, "loss": 0.0007, "step": 1830 }, { "epoch": 0.4379909545346346, "grad_norm": 0.04138866439461708, "learning_rate": 4.270015075775609e-05, "loss": 0.0005, "step": 1840 }, { "epoch": 0.44037134015710544, "grad_norm": 0.5920994281768799, "learning_rate": 4.266047766404824e-05, "loss": 0.0009, "step": 1850 }, { "epoch": 0.4427517257795763, "grad_norm": 0.010394711047410965, "learning_rate": 4.2620804570340393e-05, "loss": 0.0004, "step": 1860 }, { "epoch": 0.4451321114020471, "grad_norm": 0.031543031334877014, "learning_rate": 4.258113147663255e-05, "loss": 0.0005, "step": 1870 }, { "epoch": 0.44751249702451795, "grad_norm": 0.016665128991007805, "learning_rate": 4.25414583829247e-05, "loss": 0.0083, "step": 1880 }, { "epoch": 0.4498928826469888, "grad_norm": 0.03811788931488991, "learning_rate": 4.250178528921686e-05, "loss": 0.0014, "step": 1890 }, { "epoch": 0.4522732682694596, "grad_norm": 0.0656796246767044, "learning_rate": 4.246211219550901e-05, "loss": 0.0008, "step": 1900 }, { "epoch": 0.4546536538919305, "grad_norm": 0.011904909275472164, "learning_rate": 4.242243910180116e-05, "loss": 0.0006, "step": 1910 }, { "epoch": 0.45703403951440136, "grad_norm": 0.01850457303225994, "learning_rate": 4.238276600809331e-05, "loss": 0.0004, "step": 1920 }, { "epoch": 0.4594144251368722, "grad_norm": 0.10309766978025436, "learning_rate": 4.2343092914385466e-05, "loss": 0.0005, "step": 1930 }, { "epoch": 0.46179481075934303, "grad_norm": 0.13206863403320312, "learning_rate": 4.230341982067762e-05, "loss": 0.0004, "step": 1940 }, { "epoch": 0.46417519638181387, "grad_norm": 0.010924161411821842, "learning_rate": 4.226374672696977e-05, "loss": 0.0003, "step": 1950 }, { "epoch": 0.4665555820042847, "grad_norm": 0.013266120105981827, "learning_rate": 4.2224073633261924e-05, "loss": 0.0004, "step": 1960 }, { "epoch": 0.46893596762675555, "grad_norm": 0.008552256040275097, "learning_rate": 4.2184400539554074e-05, "loss": 0.0003, "step": 1970 }, { "epoch": 0.4713163532492264, "grad_norm": 0.0052538709715008736, "learning_rate": 4.2144727445846224e-05, "loss": 0.0005, "step": 1980 }, { "epoch": 0.4736967388716972, "grad_norm": 0.0074672214686870575, "learning_rate": 4.210505435213838e-05, "loss": 0.0003, "step": 1990 }, { "epoch": 0.47607712449416806, "grad_norm": 0.5743750929832458, "learning_rate": 4.206538125843054e-05, "loss": 0.0005, "step": 2000 }, { "epoch": 0.4784575101166389, "grad_norm": 0.0076432847417891026, "learning_rate": 4.202570816472269e-05, "loss": 0.0005, "step": 2010 }, { "epoch": 0.48083789573910973, "grad_norm": 0.09265641123056412, "learning_rate": 4.198603507101484e-05, "loss": 0.0003, "step": 2020 }, { "epoch": 0.48321828136158057, "grad_norm": 0.01519245095551014, "learning_rate": 4.194636197730699e-05, "loss": 0.0002, "step": 2030 }, { "epoch": 0.4855986669840514, "grad_norm": 0.04831220954656601, "learning_rate": 4.1906688883599146e-05, "loss": 0.0003, "step": 2040 }, { "epoch": 0.48797905260652225, "grad_norm": 0.024797851219773293, "learning_rate": 4.18670157898913e-05, "loss": 0.0004, "step": 2050 }, { "epoch": 0.4903594382289931, "grad_norm": 0.008994129486382008, "learning_rate": 4.1827342696183454e-05, "loss": 0.0002, "step": 2060 }, { "epoch": 0.4927398238514639, "grad_norm": 0.00806290004402399, "learning_rate": 4.1787669602475604e-05, "loss": 0.0004, "step": 2070 }, { "epoch": 0.49512020947393476, "grad_norm": 0.003900889540091157, "learning_rate": 4.1747996508767754e-05, "loss": 0.0002, "step": 2080 }, { "epoch": 0.4975005950964056, "grad_norm": 0.00262014614418149, "learning_rate": 4.1708323415059905e-05, "loss": 0.0002, "step": 2090 }, { "epoch": 0.49988098071887643, "grad_norm": 0.30837100744247437, "learning_rate": 4.166865032135206e-05, "loss": 0.0004, "step": 2100 }, { "epoch": 0.5022613663413473, "grad_norm": 0.5304675102233887, "learning_rate": 4.162897722764421e-05, "loss": 0.0003, "step": 2110 }, { "epoch": 0.5046417519638181, "grad_norm": 0.3627573847770691, "learning_rate": 4.158930413393637e-05, "loss": 0.0043, "step": 2120 }, { "epoch": 0.507022137586289, "grad_norm": 0.011327610351145267, "learning_rate": 4.154963104022852e-05, "loss": 0.0005, "step": 2130 }, { "epoch": 0.5094025232087598, "grad_norm": 0.055182114243507385, "learning_rate": 4.150995794652067e-05, "loss": 0.0005, "step": 2140 }, { "epoch": 0.5117829088312307, "grad_norm": 0.009911212138831615, "learning_rate": 4.147028485281283e-05, "loss": 0.0004, "step": 2150 }, { "epoch": 0.5141632944537015, "grad_norm": 0.028569847345352173, "learning_rate": 4.143061175910498e-05, "loss": 0.0003, "step": 2160 }, { "epoch": 0.5165436800761724, "grad_norm": 0.0070992144756019115, "learning_rate": 4.139093866539713e-05, "loss": 0.0006, "step": 2170 }, { "epoch": 0.5189240656986431, "grad_norm": 0.008213848806917667, "learning_rate": 4.1351265571689285e-05, "loss": 0.0002, "step": 2180 }, { "epoch": 0.521304451321114, "grad_norm": 0.018964022397994995, "learning_rate": 4.1311592477981435e-05, "loss": 0.0003, "step": 2190 }, { "epoch": 0.5236848369435848, "grad_norm": 0.004533541388809681, "learning_rate": 4.1271919384273585e-05, "loss": 0.0003, "step": 2200 }, { "epoch": 0.5260652225660557, "grad_norm": 0.12422726303339005, "learning_rate": 4.123224629056574e-05, "loss": 0.0003, "step": 2210 }, { "epoch": 0.5284456081885266, "grad_norm": 0.019521724432706833, "learning_rate": 4.119257319685789e-05, "loss": 0.0003, "step": 2220 }, { "epoch": 0.5308259938109974, "grad_norm": 0.03547817841172218, "learning_rate": 4.115290010315004e-05, "loss": 0.0004, "step": 2230 }, { "epoch": 0.5332063794334683, "grad_norm": 0.9750944375991821, "learning_rate": 4.111322700944219e-05, "loss": 0.0005, "step": 2240 }, { "epoch": 0.535586765055939, "grad_norm": 0.09758254885673523, "learning_rate": 4.107355391573435e-05, "loss": 0.0004, "step": 2250 }, { "epoch": 0.5379671506784099, "grad_norm": 0.20201332867145538, "learning_rate": 4.103388082202651e-05, "loss": 0.0008, "step": 2260 }, { "epoch": 0.5403475363008807, "grad_norm": 0.2006085067987442, "learning_rate": 4.099420772831866e-05, "loss": 0.0008, "step": 2270 }, { "epoch": 0.5427279219233516, "grad_norm": 0.0802696943283081, "learning_rate": 4.095453463461081e-05, "loss": 0.0007, "step": 2280 }, { "epoch": 0.5451083075458224, "grad_norm": 0.4039531350135803, "learning_rate": 4.091486154090296e-05, "loss": 0.0024, "step": 2290 }, { "epoch": 0.5474886931682933, "grad_norm": 0.006702470127493143, "learning_rate": 4.087518844719511e-05, "loss": 0.0007, "step": 2300 }, { "epoch": 0.5498690787907641, "grad_norm": 0.1001976877450943, "learning_rate": 4.083551535348727e-05, "loss": 0.0003, "step": 2310 }, { "epoch": 0.552249464413235, "grad_norm": 0.005626179743558168, "learning_rate": 4.079584225977942e-05, "loss": 0.0009, "step": 2320 }, { "epoch": 0.5546298500357058, "grad_norm": 0.009593102149665356, "learning_rate": 4.075616916607157e-05, "loss": 0.0003, "step": 2330 }, { "epoch": 0.5570102356581766, "grad_norm": 0.014003382995724678, "learning_rate": 4.0716496072363723e-05, "loss": 0.0003, "step": 2340 }, { "epoch": 0.5593906212806474, "grad_norm": 0.012953966856002808, "learning_rate": 4.0676822978655874e-05, "loss": 0.0004, "step": 2350 }, { "epoch": 0.5617710069031183, "grad_norm": 0.007770949974656105, "learning_rate": 4.063714988494803e-05, "loss": 0.0006, "step": 2360 }, { "epoch": 0.5641513925255891, "grad_norm": 0.01227940246462822, "learning_rate": 4.059747679124019e-05, "loss": 0.0003, "step": 2370 }, { "epoch": 0.56653177814806, "grad_norm": 0.2204684615135193, "learning_rate": 4.055780369753234e-05, "loss": 0.0003, "step": 2380 }, { "epoch": 0.5689121637705308, "grad_norm": 0.03364790603518486, "learning_rate": 4.051813060382449e-05, "loss": 0.0003, "step": 2390 }, { "epoch": 0.5712925493930017, "grad_norm": 0.049715492874383926, "learning_rate": 4.047845751011664e-05, "loss": 0.0003, "step": 2400 }, { "epoch": 0.5736729350154725, "grad_norm": 0.028070533648133278, "learning_rate": 4.0438784416408796e-05, "loss": 0.0007, "step": 2410 }, { "epoch": 0.5760533206379433, "grad_norm": 0.020421486347913742, "learning_rate": 4.0399111322700946e-05, "loss": 0.0009, "step": 2420 }, { "epoch": 0.5784337062604142, "grad_norm": 0.010064806789159775, "learning_rate": 4.0359438228993097e-05, "loss": 0.0008, "step": 2430 }, { "epoch": 0.580814091882885, "grad_norm": 0.3017018735408783, "learning_rate": 4.0319765135285254e-05, "loss": 0.001, "step": 2440 }, { "epoch": 0.5831944775053559, "grad_norm": 0.20759595930576324, "learning_rate": 4.0280092041577404e-05, "loss": 0.0003, "step": 2450 }, { "epoch": 0.5855748631278267, "grad_norm": 0.016160350292921066, "learning_rate": 4.0240418947869554e-05, "loss": 0.0006, "step": 2460 }, { "epoch": 0.5879552487502976, "grad_norm": 0.5293152332305908, "learning_rate": 4.020074585416171e-05, "loss": 0.0011, "step": 2470 }, { "epoch": 0.5903356343727684, "grad_norm": 0.007493559271097183, "learning_rate": 4.016107276045386e-05, "loss": 0.0004, "step": 2480 }, { "epoch": 0.5927160199952393, "grad_norm": 0.018649157136678696, "learning_rate": 4.012139966674601e-05, "loss": 0.0005, "step": 2490 }, { "epoch": 0.59509640561771, "grad_norm": 0.01135182660073042, "learning_rate": 4.008172657303817e-05, "loss": 0.0004, "step": 2500 }, { "epoch": 0.5974767912401809, "grad_norm": 0.0733335018157959, "learning_rate": 4.004205347933032e-05, "loss": 0.0005, "step": 2510 }, { "epoch": 0.5998571768626517, "grad_norm": 0.02785026654601097, "learning_rate": 4.0002380385622476e-05, "loss": 0.0003, "step": 2520 }, { "epoch": 0.6022375624851226, "grad_norm": 0.005258665420114994, "learning_rate": 3.996270729191463e-05, "loss": 0.0002, "step": 2530 }, { "epoch": 0.6046179481075934, "grad_norm": 0.006735061760991812, "learning_rate": 3.992303419820678e-05, "loss": 0.0003, "step": 2540 }, { "epoch": 0.6069983337300643, "grad_norm": 0.008341578766703606, "learning_rate": 3.988336110449893e-05, "loss": 0.0003, "step": 2550 }, { "epoch": 0.6093787193525351, "grad_norm": 0.0027205003425478935, "learning_rate": 3.9843688010791084e-05, "loss": 0.0003, "step": 2560 }, { "epoch": 0.611759104975006, "grad_norm": 0.01718416064977646, "learning_rate": 3.9804014917083235e-05, "loss": 0.0005, "step": 2570 }, { "epoch": 0.6141394905974767, "grad_norm": 0.06104213371872902, "learning_rate": 3.976434182337539e-05, "loss": 0.0002, "step": 2580 }, { "epoch": 0.6165198762199476, "grad_norm": 0.008454731665551662, "learning_rate": 3.972466872966754e-05, "loss": 0.0001, "step": 2590 }, { "epoch": 0.6189002618424184, "grad_norm": 0.006591182202100754, "learning_rate": 3.968499563595969e-05, "loss": 0.0002, "step": 2600 }, { "epoch": 0.6212806474648893, "grad_norm": 0.009718428365886211, "learning_rate": 3.964532254225184e-05, "loss": 0.0019, "step": 2610 }, { "epoch": 0.6236610330873602, "grad_norm": 0.0156183410435915, "learning_rate": 3.9605649448544e-05, "loss": 0.0002, "step": 2620 }, { "epoch": 0.626041418709831, "grad_norm": 0.012816215865314007, "learning_rate": 3.956597635483616e-05, "loss": 0.0008, "step": 2630 }, { "epoch": 0.6284218043323019, "grad_norm": 0.0211672130972147, "learning_rate": 3.952630326112831e-05, "loss": 0.0002, "step": 2640 }, { "epoch": 0.6308021899547727, "grad_norm": 0.012701870873570442, "learning_rate": 3.948663016742046e-05, "loss": 0.0003, "step": 2650 }, { "epoch": 0.6331825755772436, "grad_norm": 0.008668744005262852, "learning_rate": 3.944695707371261e-05, "loss": 0.0002, "step": 2660 }, { "epoch": 0.6355629611997143, "grad_norm": 0.020911380648612976, "learning_rate": 3.940728398000476e-05, "loss": 0.0004, "step": 2670 }, { "epoch": 0.6379433468221852, "grad_norm": 0.0015960232121869922, "learning_rate": 3.9367610886296915e-05, "loss": 0.0002, "step": 2680 }, { "epoch": 0.640323732444656, "grad_norm": 0.01783674582839012, "learning_rate": 3.932793779258907e-05, "loss": 0.0001, "step": 2690 }, { "epoch": 0.6427041180671269, "grad_norm": 0.006887937895953655, "learning_rate": 3.928826469888122e-05, "loss": 0.0002, "step": 2700 }, { "epoch": 0.6450845036895977, "grad_norm": 0.004555295687168837, "learning_rate": 3.924859160517337e-05, "loss": 0.0002, "step": 2710 }, { "epoch": 0.6474648893120686, "grad_norm": 0.00994735024869442, "learning_rate": 3.920891851146552e-05, "loss": 0.0003, "step": 2720 }, { "epoch": 0.6498452749345394, "grad_norm": 0.03482622653245926, "learning_rate": 3.916924541775768e-05, "loss": 0.0002, "step": 2730 }, { "epoch": 0.6522256605570103, "grad_norm": 0.06792888045310974, "learning_rate": 3.912957232404983e-05, "loss": 0.0002, "step": 2740 }, { "epoch": 0.654606046179481, "grad_norm": 0.02015574462711811, "learning_rate": 3.908989923034199e-05, "loss": 0.0008, "step": 2750 }, { "epoch": 0.6569864318019519, "grad_norm": 0.07359887659549713, "learning_rate": 3.905022613663414e-05, "loss": 0.0003, "step": 2760 }, { "epoch": 0.6593668174244227, "grad_norm": 0.006248469930142164, "learning_rate": 3.901055304292629e-05, "loss": 0.0002, "step": 2770 }, { "epoch": 0.6617472030468936, "grad_norm": 0.01739078015089035, "learning_rate": 3.897087994921844e-05, "loss": 0.0002, "step": 2780 }, { "epoch": 0.6641275886693644, "grad_norm": 0.008228071965277195, "learning_rate": 3.8931206855510596e-05, "loss": 0.0005, "step": 2790 }, { "epoch": 0.6665079742918353, "grad_norm": 0.012569721788167953, "learning_rate": 3.8891533761802746e-05, "loss": 0.0002, "step": 2800 }, { "epoch": 0.6688883599143061, "grad_norm": 0.003245885483920574, "learning_rate": 3.88518606680949e-05, "loss": 0.0001, "step": 2810 }, { "epoch": 0.671268745536777, "grad_norm": 0.010106906294822693, "learning_rate": 3.8812187574387053e-05, "loss": 0.0002, "step": 2820 }, { "epoch": 0.6736491311592478, "grad_norm": 0.0013821216998621821, "learning_rate": 3.8772514480679204e-05, "loss": 0.0002, "step": 2830 }, { "epoch": 0.6760295167817186, "grad_norm": 0.008525123819708824, "learning_rate": 3.873284138697136e-05, "loss": 0.0001, "step": 2840 }, { "epoch": 0.6784099024041895, "grad_norm": 0.0045269266702234745, "learning_rate": 3.869316829326351e-05, "loss": 0.0001, "step": 2850 }, { "epoch": 0.6807902880266603, "grad_norm": 0.005178367253392935, "learning_rate": 3.865349519955566e-05, "loss": 0.0002, "step": 2860 }, { "epoch": 0.6831706736491312, "grad_norm": 0.015604405663907528, "learning_rate": 3.861382210584781e-05, "loss": 0.0001, "step": 2870 }, { "epoch": 0.685551059271602, "grad_norm": 0.7911249399185181, "learning_rate": 3.857414901213997e-05, "loss": 0.0002, "step": 2880 }, { "epoch": 0.6879314448940729, "grad_norm": 0.005056778434664011, "learning_rate": 3.853447591843212e-05, "loss": 0.0003, "step": 2890 }, { "epoch": 0.6903118305165437, "grad_norm": 0.007354553788900375, "learning_rate": 3.8494802824724276e-05, "loss": 0.0002, "step": 2900 }, { "epoch": 0.6926922161390145, "grad_norm": 0.10069092363119125, "learning_rate": 3.8455129731016427e-05, "loss": 0.0003, "step": 2910 }, { "epoch": 0.6950726017614853, "grad_norm": 0.007913509383797646, "learning_rate": 3.841545663730858e-05, "loss": 0.0005, "step": 2920 }, { "epoch": 0.6974529873839562, "grad_norm": 0.04653599485754967, "learning_rate": 3.837578354360073e-05, "loss": 0.0005, "step": 2930 }, { "epoch": 0.699833373006427, "grad_norm": 0.007795447017997503, "learning_rate": 3.8336110449892884e-05, "loss": 0.0002, "step": 2940 }, { "epoch": 0.7022137586288979, "grad_norm": 0.0843840092420578, "learning_rate": 3.829643735618504e-05, "loss": 0.0008, "step": 2950 }, { "epoch": 0.7045941442513687, "grad_norm": 0.019790470600128174, "learning_rate": 3.825676426247719e-05, "loss": 0.0004, "step": 2960 }, { "epoch": 0.7069745298738396, "grad_norm": 0.04970049858093262, "learning_rate": 3.821709116876934e-05, "loss": 0.0008, "step": 2970 }, { "epoch": 0.7093549154963104, "grad_norm": 0.011334414593875408, "learning_rate": 3.817741807506149e-05, "loss": 0.0003, "step": 2980 }, { "epoch": 0.7117353011187812, "grad_norm": 0.12627428770065308, "learning_rate": 3.813774498135364e-05, "loss": 0.0006, "step": 2990 }, { "epoch": 0.714115686741252, "grad_norm": 0.03299270570278168, "learning_rate": 3.8098071887645806e-05, "loss": 0.0006, "step": 3000 }, { "epoch": 0.7164960723637229, "grad_norm": 0.014470428228378296, "learning_rate": 3.805839879393796e-05, "loss": 0.0002, "step": 3010 }, { "epoch": 0.7188764579861938, "grad_norm": 0.010081595741212368, "learning_rate": 3.801872570023011e-05, "loss": 0.0002, "step": 3020 }, { "epoch": 0.7212568436086646, "grad_norm": 0.006527799181640148, "learning_rate": 3.797905260652226e-05, "loss": 0.0005, "step": 3030 }, { "epoch": 0.7236372292311355, "grad_norm": 0.025967439636588097, "learning_rate": 3.793937951281441e-05, "loss": 0.0003, "step": 3040 }, { "epoch": 0.7260176148536063, "grad_norm": 0.012788872234523296, "learning_rate": 3.7899706419106565e-05, "loss": 0.0006, "step": 3050 }, { "epoch": 0.7283980004760772, "grad_norm": 0.05159073323011398, "learning_rate": 3.7860033325398715e-05, "loss": 0.0002, "step": 3060 }, { "epoch": 0.730778386098548, "grad_norm": 0.09669562429189682, "learning_rate": 3.782036023169087e-05, "loss": 0.0003, "step": 3070 }, { "epoch": 0.7331587717210188, "grad_norm": 0.0008232035324908793, "learning_rate": 3.778068713798302e-05, "loss": 0.0002, "step": 3080 }, { "epoch": 0.7355391573434896, "grad_norm": 0.0026904919650405645, "learning_rate": 3.774101404427517e-05, "loss": 0.0008, "step": 3090 }, { "epoch": 0.7379195429659605, "grad_norm": 0.22064454853534698, "learning_rate": 3.770134095056733e-05, "loss": 0.0001, "step": 3100 }, { "epoch": 0.7402999285884313, "grad_norm": 0.0037417325656861067, "learning_rate": 3.766166785685948e-05, "loss": 0.0002, "step": 3110 }, { "epoch": 0.7426803142109022, "grad_norm": 0.008903945796191692, "learning_rate": 3.762199476315163e-05, "loss": 0.0004, "step": 3120 }, { "epoch": 0.745060699833373, "grad_norm": 0.01190115325152874, "learning_rate": 3.758232166944379e-05, "loss": 0.0003, "step": 3130 }, { "epoch": 0.7474410854558439, "grad_norm": 0.005016674287617207, "learning_rate": 3.754264857573594e-05, "loss": 0.0002, "step": 3140 }, { "epoch": 0.7498214710783146, "grad_norm": 0.009286819957196712, "learning_rate": 3.750297548202809e-05, "loss": 0.0002, "step": 3150 }, { "epoch": 0.7522018567007855, "grad_norm": 0.06282204389572144, "learning_rate": 3.7463302388320245e-05, "loss": 0.0003, "step": 3160 }, { "epoch": 0.7545822423232563, "grad_norm": 0.008628441952168941, "learning_rate": 3.7423629294612396e-05, "loss": 0.0003, "step": 3170 }, { "epoch": 0.7569626279457272, "grad_norm": 0.03511732071638107, "learning_rate": 3.7383956200904546e-05, "loss": 0.0001, "step": 3180 }, { "epoch": 0.759343013568198, "grad_norm": 0.003294560592621565, "learning_rate": 3.73442831071967e-05, "loss": 0.0003, "step": 3190 }, { "epoch": 0.7617233991906689, "grad_norm": 0.032009340822696686, "learning_rate": 3.730461001348885e-05, "loss": 0.0002, "step": 3200 }, { "epoch": 0.7641037848131397, "grad_norm": 0.022615088149905205, "learning_rate": 3.726493691978101e-05, "loss": 0.0002, "step": 3210 }, { "epoch": 0.7664841704356106, "grad_norm": 0.0026582872960716486, "learning_rate": 3.722526382607316e-05, "loss": 0.0001, "step": 3220 }, { "epoch": 0.7688645560580815, "grad_norm": 0.3148833215236664, "learning_rate": 3.718559073236531e-05, "loss": 0.0002, "step": 3230 }, { "epoch": 0.7712449416805522, "grad_norm": 0.03451314941048622, "learning_rate": 3.714591763865746e-05, "loss": 0.0002, "step": 3240 }, { "epoch": 0.7736253273030231, "grad_norm": 0.008008177392184734, "learning_rate": 3.710624454494961e-05, "loss": 0.0001, "step": 3250 }, { "epoch": 0.7760057129254939, "grad_norm": 0.07701031118631363, "learning_rate": 3.706657145124177e-05, "loss": 0.0005, "step": 3260 }, { "epoch": 0.7783860985479648, "grad_norm": 0.010465078055858612, "learning_rate": 3.7026898357533926e-05, "loss": 0.0002, "step": 3270 }, { "epoch": 0.7807664841704356, "grad_norm": 0.00499736238270998, "learning_rate": 3.6987225263826076e-05, "loss": 0.0007, "step": 3280 }, { "epoch": 0.7831468697929065, "grad_norm": 0.6453936696052551, "learning_rate": 3.6947552170118226e-05, "loss": 0.0003, "step": 3290 }, { "epoch": 0.7855272554153773, "grad_norm": 0.016864465549588203, "learning_rate": 3.690787907641038e-05, "loss": 0.0003, "step": 3300 }, { "epoch": 0.7879076410378482, "grad_norm": 0.05074018985033035, "learning_rate": 3.6868205982702534e-05, "loss": 0.0002, "step": 3310 }, { "epoch": 0.7902880266603189, "grad_norm": 0.006529835984110832, "learning_rate": 3.682853288899469e-05, "loss": 0.0005, "step": 3320 }, { "epoch": 0.7926684122827898, "grad_norm": 0.041339557617902756, "learning_rate": 3.678885979528684e-05, "loss": 0.0004, "step": 3330 }, { "epoch": 0.7950487979052606, "grad_norm": 0.006891661789268255, "learning_rate": 3.674918670157899e-05, "loss": 0.0004, "step": 3340 }, { "epoch": 0.7974291835277315, "grad_norm": 0.01043302658945322, "learning_rate": 3.670951360787114e-05, "loss": 0.0003, "step": 3350 }, { "epoch": 0.7998095691502023, "grad_norm": 0.01914358325302601, "learning_rate": 3.666984051416329e-05, "loss": 0.0008, "step": 3360 }, { "epoch": 0.8021899547726732, "grad_norm": 0.016266925260424614, "learning_rate": 3.663016742045545e-05, "loss": 0.0004, "step": 3370 }, { "epoch": 0.804570340395144, "grad_norm": 0.005765034817159176, "learning_rate": 3.6590494326747606e-05, "loss": 0.0002, "step": 3380 }, { "epoch": 0.8069507260176149, "grad_norm": 0.007664472330361605, "learning_rate": 3.6550821233039757e-05, "loss": 0.0002, "step": 3390 }, { "epoch": 0.8093311116400856, "grad_norm": 0.00499699218198657, "learning_rate": 3.651114813933191e-05, "loss": 0.0001, "step": 3400 }, { "epoch": 0.8117114972625565, "grad_norm": 0.012575655244290829, "learning_rate": 3.647147504562406e-05, "loss": 0.0002, "step": 3410 }, { "epoch": 0.8140918828850274, "grad_norm": 0.010001065209507942, "learning_rate": 3.6431801951916214e-05, "loss": 0.0005, "step": 3420 }, { "epoch": 0.8164722685074982, "grad_norm": 0.06131220981478691, "learning_rate": 3.6392128858208365e-05, "loss": 0.0002, "step": 3430 }, { "epoch": 0.8188526541299691, "grad_norm": 0.037141721695661545, "learning_rate": 3.6352455764500515e-05, "loss": 0.0001, "step": 3440 }, { "epoch": 0.8212330397524399, "grad_norm": 0.05955801159143448, "learning_rate": 3.631278267079267e-05, "loss": 0.0005, "step": 3450 }, { "epoch": 0.8236134253749108, "grad_norm": 0.012499036267399788, "learning_rate": 3.627310957708482e-05, "loss": 0.0002, "step": 3460 }, { "epoch": 0.8259938109973816, "grad_norm": 0.007782169617712498, "learning_rate": 3.623343648337697e-05, "loss": 0.0004, "step": 3470 }, { "epoch": 0.8283741966198525, "grad_norm": 0.016740377992391586, "learning_rate": 3.619376338966913e-05, "loss": 0.0004, "step": 3480 }, { "epoch": 0.8307545822423232, "grad_norm": 0.05157579854130745, "learning_rate": 3.615409029596128e-05, "loss": 0.0003, "step": 3490 }, { "epoch": 0.8331349678647941, "grad_norm": 0.00816064327955246, "learning_rate": 3.611441720225343e-05, "loss": 0.0003, "step": 3500 }, { "epoch": 0.8355153534872649, "grad_norm": 0.02470710128545761, "learning_rate": 3.607474410854559e-05, "loss": 0.0002, "step": 3510 }, { "epoch": 0.8378957391097358, "grad_norm": 0.004836896900087595, "learning_rate": 3.603507101483774e-05, "loss": 0.0001, "step": 3520 }, { "epoch": 0.8402761247322066, "grad_norm": 0.003796802368015051, "learning_rate": 3.5995397921129895e-05, "loss": 0.0002, "step": 3530 }, { "epoch": 0.8426565103546775, "grad_norm": 0.006737705785781145, "learning_rate": 3.5955724827422045e-05, "loss": 0.0003, "step": 3540 }, { "epoch": 0.8450368959771483, "grad_norm": 0.0021388079039752483, "learning_rate": 3.5916051733714195e-05, "loss": 0.0001, "step": 3550 }, { "epoch": 0.8474172815996192, "grad_norm": 0.047663912177085876, "learning_rate": 3.5876378640006346e-05, "loss": 0.0001, "step": 3560 }, { "epoch": 0.8497976672220899, "grad_norm": 0.015320863574743271, "learning_rate": 3.58367055462985e-05, "loss": 0.0002, "step": 3570 }, { "epoch": 0.8521780528445608, "grad_norm": 0.008627827279269695, "learning_rate": 3.579703245259065e-05, "loss": 0.0003, "step": 3580 }, { "epoch": 0.8545584384670316, "grad_norm": 0.0034904240164905787, "learning_rate": 3.575735935888281e-05, "loss": 0.0001, "step": 3590 }, { "epoch": 0.8569388240895025, "grad_norm": 0.01078026182949543, "learning_rate": 3.571768626517496e-05, "loss": 0.0002, "step": 3600 }, { "epoch": 0.8593192097119733, "grad_norm": 0.011285877786576748, "learning_rate": 3.567801317146711e-05, "loss": 0.0007, "step": 3610 }, { "epoch": 0.8616995953344442, "grad_norm": 0.005885743070393801, "learning_rate": 3.563834007775926e-05, "loss": 0.0003, "step": 3620 }, { "epoch": 0.8640799809569151, "grad_norm": 0.1011798158288002, "learning_rate": 3.559866698405142e-05, "loss": 0.0002, "step": 3630 }, { "epoch": 0.8664603665793859, "grad_norm": 0.012861615046858788, "learning_rate": 3.5558993890343575e-05, "loss": 0.0002, "step": 3640 }, { "epoch": 0.8688407522018567, "grad_norm": 0.009324765764176846, "learning_rate": 3.5519320796635726e-05, "loss": 0.0007, "step": 3650 }, { "epoch": 0.8712211378243275, "grad_norm": 0.0035065708216279745, "learning_rate": 3.5479647702927876e-05, "loss": 0.0004, "step": 3660 }, { "epoch": 0.8736015234467984, "grad_norm": 0.010472165420651436, "learning_rate": 3.5439974609220026e-05, "loss": 0.0001, "step": 3670 }, { "epoch": 0.8759819090692692, "grad_norm": 0.009073158726096153, "learning_rate": 3.5400301515512176e-05, "loss": 0.0001, "step": 3680 }, { "epoch": 0.8783622946917401, "grad_norm": 0.0028665116988122463, "learning_rate": 3.5360628421804334e-05, "loss": 0.0001, "step": 3690 }, { "epoch": 0.8807426803142109, "grad_norm": 0.009178753942251205, "learning_rate": 3.532095532809649e-05, "loss": 0.0003, "step": 3700 }, { "epoch": 0.8831230659366818, "grad_norm": 0.007954353466629982, "learning_rate": 3.528128223438864e-05, "loss": 0.0006, "step": 3710 }, { "epoch": 0.8855034515591526, "grad_norm": 0.009399271570146084, "learning_rate": 3.524160914068079e-05, "loss": 0.0002, "step": 3720 }, { "epoch": 0.8878838371816234, "grad_norm": 0.0035749957896769047, "learning_rate": 3.520193604697294e-05, "loss": 0.0001, "step": 3730 }, { "epoch": 0.8902642228040942, "grad_norm": 0.007753758691251278, "learning_rate": 3.51622629532651e-05, "loss": 0.0001, "step": 3740 }, { "epoch": 0.8926446084265651, "grad_norm": 0.007471214048564434, "learning_rate": 3.512258985955725e-05, "loss": 0.0003, "step": 3750 }, { "epoch": 0.8950249940490359, "grad_norm": 0.016612932085990906, "learning_rate": 3.5082916765849406e-05, "loss": 0.0001, "step": 3760 }, { "epoch": 0.8974053796715068, "grad_norm": 0.008320000022649765, "learning_rate": 3.5043243672141556e-05, "loss": 0.0001, "step": 3770 }, { "epoch": 0.8997857652939776, "grad_norm": 0.010242090560495853, "learning_rate": 3.500357057843371e-05, "loss": 0.0001, "step": 3780 }, { "epoch": 0.9021661509164485, "grad_norm": 0.0036350861191749573, "learning_rate": 3.4963897484725864e-05, "loss": 0.0001, "step": 3790 }, { "epoch": 0.9045465365389193, "grad_norm": 0.002153201960027218, "learning_rate": 3.4924224391018014e-05, "loss": 0.0002, "step": 3800 }, { "epoch": 0.9069269221613901, "grad_norm": 0.003587006125599146, "learning_rate": 3.4884551297310164e-05, "loss": 0.0002, "step": 3810 }, { "epoch": 0.909307307783861, "grad_norm": 0.006511629093438387, "learning_rate": 3.4844878203602315e-05, "loss": 0.0002, "step": 3820 }, { "epoch": 0.9116876934063318, "grad_norm": 0.008945467881858349, "learning_rate": 3.480520510989447e-05, "loss": 0.0001, "step": 3830 }, { "epoch": 0.9140680790288027, "grad_norm": 0.006604051683098078, "learning_rate": 3.476553201618662e-05, "loss": 0.0001, "step": 3840 }, { "epoch": 0.9164484646512735, "grad_norm": 0.0031156474724411964, "learning_rate": 3.472585892247878e-05, "loss": 0.0003, "step": 3850 }, { "epoch": 0.9188288502737444, "grad_norm": 0.005195919424295425, "learning_rate": 3.468618582877093e-05, "loss": 0.0001, "step": 3860 }, { "epoch": 0.9212092358962152, "grad_norm": 0.008878687396645546, "learning_rate": 3.464651273506308e-05, "loss": 0.0001, "step": 3870 }, { "epoch": 0.9235896215186861, "grad_norm": 0.0020940713584423065, "learning_rate": 3.460683964135523e-05, "loss": 0.0001, "step": 3880 }, { "epoch": 0.9259700071411568, "grad_norm": 0.0066345930099487305, "learning_rate": 3.456716654764739e-05, "loss": 0.0001, "step": 3890 }, { "epoch": 0.9283503927636277, "grad_norm": 0.0018133444245904684, "learning_rate": 3.4527493453939544e-05, "loss": 0.0001, "step": 3900 }, { "epoch": 0.9307307783860985, "grad_norm": 0.000830967677757144, "learning_rate": 3.4487820360231695e-05, "loss": 0.0001, "step": 3910 }, { "epoch": 0.9331111640085694, "grad_norm": 0.0037288174498826265, "learning_rate": 3.4448147266523845e-05, "loss": 0.0001, "step": 3920 }, { "epoch": 0.9354915496310402, "grad_norm": 0.005838675890117884, "learning_rate": 3.4408474172815995e-05, "loss": 0.0003, "step": 3930 }, { "epoch": 0.9378719352535111, "grad_norm": 0.008044001646339893, "learning_rate": 3.4368801079108145e-05, "loss": 0.0002, "step": 3940 }, { "epoch": 0.9402523208759819, "grad_norm": 0.07016938179731369, "learning_rate": 3.43291279854003e-05, "loss": 0.0005, "step": 3950 }, { "epoch": 0.9426327064984528, "grad_norm": 0.11337173730134964, "learning_rate": 3.428945489169246e-05, "loss": 0.0002, "step": 3960 }, { "epoch": 0.9450130921209235, "grad_norm": 0.0017598132835701108, "learning_rate": 3.424978179798461e-05, "loss": 0.0003, "step": 3970 }, { "epoch": 0.9473934777433944, "grad_norm": 0.030149806290864944, "learning_rate": 3.421010870427676e-05, "loss": 0.0003, "step": 3980 }, { "epoch": 0.9497738633658652, "grad_norm": 0.11280670762062073, "learning_rate": 3.417043561056891e-05, "loss": 0.0003, "step": 3990 }, { "epoch": 0.9521542489883361, "grad_norm": 0.02797405980527401, "learning_rate": 3.413076251686107e-05, "loss": 0.0003, "step": 4000 }, { "epoch": 0.9545346346108069, "grad_norm": 0.009325963445007801, "learning_rate": 3.4091089423153225e-05, "loss": 0.0002, "step": 4010 }, { "epoch": 0.9569150202332778, "grad_norm": 0.015098505653440952, "learning_rate": 3.4051416329445375e-05, "loss": 0.0002, "step": 4020 }, { "epoch": 0.9592954058557487, "grad_norm": 0.0010631170589476824, "learning_rate": 3.4011743235737525e-05, "loss": 0.0002, "step": 4030 }, { "epoch": 0.9616757914782195, "grad_norm": 0.11537562310695648, "learning_rate": 3.3972070142029676e-05, "loss": 0.0004, "step": 4040 }, { "epoch": 0.9640561771006904, "grad_norm": 0.055657465010881424, "learning_rate": 3.3932397048321826e-05, "loss": 0.0002, "step": 4050 }, { "epoch": 0.9664365627231611, "grad_norm": 0.004681292921304703, "learning_rate": 3.389272395461398e-05, "loss": 0.0001, "step": 4060 }, { "epoch": 0.968816948345632, "grad_norm": 0.0036875929217785597, "learning_rate": 3.385305086090613e-05, "loss": 0.0003, "step": 4070 }, { "epoch": 0.9711973339681028, "grad_norm": 0.3181780278682709, "learning_rate": 3.381337776719829e-05, "loss": 0.0002, "step": 4080 }, { "epoch": 0.9735777195905737, "grad_norm": 0.008175074122846127, "learning_rate": 3.377370467349044e-05, "loss": 0.0001, "step": 4090 }, { "epoch": 0.9759581052130445, "grad_norm": 0.008897043764591217, "learning_rate": 3.373403157978259e-05, "loss": 0.0002, "step": 4100 }, { "epoch": 0.9783384908355154, "grad_norm": 0.005149902775883675, "learning_rate": 3.369435848607475e-05, "loss": 0.0001, "step": 4110 }, { "epoch": 0.9807188764579862, "grad_norm": 0.005102005321532488, "learning_rate": 3.36546853923669e-05, "loss": 0.0001, "step": 4120 }, { "epoch": 0.9830992620804571, "grad_norm": 0.003907215781509876, "learning_rate": 3.361501229865905e-05, "loss": 0.0001, "step": 4130 }, { "epoch": 0.9854796477029278, "grad_norm": 0.006176768336445093, "learning_rate": 3.3575339204951206e-05, "loss": 0.0001, "step": 4140 }, { "epoch": 0.9878600333253987, "grad_norm": 0.007574237417429686, "learning_rate": 3.3535666111243356e-05, "loss": 0.0001, "step": 4150 }, { "epoch": 0.9902404189478695, "grad_norm": 0.0036479670088738203, "learning_rate": 3.3495993017535506e-05, "loss": 0.0001, "step": 4160 }, { "epoch": 0.9926208045703404, "grad_norm": 0.0031234126072376966, "learning_rate": 3.3456319923827664e-05, "loss": 0.0001, "step": 4170 }, { "epoch": 0.9950011901928112, "grad_norm": 0.015276722609996796, "learning_rate": 3.3416646830119814e-05, "loss": 0.0004, "step": 4180 }, { "epoch": 0.9973815758152821, "grad_norm": 0.015308289788663387, "learning_rate": 3.3376973736411964e-05, "loss": 0.0002, "step": 4190 }, { "epoch": 0.9997619614377529, "grad_norm": 1.1039026975631714, "learning_rate": 3.333730064270412e-05, "loss": 0.0034, "step": 4200 }, { "epoch": 1.0, "eval_loss": 6.8751428443647455e-06, "eval_runtime": 52.0551, "eval_samples_per_second": 35.885, "eval_steps_per_second": 8.971, "step": 4201 }, { "epoch": 1.0021423470602238, "grad_norm": 0.4035731852054596, "learning_rate": 3.329762754899627e-05, "loss": 0.001, "step": 4210 }, { "epoch": 1.0045227326826947, "grad_norm": 0.058116745203733444, "learning_rate": 3.325795445528843e-05, "loss": 0.0006, "step": 4220 }, { "epoch": 1.0069031183051655, "grad_norm": 0.030079133808612823, "learning_rate": 3.321828136158058e-05, "loss": 0.0005, "step": 4230 }, { "epoch": 1.0092835039276362, "grad_norm": 0.03710814565420151, "learning_rate": 3.317860826787273e-05, "loss": 0.0008, "step": 4240 }, { "epoch": 1.011663889550107, "grad_norm": 0.25699111819267273, "learning_rate": 3.313893517416488e-05, "loss": 0.0003, "step": 4250 }, { "epoch": 1.014044275172578, "grad_norm": 0.01729218102991581, "learning_rate": 3.309926208045703e-05, "loss": 0.0004, "step": 4260 }, { "epoch": 1.016424660795049, "grad_norm": 0.004348506219685078, "learning_rate": 3.3059588986749194e-05, "loss": 0.0003, "step": 4270 }, { "epoch": 1.0188050464175196, "grad_norm": 0.008898822590708733, "learning_rate": 3.3019915893041344e-05, "loss": 0.0002, "step": 4280 }, { "epoch": 1.0211854320399905, "grad_norm": 0.021421125158667564, "learning_rate": 3.2980242799333494e-05, "loss": 0.0002, "step": 4290 }, { "epoch": 1.0235658176624614, "grad_norm": 0.09812607616186142, "learning_rate": 3.2940569705625645e-05, "loss": 0.0003, "step": 4300 }, { "epoch": 1.0259462032849322, "grad_norm": 0.00921029131859541, "learning_rate": 3.2900896611917795e-05, "loss": 0.0003, "step": 4310 }, { "epoch": 1.028326588907403, "grad_norm": 0.18005193769931793, "learning_rate": 3.286122351820995e-05, "loss": 0.0009, "step": 4320 }, { "epoch": 1.0307069745298738, "grad_norm": 0.022728268057107925, "learning_rate": 3.282155042450211e-05, "loss": 0.001, "step": 4330 }, { "epoch": 1.0330873601523447, "grad_norm": 0.002307797549292445, "learning_rate": 3.278187733079426e-05, "loss": 0.0001, "step": 4340 }, { "epoch": 1.0354677457748156, "grad_norm": 0.09381233900785446, "learning_rate": 3.274220423708641e-05, "loss": 0.0001, "step": 4350 }, { "epoch": 1.0378481313972863, "grad_norm": 0.30725282430648804, "learning_rate": 3.270253114337856e-05, "loss": 0.0006, "step": 4360 }, { "epoch": 1.0402285170197572, "grad_norm": 0.0028942192438989878, "learning_rate": 3.266285804967071e-05, "loss": 0.0002, "step": 4370 }, { "epoch": 1.042608902642228, "grad_norm": 0.07420436292886734, "learning_rate": 3.262318495596287e-05, "loss": 0.0001, "step": 4380 }, { "epoch": 1.044989288264699, "grad_norm": 0.0038873206358402967, "learning_rate": 3.2583511862255025e-05, "loss": 0.0004, "step": 4390 }, { "epoch": 1.0473696738871696, "grad_norm": 0.00487096281722188, "learning_rate": 3.2543838768547175e-05, "loss": 0.0001, "step": 4400 }, { "epoch": 1.0497500595096405, "grad_norm": 0.00458755437284708, "learning_rate": 3.2504165674839325e-05, "loss": 0.0001, "step": 4410 }, { "epoch": 1.0521304451321114, "grad_norm": 0.003127770032733679, "learning_rate": 3.2464492581131475e-05, "loss": 0.0001, "step": 4420 }, { "epoch": 1.0545108307545823, "grad_norm": 0.0036109236534684896, "learning_rate": 3.242481948742363e-05, "loss": 0.0003, "step": 4430 }, { "epoch": 1.0568912163770532, "grad_norm": 0.01696913130581379, "learning_rate": 3.238514639371578e-05, "loss": 0.0001, "step": 4440 }, { "epoch": 1.0592716019995239, "grad_norm": 0.0007638796814717352, "learning_rate": 3.234547330000793e-05, "loss": 0.0003, "step": 4450 }, { "epoch": 1.0616519876219948, "grad_norm": 0.005359685514122248, "learning_rate": 3.230580020630009e-05, "loss": 0.0001, "step": 4460 }, { "epoch": 1.0640323732444656, "grad_norm": 0.008990432135760784, "learning_rate": 3.226612711259224e-05, "loss": 0.0002, "step": 4470 }, { "epoch": 1.0664127588669365, "grad_norm": 0.004698805510997772, "learning_rate": 3.22264540188844e-05, "loss": 0.0001, "step": 4480 }, { "epoch": 1.0687931444894072, "grad_norm": 0.07380379736423492, "learning_rate": 3.218678092517655e-05, "loss": 0.0005, "step": 4490 }, { "epoch": 1.071173530111878, "grad_norm": 0.0072670914232730865, "learning_rate": 3.21471078314687e-05, "loss": 0.0001, "step": 4500 }, { "epoch": 1.073553915734349, "grad_norm": 0.003431397257372737, "learning_rate": 3.210743473776085e-05, "loss": 0.0001, "step": 4510 }, { "epoch": 1.0759343013568199, "grad_norm": 0.012710604816675186, "learning_rate": 3.2067761644053006e-05, "loss": 0.0001, "step": 4520 }, { "epoch": 1.0783146869792906, "grad_norm": 0.0029263871256262064, "learning_rate": 3.2028088550345156e-05, "loss": 0.0001, "step": 4530 }, { "epoch": 1.0806950726017615, "grad_norm": 0.0013361535966396332, "learning_rate": 3.198841545663731e-05, "loss": 0.0001, "step": 4540 }, { "epoch": 1.0830754582242323, "grad_norm": 0.0027455012314021587, "learning_rate": 3.194874236292946e-05, "loss": 0.0001, "step": 4550 }, { "epoch": 1.0854558438467032, "grad_norm": 0.0015189964324235916, "learning_rate": 3.1909069269221614e-05, "loss": 0.0002, "step": 4560 }, { "epoch": 1.087836229469174, "grad_norm": 0.019486431032419205, "learning_rate": 3.1869396175513764e-05, "loss": 0.0001, "step": 4570 }, { "epoch": 1.0902166150916448, "grad_norm": 0.009100046940147877, "learning_rate": 3.182972308180592e-05, "loss": 0.0002, "step": 4580 }, { "epoch": 1.0925970007141157, "grad_norm": 0.6804227828979492, "learning_rate": 3.179004998809808e-05, "loss": 0.0008, "step": 4590 }, { "epoch": 1.0949773863365866, "grad_norm": 0.004166084341704845, "learning_rate": 3.175037689439023e-05, "loss": 0.0004, "step": 4600 }, { "epoch": 1.0973577719590573, "grad_norm": 0.0014277161099016666, "learning_rate": 3.171070380068238e-05, "loss": 0.0005, "step": 4610 }, { "epoch": 1.0997381575815282, "grad_norm": 0.02292274497449398, "learning_rate": 3.167103070697453e-05, "loss": 0.0001, "step": 4620 }, { "epoch": 1.102118543203999, "grad_norm": 0.006580695044249296, "learning_rate": 3.163135761326668e-05, "loss": 0.0002, "step": 4630 }, { "epoch": 1.10449892882647, "grad_norm": 0.005075294058769941, "learning_rate": 3.1591684519558836e-05, "loss": 0.0002, "step": 4640 }, { "epoch": 1.1068793144489408, "grad_norm": 0.0034661772660911083, "learning_rate": 3.1552011425850994e-05, "loss": 0.0003, "step": 4650 }, { "epoch": 1.1092597000714115, "grad_norm": 0.0035978129599243402, "learning_rate": 3.1512338332143144e-05, "loss": 0.0002, "step": 4660 }, { "epoch": 1.1116400856938824, "grad_norm": 0.01178679708391428, "learning_rate": 3.1472665238435294e-05, "loss": 0.0001, "step": 4670 }, { "epoch": 1.1140204713163533, "grad_norm": 0.0021494280081242323, "learning_rate": 3.1432992144727444e-05, "loss": 0.0003, "step": 4680 }, { "epoch": 1.1164008569388242, "grad_norm": 0.00325006153434515, "learning_rate": 3.13933190510196e-05, "loss": 0.0001, "step": 4690 }, { "epoch": 1.1187812425612949, "grad_norm": 0.006323399022221565, "learning_rate": 3.135364595731175e-05, "loss": 0.0002, "step": 4700 }, { "epoch": 1.1211616281837657, "grad_norm": 0.006911338306963444, "learning_rate": 3.131397286360391e-05, "loss": 0.0001, "step": 4710 }, { "epoch": 1.1235420138062366, "grad_norm": 0.0032435038592666388, "learning_rate": 3.127429976989606e-05, "loss": 0.0001, "step": 4720 }, { "epoch": 1.1259223994287075, "grad_norm": 0.18325313925743103, "learning_rate": 3.123462667618821e-05, "loss": 0.0002, "step": 4730 }, { "epoch": 1.1283027850511782, "grad_norm": 0.12742838263511658, "learning_rate": 3.119495358248036e-05, "loss": 0.0004, "step": 4740 }, { "epoch": 1.130683170673649, "grad_norm": 0.001981141045689583, "learning_rate": 3.115528048877252e-05, "loss": 0.0002, "step": 4750 }, { "epoch": 1.13306355629612, "grad_norm": 0.0030578586738556623, "learning_rate": 3.111560739506467e-05, "loss": 0.0001, "step": 4760 }, { "epoch": 1.1354439419185909, "grad_norm": 0.00284597952850163, "learning_rate": 3.1075934301356824e-05, "loss": 0.0001, "step": 4770 }, { "epoch": 1.1378243275410616, "grad_norm": 0.023655202239751816, "learning_rate": 3.1036261207648975e-05, "loss": 0.0002, "step": 4780 }, { "epoch": 1.1402047131635324, "grad_norm": 0.008493321016430855, "learning_rate": 3.0996588113941125e-05, "loss": 0.0002, "step": 4790 }, { "epoch": 1.1425850987860033, "grad_norm": 0.0038551143370568752, "learning_rate": 3.095691502023328e-05, "loss": 0.0001, "step": 4800 }, { "epoch": 1.1449654844084742, "grad_norm": 0.0014539804542437196, "learning_rate": 3.091724192652543e-05, "loss": 0.0001, "step": 4810 }, { "epoch": 1.1473458700309451, "grad_norm": 0.0026364317163825035, "learning_rate": 3.087756883281758e-05, "loss": 0.0006, "step": 4820 }, { "epoch": 1.1497262556534158, "grad_norm": 0.0010660483967512846, "learning_rate": 3.083789573910973e-05, "loss": 0.0001, "step": 4830 }, { "epoch": 1.1521066412758867, "grad_norm": 0.005250291433185339, "learning_rate": 3.079822264540189e-05, "loss": 0.0013, "step": 4840 }, { "epoch": 1.1544870268983576, "grad_norm": 0.0824214443564415, "learning_rate": 3.075854955169404e-05, "loss": 0.0002, "step": 4850 }, { "epoch": 1.1568674125208283, "grad_norm": 0.003175609977915883, "learning_rate": 3.07188764579862e-05, "loss": 0.0006, "step": 4860 }, { "epoch": 1.1592477981432991, "grad_norm": 0.0015882077859714627, "learning_rate": 3.067920336427835e-05, "loss": 0.0001, "step": 4870 }, { "epoch": 1.16162818376577, "grad_norm": 0.003802343737334013, "learning_rate": 3.06395302705705e-05, "loss": 0.0001, "step": 4880 }, { "epoch": 1.164008569388241, "grad_norm": 0.002745629521086812, "learning_rate": 3.059985717686265e-05, "loss": 0.0004, "step": 4890 }, { "epoch": 1.1663889550107118, "grad_norm": 0.006173206493258476, "learning_rate": 3.0560184083154805e-05, "loss": 0.0001, "step": 4900 }, { "epoch": 1.1687693406331825, "grad_norm": 0.006407946813851595, "learning_rate": 3.052051098944696e-05, "loss": 0.0001, "step": 4910 }, { "epoch": 1.1711497262556534, "grad_norm": 0.017478201538324356, "learning_rate": 3.0480837895739113e-05, "loss": 0.0003, "step": 4920 }, { "epoch": 1.1735301118781243, "grad_norm": 0.0035310271196067333, "learning_rate": 3.0441164802031263e-05, "loss": 0.0001, "step": 4930 }, { "epoch": 1.1759104975005952, "grad_norm": 0.0057274349965155125, "learning_rate": 3.0401491708323417e-05, "loss": 0.0001, "step": 4940 }, { "epoch": 1.1782908831230658, "grad_norm": 0.013580716215074062, "learning_rate": 3.0361818614615567e-05, "loss": 0.0001, "step": 4950 }, { "epoch": 1.1806712687455367, "grad_norm": 0.005545695312321186, "learning_rate": 3.0322145520907724e-05, "loss": 0.0001, "step": 4960 }, { "epoch": 1.1830516543680076, "grad_norm": 0.001243342412635684, "learning_rate": 3.0282472427199875e-05, "loss": 0.0001, "step": 4970 }, { "epoch": 1.1854320399904785, "grad_norm": 0.004315751604735851, "learning_rate": 3.0242799333492028e-05, "loss": 0.0001, "step": 4980 }, { "epoch": 1.1878124256129494, "grad_norm": 0.0020015877671539783, "learning_rate": 3.020312623978418e-05, "loss": 0.0001, "step": 4990 }, { "epoch": 1.19019281123542, "grad_norm": 0.0013068486005067825, "learning_rate": 3.0163453146076332e-05, "loss": 0.0001, "step": 5000 }, { "epoch": 1.192573196857891, "grad_norm": 0.0020259215962141752, "learning_rate": 3.0123780052368483e-05, "loss": 0.0001, "step": 5010 }, { "epoch": 1.1949535824803619, "grad_norm": 0.00229440163820982, "learning_rate": 3.008410695866064e-05, "loss": 0.0001, "step": 5020 }, { "epoch": 1.1973339681028325, "grad_norm": 0.006487131118774414, "learning_rate": 3.0044433864952793e-05, "loss": 0.0, "step": 5030 }, { "epoch": 1.1997143537253034, "grad_norm": 0.0029580420814454556, "learning_rate": 3.0004760771244944e-05, "loss": 0.0001, "step": 5040 }, { "epoch": 1.2020947393477743, "grad_norm": 0.004215626046061516, "learning_rate": 2.9965087677537097e-05, "loss": 0.0002, "step": 5050 }, { "epoch": 1.2044751249702452, "grad_norm": 0.0045689307153224945, "learning_rate": 2.9925414583829248e-05, "loss": 0.0001, "step": 5060 }, { "epoch": 1.2068555105927161, "grad_norm": 0.0018343930132687092, "learning_rate": 2.9885741490121398e-05, "loss": 0.0001, "step": 5070 }, { "epoch": 1.2092358962151868, "grad_norm": 0.21103504300117493, "learning_rate": 2.984606839641355e-05, "loss": 0.0003, "step": 5080 }, { "epoch": 1.2116162818376577, "grad_norm": 0.04271009564399719, "learning_rate": 2.980639530270571e-05, "loss": 0.0003, "step": 5090 }, { "epoch": 1.2139966674601286, "grad_norm": 0.008761608973145485, "learning_rate": 2.976672220899786e-05, "loss": 0.0002, "step": 5100 }, { "epoch": 1.2163770530825995, "grad_norm": 0.002944928128272295, "learning_rate": 2.9727049115290013e-05, "loss": 0.0004, "step": 5110 }, { "epoch": 1.2187574387050701, "grad_norm": 0.0039098006673157215, "learning_rate": 2.9687376021582163e-05, "loss": 0.0001, "step": 5120 }, { "epoch": 1.221137824327541, "grad_norm": 0.007188912481069565, "learning_rate": 2.9647702927874317e-05, "loss": 0.0003, "step": 5130 }, { "epoch": 1.223518209950012, "grad_norm": 0.0020366155076771975, "learning_rate": 2.9608029834166467e-05, "loss": 0.0001, "step": 5140 }, { "epoch": 1.2258985955724828, "grad_norm": 0.0052825105376541615, "learning_rate": 2.9568356740458624e-05, "loss": 0.0001, "step": 5150 }, { "epoch": 1.2282789811949535, "grad_norm": 0.0322733074426651, "learning_rate": 2.9528683646750778e-05, "loss": 0.0002, "step": 5160 }, { "epoch": 1.2306593668174244, "grad_norm": 0.0030191782861948013, "learning_rate": 2.9489010553042928e-05, "loss": 0.0001, "step": 5170 }, { "epoch": 1.2330397524398953, "grad_norm": 0.0158090703189373, "learning_rate": 2.944933745933508e-05, "loss": 0.0001, "step": 5180 }, { "epoch": 1.2354201380623662, "grad_norm": 0.0023131452035158873, "learning_rate": 2.9409664365627232e-05, "loss": 0.0002, "step": 5190 }, { "epoch": 1.2378005236848368, "grad_norm": 0.0010957660852000117, "learning_rate": 2.9369991271919382e-05, "loss": 0.0001, "step": 5200 }, { "epoch": 1.2401809093073077, "grad_norm": 0.006251092534512281, "learning_rate": 2.933031817821154e-05, "loss": 0.0001, "step": 5210 }, { "epoch": 1.2425612949297786, "grad_norm": 0.002981637604534626, "learning_rate": 2.9290645084503693e-05, "loss": 0.0001, "step": 5220 }, { "epoch": 1.2449416805522495, "grad_norm": 0.0044878036715090275, "learning_rate": 2.9250971990795844e-05, "loss": 0.0009, "step": 5230 }, { "epoch": 1.2473220661747204, "grad_norm": 0.0026534402277320623, "learning_rate": 2.9211298897087997e-05, "loss": 0.0001, "step": 5240 }, { "epoch": 1.249702451797191, "grad_norm": 0.0017549542244523764, "learning_rate": 2.9171625803380148e-05, "loss": 0.0001, "step": 5250 }, { "epoch": 1.252082837419662, "grad_norm": 0.0030411062762141228, "learning_rate": 2.91319527096723e-05, "loss": 0.0001, "step": 5260 }, { "epoch": 1.2544632230421329, "grad_norm": 0.006810466758906841, "learning_rate": 2.909227961596445e-05, "loss": 0.0001, "step": 5270 }, { "epoch": 1.2568436086646035, "grad_norm": 0.008998183533549309, "learning_rate": 2.905260652225661e-05, "loss": 0.0001, "step": 5280 }, { "epoch": 1.2592239942870744, "grad_norm": 0.0006000595167279243, "learning_rate": 2.9012933428548762e-05, "loss": 0.0001, "step": 5290 }, { "epoch": 1.2616043799095453, "grad_norm": 0.0037659297231584787, "learning_rate": 2.8973260334840913e-05, "loss": 0.0001, "step": 5300 }, { "epoch": 1.2639847655320162, "grad_norm": 0.003123963950201869, "learning_rate": 2.8933587241133063e-05, "loss": 0.0001, "step": 5310 }, { "epoch": 1.2663651511544871, "grad_norm": 0.0024721056688576937, "learning_rate": 2.8893914147425217e-05, "loss": 0.0001, "step": 5320 }, { "epoch": 1.268745536776958, "grad_norm": 0.04851701855659485, "learning_rate": 2.8854241053717367e-05, "loss": 0.0002, "step": 5330 }, { "epoch": 1.2711259223994287, "grad_norm": 0.0003437872801441699, "learning_rate": 2.8814567960009524e-05, "loss": 0.0, "step": 5340 }, { "epoch": 1.2735063080218996, "grad_norm": 0.36953097581863403, "learning_rate": 2.8774894866301678e-05, "loss": 0.0002, "step": 5350 }, { "epoch": 1.2758866936443705, "grad_norm": 0.004762616939842701, "learning_rate": 2.8735221772593828e-05, "loss": 0.0, "step": 5360 }, { "epoch": 1.2782670792668411, "grad_norm": 0.0032022674567997456, "learning_rate": 2.8695548678885982e-05, "loss": 0.0001, "step": 5370 }, { "epoch": 1.280647464889312, "grad_norm": 0.112340047955513, "learning_rate": 2.8655875585178132e-05, "loss": 0.0001, "step": 5380 }, { "epoch": 1.283027850511783, "grad_norm": 0.0022161102388054132, "learning_rate": 2.8616202491470286e-05, "loss": 0.0001, "step": 5390 }, { "epoch": 1.2854082361342538, "grad_norm": 0.0012134364806115627, "learning_rate": 2.8576529397762443e-05, "loss": 0.0001, "step": 5400 }, { "epoch": 1.2877886217567247, "grad_norm": 0.003832167712971568, "learning_rate": 2.8536856304054593e-05, "loss": 0.0001, "step": 5410 }, { "epoch": 1.2901690073791954, "grad_norm": 0.001739076804369688, "learning_rate": 2.8497183210346743e-05, "loss": 0.0, "step": 5420 }, { "epoch": 1.2925493930016663, "grad_norm": 0.000749527825973928, "learning_rate": 2.8457510116638897e-05, "loss": 0.0001, "step": 5430 }, { "epoch": 1.2949297786241372, "grad_norm": 0.006486440543085337, "learning_rate": 2.8417837022931047e-05, "loss": 0.0001, "step": 5440 }, { "epoch": 1.2973101642466078, "grad_norm": 0.002875624457374215, "learning_rate": 2.83781639292232e-05, "loss": 0.0003, "step": 5450 }, { "epoch": 1.2996905498690787, "grad_norm": 0.011916677467525005, "learning_rate": 2.833849083551535e-05, "loss": 0.0002, "step": 5460 }, { "epoch": 1.3020709354915496, "grad_norm": 0.014456122182309628, "learning_rate": 2.829881774180751e-05, "loss": 0.0001, "step": 5470 }, { "epoch": 1.3044513211140205, "grad_norm": 0.00652431882917881, "learning_rate": 2.8259144648099662e-05, "loss": 0.0003, "step": 5480 }, { "epoch": 1.3068317067364914, "grad_norm": 0.004612395539879799, "learning_rate": 2.8219471554391813e-05, "loss": 0.0001, "step": 5490 }, { "epoch": 1.309212092358962, "grad_norm": 0.0016554853646084666, "learning_rate": 2.8179798460683966e-05, "loss": 0.0001, "step": 5500 }, { "epoch": 1.311592477981433, "grad_norm": 0.00955954473465681, "learning_rate": 2.8140125366976117e-05, "loss": 0.0003, "step": 5510 }, { "epoch": 1.3139728636039039, "grad_norm": 0.0014887260040268302, "learning_rate": 2.8100452273268267e-05, "loss": 0.0, "step": 5520 }, { "epoch": 1.3163532492263748, "grad_norm": 0.004022569395601749, "learning_rate": 2.8060779179560427e-05, "loss": 0.0001, "step": 5530 }, { "epoch": 1.3187336348488454, "grad_norm": 0.01300437469035387, "learning_rate": 2.8021106085852578e-05, "loss": 0.0001, "step": 5540 }, { "epoch": 1.3211140204713163, "grad_norm": 0.0033303312957286835, "learning_rate": 2.7981432992144728e-05, "loss": 0.0001, "step": 5550 }, { "epoch": 1.3234944060937872, "grad_norm": 0.00033377157524228096, "learning_rate": 2.794175989843688e-05, "loss": 0.0001, "step": 5560 }, { "epoch": 1.325874791716258, "grad_norm": 0.001646155840717256, "learning_rate": 2.7902086804729032e-05, "loss": 0.0, "step": 5570 }, { "epoch": 1.328255177338729, "grad_norm": 0.009458147920668125, "learning_rate": 2.7862413711021186e-05, "loss": 0.0, "step": 5580 }, { "epoch": 1.3306355629611997, "grad_norm": 0.044097207486629486, "learning_rate": 2.7822740617313343e-05, "loss": 0.0001, "step": 5590 }, { "epoch": 1.3330159485836706, "grad_norm": 0.3018762469291687, "learning_rate": 2.7783067523605493e-05, "loss": 0.0003, "step": 5600 }, { "epoch": 1.3353963342061415, "grad_norm": 0.00142444740049541, "learning_rate": 2.7743394429897647e-05, "loss": 0.0, "step": 5610 }, { "epoch": 1.3377767198286121, "grad_norm": 0.026065746322274208, "learning_rate": 2.7703721336189797e-05, "loss": 0.0001, "step": 5620 }, { "epoch": 1.340157105451083, "grad_norm": 0.002285444876179099, "learning_rate": 2.766404824248195e-05, "loss": 0.0004, "step": 5630 }, { "epoch": 1.342537491073554, "grad_norm": 0.0023544467985630035, "learning_rate": 2.76243751487741e-05, "loss": 0.0001, "step": 5640 }, { "epoch": 1.3449178766960248, "grad_norm": 0.005093382205814123, "learning_rate": 2.758470205506625e-05, "loss": 0.0008, "step": 5650 }, { "epoch": 1.3472982623184957, "grad_norm": 0.01395428366959095, "learning_rate": 2.754502896135841e-05, "loss": 0.0001, "step": 5660 }, { "epoch": 1.3496786479409664, "grad_norm": 0.0021814145147800446, "learning_rate": 2.7505355867650562e-05, "loss": 0.0001, "step": 5670 }, { "epoch": 1.3520590335634373, "grad_norm": 0.0020568270701915026, "learning_rate": 2.7465682773942712e-05, "loss": 0.0001, "step": 5680 }, { "epoch": 1.3544394191859082, "grad_norm": 0.001564579550176859, "learning_rate": 2.7426009680234866e-05, "loss": 0.0002, "step": 5690 }, { "epoch": 1.3568198048083788, "grad_norm": 0.0009057559072971344, "learning_rate": 2.7386336586527016e-05, "loss": 0.0001, "step": 5700 }, { "epoch": 1.3592001904308497, "grad_norm": 0.005018309690058231, "learning_rate": 2.734666349281917e-05, "loss": 0.0001, "step": 5710 }, { "epoch": 1.3615805760533206, "grad_norm": 0.0018629188416525722, "learning_rate": 2.7306990399111327e-05, "loss": 0.0003, "step": 5720 }, { "epoch": 1.3639609616757915, "grad_norm": 0.001482214662246406, "learning_rate": 2.7267317305403478e-05, "loss": 0.0001, "step": 5730 }, { "epoch": 1.3663413472982624, "grad_norm": 0.012405039742588997, "learning_rate": 2.722764421169563e-05, "loss": 0.0005, "step": 5740 }, { "epoch": 1.3687217329207333, "grad_norm": 0.0018485913751646876, "learning_rate": 2.718797111798778e-05, "loss": 0.0, "step": 5750 }, { "epoch": 1.371102118543204, "grad_norm": 0.0015681314980611205, "learning_rate": 2.7148298024279932e-05, "loss": 0.0001, "step": 5760 }, { "epoch": 1.3734825041656749, "grad_norm": 0.017725007608532906, "learning_rate": 2.7108624930572086e-05, "loss": 0.0001, "step": 5770 }, { "epoch": 1.3758628897881457, "grad_norm": 0.011187481693923473, "learning_rate": 2.7068951836864243e-05, "loss": 0.0001, "step": 5780 }, { "epoch": 1.3782432754106164, "grad_norm": 0.003125675953924656, "learning_rate": 2.7029278743156393e-05, "loss": 0.0001, "step": 5790 }, { "epoch": 1.3806236610330873, "grad_norm": 0.004620529245585203, "learning_rate": 2.6989605649448547e-05, "loss": 0.0001, "step": 5800 }, { "epoch": 1.3830040466555582, "grad_norm": 0.004881042055785656, "learning_rate": 2.6949932555740697e-05, "loss": 0.0001, "step": 5810 }, { "epoch": 1.385384432278029, "grad_norm": 0.015351341105997562, "learning_rate": 2.691025946203285e-05, "loss": 0.0001, "step": 5820 }, { "epoch": 1.3877648179005, "grad_norm": 0.06165415793657303, "learning_rate": 2.6870586368325e-05, "loss": 0.0001, "step": 5830 }, { "epoch": 1.3901452035229707, "grad_norm": 0.000691259338054806, "learning_rate": 2.6830913274617155e-05, "loss": 0.0001, "step": 5840 }, { "epoch": 1.3925255891454416, "grad_norm": 0.006264138966798782, "learning_rate": 2.6791240180909312e-05, "loss": 0.0, "step": 5850 }, { "epoch": 1.3949059747679124, "grad_norm": 0.0016265185549855232, "learning_rate": 2.6751567087201462e-05, "loss": 0.0001, "step": 5860 }, { "epoch": 1.3972863603903831, "grad_norm": 0.0036318551283329725, "learning_rate": 2.6711893993493616e-05, "loss": 0.0, "step": 5870 }, { "epoch": 1.399666746012854, "grad_norm": 0.0011168549535796046, "learning_rate": 2.6672220899785766e-05, "loss": 0.0001, "step": 5880 }, { "epoch": 1.402047131635325, "grad_norm": 0.011570369824767113, "learning_rate": 2.6632547806077916e-05, "loss": 0.0001, "step": 5890 }, { "epoch": 1.4044275172577958, "grad_norm": 0.004564432427287102, "learning_rate": 2.659287471237007e-05, "loss": 0.0001, "step": 5900 }, { "epoch": 1.4068079028802667, "grad_norm": 0.003310930449515581, "learning_rate": 2.6553201618662227e-05, "loss": 0.0001, "step": 5910 }, { "epoch": 1.4091882885027374, "grad_norm": 0.005474664270877838, "learning_rate": 2.6513528524954377e-05, "loss": 0.0, "step": 5920 }, { "epoch": 1.4115686741252083, "grad_norm": 0.003840883495286107, "learning_rate": 2.647385543124653e-05, "loss": 0.0, "step": 5930 }, { "epoch": 1.4139490597476791, "grad_norm": 0.0011354766320437193, "learning_rate": 2.643418233753868e-05, "loss": 0.0001, "step": 5940 }, { "epoch": 1.41632944537015, "grad_norm": 0.0011250395327806473, "learning_rate": 2.6394509243830835e-05, "loss": 0.0001, "step": 5950 }, { "epoch": 1.4187098309926207, "grad_norm": 0.0025986500550061464, "learning_rate": 2.6354836150122985e-05, "loss": 0.0, "step": 5960 }, { "epoch": 1.4210902166150916, "grad_norm": 0.0018986169015988708, "learning_rate": 2.6315163056415143e-05, "loss": 0.0001, "step": 5970 }, { "epoch": 1.4234706022375625, "grad_norm": 0.006072606425732374, "learning_rate": 2.6275489962707296e-05, "loss": 0.0001, "step": 5980 }, { "epoch": 1.4258509878600334, "grad_norm": 0.005382834933698177, "learning_rate": 2.6235816868999447e-05, "loss": 0.0001, "step": 5990 }, { "epoch": 1.4282313734825043, "grad_norm": 0.0069602313451468945, "learning_rate": 2.6196143775291597e-05, "loss": 0.0001, "step": 6000 }, { "epoch": 1.430611759104975, "grad_norm": 0.00503483647480607, "learning_rate": 2.615647068158375e-05, "loss": 0.0001, "step": 6010 }, { "epoch": 1.4329921447274458, "grad_norm": 0.009482208639383316, "learning_rate": 2.61167975878759e-05, "loss": 0.0001, "step": 6020 }, { "epoch": 1.4353725303499167, "grad_norm": 0.003071409650146961, "learning_rate": 2.6077124494168058e-05, "loss": 0.0003, "step": 6030 }, { "epoch": 1.4377529159723874, "grad_norm": 0.025201931595802307, "learning_rate": 2.603745140046021e-05, "loss": 0.0002, "step": 6040 }, { "epoch": 1.4401333015948583, "grad_norm": 0.029845217242836952, "learning_rate": 2.5997778306752362e-05, "loss": 0.0001, "step": 6050 }, { "epoch": 1.4425136872173292, "grad_norm": 0.002946893684566021, "learning_rate": 2.5958105213044516e-05, "loss": 0.0001, "step": 6060 }, { "epoch": 1.4448940728398, "grad_norm": 0.002334748860448599, "learning_rate": 2.5918432119336666e-05, "loss": 0.0001, "step": 6070 }, { "epoch": 1.447274458462271, "grad_norm": 0.0038676797412335873, "learning_rate": 2.587875902562882e-05, "loss": 0.0001, "step": 6080 }, { "epoch": 1.4496548440847417, "grad_norm": 0.39916858077049255, "learning_rate": 2.583908593192097e-05, "loss": 0.0005, "step": 6090 }, { "epoch": 1.4520352297072125, "grad_norm": 0.005464503075927496, "learning_rate": 2.5799412838213127e-05, "loss": 0.0, "step": 6100 }, { "epoch": 1.4544156153296834, "grad_norm": 0.002350292168557644, "learning_rate": 2.5759739744505277e-05, "loss": 0.0001, "step": 6110 }, { "epoch": 1.4567960009521541, "grad_norm": 0.02950800396502018, "learning_rate": 2.572006665079743e-05, "loss": 0.0001, "step": 6120 }, { "epoch": 1.459176386574625, "grad_norm": 0.0020270231179893017, "learning_rate": 2.568039355708958e-05, "loss": 0.0001, "step": 6130 }, { "epoch": 1.461556772197096, "grad_norm": 0.29163315892219543, "learning_rate": 2.5640720463381735e-05, "loss": 0.0004, "step": 6140 }, { "epoch": 1.4639371578195668, "grad_norm": 0.0028463418129831553, "learning_rate": 2.5601047369673885e-05, "loss": 0.0001, "step": 6150 }, { "epoch": 1.4663175434420377, "grad_norm": 0.007839919067919254, "learning_rate": 2.5561374275966042e-05, "loss": 0.0, "step": 6160 }, { "epoch": 1.4686979290645086, "grad_norm": 0.0009790142066776752, "learning_rate": 2.5521701182258196e-05, "loss": 0.0001, "step": 6170 }, { "epoch": 1.4710783146869792, "grad_norm": 0.019366919994354248, "learning_rate": 2.5482028088550346e-05, "loss": 0.0001, "step": 6180 }, { "epoch": 1.4734587003094501, "grad_norm": 0.002335514174774289, "learning_rate": 2.54423549948425e-05, "loss": 0.0001, "step": 6190 }, { "epoch": 1.475839085931921, "grad_norm": 0.004448035266250372, "learning_rate": 2.540268190113465e-05, "loss": 0.0, "step": 6200 }, { "epoch": 1.4782194715543917, "grad_norm": 0.0020590273197740316, "learning_rate": 2.53630088074268e-05, "loss": 0.0, "step": 6210 }, { "epoch": 1.4805998571768626, "grad_norm": 0.0015115641290321946, "learning_rate": 2.532333571371896e-05, "loss": 0.0001, "step": 6220 }, { "epoch": 1.4829802427993335, "grad_norm": 0.0024076756089925766, "learning_rate": 2.528366262001111e-05, "loss": 0.0003, "step": 6230 }, { "epoch": 1.4853606284218044, "grad_norm": 0.0048133935779333115, "learning_rate": 2.5243989526303262e-05, "loss": 0.0001, "step": 6240 }, { "epoch": 1.4877410140442753, "grad_norm": 0.015479459427297115, "learning_rate": 2.5204316432595416e-05, "loss": 0.0001, "step": 6250 }, { "epoch": 1.490121399666746, "grad_norm": 0.1010046973824501, "learning_rate": 2.5164643338887566e-05, "loss": 0.0001, "step": 6260 }, { "epoch": 1.4925017852892168, "grad_norm": 0.0011843384709209204, "learning_rate": 2.512497024517972e-05, "loss": 0.0002, "step": 6270 }, { "epoch": 1.4948821709116877, "grad_norm": 0.002041852567344904, "learning_rate": 2.508529715147187e-05, "loss": 0.0001, "step": 6280 }, { "epoch": 1.4972625565341584, "grad_norm": 0.002975156530737877, "learning_rate": 2.5045624057764027e-05, "loss": 0.0001, "step": 6290 }, { "epoch": 1.4996429421566293, "grad_norm": 0.005752989556640387, "learning_rate": 2.500595096405618e-05, "loss": 0.0001, "step": 6300 }, { "epoch": 1.5020233277791002, "grad_norm": 0.002325852634385228, "learning_rate": 2.496627787034833e-05, "loss": 0.0, "step": 6310 }, { "epoch": 1.504403713401571, "grad_norm": 0.006379146594554186, "learning_rate": 2.4926604776640485e-05, "loss": 0.0001, "step": 6320 }, { "epoch": 1.506784099024042, "grad_norm": 0.0011644313344731927, "learning_rate": 2.488693168293264e-05, "loss": 0.0, "step": 6330 }, { "epoch": 1.5091644846465129, "grad_norm": 0.06679144501686096, "learning_rate": 2.484725858922479e-05, "loss": 0.0001, "step": 6340 }, { "epoch": 1.5115448702689835, "grad_norm": 0.010065040551126003, "learning_rate": 2.4807585495516942e-05, "loss": 0.0003, "step": 6350 }, { "epoch": 1.5139252558914544, "grad_norm": 0.00404448714107275, "learning_rate": 2.4767912401809093e-05, "loss": 0.0001, "step": 6360 }, { "epoch": 1.516305641513925, "grad_norm": 0.005027102772146463, "learning_rate": 2.4728239308101246e-05, "loss": 0.0001, "step": 6370 }, { "epoch": 1.518686027136396, "grad_norm": 0.0007329948712140322, "learning_rate": 2.46885662143934e-05, "loss": 0.0001, "step": 6380 }, { "epoch": 1.521066412758867, "grad_norm": 0.008010495454072952, "learning_rate": 2.464889312068555e-05, "loss": 0.0001, "step": 6390 }, { "epoch": 1.5234467983813378, "grad_norm": 0.0004263845912646502, "learning_rate": 2.4609220026977704e-05, "loss": 0.0, "step": 6400 }, { "epoch": 1.5258271840038087, "grad_norm": 0.0008505060104653239, "learning_rate": 2.4569546933269858e-05, "loss": 0.0001, "step": 6410 }, { "epoch": 1.5282075696262796, "grad_norm": 0.005009577609598637, "learning_rate": 2.4529873839562008e-05, "loss": 0.0001, "step": 6420 }, { "epoch": 1.5305879552487502, "grad_norm": 0.0055831428617239, "learning_rate": 2.4490200745854165e-05, "loss": 0.0, "step": 6430 }, { "epoch": 1.5329683408712211, "grad_norm": 0.0025661292020231485, "learning_rate": 2.4450527652146315e-05, "loss": 0.0002, "step": 6440 }, { "epoch": 1.535348726493692, "grad_norm": 0.002652715193107724, "learning_rate": 2.4410854558438466e-05, "loss": 0.0, "step": 6450 }, { "epoch": 1.5377291121161627, "grad_norm": 0.0017773109721019864, "learning_rate": 2.4371181464730623e-05, "loss": 0.0001, "step": 6460 }, { "epoch": 1.5401094977386336, "grad_norm": 0.023734472692012787, "learning_rate": 2.4331508371022773e-05, "loss": 0.0001, "step": 6470 }, { "epoch": 1.5424898833611045, "grad_norm": 0.0018312609754502773, "learning_rate": 2.4291835277314927e-05, "loss": 0.0001, "step": 6480 }, { "epoch": 1.5448702689835754, "grad_norm": 0.004327055066823959, "learning_rate": 2.425216218360708e-05, "loss": 0.0001, "step": 6490 }, { "epoch": 1.5472506546060463, "grad_norm": 0.0021172019187361, "learning_rate": 2.421248908989923e-05, "loss": 0.0001, "step": 6500 }, { "epoch": 1.5496310402285172, "grad_norm": 0.001905101933516562, "learning_rate": 2.4172815996191385e-05, "loss": 0.0, "step": 6510 }, { "epoch": 1.5520114258509878, "grad_norm": 0.0016990803414955735, "learning_rate": 2.4133142902483538e-05, "loss": 0.0001, "step": 6520 }, { "epoch": 1.5543918114734587, "grad_norm": 0.0022508346009999514, "learning_rate": 2.409346980877569e-05, "loss": 0.0001, "step": 6530 }, { "epoch": 1.5567721970959294, "grad_norm": 0.0018837592797353864, "learning_rate": 2.4053796715067842e-05, "loss": 0.0001, "step": 6540 }, { "epoch": 1.5591525827184003, "grad_norm": 0.001968635246157646, "learning_rate": 2.4014123621359993e-05, "loss": 0.0002, "step": 6550 }, { "epoch": 1.5615329683408712, "grad_norm": 0.0019730927888303995, "learning_rate": 2.397445052765215e-05, "loss": 0.0001, "step": 6560 }, { "epoch": 1.563913353963342, "grad_norm": 0.0006384404841810465, "learning_rate": 2.39347774339443e-05, "loss": 0.0, "step": 6570 }, { "epoch": 1.566293739585813, "grad_norm": 0.05303851515054703, "learning_rate": 2.389510434023645e-05, "loss": 0.0002, "step": 6580 }, { "epoch": 1.5686741252082839, "grad_norm": 0.009338784962892532, "learning_rate": 2.3855431246528607e-05, "loss": 0.0, "step": 6590 }, { "epoch": 1.5710545108307545, "grad_norm": 0.001042340649291873, "learning_rate": 2.3815758152820758e-05, "loss": 0.0, "step": 6600 }, { "epoch": 1.5734348964532254, "grad_norm": 0.008856063708662987, "learning_rate": 2.377608505911291e-05, "loss": 0.0001, "step": 6610 }, { "epoch": 1.575815282075696, "grad_norm": 0.0010636444203555584, "learning_rate": 2.3736411965405065e-05, "loss": 0.0001, "step": 6620 }, { "epoch": 1.578195667698167, "grad_norm": 0.044303007423877716, "learning_rate": 2.3696738871697215e-05, "loss": 0.0001, "step": 6630 }, { "epoch": 1.5805760533206379, "grad_norm": 0.003368295030668378, "learning_rate": 2.365706577798937e-05, "loss": 0.0001, "step": 6640 }, { "epoch": 1.5829564389431088, "grad_norm": 0.0010406200308352709, "learning_rate": 2.3617392684281523e-05, "loss": 0.0001, "step": 6650 }, { "epoch": 1.5853368245655797, "grad_norm": 0.009850569069385529, "learning_rate": 2.3577719590573673e-05, "loss": 0.0, "step": 6660 }, { "epoch": 1.5877172101880506, "grad_norm": 0.00514467665925622, "learning_rate": 2.3538046496865827e-05, "loss": 0.0001, "step": 6670 }, { "epoch": 1.5900975958105215, "grad_norm": 0.00200643390417099, "learning_rate": 2.349837340315798e-05, "loss": 0.0002, "step": 6680 }, { "epoch": 1.5924779814329921, "grad_norm": 0.01371715497225523, "learning_rate": 2.345870030945013e-05, "loss": 0.0002, "step": 6690 }, { "epoch": 1.594858367055463, "grad_norm": 0.0005170275107957423, "learning_rate": 2.3419027215742284e-05, "loss": 0.0001, "step": 6700 }, { "epoch": 1.5972387526779337, "grad_norm": 0.0018967930227518082, "learning_rate": 2.3379354122034438e-05, "loss": 0.0, "step": 6710 }, { "epoch": 1.5996191383004046, "grad_norm": 0.002288557356223464, "learning_rate": 2.3339681028326592e-05, "loss": 0.0003, "step": 6720 }, { "epoch": 1.6019995239228755, "grad_norm": 0.0017687254585325718, "learning_rate": 2.3300007934618742e-05, "loss": 0.0001, "step": 6730 }, { "epoch": 1.6043799095453464, "grad_norm": 0.023880669847130775, "learning_rate": 2.3260334840910893e-05, "loss": 0.0001, "step": 6740 }, { "epoch": 1.6067602951678173, "grad_norm": 0.004767647013068199, "learning_rate": 2.322066174720305e-05, "loss": 0.0001, "step": 6750 }, { "epoch": 1.6091406807902882, "grad_norm": 0.0016061540227383375, "learning_rate": 2.31809886534952e-05, "loss": 0.0001, "step": 6760 }, { "epoch": 1.6115210664127588, "grad_norm": 0.009586431086063385, "learning_rate": 2.3141315559787354e-05, "loss": 0.0001, "step": 6770 }, { "epoch": 1.6139014520352297, "grad_norm": 0.003596968250349164, "learning_rate": 2.3101642466079507e-05, "loss": 0.0001, "step": 6780 }, { "epoch": 1.6162818376577004, "grad_norm": 0.003184641245752573, "learning_rate": 2.3061969372371658e-05, "loss": 0.0001, "step": 6790 }, { "epoch": 1.6186622232801713, "grad_norm": 0.02113034948706627, "learning_rate": 2.302229627866381e-05, "loss": 0.0, "step": 6800 }, { "epoch": 1.6210426089026422, "grad_norm": 0.0022694601211696863, "learning_rate": 2.2982623184955965e-05, "loss": 0.0001, "step": 6810 }, { "epoch": 1.623422994525113, "grad_norm": 0.0007104437099769711, "learning_rate": 2.2942950091248115e-05, "loss": 0.0, "step": 6820 }, { "epoch": 1.625803380147584, "grad_norm": 0.004562158603221178, "learning_rate": 2.290327699754027e-05, "loss": 0.0001, "step": 6830 }, { "epoch": 1.6281837657700549, "grad_norm": 0.0015846043825149536, "learning_rate": 2.2863603903832423e-05, "loss": 0.0001, "step": 6840 }, { "epoch": 1.6305641513925258, "grad_norm": 0.012255080044269562, "learning_rate": 2.2823930810124573e-05, "loss": 0.0001, "step": 6850 }, { "epoch": 1.6329445370149964, "grad_norm": 0.0012517154682427645, "learning_rate": 2.2784257716416727e-05, "loss": 0.0001, "step": 6860 }, { "epoch": 1.6353249226374673, "grad_norm": 0.0006557099404744804, "learning_rate": 2.274458462270888e-05, "loss": 0.0001, "step": 6870 }, { "epoch": 1.637705308259938, "grad_norm": 0.0007641498814336956, "learning_rate": 2.2704911529001034e-05, "loss": 0.0, "step": 6880 }, { "epoch": 1.6400856938824089, "grad_norm": 0.005642781965434551, "learning_rate": 2.2665238435293184e-05, "loss": 0.0, "step": 6890 }, { "epoch": 1.6424660795048798, "grad_norm": 0.0022149153519421816, "learning_rate": 2.2625565341585338e-05, "loss": 0.0004, "step": 6900 }, { "epoch": 1.6448464651273507, "grad_norm": 0.8982350826263428, "learning_rate": 2.2585892247877492e-05, "loss": 0.0003, "step": 6910 }, { "epoch": 1.6472268507498216, "grad_norm": 0.002032769611105323, "learning_rate": 2.2546219154169642e-05, "loss": 0.0001, "step": 6920 }, { "epoch": 1.6496072363722925, "grad_norm": 0.0021233465522527695, "learning_rate": 2.2506546060461796e-05, "loss": 0.0001, "step": 6930 }, { "epoch": 1.6519876219947631, "grad_norm": 0.019824443385004997, "learning_rate": 2.246687296675395e-05, "loss": 0.0001, "step": 6940 }, { "epoch": 1.654368007617234, "grad_norm": 0.002160045551136136, "learning_rate": 2.24271998730461e-05, "loss": 0.0001, "step": 6950 }, { "epoch": 1.6567483932397047, "grad_norm": 0.002742405980825424, "learning_rate": 2.2387526779338254e-05, "loss": 0.0003, "step": 6960 }, { "epoch": 1.6591287788621756, "grad_norm": 0.04358428716659546, "learning_rate": 2.2347853685630407e-05, "loss": 0.0003, "step": 6970 }, { "epoch": 1.6615091644846465, "grad_norm": 0.0023650035727769136, "learning_rate": 2.2308180591922558e-05, "loss": 0.0, "step": 6980 }, { "epoch": 1.6638895501071174, "grad_norm": 0.0027010326739400625, "learning_rate": 2.226850749821471e-05, "loss": 0.0001, "step": 6990 }, { "epoch": 1.6662699357295883, "grad_norm": 0.01885942928493023, "learning_rate": 2.2228834404506865e-05, "loss": 0.0001, "step": 7000 }, { "epoch": 1.6686503213520592, "grad_norm": 0.013014287687838078, "learning_rate": 2.218916131079902e-05, "loss": 0.0004, "step": 7010 }, { "epoch": 1.6710307069745298, "grad_norm": 0.0015542235923931003, "learning_rate": 2.214948821709117e-05, "loss": 0.0001, "step": 7020 }, { "epoch": 1.6734110925970007, "grad_norm": 0.011335782706737518, "learning_rate": 2.2109815123383323e-05, "loss": 0.0, "step": 7030 }, { "epoch": 1.6757914782194716, "grad_norm": 0.1068568155169487, "learning_rate": 2.2070142029675476e-05, "loss": 0.0001, "step": 7040 }, { "epoch": 1.6781718638419423, "grad_norm": 0.004407468251883984, "learning_rate": 2.2030468935967627e-05, "loss": 0.0001, "step": 7050 }, { "epoch": 1.6805522494644132, "grad_norm": 0.0026373250875622034, "learning_rate": 2.199079584225978e-05, "loss": 0.0, "step": 7060 }, { "epoch": 1.682932635086884, "grad_norm": 0.020453903824090958, "learning_rate": 2.1951122748551934e-05, "loss": 0.0001, "step": 7070 }, { "epoch": 1.685313020709355, "grad_norm": 0.009605340659618378, "learning_rate": 2.1911449654844084e-05, "loss": 0.0003, "step": 7080 }, { "epoch": 1.6876934063318259, "grad_norm": 0.0008563417941331863, "learning_rate": 2.1871776561136238e-05, "loss": 0.0, "step": 7090 }, { "epoch": 1.6900737919542967, "grad_norm": 0.0017095934599637985, "learning_rate": 2.1832103467428392e-05, "loss": 0.0001, "step": 7100 }, { "epoch": 1.6924541775767674, "grad_norm": 0.0017231311649084091, "learning_rate": 2.1792430373720542e-05, "loss": 0.0001, "step": 7110 }, { "epoch": 1.6948345631992383, "grad_norm": 0.0004322198801673949, "learning_rate": 2.17527572800127e-05, "loss": 0.0001, "step": 7120 }, { "epoch": 1.697214948821709, "grad_norm": 0.06828305870294571, "learning_rate": 2.171308418630485e-05, "loss": 0.0003, "step": 7130 }, { "epoch": 1.6995953344441799, "grad_norm": 0.012662236578762531, "learning_rate": 2.1673411092597e-05, "loss": 0.0, "step": 7140 }, { "epoch": 1.7019757200666508, "grad_norm": 0.0004414702707435936, "learning_rate": 2.1633737998889153e-05, "loss": 0.0, "step": 7150 }, { "epoch": 1.7043561056891217, "grad_norm": 0.0018225832609459758, "learning_rate": 2.1594064905181307e-05, "loss": 0.0001, "step": 7160 }, { "epoch": 1.7067364913115926, "grad_norm": 0.10008008033037186, "learning_rate": 2.155439181147346e-05, "loss": 0.0001, "step": 7170 }, { "epoch": 1.7091168769340634, "grad_norm": 0.0027361391112208366, "learning_rate": 2.151471871776561e-05, "loss": 0.0, "step": 7180 }, { "epoch": 1.7114972625565341, "grad_norm": 0.0021505611948668957, "learning_rate": 2.1475045624057765e-05, "loss": 0.0, "step": 7190 }, { "epoch": 1.713877648179005, "grad_norm": 0.00697895884513855, "learning_rate": 2.143537253034992e-05, "loss": 0.0, "step": 7200 }, { "epoch": 1.7162580338014757, "grad_norm": 0.002057724166661501, "learning_rate": 2.139569943664207e-05, "loss": 0.0, "step": 7210 }, { "epoch": 1.7186384194239466, "grad_norm": 0.002399923512712121, "learning_rate": 2.1356026342934223e-05, "loss": 0.0001, "step": 7220 }, { "epoch": 1.7210188050464175, "grad_norm": 0.3061892092227936, "learning_rate": 2.1316353249226376e-05, "loss": 0.0002, "step": 7230 }, { "epoch": 1.7233991906688884, "grad_norm": 0.004888875875622034, "learning_rate": 2.1276680155518527e-05, "loss": 0.0, "step": 7240 }, { "epoch": 1.7257795762913593, "grad_norm": 0.04453931376338005, "learning_rate": 2.1237007061810684e-05, "loss": 0.0001, "step": 7250 }, { "epoch": 1.7281599619138301, "grad_norm": 0.02463744953274727, "learning_rate": 2.1197333968102834e-05, "loss": 0.0001, "step": 7260 }, { "epoch": 1.730540347536301, "grad_norm": 0.002113641705363989, "learning_rate": 2.1157660874394984e-05, "loss": 0.0001, "step": 7270 }, { "epoch": 1.7329207331587717, "grad_norm": 0.0024889137130230665, "learning_rate": 2.111798778068714e-05, "loss": 0.0001, "step": 7280 }, { "epoch": 1.7353011187812426, "grad_norm": 0.10477261245250702, "learning_rate": 2.107831468697929e-05, "loss": 0.0001, "step": 7290 }, { "epoch": 1.7376815044037133, "grad_norm": 0.0008585329633206129, "learning_rate": 2.1038641593271445e-05, "loss": 0.0001, "step": 7300 }, { "epoch": 1.7400618900261842, "grad_norm": 0.17968738079071045, "learning_rate": 2.09989684995636e-05, "loss": 0.0001, "step": 7310 }, { "epoch": 1.742442275648655, "grad_norm": 0.0023223140742629766, "learning_rate": 2.095929540585575e-05, "loss": 0.0001, "step": 7320 }, { "epoch": 1.744822661271126, "grad_norm": 0.0016741958679631352, "learning_rate": 2.0919622312147903e-05, "loss": 0.0002, "step": 7330 }, { "epoch": 1.7472030468935968, "grad_norm": 0.009992700070142746, "learning_rate": 2.0879949218440053e-05, "loss": 0.0, "step": 7340 }, { "epoch": 1.7495834325160677, "grad_norm": 0.002163327531889081, "learning_rate": 2.0840276124732207e-05, "loss": 0.0, "step": 7350 }, { "epoch": 1.7519638181385384, "grad_norm": 0.15539680421352386, "learning_rate": 2.080060303102436e-05, "loss": 0.0003, "step": 7360 }, { "epoch": 1.7543442037610093, "grad_norm": 0.002331450814381242, "learning_rate": 2.076092993731651e-05, "loss": 0.0, "step": 7370 }, { "epoch": 1.75672458938348, "grad_norm": 0.0014541965210810304, "learning_rate": 2.0721256843608665e-05, "loss": 0.0, "step": 7380 }, { "epoch": 1.7591049750059509, "grad_norm": 0.002874292666092515, "learning_rate": 2.068158374990082e-05, "loss": 0.0, "step": 7390 }, { "epoch": 1.7614853606284218, "grad_norm": 0.046790674328804016, "learning_rate": 2.064191065619297e-05, "loss": 0.0001, "step": 7400 }, { "epoch": 1.7638657462508927, "grad_norm": 0.012541896663606167, "learning_rate": 2.0602237562485126e-05, "loss": 0.0001, "step": 7410 }, { "epoch": 1.7662461318733635, "grad_norm": 0.0005884987185709178, "learning_rate": 2.0562564468777276e-05, "loss": 0.0001, "step": 7420 }, { "epoch": 1.7686265174958344, "grad_norm": 0.0090475520119071, "learning_rate": 2.0522891375069426e-05, "loss": 0.0001, "step": 7430 }, { "epoch": 1.7710069031183053, "grad_norm": 0.04852410405874252, "learning_rate": 2.0483218281361584e-05, "loss": 0.0001, "step": 7440 }, { "epoch": 1.773387288740776, "grad_norm": 0.003311296459287405, "learning_rate": 2.0443545187653734e-05, "loss": 0.0001, "step": 7450 }, { "epoch": 1.775767674363247, "grad_norm": 0.03242022171616554, "learning_rate": 2.0403872093945888e-05, "loss": 0.0001, "step": 7460 }, { "epoch": 1.7781480599857176, "grad_norm": 0.010833712294697762, "learning_rate": 2.036419900023804e-05, "loss": 0.0002, "step": 7470 }, { "epoch": 1.7805284456081885, "grad_norm": 0.0031983698718249798, "learning_rate": 2.032452590653019e-05, "loss": 0.0001, "step": 7480 }, { "epoch": 1.7829088312306594, "grad_norm": 0.021590987220406532, "learning_rate": 2.0284852812822345e-05, "loss": 0.0001, "step": 7490 }, { "epoch": 1.7852892168531302, "grad_norm": 0.005147872492671013, "learning_rate": 2.02451797191145e-05, "loss": 0.0, "step": 7500 }, { "epoch": 1.7876696024756011, "grad_norm": 0.0012411813950166106, "learning_rate": 2.020550662540665e-05, "loss": 0.0001, "step": 7510 }, { "epoch": 1.790049988098072, "grad_norm": 0.0009874672396108508, "learning_rate": 2.0165833531698803e-05, "loss": 0.0001, "step": 7520 }, { "epoch": 1.7924303737205427, "grad_norm": 0.002135714516043663, "learning_rate": 2.0126160437990957e-05, "loss": 0.0003, "step": 7530 }, { "epoch": 1.7948107593430136, "grad_norm": 0.002928838599473238, "learning_rate": 2.008648734428311e-05, "loss": 0.0002, "step": 7540 }, { "epoch": 1.7971911449654843, "grad_norm": 0.002418682212010026, "learning_rate": 2.004681425057526e-05, "loss": 0.0001, "step": 7550 }, { "epoch": 1.7995715305879552, "grad_norm": 0.022359730675816536, "learning_rate": 2.000714115686741e-05, "loss": 0.0, "step": 7560 }, { "epoch": 1.801951916210426, "grad_norm": 0.0013171250466257334, "learning_rate": 1.9967468063159568e-05, "loss": 0.0001, "step": 7570 }, { "epoch": 1.804332301832897, "grad_norm": 0.005206149537116289, "learning_rate": 1.992779496945172e-05, "loss": 0.0001, "step": 7580 }, { "epoch": 1.8067126874553678, "grad_norm": 0.5035125613212585, "learning_rate": 1.988812187574387e-05, "loss": 0.0004, "step": 7590 }, { "epoch": 1.8090930730778387, "grad_norm": 0.0018090710509568453, "learning_rate": 1.9848448782036026e-05, "loss": 0.0, "step": 7600 }, { "epoch": 1.8114734587003094, "grad_norm": 0.0020274862181395292, "learning_rate": 1.9808775688328176e-05, "loss": 0.0, "step": 7610 }, { "epoch": 1.8138538443227803, "grad_norm": 0.008559592068195343, "learning_rate": 1.976910259462033e-05, "loss": 0.0001, "step": 7620 }, { "epoch": 1.816234229945251, "grad_norm": 0.002766631543636322, "learning_rate": 1.9729429500912483e-05, "loss": 0.0, "step": 7630 }, { "epoch": 1.8186146155677219, "grad_norm": 0.003933802247047424, "learning_rate": 1.9689756407204634e-05, "loss": 0.0001, "step": 7640 }, { "epoch": 1.8209950011901928, "grad_norm": 0.0502641461789608, "learning_rate": 1.9650083313496787e-05, "loss": 0.0001, "step": 7650 }, { "epoch": 1.8233753868126636, "grad_norm": 0.002705627353861928, "learning_rate": 1.961041021978894e-05, "loss": 0.0, "step": 7660 }, { "epoch": 1.8257557724351345, "grad_norm": 0.015057703480124474, "learning_rate": 1.957073712608109e-05, "loss": 0.0001, "step": 7670 }, { "epoch": 1.8281361580576054, "grad_norm": 0.0005775150493718684, "learning_rate": 1.9531064032373245e-05, "loss": 0.0001, "step": 7680 }, { "epoch": 1.8305165436800763, "grad_norm": 0.006392305716872215, "learning_rate": 1.94913909386654e-05, "loss": 0.0001, "step": 7690 }, { "epoch": 1.832896929302547, "grad_norm": 0.0014930195175111294, "learning_rate": 1.9451717844957553e-05, "loss": 0.0002, "step": 7700 }, { "epoch": 1.8352773149250179, "grad_norm": 0.0161952693015337, "learning_rate": 1.9412044751249703e-05, "loss": 0.0001, "step": 7710 }, { "epoch": 1.8376577005474886, "grad_norm": 0.0019109123386442661, "learning_rate": 1.9372371657541857e-05, "loss": 0.0001, "step": 7720 }, { "epoch": 1.8400380861699595, "grad_norm": 0.0026801279745996, "learning_rate": 1.933269856383401e-05, "loss": 0.0, "step": 7730 }, { "epoch": 1.8424184717924303, "grad_norm": 0.006187149789184332, "learning_rate": 1.929302547012616e-05, "loss": 0.0001, "step": 7740 }, { "epoch": 1.8447988574149012, "grad_norm": 0.002990028355270624, "learning_rate": 1.9253352376418314e-05, "loss": 0.0001, "step": 7750 }, { "epoch": 1.8471792430373721, "grad_norm": 0.0044268155470490456, "learning_rate": 1.9213679282710468e-05, "loss": 0.0, "step": 7760 }, { "epoch": 1.849559628659843, "grad_norm": 0.005206019151955843, "learning_rate": 1.9174006189002618e-05, "loss": 0.0001, "step": 7770 }, { "epoch": 1.8519400142823137, "grad_norm": 0.005415783729404211, "learning_rate": 1.9134333095294772e-05, "loss": 0.0001, "step": 7780 }, { "epoch": 1.8543203999047846, "grad_norm": 0.0016888550017029047, "learning_rate": 1.9094660001586926e-05, "loss": 0.0, "step": 7790 }, { "epoch": 1.8567007855272553, "grad_norm": 0.003122705966234207, "learning_rate": 1.9054986907879076e-05, "loss": 0.0, "step": 7800 }, { "epoch": 1.8590811711497262, "grad_norm": 0.021525248885154724, "learning_rate": 1.901531381417123e-05, "loss": 0.0, "step": 7810 }, { "epoch": 1.861461556772197, "grad_norm": 0.004836782813072205, "learning_rate": 1.8975640720463383e-05, "loss": 0.0, "step": 7820 }, { "epoch": 1.863841942394668, "grad_norm": 0.003003711812198162, "learning_rate": 1.8935967626755534e-05, "loss": 0.0001, "step": 7830 }, { "epoch": 1.8662223280171388, "grad_norm": 0.0034373151138424873, "learning_rate": 1.8896294533047687e-05, "loss": 0.0001, "step": 7840 }, { "epoch": 1.8686027136396097, "grad_norm": 0.061307862401008606, "learning_rate": 1.885662143933984e-05, "loss": 0.0001, "step": 7850 }, { "epoch": 1.8709830992620806, "grad_norm": 0.001207771128974855, "learning_rate": 1.8816948345631995e-05, "loss": 0.0, "step": 7860 }, { "epoch": 1.8733634848845513, "grad_norm": 0.007686016149818897, "learning_rate": 1.8777275251924145e-05, "loss": 0.0001, "step": 7870 }, { "epoch": 1.8757438705070222, "grad_norm": 0.0019049645634368062, "learning_rate": 1.87376021582163e-05, "loss": 0.0001, "step": 7880 }, { "epoch": 1.8781242561294929, "grad_norm": 0.00202633673325181, "learning_rate": 1.8697929064508452e-05, "loss": 0.0, "step": 7890 }, { "epoch": 1.8805046417519637, "grad_norm": 0.0011157892877236009, "learning_rate": 1.8658255970800603e-05, "loss": 0.0002, "step": 7900 }, { "epoch": 1.8828850273744346, "grad_norm": 0.001622357638552785, "learning_rate": 1.8618582877092756e-05, "loss": 0.0006, "step": 7910 }, { "epoch": 1.8852654129969055, "grad_norm": 0.04895901307463646, "learning_rate": 1.857890978338491e-05, "loss": 0.0002, "step": 7920 }, { "epoch": 1.8876457986193764, "grad_norm": 0.0012425240129232407, "learning_rate": 1.853923668967706e-05, "loss": 0.0001, "step": 7930 }, { "epoch": 1.8900261842418473, "grad_norm": 0.004690519999712706, "learning_rate": 1.8499563595969214e-05, "loss": 0.0001, "step": 7940 }, { "epoch": 1.892406569864318, "grad_norm": 0.0015794184291735291, "learning_rate": 1.8459890502261368e-05, "loss": 0.0, "step": 7950 }, { "epoch": 1.8947869554867889, "grad_norm": 0.01080586388707161, "learning_rate": 1.8420217408553518e-05, "loss": 0.0002, "step": 7960 }, { "epoch": 1.8971673411092596, "grad_norm": 0.0018335338681936264, "learning_rate": 1.8380544314845672e-05, "loss": 0.0, "step": 7970 }, { "epoch": 1.8995477267317304, "grad_norm": 0.003800921142101288, "learning_rate": 1.8340871221137826e-05, "loss": 0.0, "step": 7980 }, { "epoch": 1.9019281123542013, "grad_norm": 0.0035681715235114098, "learning_rate": 1.830119812742998e-05, "loss": 0.0001, "step": 7990 }, { "epoch": 1.9043084979766722, "grad_norm": 0.001115818158723414, "learning_rate": 1.826152503372213e-05, "loss": 0.0003, "step": 8000 }, { "epoch": 1.9066888835991431, "grad_norm": 0.004726150073111057, "learning_rate": 1.8221851940014283e-05, "loss": 0.0001, "step": 8010 }, { "epoch": 1.909069269221614, "grad_norm": 0.025985538959503174, "learning_rate": 1.8182178846306437e-05, "loss": 0.0, "step": 8020 }, { "epoch": 1.9114496548440847, "grad_norm": 0.002658289624378085, "learning_rate": 1.8142505752598587e-05, "loss": 0.0001, "step": 8030 }, { "epoch": 1.9138300404665556, "grad_norm": 0.010776730254292488, "learning_rate": 1.810283265889074e-05, "loss": 0.0001, "step": 8040 }, { "epoch": 1.9162104260890265, "grad_norm": 0.004742765333503485, "learning_rate": 1.8063159565182895e-05, "loss": 0.0001, "step": 8050 }, { "epoch": 1.9185908117114971, "grad_norm": 0.0017833469901233912, "learning_rate": 1.8023486471475045e-05, "loss": 0.0003, "step": 8060 }, { "epoch": 1.920971197333968, "grad_norm": 0.0015226156683638692, "learning_rate": 1.79838133777672e-05, "loss": 0.0, "step": 8070 }, { "epoch": 1.923351582956439, "grad_norm": 0.0021416472736746073, "learning_rate": 1.7944140284059352e-05, "loss": 0.0001, "step": 8080 }, { "epoch": 1.9257319685789098, "grad_norm": 0.0021594560239464045, "learning_rate": 1.7904467190351503e-05, "loss": 0.0, "step": 8090 }, { "epoch": 1.9281123542013807, "grad_norm": 0.0018359982641413808, "learning_rate": 1.786479409664366e-05, "loss": 0.0, "step": 8100 }, { "epoch": 1.9304927398238516, "grad_norm": 0.0036185849457979202, "learning_rate": 1.782512100293581e-05, "loss": 0.0001, "step": 8110 }, { "epoch": 1.9328731254463223, "grad_norm": 0.019637318328022957, "learning_rate": 1.778544790922796e-05, "loss": 0.0, "step": 8120 }, { "epoch": 1.9352535110687932, "grad_norm": 0.002496182220056653, "learning_rate": 1.7745774815520117e-05, "loss": 0.0001, "step": 8130 }, { "epoch": 1.9376338966912638, "grad_norm": 0.004374451469630003, "learning_rate": 1.7706101721812268e-05, "loss": 0.0005, "step": 8140 }, { "epoch": 1.9400142823137347, "grad_norm": 0.0006196928443387151, "learning_rate": 1.766642862810442e-05, "loss": 0.0, "step": 8150 }, { "epoch": 1.9423946679362056, "grad_norm": 0.0037022046744823456, "learning_rate": 1.7626755534396572e-05, "loss": 0.0, "step": 8160 }, { "epoch": 1.9447750535586765, "grad_norm": 0.004300027620047331, "learning_rate": 1.7587082440688725e-05, "loss": 0.0, "step": 8170 }, { "epoch": 1.9471554391811474, "grad_norm": 0.0019766122568398714, "learning_rate": 1.754740934698088e-05, "loss": 0.0001, "step": 8180 }, { "epoch": 1.9495358248036183, "grad_norm": 0.0018594982102513313, "learning_rate": 1.750773625327303e-05, "loss": 0.0001, "step": 8190 }, { "epoch": 1.951916210426089, "grad_norm": 0.0012102769687771797, "learning_rate": 1.7468063159565183e-05, "loss": 0.0001, "step": 8200 }, { "epoch": 1.9542965960485599, "grad_norm": 0.0012130772229284048, "learning_rate": 1.7428390065857337e-05, "loss": 0.0, "step": 8210 }, { "epoch": 1.9566769816710305, "grad_norm": 0.0006833472289144993, "learning_rate": 1.7388716972149487e-05, "loss": 0.0004, "step": 8220 }, { "epoch": 1.9590573672935014, "grad_norm": 0.0017617164412513375, "learning_rate": 1.7349043878441644e-05, "loss": 0.0001, "step": 8230 }, { "epoch": 1.9614377529159723, "grad_norm": 0.0013312195660546422, "learning_rate": 1.7309370784733795e-05, "loss": 0.0, "step": 8240 }, { "epoch": 1.9638181385384432, "grad_norm": 0.0018878667615354061, "learning_rate": 1.7269697691025945e-05, "loss": 0.0, "step": 8250 }, { "epoch": 1.9661985241609141, "grad_norm": 0.0019427284132689238, "learning_rate": 1.7230024597318102e-05, "loss": 0.0, "step": 8260 }, { "epoch": 1.968578909783385, "grad_norm": 0.004271362908184528, "learning_rate": 1.7190351503610252e-05, "loss": 0.0001, "step": 8270 }, { "epoch": 1.970959295405856, "grad_norm": 0.0027857243549078703, "learning_rate": 1.7150678409902406e-05, "loss": 0.0, "step": 8280 }, { "epoch": 1.9733396810283266, "grad_norm": 0.0018286170670762658, "learning_rate": 1.711100531619456e-05, "loss": 0.0001, "step": 8290 }, { "epoch": 1.9757200666507975, "grad_norm": 0.001666391035541892, "learning_rate": 1.707133222248671e-05, "loss": 0.0, "step": 8300 }, { "epoch": 1.9781004522732681, "grad_norm": 0.021936526522040367, "learning_rate": 1.7031659128778864e-05, "loss": 0.0001, "step": 8310 }, { "epoch": 1.980480837895739, "grad_norm": 0.00029301681206561625, "learning_rate": 1.6991986035071017e-05, "loss": 0.0, "step": 8320 }, { "epoch": 1.98286122351821, "grad_norm": 0.0009200606727972627, "learning_rate": 1.6952312941363168e-05, "loss": 0.0, "step": 8330 }, { "epoch": 1.9852416091406808, "grad_norm": 0.00579107366502285, "learning_rate": 1.691263984765532e-05, "loss": 0.0, "step": 8340 }, { "epoch": 1.9876219947631517, "grad_norm": 0.000620057515334338, "learning_rate": 1.687296675394747e-05, "loss": 0.0001, "step": 8350 }, { "epoch": 1.9900023803856226, "grad_norm": 0.0015694822650402784, "learning_rate": 1.6833293660239625e-05, "loss": 0.0, "step": 8360 }, { "epoch": 1.9923827660080933, "grad_norm": 0.0013426202349364758, "learning_rate": 1.679362056653178e-05, "loss": 0.0, "step": 8370 }, { "epoch": 1.9947631516305642, "grad_norm": 0.06455473601818085, "learning_rate": 1.675394747282393e-05, "loss": 0.0001, "step": 8380 }, { "epoch": 1.9971435372530348, "grad_norm": 0.0007938113994896412, "learning_rate": 1.6714274379116086e-05, "loss": 0.0001, "step": 8390 }, { "epoch": 1.9995239228755057, "grad_norm": 0.0030489168129861355, "learning_rate": 1.6674601285408237e-05, "loss": 0.0001, "step": 8400 }, { "epoch": 2.0, "eval_loss": 7.416475114041532e-07, "eval_runtime": 52.1219, "eval_samples_per_second": 35.839, "eval_steps_per_second": 8.96, "step": 8402 }, { "epoch": 2.0019043084979766, "grad_norm": 0.00039361350354738533, "learning_rate": 1.6634928191700387e-05, "loss": 0.0001, "step": 8410 }, { "epoch": 2.0042846941204475, "grad_norm": 0.007912525907158852, "learning_rate": 1.6595255097992544e-05, "loss": 0.0001, "step": 8420 }, { "epoch": 2.0066650797429184, "grad_norm": 0.003857001895084977, "learning_rate": 1.6555582004284694e-05, "loss": 0.0001, "step": 8430 }, { "epoch": 2.0090454653653893, "grad_norm": 0.002192788990214467, "learning_rate": 1.6515908910576848e-05, "loss": 0.0, "step": 8440 }, { "epoch": 2.01142585098786, "grad_norm": 0.00107199524063617, "learning_rate": 1.6476235816869002e-05, "loss": 0.0002, "step": 8450 }, { "epoch": 2.013806236610331, "grad_norm": 0.024036822840571404, "learning_rate": 1.6436562723161152e-05, "loss": 0.0001, "step": 8460 }, { "epoch": 2.0161866222328015, "grad_norm": 0.000551603501662612, "learning_rate": 1.6396889629453306e-05, "loss": 0.0, "step": 8470 }, { "epoch": 2.0185670078552724, "grad_norm": 0.001782495528459549, "learning_rate": 1.635721653574546e-05, "loss": 0.0001, "step": 8480 }, { "epoch": 2.0209473934777433, "grad_norm": 0.030838970094919205, "learning_rate": 1.631754344203761e-05, "loss": 0.0001, "step": 8490 }, { "epoch": 2.023327779100214, "grad_norm": 0.0005242625484243035, "learning_rate": 1.6277870348329764e-05, "loss": 0.0003, "step": 8500 }, { "epoch": 2.025708164722685, "grad_norm": 0.001871236483566463, "learning_rate": 1.6238197254621917e-05, "loss": 0.0, "step": 8510 }, { "epoch": 2.028088550345156, "grad_norm": 0.0005813137395307422, "learning_rate": 1.6198524160914068e-05, "loss": 0.0, "step": 8520 }, { "epoch": 2.030468935967627, "grad_norm": 0.0007783659384585917, "learning_rate": 1.615885106720622e-05, "loss": 0.0, "step": 8530 }, { "epoch": 2.032849321590098, "grad_norm": 0.002862844616174698, "learning_rate": 1.6119177973498375e-05, "loss": 0.0001, "step": 8540 }, { "epoch": 2.0352297072125682, "grad_norm": 0.0016766699263826013, "learning_rate": 1.607950487979053e-05, "loss": 0.0, "step": 8550 }, { "epoch": 2.037610092835039, "grad_norm": 0.06566356122493744, "learning_rate": 1.603983178608268e-05, "loss": 0.0002, "step": 8560 }, { "epoch": 2.03999047845751, "grad_norm": 0.0013121259398758411, "learning_rate": 1.600015869237483e-05, "loss": 0.0, "step": 8570 }, { "epoch": 2.042370864079981, "grad_norm": 0.0012001970317214727, "learning_rate": 1.5960485598666986e-05, "loss": 0.0001, "step": 8580 }, { "epoch": 2.044751249702452, "grad_norm": 0.008261552080512047, "learning_rate": 1.5920812504959137e-05, "loss": 0.0, "step": 8590 }, { "epoch": 2.0471316353249227, "grad_norm": 0.0006174147129058838, "learning_rate": 1.588113941125129e-05, "loss": 0.0, "step": 8600 }, { "epoch": 2.0495120209473936, "grad_norm": 0.005130809266120195, "learning_rate": 1.5841466317543444e-05, "loss": 0.0001, "step": 8610 }, { "epoch": 2.0518924065698645, "grad_norm": 0.0034670240711420774, "learning_rate": 1.5801793223835594e-05, "loss": 0.0004, "step": 8620 }, { "epoch": 2.054272792192335, "grad_norm": 0.0055514005944132805, "learning_rate": 1.5762120130127748e-05, "loss": 0.0001, "step": 8630 }, { "epoch": 2.056653177814806, "grad_norm": 0.0003135903971269727, "learning_rate": 1.5722447036419902e-05, "loss": 0.0, "step": 8640 }, { "epoch": 2.0590335634372767, "grad_norm": 0.002474389737471938, "learning_rate": 1.5682773942712052e-05, "loss": 0.0001, "step": 8650 }, { "epoch": 2.0614139490597476, "grad_norm": 0.004792024847120047, "learning_rate": 1.5643100849004206e-05, "loss": 0.0, "step": 8660 }, { "epoch": 2.0637943346822185, "grad_norm": 0.0030985362827777863, "learning_rate": 1.560342775529636e-05, "loss": 0.0001, "step": 8670 }, { "epoch": 2.0661747203046894, "grad_norm": 0.004058391321450472, "learning_rate": 1.5563754661588513e-05, "loss": 0.0, "step": 8680 }, { "epoch": 2.0685551059271603, "grad_norm": 0.00150771695189178, "learning_rate": 1.5524081567880663e-05, "loss": 0.0, "step": 8690 }, { "epoch": 2.070935491549631, "grad_norm": 0.001020533381961286, "learning_rate": 1.5484408474172817e-05, "loss": 0.0, "step": 8700 }, { "epoch": 2.073315877172102, "grad_norm": 0.00616106390953064, "learning_rate": 1.544473538046497e-05, "loss": 0.0, "step": 8710 }, { "epoch": 2.0756962627945725, "grad_norm": 0.0025589261204004288, "learning_rate": 1.540506228675712e-05, "loss": 0.0001, "step": 8720 }, { "epoch": 2.0780766484170434, "grad_norm": 0.0006466865306720138, "learning_rate": 1.5365389193049275e-05, "loss": 0.0, "step": 8730 }, { "epoch": 2.0804570340395143, "grad_norm": 0.002343350788578391, "learning_rate": 1.532571609934143e-05, "loss": 0.0001, "step": 8740 }, { "epoch": 2.082837419661985, "grad_norm": 0.0006717872456647456, "learning_rate": 1.528604300563358e-05, "loss": 0.0001, "step": 8750 }, { "epoch": 2.085217805284456, "grad_norm": 0.0009957224829122424, "learning_rate": 1.524636991192573e-05, "loss": 0.0002, "step": 8760 }, { "epoch": 2.087598190906927, "grad_norm": 0.0014106009621173143, "learning_rate": 1.5206696818217886e-05, "loss": 0.0, "step": 8770 }, { "epoch": 2.089978576529398, "grad_norm": 0.0011065505677834153, "learning_rate": 1.5167023724510038e-05, "loss": 0.0001, "step": 8780 }, { "epoch": 2.092358962151869, "grad_norm": 0.0027844165451824665, "learning_rate": 1.512735063080219e-05, "loss": 0.0001, "step": 8790 }, { "epoch": 2.0947393477743392, "grad_norm": 0.0006960778846405447, "learning_rate": 1.5087677537094344e-05, "loss": 0.0, "step": 8800 }, { "epoch": 2.09711973339681, "grad_norm": 0.0003423156449571252, "learning_rate": 1.5048004443386496e-05, "loss": 0.0, "step": 8810 }, { "epoch": 2.099500119019281, "grad_norm": 0.0011733579449355602, "learning_rate": 1.5008331349678648e-05, "loss": 0.0, "step": 8820 }, { "epoch": 2.101880504641752, "grad_norm": 0.004115458112210035, "learning_rate": 1.4968658255970802e-05, "loss": 0.0, "step": 8830 }, { "epoch": 2.104260890264223, "grad_norm": 0.072359099984169, "learning_rate": 1.4928985162262954e-05, "loss": 0.0, "step": 8840 }, { "epoch": 2.1066412758866937, "grad_norm": 0.003922273404896259, "learning_rate": 1.4889312068555106e-05, "loss": 0.0, "step": 8850 }, { "epoch": 2.1090216615091646, "grad_norm": 0.012736503966152668, "learning_rate": 1.4849638974847261e-05, "loss": 0.0, "step": 8860 }, { "epoch": 2.1114020471316355, "grad_norm": 0.0019338323036208749, "learning_rate": 1.4809965881139411e-05, "loss": 0.0001, "step": 8870 }, { "epoch": 2.1137824327541064, "grad_norm": 0.0015457593835890293, "learning_rate": 1.4770292787431563e-05, "loss": 0.0, "step": 8880 }, { "epoch": 2.116162818376577, "grad_norm": 0.0016716497484594584, "learning_rate": 1.4730619693723719e-05, "loss": 0.0, "step": 8890 }, { "epoch": 2.1185432039990477, "grad_norm": 0.001560089411213994, "learning_rate": 1.469094660001587e-05, "loss": 0.0, "step": 8900 }, { "epoch": 2.1209235896215186, "grad_norm": 0.0031743065919727087, "learning_rate": 1.4651273506308023e-05, "loss": 0.0, "step": 8910 }, { "epoch": 2.1233039752439895, "grad_norm": 0.0015614436706528068, "learning_rate": 1.4611600412600176e-05, "loss": 0.0, "step": 8920 }, { "epoch": 2.1256843608664604, "grad_norm": 0.0005399516085162759, "learning_rate": 1.4571927318892328e-05, "loss": 0.0, "step": 8930 }, { "epoch": 2.1280647464889313, "grad_norm": 0.0014794693561270833, "learning_rate": 1.453225422518448e-05, "loss": 0.0, "step": 8940 }, { "epoch": 2.130445132111402, "grad_norm": 0.0024672893341630697, "learning_rate": 1.4492581131476632e-05, "loss": 0.0, "step": 8950 }, { "epoch": 2.132825517733873, "grad_norm": 0.0013646584702655673, "learning_rate": 1.4452908037768786e-05, "loss": 0.0, "step": 8960 }, { "epoch": 2.1352059033563435, "grad_norm": 0.07290241867303848, "learning_rate": 1.4413234944060938e-05, "loss": 0.0001, "step": 8970 }, { "epoch": 2.1375862889788144, "grad_norm": 0.001859787036664784, "learning_rate": 1.437356185035309e-05, "loss": 0.0, "step": 8980 }, { "epoch": 2.1399666746012853, "grad_norm": 0.001754750614054501, "learning_rate": 1.4333888756645244e-05, "loss": 0.0, "step": 8990 }, { "epoch": 2.142347060223756, "grad_norm": 0.028476126492023468, "learning_rate": 1.4294215662937396e-05, "loss": 0.0, "step": 9000 }, { "epoch": 2.144727445846227, "grad_norm": 0.0005994876846671104, "learning_rate": 1.4254542569229548e-05, "loss": 0.0, "step": 9010 }, { "epoch": 2.147107831468698, "grad_norm": 0.0007879494805820286, "learning_rate": 1.4214869475521703e-05, "loss": 0.0, "step": 9020 }, { "epoch": 2.149488217091169, "grad_norm": 0.0012654970632866025, "learning_rate": 1.4175196381813855e-05, "loss": 0.0, "step": 9030 }, { "epoch": 2.1518686027136398, "grad_norm": 0.0018679037457332015, "learning_rate": 1.4135523288106006e-05, "loss": 0.0, "step": 9040 }, { "epoch": 2.1542489883361107, "grad_norm": 0.0017861429369077086, "learning_rate": 1.4095850194398161e-05, "loss": 0.0, "step": 9050 }, { "epoch": 2.156629373958581, "grad_norm": 0.006415149662643671, "learning_rate": 1.4056177100690313e-05, "loss": 0.0, "step": 9060 }, { "epoch": 2.159009759581052, "grad_norm": 0.002842891961336136, "learning_rate": 1.4016504006982465e-05, "loss": 0.0, "step": 9070 }, { "epoch": 2.161390145203523, "grad_norm": 0.0013869826216250658, "learning_rate": 1.3976830913274619e-05, "loss": 0.0, "step": 9080 }, { "epoch": 2.163770530825994, "grad_norm": 0.018388478085398674, "learning_rate": 1.393715781956677e-05, "loss": 0.0001, "step": 9090 }, { "epoch": 2.1661509164484647, "grad_norm": 0.0008245584322139621, "learning_rate": 1.3897484725858923e-05, "loss": 0.0, "step": 9100 }, { "epoch": 2.1685313020709356, "grad_norm": 0.36837905645370483, "learning_rate": 1.3857811632151076e-05, "loss": 0.0001, "step": 9110 }, { "epoch": 2.1709116876934065, "grad_norm": 0.002466343343257904, "learning_rate": 1.3818138538443228e-05, "loss": 0.0001, "step": 9120 }, { "epoch": 2.1732920733158774, "grad_norm": 0.0035982499830424786, "learning_rate": 1.377846544473538e-05, "loss": 0.0, "step": 9130 }, { "epoch": 2.175672458938348, "grad_norm": 0.13738982379436493, "learning_rate": 1.3738792351027536e-05, "loss": 0.0001, "step": 9140 }, { "epoch": 2.1780528445608187, "grad_norm": 0.00042806967394426465, "learning_rate": 1.3699119257319688e-05, "loss": 0.0, "step": 9150 }, { "epoch": 2.1804332301832896, "grad_norm": 0.002727969316765666, "learning_rate": 1.3659446163611838e-05, "loss": 0.0, "step": 9160 }, { "epoch": 2.1828136158057605, "grad_norm": 0.0010691905627027154, "learning_rate": 1.361977306990399e-05, "loss": 0.0, "step": 9170 }, { "epoch": 2.1851940014282314, "grad_norm": 0.020881984382867813, "learning_rate": 1.3580099976196145e-05, "loss": 0.0, "step": 9180 }, { "epoch": 2.1875743870507023, "grad_norm": 0.0019363940227776766, "learning_rate": 1.3540426882488297e-05, "loss": 0.0, "step": 9190 }, { "epoch": 2.189954772673173, "grad_norm": 0.001359110465273261, "learning_rate": 1.350075378878045e-05, "loss": 0.0, "step": 9200 }, { "epoch": 2.192335158295644, "grad_norm": 0.0024417322129011154, "learning_rate": 1.3461080695072603e-05, "loss": 0.0, "step": 9210 }, { "epoch": 2.1947155439181145, "grad_norm": 0.0006399775156751275, "learning_rate": 1.3421407601364755e-05, "loss": 0.0, "step": 9220 }, { "epoch": 2.1970959295405854, "grad_norm": 0.001347382552921772, "learning_rate": 1.3381734507656907e-05, "loss": 0.0001, "step": 9230 }, { "epoch": 2.1994763151630563, "grad_norm": 0.002276881132274866, "learning_rate": 1.334206141394906e-05, "loss": 0.0, "step": 9240 }, { "epoch": 2.201856700785527, "grad_norm": 0.0005205354536883533, "learning_rate": 1.3302388320241213e-05, "loss": 0.0, "step": 9250 }, { "epoch": 2.204237086407998, "grad_norm": 0.001351204700767994, "learning_rate": 1.3262715226533365e-05, "loss": 0.0, "step": 9260 }, { "epoch": 2.206617472030469, "grad_norm": 0.00529600540176034, "learning_rate": 1.322304213282552e-05, "loss": 0.0002, "step": 9270 }, { "epoch": 2.20899785765294, "grad_norm": 0.002000352367758751, "learning_rate": 1.318336903911767e-05, "loss": 0.0, "step": 9280 }, { "epoch": 2.2113782432754108, "grad_norm": 0.0011036837240681052, "learning_rate": 1.3143695945409823e-05, "loss": 0.0, "step": 9290 }, { "epoch": 2.2137586288978817, "grad_norm": 0.0023322845809161663, "learning_rate": 1.3104022851701978e-05, "loss": 0.0, "step": 9300 }, { "epoch": 2.216139014520352, "grad_norm": 0.0029122158885002136, "learning_rate": 1.306434975799413e-05, "loss": 0.0, "step": 9310 }, { "epoch": 2.218519400142823, "grad_norm": 0.00949085596948862, "learning_rate": 1.302467666428628e-05, "loss": 0.0, "step": 9320 }, { "epoch": 2.220899785765294, "grad_norm": 0.0013391702668741345, "learning_rate": 1.2985003570578436e-05, "loss": 0.0, "step": 9330 }, { "epoch": 2.223280171387765, "grad_norm": 0.00047678747796453536, "learning_rate": 1.2945330476870588e-05, "loss": 0.0, "step": 9340 }, { "epoch": 2.2256605570102357, "grad_norm": 0.0031029602978378534, "learning_rate": 1.290565738316274e-05, "loss": 0.0, "step": 9350 }, { "epoch": 2.2280409426327066, "grad_norm": 0.00046392931835725904, "learning_rate": 1.2865984289454892e-05, "loss": 0.0, "step": 9360 }, { "epoch": 2.2304213282551775, "grad_norm": 0.0008917547529563308, "learning_rate": 1.2826311195747045e-05, "loss": 0.0, "step": 9370 }, { "epoch": 2.2328017138776484, "grad_norm": 0.0039760940708220005, "learning_rate": 1.2786638102039197e-05, "loss": 0.0, "step": 9380 }, { "epoch": 2.235182099500119, "grad_norm": 0.0009416754473932087, "learning_rate": 1.274696500833135e-05, "loss": 0.0, "step": 9390 }, { "epoch": 2.2375624851225897, "grad_norm": 0.0008697324083186686, "learning_rate": 1.2707291914623503e-05, "loss": 0.0, "step": 9400 }, { "epoch": 2.2399428707450606, "grad_norm": 0.00044792311382479966, "learning_rate": 1.2667618820915655e-05, "loss": 0.0, "step": 9410 }, { "epoch": 2.2423232563675315, "grad_norm": 0.0014049585442990065, "learning_rate": 1.2627945727207807e-05, "loss": 0.0, "step": 9420 }, { "epoch": 2.2447036419900024, "grad_norm": 0.00259969150647521, "learning_rate": 1.2588272633499962e-05, "loss": 0.0, "step": 9430 }, { "epoch": 2.2470840276124733, "grad_norm": 0.0012579966569319367, "learning_rate": 1.2548599539792113e-05, "loss": 0.0, "step": 9440 }, { "epoch": 2.249464413234944, "grad_norm": 0.008475791662931442, "learning_rate": 1.2508926446084265e-05, "loss": 0.0, "step": 9450 }, { "epoch": 2.251844798857415, "grad_norm": 0.007055677939206362, "learning_rate": 1.2469253352376418e-05, "loss": 0.0, "step": 9460 }, { "epoch": 2.2542251844798855, "grad_norm": 0.00043771168566308916, "learning_rate": 1.2429580258668572e-05, "loss": 0.0, "step": 9470 }, { "epoch": 2.2566055701023564, "grad_norm": 0.0004315728147048503, "learning_rate": 1.2389907164960724e-05, "loss": 0.0, "step": 9480 }, { "epoch": 2.2589859557248273, "grad_norm": 0.0006574731087312102, "learning_rate": 1.2350234071252876e-05, "loss": 0.0, "step": 9490 }, { "epoch": 2.261366341347298, "grad_norm": 0.000502898299600929, "learning_rate": 1.231056097754503e-05, "loss": 0.0, "step": 9500 }, { "epoch": 2.263746726969769, "grad_norm": 0.0014464023988693953, "learning_rate": 1.2270887883837182e-05, "loss": 0.0, "step": 9510 }, { "epoch": 2.26612711259224, "grad_norm": 0.0007312349043786526, "learning_rate": 1.2231214790129336e-05, "loss": 0.0, "step": 9520 }, { "epoch": 2.268507498214711, "grad_norm": 0.0012411205098032951, "learning_rate": 1.2191541696421488e-05, "loss": 0.0, "step": 9530 }, { "epoch": 2.2708878838371818, "grad_norm": 0.003359739203006029, "learning_rate": 1.215186860271364e-05, "loss": 0.0, "step": 9540 }, { "epoch": 2.2732682694596527, "grad_norm": 0.0025401897728443146, "learning_rate": 1.2112195509005793e-05, "loss": 0.0, "step": 9550 }, { "epoch": 2.275648655082123, "grad_norm": 0.0009357984527014196, "learning_rate": 1.2072522415297945e-05, "loss": 0.0, "step": 9560 }, { "epoch": 2.278029040704594, "grad_norm": 0.015569353476166725, "learning_rate": 1.2032849321590097e-05, "loss": 0.0, "step": 9570 }, { "epoch": 2.280409426327065, "grad_norm": 0.0005228265072219074, "learning_rate": 1.1993176227882251e-05, "loss": 0.0, "step": 9580 }, { "epoch": 2.282789811949536, "grad_norm": 0.0006133327260613441, "learning_rate": 1.1953503134174405e-05, "loss": 0.0, "step": 9590 }, { "epoch": 2.2851701975720067, "grad_norm": 0.0006283469265326858, "learning_rate": 1.1913830040466557e-05, "loss": 0.0, "step": 9600 }, { "epoch": 2.2875505831944776, "grad_norm": 0.0017937012016773224, "learning_rate": 1.1874156946758709e-05, "loss": 0.0, "step": 9610 }, { "epoch": 2.2899309688169485, "grad_norm": 0.00227372907102108, "learning_rate": 1.183448385305086e-05, "loss": 0.0, "step": 9620 }, { "epoch": 2.2923113544394194, "grad_norm": 0.0007874960429035127, "learning_rate": 1.1794810759343014e-05, "loss": 0.0, "step": 9630 }, { "epoch": 2.2946917400618903, "grad_norm": 0.0012992926640436053, "learning_rate": 1.1755137665635168e-05, "loss": 0.0, "step": 9640 }, { "epoch": 2.2970721256843607, "grad_norm": 0.0026856097392737865, "learning_rate": 1.1715464571927318e-05, "loss": 0.0001, "step": 9650 }, { "epoch": 2.2994525113068316, "grad_norm": 0.027589144185185432, "learning_rate": 1.1675791478219472e-05, "loss": 0.0005, "step": 9660 }, { "epoch": 2.3018328969293025, "grad_norm": 0.00021341729734558612, "learning_rate": 1.1636118384511626e-05, "loss": 0.0, "step": 9670 }, { "epoch": 2.3042132825517734, "grad_norm": 0.0005525678861886263, "learning_rate": 1.1596445290803778e-05, "loss": 0.0, "step": 9680 }, { "epoch": 2.3065936681742443, "grad_norm": 0.0006510653183795512, "learning_rate": 1.155677219709593e-05, "loss": 0.0, "step": 9690 }, { "epoch": 2.308974053796715, "grad_norm": 0.0011141913710162044, "learning_rate": 1.1517099103388082e-05, "loss": 0.0, "step": 9700 }, { "epoch": 2.311354439419186, "grad_norm": 0.001998309977352619, "learning_rate": 1.1477426009680235e-05, "loss": 0.0, "step": 9710 }, { "epoch": 2.3137348250416565, "grad_norm": 0.008638182654976845, "learning_rate": 1.1437752915972389e-05, "loss": 0.0, "step": 9720 }, { "epoch": 2.3161152106641274, "grad_norm": 0.0004837829037569463, "learning_rate": 1.139807982226454e-05, "loss": 0.0, "step": 9730 }, { "epoch": 2.3184955962865983, "grad_norm": 0.008834806270897388, "learning_rate": 1.1358406728556693e-05, "loss": 0.0, "step": 9740 }, { "epoch": 2.320875981909069, "grad_norm": 0.017421774566173553, "learning_rate": 1.1318733634848847e-05, "loss": 0.0, "step": 9750 }, { "epoch": 2.32325636753154, "grad_norm": 0.0008695673895999789, "learning_rate": 1.1279060541140999e-05, "loss": 0.0, "step": 9760 }, { "epoch": 2.325636753154011, "grad_norm": 0.007985567674040794, "learning_rate": 1.1239387447433151e-05, "loss": 0.0001, "step": 9770 }, { "epoch": 2.328017138776482, "grad_norm": 0.0002991770743392408, "learning_rate": 1.1199714353725305e-05, "loss": 0.0, "step": 9780 }, { "epoch": 2.3303975243989528, "grad_norm": 0.0018964770715683699, "learning_rate": 1.1160041260017457e-05, "loss": 0.0, "step": 9790 }, { "epoch": 2.3327779100214237, "grad_norm": 0.0003782061976380646, "learning_rate": 1.112036816630961e-05, "loss": 0.0, "step": 9800 }, { "epoch": 2.335158295643894, "grad_norm": 0.0005394426407292485, "learning_rate": 1.108069507260176e-05, "loss": 0.0, "step": 9810 }, { "epoch": 2.337538681266365, "grad_norm": 0.0008728650282137096, "learning_rate": 1.1041021978893914e-05, "loss": 0.0, "step": 9820 }, { "epoch": 2.339919066888836, "grad_norm": 1.026079773902893, "learning_rate": 1.1001348885186068e-05, "loss": 0.0001, "step": 9830 }, { "epoch": 2.342299452511307, "grad_norm": 0.000987286795862019, "learning_rate": 1.096167579147822e-05, "loss": 0.0, "step": 9840 }, { "epoch": 2.3446798381337777, "grad_norm": 0.0015003952430561185, "learning_rate": 1.0922002697770372e-05, "loss": 0.0, "step": 9850 }, { "epoch": 2.3470602237562486, "grad_norm": 0.001296977628953755, "learning_rate": 1.0882329604062526e-05, "loss": 0.0, "step": 9860 }, { "epoch": 2.3494406093787195, "grad_norm": 0.000640163547359407, "learning_rate": 1.0842656510354678e-05, "loss": 0.0, "step": 9870 }, { "epoch": 2.3518209950011904, "grad_norm": 0.0009391361963935196, "learning_rate": 1.0802983416646831e-05, "loss": 0.0, "step": 9880 }, { "epoch": 2.3542013806236612, "grad_norm": 0.0006612977595068514, "learning_rate": 1.0763310322938983e-05, "loss": 0.0001, "step": 9890 }, { "epoch": 2.3565817662461317, "grad_norm": 0.0014715328579768538, "learning_rate": 1.0723637229231135e-05, "loss": 0.0, "step": 9900 }, { "epoch": 2.3589621518686026, "grad_norm": 0.0004139976226724684, "learning_rate": 1.0683964135523289e-05, "loss": 0.0, "step": 9910 }, { "epoch": 2.3613425374910735, "grad_norm": 0.001368595752865076, "learning_rate": 1.0644291041815441e-05, "loss": 0.0001, "step": 9920 }, { "epoch": 2.3637229231135444, "grad_norm": 0.0010275020031258464, "learning_rate": 1.0604617948107593e-05, "loss": 0.0, "step": 9930 }, { "epoch": 2.3661033087360153, "grad_norm": 0.0008476102957502007, "learning_rate": 1.0564944854399747e-05, "loss": 0.0, "step": 9940 }, { "epoch": 2.368483694358486, "grad_norm": 0.019286731258034706, "learning_rate": 1.0525271760691899e-05, "loss": 0.0, "step": 9950 }, { "epoch": 2.370864079980957, "grad_norm": 0.0007589785964228213, "learning_rate": 1.0485598666984052e-05, "loss": 0.0001, "step": 9960 }, { "epoch": 2.373244465603428, "grad_norm": 0.0007659016991965473, "learning_rate": 1.0445925573276204e-05, "loss": 0.0, "step": 9970 }, { "epoch": 2.375624851225899, "grad_norm": 0.0035345428623259068, "learning_rate": 1.0406252479568356e-05, "loss": 0.0, "step": 9980 }, { "epoch": 2.3780052368483693, "grad_norm": 0.0021891535725444555, "learning_rate": 1.036657938586051e-05, "loss": 0.0, "step": 9990 }, { "epoch": 2.38038562247084, "grad_norm": 0.2655426263809204, "learning_rate": 1.0326906292152662e-05, "loss": 0.001, "step": 10000 }, { "epoch": 2.382766008093311, "grad_norm": 0.0008121923892758787, "learning_rate": 1.0287233198444816e-05, "loss": 0.0001, "step": 10010 }, { "epoch": 2.385146393715782, "grad_norm": 0.006638567429035902, "learning_rate": 1.0247560104736968e-05, "loss": 0.0, "step": 10020 }, { "epoch": 2.387526779338253, "grad_norm": 0.0033031317871063948, "learning_rate": 1.020788701102912e-05, "loss": 0.0, "step": 10030 }, { "epoch": 2.3899071649607238, "grad_norm": 0.0004701575671788305, "learning_rate": 1.0168213917321274e-05, "loss": 0.0002, "step": 10040 }, { "epoch": 2.3922875505831946, "grad_norm": 0.007627520710229874, "learning_rate": 1.0128540823613426e-05, "loss": 0.0, "step": 10050 }, { "epoch": 2.394667936205665, "grad_norm": 0.0011233366094529629, "learning_rate": 1.0088867729905578e-05, "loss": 0.0, "step": 10060 }, { "epoch": 2.397048321828136, "grad_norm": 0.0003728682058863342, "learning_rate": 1.0049194636197731e-05, "loss": 0.0001, "step": 10070 }, { "epoch": 2.399428707450607, "grad_norm": 0.0018078387947753072, "learning_rate": 1.0009521542489885e-05, "loss": 0.0, "step": 10080 }, { "epoch": 2.4018090930730778, "grad_norm": 0.004032574128359556, "learning_rate": 9.969848448782037e-06, "loss": 0.0, "step": 10090 }, { "epoch": 2.4041894786955487, "grad_norm": 0.0010251044295728207, "learning_rate": 9.930175355074189e-06, "loss": 0.0, "step": 10100 }, { "epoch": 2.4065698643180196, "grad_norm": 0.0012369100004434586, "learning_rate": 9.890502261366341e-06, "loss": 0.0001, "step": 10110 }, { "epoch": 2.4089502499404905, "grad_norm": 0.0008841692470014095, "learning_rate": 9.850829167658495e-06, "loss": 0.0, "step": 10120 }, { "epoch": 2.4113306355629613, "grad_norm": 0.05967468023300171, "learning_rate": 9.811156073950648e-06, "loss": 0.0001, "step": 10130 }, { "epoch": 2.4137110211854322, "grad_norm": 0.002878790721297264, "learning_rate": 9.771482980242799e-06, "loss": 0.0, "step": 10140 }, { "epoch": 2.4160914068079027, "grad_norm": 0.0005018101655878127, "learning_rate": 9.731809886534952e-06, "loss": 0.0, "step": 10150 }, { "epoch": 2.4184717924303736, "grad_norm": 0.0015724776312708855, "learning_rate": 9.692136792827106e-06, "loss": 0.0003, "step": 10160 }, { "epoch": 2.4208521780528445, "grad_norm": 0.004237225744873285, "learning_rate": 9.652463699119258e-06, "loss": 0.0, "step": 10170 }, { "epoch": 2.4232325636753154, "grad_norm": 0.00131317344494164, "learning_rate": 9.61279060541141e-06, "loss": 0.0, "step": 10180 }, { "epoch": 2.4256129492977863, "grad_norm": 0.002073557348921895, "learning_rate": 9.573117511703564e-06, "loss": 0.0, "step": 10190 }, { "epoch": 2.427993334920257, "grad_norm": 0.0045993453823029995, "learning_rate": 9.533444417995716e-06, "loss": 0.0002, "step": 10200 }, { "epoch": 2.430373720542728, "grad_norm": 0.001618819311261177, "learning_rate": 9.49377132428787e-06, "loss": 0.0001, "step": 10210 }, { "epoch": 2.432754106165199, "grad_norm": 0.00304215750657022, "learning_rate": 9.45409823058002e-06, "loss": 0.0, "step": 10220 }, { "epoch": 2.43513449178767, "grad_norm": 0.0007059932686388493, "learning_rate": 9.414425136872173e-06, "loss": 0.0, "step": 10230 }, { "epoch": 2.4375148774101403, "grad_norm": 0.0031899004243314266, "learning_rate": 9.374752043164327e-06, "loss": 0.0, "step": 10240 }, { "epoch": 2.439895263032611, "grad_norm": 0.002477418165653944, "learning_rate": 9.33507894945648e-06, "loss": 0.0, "step": 10250 }, { "epoch": 2.442275648655082, "grad_norm": 0.00046585980453528464, "learning_rate": 9.295405855748631e-06, "loss": 0.0, "step": 10260 }, { "epoch": 2.444656034277553, "grad_norm": 0.0005838835495524108, "learning_rate": 9.255732762040785e-06, "loss": 0.0, "step": 10270 }, { "epoch": 2.447036419900024, "grad_norm": 0.001370543148368597, "learning_rate": 9.216059668332937e-06, "loss": 0.0, "step": 10280 }, { "epoch": 2.4494168055224947, "grad_norm": 0.0016045079100877047, "learning_rate": 9.17638657462509e-06, "loss": 0.0, "step": 10290 }, { "epoch": 2.4517971911449656, "grad_norm": 0.0020401678048074245, "learning_rate": 9.136713480917243e-06, "loss": 0.0001, "step": 10300 }, { "epoch": 2.454177576767436, "grad_norm": 0.00043605471728369594, "learning_rate": 9.097040387209395e-06, "loss": 0.0, "step": 10310 }, { "epoch": 2.456557962389907, "grad_norm": 0.0005910994368605316, "learning_rate": 9.057367293501548e-06, "loss": 0.0, "step": 10320 }, { "epoch": 2.458938348012378, "grad_norm": 0.0005397904315032065, "learning_rate": 9.0176941997937e-06, "loss": 0.0, "step": 10330 }, { "epoch": 2.4613187336348488, "grad_norm": 0.014002328738570213, "learning_rate": 8.978021106085852e-06, "loss": 0.0, "step": 10340 }, { "epoch": 2.4636991192573197, "grad_norm": 0.0011001590173691511, "learning_rate": 8.938348012378006e-06, "loss": 0.0001, "step": 10350 }, { "epoch": 2.4660795048797906, "grad_norm": 0.0029695210978388786, "learning_rate": 8.898674918670158e-06, "loss": 0.0, "step": 10360 }, { "epoch": 2.4684598905022614, "grad_norm": 0.00410072086378932, "learning_rate": 8.859001824962312e-06, "loss": 0.0, "step": 10370 }, { "epoch": 2.4708402761247323, "grad_norm": 0.0005128366756252944, "learning_rate": 8.819328731254464e-06, "loss": 0.0, "step": 10380 }, { "epoch": 2.4732206617472032, "grad_norm": 0.0021037100814282894, "learning_rate": 8.779655637546616e-06, "loss": 0.0, "step": 10390 }, { "epoch": 2.4756010473696737, "grad_norm": 0.0005958130932413042, "learning_rate": 8.73998254383877e-06, "loss": 0.0, "step": 10400 }, { "epoch": 2.4779814329921446, "grad_norm": 0.0021961687598377466, "learning_rate": 8.700309450130921e-06, "loss": 0.0001, "step": 10410 }, { "epoch": 2.4803618186146155, "grad_norm": 0.0011290331603959203, "learning_rate": 8.660636356423073e-06, "loss": 0.0, "step": 10420 }, { "epoch": 2.4827422042370864, "grad_norm": 0.003101737704128027, "learning_rate": 8.620963262715227e-06, "loss": 0.0, "step": 10430 }, { "epoch": 2.4851225898595573, "grad_norm": 0.010269707068800926, "learning_rate": 8.581290169007379e-06, "loss": 0.0, "step": 10440 }, { "epoch": 2.487502975482028, "grad_norm": 0.0006016406114213169, "learning_rate": 8.541617075299533e-06, "loss": 0.0, "step": 10450 }, { "epoch": 2.489883361104499, "grad_norm": 0.012370145879685879, "learning_rate": 8.501943981591685e-06, "loss": 0.0, "step": 10460 }, { "epoch": 2.49226374672697, "grad_norm": 0.002209730911999941, "learning_rate": 8.462270887883837e-06, "loss": 0.0, "step": 10470 }, { "epoch": 2.494644132349441, "grad_norm": 0.0002978077973239124, "learning_rate": 8.42259779417599e-06, "loss": 0.0, "step": 10480 }, { "epoch": 2.4970245179719113, "grad_norm": 0.0006728899315930903, "learning_rate": 8.382924700468144e-06, "loss": 0.0, "step": 10490 }, { "epoch": 2.499404903594382, "grad_norm": 0.0008764348458498716, "learning_rate": 8.343251606760296e-06, "loss": 0.0, "step": 10500 }, { "epoch": 2.501785289216853, "grad_norm": 0.001580104581080377, "learning_rate": 8.303578513052448e-06, "loss": 0.0, "step": 10510 }, { "epoch": 2.504165674839324, "grad_norm": 0.0003571589768398553, "learning_rate": 8.2639054193446e-06, "loss": 0.0, "step": 10520 }, { "epoch": 2.506546060461795, "grad_norm": 0.004758020397275686, "learning_rate": 8.224232325636754e-06, "loss": 0.0, "step": 10530 }, { "epoch": 2.5089264460842657, "grad_norm": 0.0013680767733603716, "learning_rate": 8.184559231928906e-06, "loss": 0.0, "step": 10540 }, { "epoch": 2.5113068317067366, "grad_norm": 0.0010658970568329096, "learning_rate": 8.144886138221058e-06, "loss": 0.0, "step": 10550 }, { "epoch": 2.513687217329207, "grad_norm": 0.0007452235440723598, "learning_rate": 8.105213044513212e-06, "loss": 0.0002, "step": 10560 }, { "epoch": 2.5160676029516784, "grad_norm": 0.0006281470414251089, "learning_rate": 8.065539950805365e-06, "loss": 0.0, "step": 10570 }, { "epoch": 2.518447988574149, "grad_norm": 0.0007866009837016463, "learning_rate": 8.025866857097517e-06, "loss": 0.0, "step": 10580 }, { "epoch": 2.5208283741966198, "grad_norm": 0.00039683215436525643, "learning_rate": 7.98619376338967e-06, "loss": 0.0, "step": 10590 }, { "epoch": 2.5232087598190907, "grad_norm": 0.0009177124593406916, "learning_rate": 7.946520669681823e-06, "loss": 0.0, "step": 10600 }, { "epoch": 2.5255891454415615, "grad_norm": 0.00038271176163107157, "learning_rate": 7.906847575973975e-06, "loss": 0.0, "step": 10610 }, { "epoch": 2.5279695310640324, "grad_norm": 0.00041592001798562706, "learning_rate": 7.867174482266127e-06, "loss": 0.0, "step": 10620 }, { "epoch": 2.5303499166865033, "grad_norm": 0.0009455361287109554, "learning_rate": 7.827501388558279e-06, "loss": 0.0, "step": 10630 }, { "epoch": 2.5327303023089742, "grad_norm": 0.0005674211424775422, "learning_rate": 7.787828294850433e-06, "loss": 0.0001, "step": 10640 }, { "epoch": 2.5351106879314447, "grad_norm": 0.008180541917681694, "learning_rate": 7.748155201142586e-06, "loss": 0.0001, "step": 10650 }, { "epoch": 2.537491073553916, "grad_norm": 0.006044210400432348, "learning_rate": 7.708482107434738e-06, "loss": 0.0, "step": 10660 }, { "epoch": 2.5398714591763865, "grad_norm": 0.00039350485894829035, "learning_rate": 7.66880901372689e-06, "loss": 0.0, "step": 10670 }, { "epoch": 2.5422518447988574, "grad_norm": 0.0007660723640583456, "learning_rate": 7.629135920019044e-06, "loss": 0.0, "step": 10680 }, { "epoch": 2.5446322304213282, "grad_norm": 0.001309241633862257, "learning_rate": 7.589462826311196e-06, "loss": 0.0, "step": 10690 }, { "epoch": 2.547012616043799, "grad_norm": 0.023756977170705795, "learning_rate": 7.549789732603349e-06, "loss": 0.0002, "step": 10700 }, { "epoch": 2.54939300166627, "grad_norm": 0.002046087756752968, "learning_rate": 7.510116638895501e-06, "loss": 0.0, "step": 10710 }, { "epoch": 2.551773387288741, "grad_norm": 0.0047508729621768, "learning_rate": 7.470443545187654e-06, "loss": 0.0, "step": 10720 }, { "epoch": 2.554153772911212, "grad_norm": 0.0010949558345600963, "learning_rate": 7.4307704514798075e-06, "loss": 0.0, "step": 10730 }, { "epoch": 2.5565341585336823, "grad_norm": 0.010589073412120342, "learning_rate": 7.391097357771959e-06, "loss": 0.0, "step": 10740 }, { "epoch": 2.558914544156153, "grad_norm": 0.0006332534248940647, "learning_rate": 7.351424264064112e-06, "loss": 0.0, "step": 10750 }, { "epoch": 2.561294929778624, "grad_norm": 0.00027181513723917305, "learning_rate": 7.311751170356265e-06, "loss": 0.0, "step": 10760 }, { "epoch": 2.563675315401095, "grad_norm": 0.0036267938558012247, "learning_rate": 7.272078076648417e-06, "loss": 0.0, "step": 10770 }, { "epoch": 2.566055701023566, "grad_norm": 0.002974023576825857, "learning_rate": 7.23240498294057e-06, "loss": 0.0, "step": 10780 }, { "epoch": 2.5684360866460367, "grad_norm": 0.0005654848064295948, "learning_rate": 7.192731889232724e-06, "loss": 0.0001, "step": 10790 }, { "epoch": 2.5708164722685076, "grad_norm": 0.001776995835825801, "learning_rate": 7.153058795524875e-06, "loss": 0.0, "step": 10800 }, { "epoch": 2.573196857890978, "grad_norm": 0.0031643370166420937, "learning_rate": 7.1133857018170286e-06, "loss": 0.0, "step": 10810 }, { "epoch": 2.5755772435134494, "grad_norm": 0.0006117381271906197, "learning_rate": 7.07371260810918e-06, "loss": 0.0, "step": 10820 }, { "epoch": 2.57795762913592, "grad_norm": 0.00013082509394735098, "learning_rate": 7.034039514401333e-06, "loss": 0.0, "step": 10830 }, { "epoch": 2.5803380147583908, "grad_norm": 0.009411906823515892, "learning_rate": 6.994366420693486e-06, "loss": 0.0, "step": 10840 }, { "epoch": 2.5827184003808616, "grad_norm": 0.007766501512378454, "learning_rate": 6.954693326985638e-06, "loss": 0.0, "step": 10850 }, { "epoch": 2.5850987860033325, "grad_norm": 0.001036152825690806, "learning_rate": 6.915020233277791e-06, "loss": 0.0, "step": 10860 }, { "epoch": 2.5874791716258034, "grad_norm": 0.0007062302902340889, "learning_rate": 6.875347139569945e-06, "loss": 0.0, "step": 10870 }, { "epoch": 2.5898595572482743, "grad_norm": 0.004976709373295307, "learning_rate": 6.835674045862096e-06, "loss": 0.0, "step": 10880 }, { "epoch": 2.592239942870745, "grad_norm": 0.0005074761575087905, "learning_rate": 6.79600095215425e-06, "loss": 0.0, "step": 10890 }, { "epoch": 2.5946203284932157, "grad_norm": 0.0028977631591260433, "learning_rate": 6.7563278584464025e-06, "loss": 0.0, "step": 10900 }, { "epoch": 2.597000714115687, "grad_norm": 0.004557565785944462, "learning_rate": 6.7166547647385545e-06, "loss": 0.0, "step": 10910 }, { "epoch": 2.5993810997381575, "grad_norm": 0.0018358832458034158, "learning_rate": 6.676981671030707e-06, "loss": 0.0, "step": 10920 }, { "epoch": 2.6017614853606283, "grad_norm": 0.0014729060931131244, "learning_rate": 6.637308577322859e-06, "loss": 0.0, "step": 10930 }, { "epoch": 2.6041418709830992, "grad_norm": 0.0004332439857535064, "learning_rate": 6.597635483615012e-06, "loss": 0.0, "step": 10940 }, { "epoch": 2.60652225660557, "grad_norm": 0.0009114540298469365, "learning_rate": 6.557962389907166e-06, "loss": 0.0, "step": 10950 }, { "epoch": 2.608902642228041, "grad_norm": 0.010355968959629536, "learning_rate": 6.518289296199318e-06, "loss": 0.0, "step": 10960 }, { "epoch": 2.611283027850512, "grad_norm": 0.054084401577711105, "learning_rate": 6.478616202491471e-06, "loss": 0.0, "step": 10970 }, { "epoch": 2.613663413472983, "grad_norm": 0.0009903626050800085, "learning_rate": 6.438943108783624e-06, "loss": 0.0, "step": 10980 }, { "epoch": 2.6160437990954533, "grad_norm": 0.00019378839351702482, "learning_rate": 6.399270015075776e-06, "loss": 0.0, "step": 10990 }, { "epoch": 2.618424184717924, "grad_norm": 0.0006563541246578097, "learning_rate": 6.3595969213679285e-06, "loss": 0.0, "step": 11000 }, { "epoch": 2.620804570340395, "grad_norm": 0.0006744746351614594, "learning_rate": 6.3199238276600805e-06, "loss": 0.0, "step": 11010 }, { "epoch": 2.623184955962866, "grad_norm": 0.0011966971214860678, "learning_rate": 6.280250733952233e-06, "loss": 0.0, "step": 11020 }, { "epoch": 2.625565341585337, "grad_norm": 0.0017309453105553985, "learning_rate": 6.240577640244387e-06, "loss": 0.0, "step": 11030 }, { "epoch": 2.6279457272078077, "grad_norm": 0.0008661380270496011, "learning_rate": 6.200904546536539e-06, "loss": 0.0005, "step": 11040 }, { "epoch": 2.6303261128302786, "grad_norm": 0.0003683891554828733, "learning_rate": 6.161231452828692e-06, "loss": 0.0, "step": 11050 }, { "epoch": 2.6327064984527495, "grad_norm": 0.0005742148496210575, "learning_rate": 6.121558359120845e-06, "loss": 0.0, "step": 11060 }, { "epoch": 2.6350868840752204, "grad_norm": 0.0010009456891566515, "learning_rate": 6.0818852654129976e-06, "loss": 0.0, "step": 11070 }, { "epoch": 2.637467269697691, "grad_norm": 0.0008674330892972648, "learning_rate": 6.0422121717051496e-06, "loss": 0.0001, "step": 11080 }, { "epoch": 2.6398476553201617, "grad_norm": 0.00011453252227511257, "learning_rate": 6.002539077997302e-06, "loss": 0.0, "step": 11090 }, { "epoch": 2.6422280409426326, "grad_norm": 0.0014997412217780948, "learning_rate": 5.962865984289455e-06, "loss": 0.0, "step": 11100 }, { "epoch": 2.6446084265651035, "grad_norm": 0.0013535526813939214, "learning_rate": 5.923192890581608e-06, "loss": 0.0, "step": 11110 }, { "epoch": 2.6469888121875744, "grad_norm": 0.0010607549920678139, "learning_rate": 5.883519796873761e-06, "loss": 0.0, "step": 11120 }, { "epoch": 2.6493691978100453, "grad_norm": 0.001384345581755042, "learning_rate": 5.843846703165913e-06, "loss": 0.0, "step": 11130 }, { "epoch": 2.651749583432516, "grad_norm": 0.009620246477425098, "learning_rate": 5.804173609458066e-06, "loss": 0.0, "step": 11140 }, { "epoch": 2.6541299690549867, "grad_norm": 0.004576113075017929, "learning_rate": 5.764500515750219e-06, "loss": 0.0, "step": 11150 }, { "epoch": 2.656510354677458, "grad_norm": 0.0007963149109855294, "learning_rate": 5.7248274220423715e-06, "loss": 0.0, "step": 11160 }, { "epoch": 2.6588907402999284, "grad_norm": 0.0005275904550217092, "learning_rate": 5.6851543283345235e-06, "loss": 0.0, "step": 11170 }, { "epoch": 2.6612711259223993, "grad_norm": 0.0007748051430098712, "learning_rate": 5.645481234626677e-06, "loss": 0.0, "step": 11180 }, { "epoch": 2.6636515115448702, "grad_norm": 0.0005676033324562013, "learning_rate": 5.605808140918829e-06, "loss": 0.0, "step": 11190 }, { "epoch": 2.666031897167341, "grad_norm": 0.0009870273061096668, "learning_rate": 5.566135047210982e-06, "loss": 0.0, "step": 11200 }, { "epoch": 2.668412282789812, "grad_norm": 0.0004960622172802687, "learning_rate": 5.526461953503134e-06, "loss": 0.0, "step": 11210 }, { "epoch": 2.670792668412283, "grad_norm": 0.2789072096347809, "learning_rate": 5.486788859795288e-06, "loss": 0.0001, "step": 11220 }, { "epoch": 2.673173054034754, "grad_norm": 0.004494486376643181, "learning_rate": 5.44711576608744e-06, "loss": 0.0001, "step": 11230 }, { "epoch": 2.6755534396572243, "grad_norm": 0.0009736506035551429, "learning_rate": 5.407442672379593e-06, "loss": 0.0, "step": 11240 }, { "epoch": 2.677933825279695, "grad_norm": 0.0027844863943755627, "learning_rate": 5.367769578671745e-06, "loss": 0.0, "step": 11250 }, { "epoch": 2.680314210902166, "grad_norm": 0.013426104560494423, "learning_rate": 5.328096484963898e-06, "loss": 0.0, "step": 11260 }, { "epoch": 2.682694596524637, "grad_norm": 0.0002785604156088084, "learning_rate": 5.28842339125605e-06, "loss": 0.0, "step": 11270 }, { "epoch": 2.685074982147108, "grad_norm": 0.0007079096976667643, "learning_rate": 5.248750297548203e-06, "loss": 0.0, "step": 11280 }, { "epoch": 2.6874553677695787, "grad_norm": 0.0004877845640294254, "learning_rate": 5.209077203840355e-06, "loss": 0.0001, "step": 11290 }, { "epoch": 2.6898357533920496, "grad_norm": 0.029308408498764038, "learning_rate": 5.169404110132509e-06, "loss": 0.0, "step": 11300 }, { "epoch": 2.6922161390145205, "grad_norm": 0.0011891064932569861, "learning_rate": 5.129731016424661e-06, "loss": 0.0, "step": 11310 }, { "epoch": 2.6945965246369914, "grad_norm": 0.009328281506896019, "learning_rate": 5.090057922716814e-06, "loss": 0.0, "step": 11320 }, { "epoch": 2.696976910259462, "grad_norm": 0.0010127691784873605, "learning_rate": 5.0503848290089666e-06, "loss": 0.0, "step": 11330 }, { "epoch": 2.6993572958819327, "grad_norm": 0.0006704577244818211, "learning_rate": 5.010711735301119e-06, "loss": 0.0, "step": 11340 }, { "epoch": 2.7017376815044036, "grad_norm": 0.0015914466930553317, "learning_rate": 4.971038641593271e-06, "loss": 0.0, "step": 11350 }, { "epoch": 2.7041180671268745, "grad_norm": 0.00046926282811909914, "learning_rate": 4.931365547885424e-06, "loss": 0.0, "step": 11360 }, { "epoch": 2.7064984527493454, "grad_norm": 0.0008572743972763419, "learning_rate": 4.891692454177577e-06, "loss": 0.0, "step": 11370 }, { "epoch": 2.7088788383718163, "grad_norm": 0.001012885244563222, "learning_rate": 4.85201936046973e-06, "loss": 0.0, "step": 11380 }, { "epoch": 2.711259223994287, "grad_norm": 0.000291361880954355, "learning_rate": 4.812346266761882e-06, "loss": 0.0, "step": 11390 }, { "epoch": 2.7136396096167577, "grad_norm": 0.001445894013158977, "learning_rate": 4.772673173054035e-06, "loss": 0.0, "step": 11400 }, { "epoch": 2.716019995239229, "grad_norm": 0.0007329813088290393, "learning_rate": 4.733000079346188e-06, "loss": 0.0, "step": 11410 }, { "epoch": 2.7184003808616994, "grad_norm": 0.02237352356314659, "learning_rate": 4.6933269856383405e-06, "loss": 0.0, "step": 11420 }, { "epoch": 2.7207807664841703, "grad_norm": 0.0004787015204783529, "learning_rate": 4.6536538919304925e-06, "loss": 0.0, "step": 11430 }, { "epoch": 2.7231611521066412, "grad_norm": 0.0011766423704102635, "learning_rate": 4.613980798222645e-06, "loss": 0.0, "step": 11440 }, { "epoch": 2.725541537729112, "grad_norm": 0.0003720026579685509, "learning_rate": 4.574307704514798e-06, "loss": 0.0, "step": 11450 }, { "epoch": 2.727921923351583, "grad_norm": 0.0004271367215551436, "learning_rate": 4.534634610806951e-06, "loss": 0.0, "step": 11460 }, { "epoch": 2.730302308974054, "grad_norm": 0.001319264993071556, "learning_rate": 4.494961517099103e-06, "loss": 0.0, "step": 11470 }, { "epoch": 2.732682694596525, "grad_norm": 0.0012237573973834515, "learning_rate": 4.455288423391257e-06, "loss": 0.0, "step": 11480 }, { "epoch": 2.7350630802189952, "grad_norm": 0.00044418079778552055, "learning_rate": 4.415615329683409e-06, "loss": 0.0, "step": 11490 }, { "epoch": 2.7374434658414666, "grad_norm": 0.0009368477039970458, "learning_rate": 4.375942235975562e-06, "loss": 0.0, "step": 11500 }, { "epoch": 2.739823851463937, "grad_norm": 0.0015390801709145308, "learning_rate": 4.336269142267714e-06, "loss": 0.0001, "step": 11510 }, { "epoch": 2.742204237086408, "grad_norm": 0.00022943236399441957, "learning_rate": 4.296596048559867e-06, "loss": 0.0, "step": 11520 }, { "epoch": 2.744584622708879, "grad_norm": 0.0031924904324114323, "learning_rate": 4.256922954852019e-06, "loss": 0.0, "step": 11530 }, { "epoch": 2.7469650083313497, "grad_norm": 0.0011005508713424206, "learning_rate": 4.217249861144172e-06, "loss": 0.0, "step": 11540 }, { "epoch": 2.7493453939538206, "grad_norm": 0.00039162219036370516, "learning_rate": 4.177576767436325e-06, "loss": 0.0, "step": 11550 }, { "epoch": 2.7517257795762915, "grad_norm": 0.0011376795591786504, "learning_rate": 4.137903673728478e-06, "loss": 0.0, "step": 11560 }, { "epoch": 2.7541061651987624, "grad_norm": 0.0005944286240264773, "learning_rate": 4.09823058002063e-06, "loss": 0.0, "step": 11570 }, { "epoch": 2.756486550821233, "grad_norm": 0.0007298539276234806, "learning_rate": 4.058557486312783e-06, "loss": 0.0, "step": 11580 }, { "epoch": 2.7588669364437037, "grad_norm": 0.00018211067072115839, "learning_rate": 4.0188843926049356e-06, "loss": 0.0, "step": 11590 }, { "epoch": 2.7612473220661746, "grad_norm": 0.0034182893577963114, "learning_rate": 3.9792112988970884e-06, "loss": 0.0, "step": 11600 }, { "epoch": 2.7636277076886455, "grad_norm": 0.000364614010322839, "learning_rate": 3.939538205189241e-06, "loss": 0.0, "step": 11610 }, { "epoch": 2.7660080933111164, "grad_norm": 0.0021814818028360605, "learning_rate": 3.899865111481393e-06, "loss": 0.0, "step": 11620 }, { "epoch": 2.7683884789335873, "grad_norm": 0.0014812530716881156, "learning_rate": 3.860192017773546e-06, "loss": 0.0001, "step": 11630 }, { "epoch": 2.770768864556058, "grad_norm": 0.0005358079797588289, "learning_rate": 3.820518924065699e-06, "loss": 0.0, "step": 11640 }, { "epoch": 2.7731492501785286, "grad_norm": 0.00028996021137572825, "learning_rate": 3.7808458303578514e-06, "loss": 0.0, "step": 11650 }, { "epoch": 2.775529635801, "grad_norm": 0.001182155217975378, "learning_rate": 3.741172736650004e-06, "loss": 0.0, "step": 11660 }, { "epoch": 2.7779100214234704, "grad_norm": 0.00023413899180013686, "learning_rate": 3.701499642942157e-06, "loss": 0.0, "step": 11670 }, { "epoch": 2.7802904070459413, "grad_norm": 0.0006019670399837196, "learning_rate": 3.6618265492343095e-06, "loss": 0.0, "step": 11680 }, { "epoch": 2.782670792668412, "grad_norm": 0.0004944771062582731, "learning_rate": 3.622153455526462e-06, "loss": 0.0, "step": 11690 }, { "epoch": 2.785051178290883, "grad_norm": 6.98843869031407e-05, "learning_rate": 3.5824803618186144e-06, "loss": 0.0, "step": 11700 }, { "epoch": 2.787431563913354, "grad_norm": 0.0005101510905660689, "learning_rate": 3.5428072681107677e-06, "loss": 0.0, "step": 11710 }, { "epoch": 2.789811949535825, "grad_norm": 0.00034247711300849915, "learning_rate": 3.50313417440292e-06, "loss": 0.0, "step": 11720 }, { "epoch": 2.792192335158296, "grad_norm": 0.00044277720735408366, "learning_rate": 3.4634610806950725e-06, "loss": 0.0, "step": 11730 }, { "epoch": 2.7945727207807662, "grad_norm": 0.0005088172620162368, "learning_rate": 3.423787986987225e-06, "loss": 0.0, "step": 11740 }, { "epoch": 2.7969531064032376, "grad_norm": 0.00021512300008907914, "learning_rate": 3.384114893279378e-06, "loss": 0.0, "step": 11750 }, { "epoch": 2.799333492025708, "grad_norm": 0.0007052098517306149, "learning_rate": 3.3444417995715306e-06, "loss": 0.0, "step": 11760 }, { "epoch": 2.801713877648179, "grad_norm": 0.036882251501083374, "learning_rate": 3.304768705863683e-06, "loss": 0.0, "step": 11770 }, { "epoch": 2.80409426327065, "grad_norm": 0.00013749166100751609, "learning_rate": 3.2650956121558363e-06, "loss": 0.0, "step": 11780 }, { "epoch": 2.8064746488931207, "grad_norm": 0.0006571552366949618, "learning_rate": 3.2254225184479888e-06, "loss": 0.0, "step": 11790 }, { "epoch": 2.8088550345155916, "grad_norm": 0.0008290376281365752, "learning_rate": 3.185749424740141e-06, "loss": 0.0, "step": 11800 }, { "epoch": 2.8112354201380625, "grad_norm": 8.49374700919725e-05, "learning_rate": 3.146076331032294e-06, "loss": 0.0, "step": 11810 }, { "epoch": 2.8136158057605334, "grad_norm": 0.00033748464193195105, "learning_rate": 3.1064032373244465e-06, "loss": 0.0, "step": 11820 }, { "epoch": 2.815996191383004, "grad_norm": 0.0003914514381904155, "learning_rate": 3.0667301436165993e-06, "loss": 0.0, "step": 11830 }, { "epoch": 2.8183765770054747, "grad_norm": 0.00029730124515481293, "learning_rate": 3.0270570499087517e-06, "loss": 0.0, "step": 11840 }, { "epoch": 2.8207569626279456, "grad_norm": 0.00035526990541256964, "learning_rate": 2.9873839562009046e-06, "loss": 0.0, "step": 11850 }, { "epoch": 2.8231373482504165, "grad_norm": 0.0007370146340690553, "learning_rate": 2.9477108624930574e-06, "loss": 0.0, "step": 11860 }, { "epoch": 2.8255177338728874, "grad_norm": 8.048515883274376e-05, "learning_rate": 2.90803776878521e-06, "loss": 0.0, "step": 11870 }, { "epoch": 2.8278981194953583, "grad_norm": 0.00022186528076417744, "learning_rate": 2.8683646750773627e-06, "loss": 0.0, "step": 11880 }, { "epoch": 2.830278505117829, "grad_norm": 0.0004252239887136966, "learning_rate": 2.8286915813695156e-06, "loss": 0.0, "step": 11890 }, { "epoch": 2.8326588907403, "grad_norm": 0.00027670618146657944, "learning_rate": 2.789018487661668e-06, "loss": 0.0, "step": 11900 }, { "epoch": 2.835039276362771, "grad_norm": 0.0020431778393685818, "learning_rate": 2.749345393953821e-06, "loss": 0.0, "step": 11910 }, { "epoch": 2.8374196619852414, "grad_norm": 0.001547365915030241, "learning_rate": 2.7096723002459737e-06, "loss": 0.0, "step": 11920 }, { "epoch": 2.8398000476077123, "grad_norm": 0.0013964555691927671, "learning_rate": 2.669999206538126e-06, "loss": 0.0, "step": 11930 }, { "epoch": 2.842180433230183, "grad_norm": 0.00027170139946974814, "learning_rate": 2.630326112830279e-06, "loss": 0.0, "step": 11940 }, { "epoch": 2.844560818852654, "grad_norm": 0.0008765398524701595, "learning_rate": 2.5906530191224314e-06, "loss": 0.0, "step": 11950 }, { "epoch": 2.846941204475125, "grad_norm": 0.00015922258899081498, "learning_rate": 2.5509799254145842e-06, "loss": 0.0, "step": 11960 }, { "epoch": 2.849321590097596, "grad_norm": 0.00011323492071824148, "learning_rate": 2.5113068317067367e-06, "loss": 0.0, "step": 11970 }, { "epoch": 2.851701975720067, "grad_norm": 0.0008671206305734813, "learning_rate": 2.4716337379988895e-06, "loss": 0.0001, "step": 11980 }, { "epoch": 2.8540823613425372, "grad_norm": 0.00013449507241602987, "learning_rate": 2.431960644291042e-06, "loss": 0.0, "step": 11990 }, { "epoch": 2.8564627469650086, "grad_norm": 0.0008318678010255098, "learning_rate": 2.3922875505831948e-06, "loss": 0.0, "step": 12000 }, { "epoch": 2.858843132587479, "grad_norm": 0.0012901159934699535, "learning_rate": 2.352614456875347e-06, "loss": 0.0, "step": 12010 }, { "epoch": 2.86122351820995, "grad_norm": 0.00032769294921308756, "learning_rate": 2.3129413631675e-06, "loss": 0.0, "step": 12020 }, { "epoch": 2.863603903832421, "grad_norm": 0.0022394724655896425, "learning_rate": 2.2732682694596525e-06, "loss": 0.0, "step": 12030 }, { "epoch": 2.8659842894548917, "grad_norm": 0.0001916442415677011, "learning_rate": 2.2335951757518053e-06, "loss": 0.0, "step": 12040 }, { "epoch": 2.8683646750773626, "grad_norm": 0.0008263205527327955, "learning_rate": 2.1939220820439578e-06, "loss": 0.0, "step": 12050 }, { "epoch": 2.8707450606998335, "grad_norm": 0.01558750867843628, "learning_rate": 2.1542489883361106e-06, "loss": 0.0, "step": 12060 }, { "epoch": 2.8731254463223044, "grad_norm": 0.0005802076193504035, "learning_rate": 2.1145758946282635e-06, "loss": 0.0002, "step": 12070 }, { "epoch": 2.875505831944775, "grad_norm": 0.0006769265746697783, "learning_rate": 2.074902800920416e-06, "loss": 0.0, "step": 12080 }, { "epoch": 2.877886217567246, "grad_norm": 0.00040787094621919096, "learning_rate": 2.0352297072125687e-06, "loss": 0.0, "step": 12090 }, { "epoch": 2.8802666031897166, "grad_norm": 0.00034027136280201375, "learning_rate": 1.995556613504721e-06, "loss": 0.0, "step": 12100 }, { "epoch": 2.8826469888121875, "grad_norm": 0.008367573842406273, "learning_rate": 1.955883519796874e-06, "loss": 0.0, "step": 12110 }, { "epoch": 2.8850273744346584, "grad_norm": 0.0002640595193952322, "learning_rate": 1.9162104260890264e-06, "loss": 0.0, "step": 12120 }, { "epoch": 2.8874077600571293, "grad_norm": 0.0006561621557921171, "learning_rate": 1.8765373323811793e-06, "loss": 0.0, "step": 12130 }, { "epoch": 2.8897881456796, "grad_norm": 0.0008464111597277224, "learning_rate": 1.8368642386733317e-06, "loss": 0.0, "step": 12140 }, { "epoch": 2.892168531302071, "grad_norm": 0.0003002223384100944, "learning_rate": 1.7971911449654846e-06, "loss": 0.0, "step": 12150 }, { "epoch": 2.894548916924542, "grad_norm": 0.0003043843025807291, "learning_rate": 1.757518051257637e-06, "loss": 0.0, "step": 12160 }, { "epoch": 2.8969293025470124, "grad_norm": 0.00041168101597577333, "learning_rate": 1.7178449575497898e-06, "loss": 0.0, "step": 12170 }, { "epoch": 2.8993096881694833, "grad_norm": 0.002103559672832489, "learning_rate": 1.6781718638419423e-06, "loss": 0.0, "step": 12180 }, { "epoch": 2.901690073791954, "grad_norm": 0.00029975874349474907, "learning_rate": 1.6384987701340951e-06, "loss": 0.0, "step": 12190 }, { "epoch": 2.904070459414425, "grad_norm": 0.004904668778181076, "learning_rate": 1.5988256764262475e-06, "loss": 0.0001, "step": 12200 }, { "epoch": 2.906450845036896, "grad_norm": 0.0009001428843475878, "learning_rate": 1.5591525827184004e-06, "loss": 0.0, "step": 12210 }, { "epoch": 2.908831230659367, "grad_norm": 0.0004976601339876652, "learning_rate": 1.519479489010553e-06, "loss": 0.0, "step": 12220 }, { "epoch": 2.9112116162818378, "grad_norm": 0.0002044235880021006, "learning_rate": 1.4798063953027057e-06, "loss": 0.0, "step": 12230 }, { "epoch": 2.9135920019043082, "grad_norm": 0.0003118833410553634, "learning_rate": 1.4401333015948583e-06, "loss": 0.0, "step": 12240 }, { "epoch": 2.9159723875267796, "grad_norm": 0.00038868881529197097, "learning_rate": 1.4004602078870111e-06, "loss": 0.0, "step": 12250 }, { "epoch": 2.91835277314925, "grad_norm": 0.0005747165414504707, "learning_rate": 1.3607871141791638e-06, "loss": 0.0, "step": 12260 }, { "epoch": 2.920733158771721, "grad_norm": 0.0013731828657910228, "learning_rate": 1.3211140204713164e-06, "loss": 0.0, "step": 12270 }, { "epoch": 2.923113544394192, "grad_norm": 0.000688336614985019, "learning_rate": 1.281440926763469e-06, "loss": 0.0, "step": 12280 }, { "epoch": 2.9254939300166627, "grad_norm": 0.00041094853077083826, "learning_rate": 1.241767833055622e-06, "loss": 0.0, "step": 12290 }, { "epoch": 2.9278743156391336, "grad_norm": 0.00040040462044999003, "learning_rate": 1.2020947393477745e-06, "loss": 0.0, "step": 12300 }, { "epoch": 2.9302547012616045, "grad_norm": 0.0027486933395266533, "learning_rate": 1.1624216456399272e-06, "loss": 0.0, "step": 12310 }, { "epoch": 2.9326350868840754, "grad_norm": 0.000705558864865452, "learning_rate": 1.1227485519320798e-06, "loss": 0.0, "step": 12320 }, { "epoch": 2.935015472506546, "grad_norm": 0.0013841954059898853, "learning_rate": 1.0830754582242325e-06, "loss": 0.0001, "step": 12330 }, { "epoch": 2.937395858129017, "grad_norm": 0.0013595300260931253, "learning_rate": 1.043402364516385e-06, "loss": 0.0, "step": 12340 }, { "epoch": 2.9397762437514876, "grad_norm": 0.0011891273315995932, "learning_rate": 1.0037292708085377e-06, "loss": 0.0, "step": 12350 }, { "epoch": 2.9421566293739585, "grad_norm": 0.0009695956250652671, "learning_rate": 9.640561771006904e-07, "loss": 0.0, "step": 12360 }, { "epoch": 2.9445370149964294, "grad_norm": 0.00034754411899484694, "learning_rate": 9.24383083392843e-07, "loss": 0.0001, "step": 12370 }, { "epoch": 2.9469174006189003, "grad_norm": 0.00020417921768967062, "learning_rate": 8.847099896849956e-07, "loss": 0.0, "step": 12380 }, { "epoch": 2.949297786241371, "grad_norm": 0.0010077544720843434, "learning_rate": 8.450368959771483e-07, "loss": 0.0, "step": 12390 }, { "epoch": 2.951678171863842, "grad_norm": 0.0006951851537451148, "learning_rate": 8.053638022693009e-07, "loss": 0.0, "step": 12400 }, { "epoch": 2.954058557486313, "grad_norm": 0.0005225545028224587, "learning_rate": 7.656907085614537e-07, "loss": 0.0, "step": 12410 }, { "epoch": 2.9564389431087834, "grad_norm": 0.0004363077168818563, "learning_rate": 7.260176148536063e-07, "loss": 0.0, "step": 12420 }, { "epoch": 2.9588193287312543, "grad_norm": 0.00024609945830889046, "learning_rate": 6.863445211457589e-07, "loss": 0.0, "step": 12430 }, { "epoch": 2.961199714353725, "grad_norm": 0.06491145491600037, "learning_rate": 6.466714274379116e-07, "loss": 0.0, "step": 12440 }, { "epoch": 2.963580099976196, "grad_norm": 0.0004482944495975971, "learning_rate": 6.069983337300642e-07, "loss": 0.0, "step": 12450 }, { "epoch": 2.965960485598667, "grad_norm": 0.001836300129070878, "learning_rate": 5.67325240022217e-07, "loss": 0.0, "step": 12460 }, { "epoch": 2.968340871221138, "grad_norm": 0.0004112005408387631, "learning_rate": 5.276521463143697e-07, "loss": 0.0, "step": 12470 }, { "epoch": 2.9707212568436088, "grad_norm": 0.0020831027068197727, "learning_rate": 4.879790526065223e-07, "loss": 0.0, "step": 12480 }, { "epoch": 2.9731016424660797, "grad_norm": 0.0012763678096234798, "learning_rate": 4.4830595889867493e-07, "loss": 0.0, "step": 12490 }, { "epoch": 2.9754820280885506, "grad_norm": 0.0011779662454500794, "learning_rate": 4.086328651908276e-07, "loss": 0.0, "step": 12500 }, { "epoch": 2.977862413711021, "grad_norm": 0.0005871544708497822, "learning_rate": 3.6895977148298026e-07, "loss": 0.0, "step": 12510 }, { "epoch": 2.980242799333492, "grad_norm": 0.002057824982330203, "learning_rate": 3.2928667777513295e-07, "loss": 0.0, "step": 12520 }, { "epoch": 2.982623184955963, "grad_norm": 0.00029588877805508673, "learning_rate": 2.896135840672856e-07, "loss": 0.0, "step": 12530 }, { "epoch": 2.9850035705784337, "grad_norm": 0.0004726073530036956, "learning_rate": 2.499404903594382e-07, "loss": 0.0, "step": 12540 }, { "epoch": 2.9873839562009046, "grad_norm": 0.0014838631032034755, "learning_rate": 2.102673966515909e-07, "loss": 0.0, "step": 12550 }, { "epoch": 2.9897643418233755, "grad_norm": 0.0010778923751786351, "learning_rate": 1.7059430294374355e-07, "loss": 0.0, "step": 12560 }, { "epoch": 2.9921447274458464, "grad_norm": 0.0007851801346987486, "learning_rate": 1.3092120923589622e-07, "loss": 0.0, "step": 12570 }, { "epoch": 2.994525113068317, "grad_norm": 0.00047710456419736147, "learning_rate": 9.124811552804888e-08, "loss": 0.0, "step": 12580 }, { "epoch": 2.996905498690788, "grad_norm": 0.003749624127522111, "learning_rate": 5.1575021820201544e-08, "loss": 0.0, "step": 12590 }, { "epoch": 2.9992858843132586, "grad_norm": 0.0007799621089361608, "learning_rate": 1.1901928112354202e-08, "loss": 0.0001, "step": 12600 }, { "epoch": 3.0, "eval_loss": 2.340411811019294e-07, "eval_runtime": 52.9973, "eval_samples_per_second": 35.247, "eval_steps_per_second": 8.812, "step": 12603 } ], "logging_steps": 10, "max_steps": 12603, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6586245895421952.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }