{ "best_metric": 0.812837658080022, "best_model_checkpoint": "/data/hungnm/unisentiment/modernBERT-large-sentiment/checkpoint-4611", "epoch": 5.0, "eval_steps": 500, "global_step": 7685, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032530904359141183, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.3733, "step": 5 }, { "epoch": 0.006506180871828237, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.3771, "step": 10 }, { "epoch": 0.009759271307742356, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.3846, "step": 15 }, { "epoch": 0.013012361743656473, "grad_norm": 6.415463447570801, "learning_rate": 1.2987012987012988e-06, "loss": 2.3719, "step": 20 }, { "epoch": 0.01626545217957059, "grad_norm": 6.333994388580322, "learning_rate": 4.5454545454545455e-06, "loss": 2.3437, "step": 25 }, { "epoch": 0.01951854261548471, "grad_norm": 8.240402221679688, "learning_rate": 7.792207792207792e-06, "loss": 2.2279, "step": 30 }, { "epoch": 0.02277163305139883, "grad_norm": 3.9838759899139404, "learning_rate": 1.103896103896104e-05, "loss": 2.079, "step": 35 }, { "epoch": 0.026024723487312947, "grad_norm": 4.472020626068115, "learning_rate": 1.4285714285714285e-05, "loss": 1.997, "step": 40 }, { "epoch": 0.029277813923227064, "grad_norm": 3.7005715370178223, "learning_rate": 1.7532467532467535e-05, "loss": 1.9341, "step": 45 }, { "epoch": 0.03253090435914118, "grad_norm": 13.776549339294434, "learning_rate": 2.012987012987013e-05, "loss": 1.8824, "step": 50 }, { "epoch": 0.035783994795055306, "grad_norm": 21.78818130493164, "learning_rate": 2.272727272727273e-05, "loss": 1.8142, "step": 55 }, { "epoch": 0.03903708523096942, "grad_norm": 5.216381549835205, "learning_rate": 2.5324675324675325e-05, "loss": 1.7531, "step": 60 }, { "epoch": 0.04229017566688354, "grad_norm": 1.8160972595214844, "learning_rate": 2.857142857142857e-05, "loss": 1.7342, "step": 65 }, { "epoch": 0.04554326610279766, "grad_norm": 6.6145339012146, "learning_rate": 3.181818181818182e-05, "loss": 1.719, "step": 70 }, { "epoch": 0.048796356538711776, "grad_norm": 2.8823325634002686, "learning_rate": 3.506493506493507e-05, "loss": 1.6266, "step": 75 }, { "epoch": 0.05204944697462589, "grad_norm": 7.7064433097839355, "learning_rate": 3.831168831168831e-05, "loss": 1.5322, "step": 80 }, { "epoch": 0.05530253741054001, "grad_norm": 2.749300241470337, "learning_rate": 4.155844155844156e-05, "loss": 1.5303, "step": 85 }, { "epoch": 0.05855562784645413, "grad_norm": 2.898153305053711, "learning_rate": 4.4805194805194805e-05, "loss": 1.4395, "step": 90 }, { "epoch": 0.06180871828236825, "grad_norm": 2.754175901412964, "learning_rate": 4.8051948051948054e-05, "loss": 1.3824, "step": 95 }, { "epoch": 0.06506180871828236, "grad_norm": 6.261497974395752, "learning_rate": 4.99999914743261e-05, "loss": 1.3976, "step": 100 }, { "epoch": 0.06831489915419649, "grad_norm": 1.882625699043274, "learning_rate": 4.9999895560561514e-05, "loss": 1.3925, "step": 105 }, { "epoch": 0.07156798959011061, "grad_norm": 1.972509503364563, "learning_rate": 4.9999693076350204e-05, "loss": 1.3423, "step": 110 }, { "epoch": 0.07482108002602472, "grad_norm": 3.2671749591827393, "learning_rate": 4.999938402255531e-05, "loss": 1.2724, "step": 115 }, { "epoch": 0.07807417046193885, "grad_norm": 2.316319465637207, "learning_rate": 4.9998968400494294e-05, "loss": 1.2621, "step": 120 }, { "epoch": 0.08132726089785296, "grad_norm": 2.291067361831665, "learning_rate": 4.9998446211938876e-05, "loss": 1.2517, "step": 125 }, { "epoch": 0.08458035133376708, "grad_norm": 2.1083593368530273, "learning_rate": 4.999781745911506e-05, "loss": 1.2589, "step": 130 }, { "epoch": 0.08783344176968119, "grad_norm": 2.5555965900421143, "learning_rate": 4.9997082144703124e-05, "loss": 1.2316, "step": 135 }, { "epoch": 0.09108653220559532, "grad_norm": 1.8004491329193115, "learning_rate": 4.999624027183758e-05, "loss": 1.2197, "step": 140 }, { "epoch": 0.09433962264150944, "grad_norm": 1.7682017087936401, "learning_rate": 4.999529184410721e-05, "loss": 1.1816, "step": 145 }, { "epoch": 0.09759271307742355, "grad_norm": 2.857111692428589, "learning_rate": 4.999423686555498e-05, "loss": 1.1797, "step": 150 }, { "epoch": 0.10084580351333768, "grad_norm": 3.4952378273010254, "learning_rate": 4.999307534067812e-05, "loss": 1.1883, "step": 155 }, { "epoch": 0.10409889394925179, "grad_norm": 2.0805113315582275, "learning_rate": 4.999180727442799e-05, "loss": 1.2026, "step": 160 }, { "epoch": 0.10735198438516591, "grad_norm": 2.541172504425049, "learning_rate": 4.9990432672210174e-05, "loss": 1.1911, "step": 165 }, { "epoch": 0.11060507482108002, "grad_norm": 2.9252259731292725, "learning_rate": 4.9988951539884365e-05, "loss": 1.2091, "step": 170 }, { "epoch": 0.11385816525699415, "grad_norm": 1.487844705581665, "learning_rate": 4.9987363883764396e-05, "loss": 1.1773, "step": 175 }, { "epoch": 0.11711125569290826, "grad_norm": 1.4860082864761353, "learning_rate": 4.9985669710618156e-05, "loss": 1.1507, "step": 180 }, { "epoch": 0.12036434612882238, "grad_norm": 1.6352834701538086, "learning_rate": 4.9983869027667656e-05, "loss": 1.111, "step": 185 }, { "epoch": 0.1236174365647365, "grad_norm": 1.2237991094589233, "learning_rate": 4.99819618425889e-05, "loss": 1.1241, "step": 190 }, { "epoch": 0.12687052700065063, "grad_norm": 1.7817842960357666, "learning_rate": 4.997994816351191e-05, "loss": 1.1474, "step": 195 }, { "epoch": 0.13012361743656473, "grad_norm": 1.440572738647461, "learning_rate": 4.997782799902065e-05, "loss": 1.1401, "step": 200 }, { "epoch": 0.13337670787247885, "grad_norm": 1.1736341714859009, "learning_rate": 4.997560135815307e-05, "loss": 1.1317, "step": 205 }, { "epoch": 0.13662979830839297, "grad_norm": 1.5103353261947632, "learning_rate": 4.997326825040094e-05, "loss": 1.1289, "step": 210 }, { "epoch": 0.1398828887443071, "grad_norm": 2.190202236175537, "learning_rate": 4.997082868570993e-05, "loss": 1.1173, "step": 215 }, { "epoch": 0.14313597918022122, "grad_norm": 1.1619236469268799, "learning_rate": 4.9968282674479486e-05, "loss": 1.1033, "step": 220 }, { "epoch": 0.14638906961613532, "grad_norm": 1.4644910097122192, "learning_rate": 4.9965630227562866e-05, "loss": 1.0935, "step": 225 }, { "epoch": 0.14964216005204944, "grad_norm": 1.8269380331039429, "learning_rate": 4.9962871356266994e-05, "loss": 1.0968, "step": 230 }, { "epoch": 0.15289525048796357, "grad_norm": 1.0320109128952026, "learning_rate": 4.996000607235248e-05, "loss": 1.0718, "step": 235 }, { "epoch": 0.1561483409238777, "grad_norm": 2.302537202835083, "learning_rate": 4.995703438803359e-05, "loss": 1.1044, "step": 240 }, { "epoch": 0.1594014313597918, "grad_norm": 2.2435052394866943, "learning_rate": 4.995395631597809e-05, "loss": 1.0936, "step": 245 }, { "epoch": 0.16265452179570591, "grad_norm": 2.0816986560821533, "learning_rate": 4.995077186930731e-05, "loss": 1.1036, "step": 250 }, { "epoch": 0.16590761223162004, "grad_norm": 1.0913221836090088, "learning_rate": 4.994748106159602e-05, "loss": 1.0906, "step": 255 }, { "epoch": 0.16916070266753416, "grad_norm": 1.5220658779144287, "learning_rate": 4.9944083906872405e-05, "loss": 1.0789, "step": 260 }, { "epoch": 0.1724137931034483, "grad_norm": 2.121184825897217, "learning_rate": 4.994058041961796e-05, "loss": 1.0718, "step": 265 }, { "epoch": 0.17566688353936238, "grad_norm": 1.4716627597808838, "learning_rate": 4.9936970614767485e-05, "loss": 1.0378, "step": 270 }, { "epoch": 0.1789199739752765, "grad_norm": 1.700437068939209, "learning_rate": 4.993325450770898e-05, "loss": 1.0586, "step": 275 }, { "epoch": 0.18217306441119063, "grad_norm": 1.87433660030365, "learning_rate": 4.9929432114283614e-05, "loss": 1.0486, "step": 280 }, { "epoch": 0.18542615484710476, "grad_norm": 1.4089707136154175, "learning_rate": 4.992550345078559e-05, "loss": 1.0547, "step": 285 }, { "epoch": 0.18867924528301888, "grad_norm": 1.6576772928237915, "learning_rate": 4.992146853396219e-05, "loss": 1.033, "step": 290 }, { "epoch": 0.19193233571893298, "grad_norm": 1.2037906646728516, "learning_rate": 4.9917327381013585e-05, "loss": 1.0335, "step": 295 }, { "epoch": 0.1951854261548471, "grad_norm": 1.2297475337982178, "learning_rate": 4.9913080009592824e-05, "loss": 1.0508, "step": 300 }, { "epoch": 0.19843851659076123, "grad_norm": 1.6239577531814575, "learning_rate": 4.990872643780577e-05, "loss": 1.0528, "step": 305 }, { "epoch": 0.20169160702667535, "grad_norm": 1.4611440896987915, "learning_rate": 4.9904266684210964e-05, "loss": 1.0437, "step": 310 }, { "epoch": 0.20494469746258945, "grad_norm": 1.5891550779342651, "learning_rate": 4.989970076781961e-05, "loss": 1.0262, "step": 315 }, { "epoch": 0.20819778789850357, "grad_norm": 1.6164276599884033, "learning_rate": 4.9895028708095474e-05, "loss": 1.0388, "step": 320 }, { "epoch": 0.2114508783344177, "grad_norm": 1.509124517440796, "learning_rate": 4.989025052495476e-05, "loss": 1.0332, "step": 325 }, { "epoch": 0.21470396877033182, "grad_norm": 1.2852742671966553, "learning_rate": 4.988536623876609e-05, "loss": 1.0306, "step": 330 }, { "epoch": 0.21795705920624595, "grad_norm": 1.3699005842208862, "learning_rate": 4.988037587035036e-05, "loss": 1.0266, "step": 335 }, { "epoch": 0.22121014964216004, "grad_norm": 1.5352535247802734, "learning_rate": 4.98752794409807e-05, "loss": 1.0281, "step": 340 }, { "epoch": 0.22446324007807417, "grad_norm": 1.1084707975387573, "learning_rate": 4.9870076972382354e-05, "loss": 0.9941, "step": 345 }, { "epoch": 0.2277163305139883, "grad_norm": 2.580686330795288, "learning_rate": 4.9864768486732585e-05, "loss": 1.022, "step": 350 }, { "epoch": 0.23096942094990242, "grad_norm": 22.110929489135742, "learning_rate": 4.98593540066606e-05, "loss": 1.0305, "step": 355 }, { "epoch": 0.2342225113858165, "grad_norm": 1.32615065574646, "learning_rate": 4.985383355524743e-05, "loss": 1.0218, "step": 360 }, { "epoch": 0.23747560182173064, "grad_norm": 1.7118041515350342, "learning_rate": 4.984820715602585e-05, "loss": 1.0216, "step": 365 }, { "epoch": 0.24072869225764476, "grad_norm": 0.9846453070640564, "learning_rate": 4.984247483298029e-05, "loss": 1.0244, "step": 370 }, { "epoch": 0.24398178269355889, "grad_norm": 1.1930924654006958, "learning_rate": 4.9836636610546697e-05, "loss": 1.0333, "step": 375 }, { "epoch": 0.247234873129473, "grad_norm": 1.5189563035964966, "learning_rate": 4.983069251361244e-05, "loss": 1.0261, "step": 380 }, { "epoch": 0.2504879635653871, "grad_norm": 1.648707389831543, "learning_rate": 4.982464256751624e-05, "loss": 1.032, "step": 385 }, { "epoch": 0.25374105400130126, "grad_norm": 1.6429810523986816, "learning_rate": 4.981848679804803e-05, "loss": 1.0325, "step": 390 }, { "epoch": 0.25699414443721535, "grad_norm": 1.6516709327697754, "learning_rate": 4.981222523144882e-05, "loss": 1.0036, "step": 395 }, { "epoch": 0.26024723487312945, "grad_norm": 1.458806037902832, "learning_rate": 4.980585789441066e-05, "loss": 1.0104, "step": 400 }, { "epoch": 0.2635003253090436, "grad_norm": 1.695860505104065, "learning_rate": 4.979938481407645e-05, "loss": 1.0072, "step": 405 }, { "epoch": 0.2667534157449577, "grad_norm": 1.897962212562561, "learning_rate": 4.9792806018039876e-05, "loss": 1.0102, "step": 410 }, { "epoch": 0.27000650618087185, "grad_norm": 1.5890872478485107, "learning_rate": 4.9786121534345265e-05, "loss": 0.9945, "step": 415 }, { "epoch": 0.27325959661678595, "grad_norm": 1.090980887413025, "learning_rate": 4.977933139148746e-05, "loss": 0.9897, "step": 420 }, { "epoch": 0.27651268705270005, "grad_norm": 1.3130533695220947, "learning_rate": 4.977243561841174e-05, "loss": 1.0161, "step": 425 }, { "epoch": 0.2797657774886142, "grad_norm": 1.4305720329284668, "learning_rate": 4.976543424451365e-05, "loss": 0.9802, "step": 430 }, { "epoch": 0.2830188679245283, "grad_norm": 1.3603191375732422, "learning_rate": 4.975832729963888e-05, "loss": 0.9904, "step": 435 }, { "epoch": 0.28627195836044245, "grad_norm": 4.940343856811523, "learning_rate": 4.9751114814083186e-05, "loss": 0.9873, "step": 440 }, { "epoch": 0.28952504879635654, "grad_norm": 2.8304941654205322, "learning_rate": 4.974379681859221e-05, "loss": 0.9855, "step": 445 }, { "epoch": 0.29277813923227064, "grad_norm": 1.6684479713439941, "learning_rate": 4.973637334436135e-05, "loss": 1.0027, "step": 450 }, { "epoch": 0.2960312296681848, "grad_norm": 1.036837100982666, "learning_rate": 4.972884442303566e-05, "loss": 0.999, "step": 455 }, { "epoch": 0.2992843201040989, "grad_norm": 1.512636423110962, "learning_rate": 4.972121008670971e-05, "loss": 0.9988, "step": 460 }, { "epoch": 0.302537410540013, "grad_norm": 1.7532439231872559, "learning_rate": 4.97134703679274e-05, "loss": 0.984, "step": 465 }, { "epoch": 0.30579050097592714, "grad_norm": 1.0935814380645752, "learning_rate": 4.970562529968189e-05, "loss": 0.9725, "step": 470 }, { "epoch": 0.30904359141184123, "grad_norm": 0.9008370041847229, "learning_rate": 4.969767491541543e-05, "loss": 0.9788, "step": 475 }, { "epoch": 0.3122966818477554, "grad_norm": 0.9959591627120972, "learning_rate": 4.9689619249019174e-05, "loss": 0.9821, "step": 480 }, { "epoch": 0.3155497722836695, "grad_norm": 1.1844329833984375, "learning_rate": 4.9681458334833114e-05, "loss": 0.9588, "step": 485 }, { "epoch": 0.3188028627195836, "grad_norm": 1.136703372001648, "learning_rate": 4.9673192207645894e-05, "loss": 0.9948, "step": 490 }, { "epoch": 0.32205595315549773, "grad_norm": 1.3169392347335815, "learning_rate": 4.9664820902694654e-05, "loss": 0.9924, "step": 495 }, { "epoch": 0.32530904359141183, "grad_norm": 0.863590657711029, "learning_rate": 4.9656344455664885e-05, "loss": 0.9875, "step": 500 }, { "epoch": 0.328562134027326, "grad_norm": 1.396112084388733, "learning_rate": 4.9647762902690295e-05, "loss": 0.9765, "step": 505 }, { "epoch": 0.3318152244632401, "grad_norm": 2.4344260692596436, "learning_rate": 4.963907628035264e-05, "loss": 0.9848, "step": 510 }, { "epoch": 0.3350683148991542, "grad_norm": 2.1281023025512695, "learning_rate": 4.963028462568154e-05, "loss": 0.9926, "step": 515 }, { "epoch": 0.3383214053350683, "grad_norm": 1.2993323802947998, "learning_rate": 4.9621387976154396e-05, "loss": 0.9782, "step": 520 }, { "epoch": 0.3415744957709824, "grad_norm": 1.3118032217025757, "learning_rate": 4.961238636969616e-05, "loss": 0.9885, "step": 525 }, { "epoch": 0.3448275862068966, "grad_norm": 3.1309878826141357, "learning_rate": 4.960327984467919e-05, "loss": 0.9786, "step": 530 }, { "epoch": 0.34808067664281067, "grad_norm": 0.9523271918296814, "learning_rate": 4.9594068439923115e-05, "loss": 0.9742, "step": 535 }, { "epoch": 0.35133376707872477, "grad_norm": 1.010175347328186, "learning_rate": 4.958475219469464e-05, "loss": 0.9569, "step": 540 }, { "epoch": 0.3545868575146389, "grad_norm": 0.8311709761619568, "learning_rate": 4.9575331148707385e-05, "loss": 0.9638, "step": 545 }, { "epoch": 0.357839947950553, "grad_norm": 2.4854001998901367, "learning_rate": 4.9565805342121716e-05, "loss": 0.9537, "step": 550 }, { "epoch": 0.36109303838646717, "grad_norm": 11.345256805419922, "learning_rate": 4.955617481554459e-05, "loss": 0.9738, "step": 555 }, { "epoch": 0.36434612882238127, "grad_norm": 1.5204081535339355, "learning_rate": 4.954643961002936e-05, "loss": 0.9907, "step": 560 }, { "epoch": 0.36759921925829536, "grad_norm": 1.0884438753128052, "learning_rate": 4.95365997670756e-05, "loss": 0.9653, "step": 565 }, { "epoch": 0.3708523096942095, "grad_norm": 0.7718302011489868, "learning_rate": 4.952665532862895e-05, "loss": 0.9573, "step": 570 }, { "epoch": 0.3741054001301236, "grad_norm": 1.0595511198043823, "learning_rate": 4.9516606337080904e-05, "loss": 0.9535, "step": 575 }, { "epoch": 0.37735849056603776, "grad_norm": 0.9665298461914062, "learning_rate": 4.950645283526868e-05, "loss": 0.9614, "step": 580 }, { "epoch": 0.38061158100195186, "grad_norm": 2.5156960487365723, "learning_rate": 4.949619486647497e-05, "loss": 0.9699, "step": 585 }, { "epoch": 0.38386467143786596, "grad_norm": 0.983572244644165, "learning_rate": 4.948583247442783e-05, "loss": 0.9419, "step": 590 }, { "epoch": 0.3871177618737801, "grad_norm": 1.5895793437957764, "learning_rate": 4.9475365703300416e-05, "loss": 0.9431, "step": 595 }, { "epoch": 0.3903708523096942, "grad_norm": 0.7313794493675232, "learning_rate": 4.9464794597710864e-05, "loss": 0.9517, "step": 600 }, { "epoch": 0.3936239427456083, "grad_norm": 0.8579323887825012, "learning_rate": 4.945411920272205e-05, "loss": 0.9623, "step": 605 }, { "epoch": 0.39687703318152245, "grad_norm": 1.7319488525390625, "learning_rate": 4.944333956384144e-05, "loss": 0.9652, "step": 610 }, { "epoch": 0.40013012361743655, "grad_norm": 0.8903234601020813, "learning_rate": 4.943245572702086e-05, "loss": 0.9601, "step": 615 }, { "epoch": 0.4033832140533507, "grad_norm": 0.8647845387458801, "learning_rate": 4.942146773865631e-05, "loss": 0.9503, "step": 620 }, { "epoch": 0.4066363044892648, "grad_norm": 1.0848610401153564, "learning_rate": 4.941037564558779e-05, "loss": 0.9482, "step": 625 }, { "epoch": 0.4098893949251789, "grad_norm": 1.0989733934402466, "learning_rate": 4.939917949509907e-05, "loss": 0.9418, "step": 630 }, { "epoch": 0.41314248536109305, "grad_norm": 0.8608380556106567, "learning_rate": 4.938787933491749e-05, "loss": 0.943, "step": 635 }, { "epoch": 0.41639557579700714, "grad_norm": 2.864649772644043, "learning_rate": 4.937647521321378e-05, "loss": 0.948, "step": 640 }, { "epoch": 0.4196486662329213, "grad_norm": 0.8896744847297668, "learning_rate": 4.936496717860184e-05, "loss": 0.9573, "step": 645 }, { "epoch": 0.4229017566688354, "grad_norm": 1.1625382900238037, "learning_rate": 4.935335528013853e-05, "loss": 0.9497, "step": 650 }, { "epoch": 0.4261548471047495, "grad_norm": 0.8717337846755981, "learning_rate": 4.934163956732345e-05, "loss": 0.9442, "step": 655 }, { "epoch": 0.42940793754066364, "grad_norm": 0.9472817778587341, "learning_rate": 4.932982009009879e-05, "loss": 0.941, "step": 660 }, { "epoch": 0.43266102797657774, "grad_norm": 1.1837284564971924, "learning_rate": 4.931789689884901e-05, "loss": 0.9521, "step": 665 }, { "epoch": 0.4359141184124919, "grad_norm": 0.7896617650985718, "learning_rate": 4.9305870044400725e-05, "loss": 0.9407, "step": 670 }, { "epoch": 0.439167208848406, "grad_norm": 1.0529261827468872, "learning_rate": 4.9293739578022444e-05, "loss": 0.9483, "step": 675 }, { "epoch": 0.4424202992843201, "grad_norm": 1.018526315689087, "learning_rate": 4.928150555142436e-05, "loss": 0.9474, "step": 680 }, { "epoch": 0.44567338972023424, "grad_norm": 1.2944765090942383, "learning_rate": 4.926916801675809e-05, "loss": 0.9548, "step": 685 }, { "epoch": 0.44892648015614833, "grad_norm": 1.1999421119689941, "learning_rate": 4.925672702661653e-05, "loss": 0.9313, "step": 690 }, { "epoch": 0.4521795705920625, "grad_norm": 1.117092490196228, "learning_rate": 4.92441826340336e-05, "loss": 0.9212, "step": 695 }, { "epoch": 0.4554326610279766, "grad_norm": 0.976388692855835, "learning_rate": 4.923153489248395e-05, "loss": 0.9258, "step": 700 }, { "epoch": 0.4586857514638907, "grad_norm": 1.206737756729126, "learning_rate": 4.921878385588284e-05, "loss": 0.897, "step": 705 }, { "epoch": 0.46193884189980483, "grad_norm": 0.9215301871299744, "learning_rate": 4.920592957858584e-05, "loss": 0.9646, "step": 710 }, { "epoch": 0.4651919323357189, "grad_norm": 2.4111385345458984, "learning_rate": 4.9192972115388634e-05, "loss": 0.9249, "step": 715 }, { "epoch": 0.468445022771633, "grad_norm": 1.1773241758346558, "learning_rate": 4.9179911521526734e-05, "loss": 0.9289, "step": 720 }, { "epoch": 0.4716981132075472, "grad_norm": 1.2238733768463135, "learning_rate": 4.9166747852675325e-05, "loss": 0.9672, "step": 725 }, { "epoch": 0.4749512036434613, "grad_norm": 0.8060822486877441, "learning_rate": 4.9153481164948964e-05, "loss": 0.9571, "step": 730 }, { "epoch": 0.4782042940793754, "grad_norm": 1.0834110975265503, "learning_rate": 4.914011151490135e-05, "loss": 0.925, "step": 735 }, { "epoch": 0.4814573845152895, "grad_norm": 2.0932958126068115, "learning_rate": 4.912663895952511e-05, "loss": 0.9022, "step": 740 }, { "epoch": 0.4847104749512036, "grad_norm": 0.9362598061561584, "learning_rate": 4.911306355625154e-05, "loss": 0.9569, "step": 745 }, { "epoch": 0.48796356538711777, "grad_norm": 0.8350614905357361, "learning_rate": 4.909938536295034e-05, "loss": 0.9283, "step": 750 }, { "epoch": 0.49121665582303187, "grad_norm": 0.7186315059661865, "learning_rate": 4.908560443792941e-05, "loss": 0.916, "step": 755 }, { "epoch": 0.494469746258946, "grad_norm": 0.8941408395767212, "learning_rate": 4.907172083993457e-05, "loss": 0.9132, "step": 760 }, { "epoch": 0.4977228366948601, "grad_norm": 0.6881601214408875, "learning_rate": 4.9057734628149296e-05, "loss": 0.9401, "step": 765 }, { "epoch": 0.5009759271307742, "grad_norm": 0.7907692790031433, "learning_rate": 4.904364586219454e-05, "loss": 0.9453, "step": 770 }, { "epoch": 0.5042290175666884, "grad_norm": 0.6884419322013855, "learning_rate": 4.902945460212839e-05, "loss": 0.9213, "step": 775 }, { "epoch": 0.5074821080026025, "grad_norm": 0.9783698916435242, "learning_rate": 4.9015160908445846e-05, "loss": 0.9324, "step": 780 }, { "epoch": 0.5107351984385166, "grad_norm": 0.670250415802002, "learning_rate": 4.900076484207857e-05, "loss": 0.9186, "step": 785 }, { "epoch": 0.5139882888744307, "grad_norm": 0.6589803695678711, "learning_rate": 4.8986266464394645e-05, "loss": 0.9245, "step": 790 }, { "epoch": 0.5172413793103449, "grad_norm": 0.8181180953979492, "learning_rate": 4.8971665837198266e-05, "loss": 0.9117, "step": 795 }, { "epoch": 0.5204944697462589, "grad_norm": 0.9122630953788757, "learning_rate": 4.89569630227295e-05, "loss": 0.916, "step": 800 }, { "epoch": 0.523747560182173, "grad_norm": 1.7749128341674805, "learning_rate": 4.894215808366404e-05, "loss": 0.927, "step": 805 }, { "epoch": 0.5270006506180872, "grad_norm": 0.708722710609436, "learning_rate": 4.892725108311289e-05, "loss": 0.9378, "step": 810 }, { "epoch": 0.5302537410540012, "grad_norm": 0.7337002754211426, "learning_rate": 4.891224208462217e-05, "loss": 0.9042, "step": 815 }, { "epoch": 0.5335068314899154, "grad_norm": 0.6495442986488342, "learning_rate": 4.889713115217276e-05, "loss": 0.9271, "step": 820 }, { "epoch": 0.5367599219258296, "grad_norm": 0.6206863522529602, "learning_rate": 4.8881918350180076e-05, "loss": 0.9292, "step": 825 }, { "epoch": 0.5400130123617437, "grad_norm": 1.1107088327407837, "learning_rate": 4.886660374349381e-05, "loss": 0.9002, "step": 830 }, { "epoch": 0.5432661027976577, "grad_norm": 0.7245462536811829, "learning_rate": 4.885118739739759e-05, "loss": 0.898, "step": 835 }, { "epoch": 0.5465191932335719, "grad_norm": 0.7149853110313416, "learning_rate": 4.883566937760879e-05, "loss": 0.9105, "step": 840 }, { "epoch": 0.549772283669486, "grad_norm": 0.8408038020133972, "learning_rate": 4.882004975027816e-05, "loss": 0.9059, "step": 845 }, { "epoch": 0.5530253741054001, "grad_norm": 0.8386579155921936, "learning_rate": 4.880432858198962e-05, "loss": 0.9148, "step": 850 }, { "epoch": 0.5562784645413142, "grad_norm": 0.6639278531074524, "learning_rate": 4.878850593975992e-05, "loss": 0.8995, "step": 855 }, { "epoch": 0.5595315549772284, "grad_norm": 0.8974155187606812, "learning_rate": 4.8772581891038385e-05, "loss": 0.9406, "step": 860 }, { "epoch": 0.5627846454131424, "grad_norm": 1.0255565643310547, "learning_rate": 4.875655650370662e-05, "loss": 0.9067, "step": 865 }, { "epoch": 0.5660377358490566, "grad_norm": 1.0870089530944824, "learning_rate": 4.87404298460782e-05, "loss": 0.9239, "step": 870 }, { "epoch": 0.5692908262849707, "grad_norm": 0.7773581743240356, "learning_rate": 4.872420198689845e-05, "loss": 0.9284, "step": 875 }, { "epoch": 0.5725439167208849, "grad_norm": 0.6731524467468262, "learning_rate": 4.870787299534404e-05, "loss": 0.9122, "step": 880 }, { "epoch": 0.5757970071567989, "grad_norm": 0.8352004885673523, "learning_rate": 4.869144294102279e-05, "loss": 0.906, "step": 885 }, { "epoch": 0.5790500975927131, "grad_norm": 0.7254254817962646, "learning_rate": 4.8674911893973305e-05, "loss": 0.899, "step": 890 }, { "epoch": 0.5823031880286272, "grad_norm": 0.6976432204246521, "learning_rate": 4.8658279924664754e-05, "loss": 0.9104, "step": 895 }, { "epoch": 0.5855562784645413, "grad_norm": 0.7411409616470337, "learning_rate": 4.8641547103996456e-05, "loss": 0.9216, "step": 900 }, { "epoch": 0.5888093689004554, "grad_norm": 0.8754427433013916, "learning_rate": 4.862471350329769e-05, "loss": 0.9156, "step": 905 }, { "epoch": 0.5920624593363696, "grad_norm": 0.7311625480651855, "learning_rate": 4.8607779194327344e-05, "loss": 0.9125, "step": 910 }, { "epoch": 0.5953155497722836, "grad_norm": 0.7393909096717834, "learning_rate": 4.8590744249273566e-05, "loss": 0.9011, "step": 915 }, { "epoch": 0.5985686402081978, "grad_norm": 0.7025457620620728, "learning_rate": 4.857360874075355e-05, "loss": 0.9198, "step": 920 }, { "epoch": 0.6018217306441119, "grad_norm": 0.7867306470870972, "learning_rate": 4.855637274181314e-05, "loss": 0.8803, "step": 925 }, { "epoch": 0.605074821080026, "grad_norm": 0.6452615857124329, "learning_rate": 4.853903632592657e-05, "loss": 0.9104, "step": 930 }, { "epoch": 0.6083279115159401, "grad_norm": 0.8884091377258301, "learning_rate": 4.852159956699614e-05, "loss": 0.9119, "step": 935 }, { "epoch": 0.6115810019518543, "grad_norm": 0.5305378437042236, "learning_rate": 4.850406253935188e-05, "loss": 0.9296, "step": 940 }, { "epoch": 0.6148340923877684, "grad_norm": 3.5500142574310303, "learning_rate": 4.848642531775126e-05, "loss": 0.8996, "step": 945 }, { "epoch": 0.6180871828236825, "grad_norm": 0.7085503339767456, "learning_rate": 4.846868797737886e-05, "loss": 0.9045, "step": 950 }, { "epoch": 0.6213402732595966, "grad_norm": 0.9540202021598816, "learning_rate": 4.8450850593846035e-05, "loss": 0.9041, "step": 955 }, { "epoch": 0.6245933636955108, "grad_norm": 0.7558302879333496, "learning_rate": 4.843291324319064e-05, "loss": 0.8998, "step": 960 }, { "epoch": 0.6278464541314248, "grad_norm": 1.2235993146896362, "learning_rate": 4.8414876001876636e-05, "loss": 0.8977, "step": 965 }, { "epoch": 0.631099544567339, "grad_norm": 0.9432989954948425, "learning_rate": 4.839673894679383e-05, "loss": 0.9013, "step": 970 }, { "epoch": 0.6343526350032531, "grad_norm": 0.8516787886619568, "learning_rate": 4.83785021552575e-05, "loss": 0.9195, "step": 975 }, { "epoch": 0.6376057254391672, "grad_norm": 0.9901612401008606, "learning_rate": 4.836016570500809e-05, "loss": 0.8917, "step": 980 }, { "epoch": 0.6408588158750813, "grad_norm": 0.7319976687431335, "learning_rate": 4.834172967421088e-05, "loss": 0.8961, "step": 985 }, { "epoch": 0.6441119063109955, "grad_norm": 0.8301442265510559, "learning_rate": 4.832319414145565e-05, "loss": 0.9025, "step": 990 }, { "epoch": 0.6473649967469096, "grad_norm": 0.7469923496246338, "learning_rate": 4.8304559185756303e-05, "loss": 0.8908, "step": 995 }, { "epoch": 0.6506180871828237, "grad_norm": 0.7159551382064819, "learning_rate": 4.828582488655062e-05, "loss": 0.8948, "step": 1000 }, { "epoch": 0.6538711776187378, "grad_norm": 1.0059654712677002, "learning_rate": 4.826699132369983e-05, "loss": 0.9101, "step": 1005 }, { "epoch": 0.657124268054652, "grad_norm": 0.9471872448921204, "learning_rate": 4.824805857748831e-05, "loss": 0.9172, "step": 1010 }, { "epoch": 0.660377358490566, "grad_norm": 0.7876858711242676, "learning_rate": 4.822902672862325e-05, "loss": 0.8933, "step": 1015 }, { "epoch": 0.6636304489264802, "grad_norm": 0.7322363257408142, "learning_rate": 4.82098958582343e-05, "loss": 0.897, "step": 1020 }, { "epoch": 0.6668835393623943, "grad_norm": 0.7154924273490906, "learning_rate": 4.819066604787321e-05, "loss": 0.8931, "step": 1025 }, { "epoch": 0.6701366297983083, "grad_norm": 0.759896457195282, "learning_rate": 4.817133737951352e-05, "loss": 0.894, "step": 1030 }, { "epoch": 0.6733897202342225, "grad_norm": 0.9226410984992981, "learning_rate": 4.815190993555013e-05, "loss": 0.884, "step": 1035 }, { "epoch": 0.6766428106701367, "grad_norm": 0.7603817582130432, "learning_rate": 4.8132383798799077e-05, "loss": 0.901, "step": 1040 }, { "epoch": 0.6798959011060507, "grad_norm": 0.5785139203071594, "learning_rate": 4.811275905249705e-05, "loss": 0.9105, "step": 1045 }, { "epoch": 0.6831489915419648, "grad_norm": 0.9068583846092224, "learning_rate": 4.8093035780301135e-05, "loss": 0.8941, "step": 1050 }, { "epoch": 0.686402081977879, "grad_norm": 0.6756225228309631, "learning_rate": 4.807321406628838e-05, "loss": 0.9318, "step": 1055 }, { "epoch": 0.6896551724137931, "grad_norm": 0.5247710943222046, "learning_rate": 4.805329399495552e-05, "loss": 0.8878, "step": 1060 }, { "epoch": 0.6929082628497072, "grad_norm": 0.7933164238929749, "learning_rate": 4.8033275651218525e-05, "loss": 0.8926, "step": 1065 }, { "epoch": 0.6961613532856213, "grad_norm": 0.7385444641113281, "learning_rate": 4.8013159120412324e-05, "loss": 0.9179, "step": 1070 }, { "epoch": 0.6994144437215355, "grad_norm": 0.5647626519203186, "learning_rate": 4.7992944488290357e-05, "loss": 0.8982, "step": 1075 }, { "epoch": 0.7026675341574495, "grad_norm": 0.7482002973556519, "learning_rate": 4.79726318410243e-05, "loss": 0.8903, "step": 1080 }, { "epoch": 0.7059206245933637, "grad_norm": 0.7001275420188904, "learning_rate": 4.7952221265203626e-05, "loss": 0.9044, "step": 1085 }, { "epoch": 0.7091737150292778, "grad_norm": 0.7466359734535217, "learning_rate": 4.793171284783525e-05, "loss": 0.8829, "step": 1090 }, { "epoch": 0.7124268054651919, "grad_norm": 0.938048779964447, "learning_rate": 4.791110667634321e-05, "loss": 0.9097, "step": 1095 }, { "epoch": 0.715679895901106, "grad_norm": 1.033058762550354, "learning_rate": 4.789040283856822e-05, "loss": 0.8829, "step": 1100 }, { "epoch": 0.7189329863370202, "grad_norm": 0.7184875011444092, "learning_rate": 4.7869601422767326e-05, "loss": 0.9007, "step": 1105 }, { "epoch": 0.7221860767729343, "grad_norm": 0.6718536615371704, "learning_rate": 4.784870251761357e-05, "loss": 0.8909, "step": 1110 }, { "epoch": 0.7254391672088484, "grad_norm": 0.854129433631897, "learning_rate": 4.782770621219552e-05, "loss": 0.9017, "step": 1115 }, { "epoch": 0.7286922576447625, "grad_norm": 0.6812130212783813, "learning_rate": 4.7806612596017e-05, "loss": 0.8995, "step": 1120 }, { "epoch": 0.7319453480806767, "grad_norm": 0.6341739892959595, "learning_rate": 4.778542175899662e-05, "loss": 0.865, "step": 1125 }, { "epoch": 0.7351984385165907, "grad_norm": 0.7255128026008606, "learning_rate": 4.776413379146743e-05, "loss": 0.8991, "step": 1130 }, { "epoch": 0.7384515289525049, "grad_norm": 0.6501480340957642, "learning_rate": 4.7742748784176554e-05, "loss": 0.896, "step": 1135 }, { "epoch": 0.741704619388419, "grad_norm": 1.5047345161437988, "learning_rate": 4.7721266828284754e-05, "loss": 0.9003, "step": 1140 }, { "epoch": 0.7449577098243331, "grad_norm": 0.9915900230407715, "learning_rate": 4.769968801536608e-05, "loss": 0.887, "step": 1145 }, { "epoch": 0.7482108002602472, "grad_norm": 1.0044691562652588, "learning_rate": 4.767801243740746e-05, "loss": 0.8908, "step": 1150 }, { "epoch": 0.7514638906961614, "grad_norm": 0.5450899004936218, "learning_rate": 4.765624018680833e-05, "loss": 0.9114, "step": 1155 }, { "epoch": 0.7547169811320755, "grad_norm": 0.6787106394767761, "learning_rate": 4.763437135638021e-05, "loss": 0.9027, "step": 1160 }, { "epoch": 0.7579700715679896, "grad_norm": 0.8962056636810303, "learning_rate": 4.761240603934633e-05, "loss": 0.9099, "step": 1165 }, { "epoch": 0.7612231620039037, "grad_norm": 0.7145211100578308, "learning_rate": 4.759034432934123e-05, "loss": 0.909, "step": 1170 }, { "epoch": 0.7644762524398179, "grad_norm": 0.5293973684310913, "learning_rate": 4.7568186320410356e-05, "loss": 0.8734, "step": 1175 }, { "epoch": 0.7677293428757319, "grad_norm": 0.741165041923523, "learning_rate": 4.754593210700966e-05, "loss": 0.866, "step": 1180 }, { "epoch": 0.7709824333116461, "grad_norm": 0.7917879819869995, "learning_rate": 4.7523581784005187e-05, "loss": 0.8871, "step": 1185 }, { "epoch": 0.7742355237475602, "grad_norm": 0.7449229955673218, "learning_rate": 4.750113544667271e-05, "loss": 0.8966, "step": 1190 }, { "epoch": 0.7774886141834743, "grad_norm": 0.6702096462249756, "learning_rate": 4.7478593190697254e-05, "loss": 0.8784, "step": 1195 }, { "epoch": 0.7807417046193884, "grad_norm": 0.6181638240814209, "learning_rate": 4.745595511217277e-05, "loss": 0.9148, "step": 1200 }, { "epoch": 0.7839947950553026, "grad_norm": 0.6590509414672852, "learning_rate": 4.743322130760166e-05, "loss": 0.8738, "step": 1205 }, { "epoch": 0.7872478854912166, "grad_norm": 0.6860212087631226, "learning_rate": 4.7410391873894386e-05, "loss": 0.8986, "step": 1210 }, { "epoch": 0.7905009759271308, "grad_norm": 0.8018529415130615, "learning_rate": 4.73874669083691e-05, "loss": 0.8894, "step": 1215 }, { "epoch": 0.7937540663630449, "grad_norm": 0.49539148807525635, "learning_rate": 4.736444650875114e-05, "loss": 0.8812, "step": 1220 }, { "epoch": 0.7970071567989591, "grad_norm": 0.5640125870704651, "learning_rate": 4.7341330773172686e-05, "loss": 0.8681, "step": 1225 }, { "epoch": 0.8002602472348731, "grad_norm": 0.7945250272750854, "learning_rate": 4.731811980017234e-05, "loss": 0.8608, "step": 1230 }, { "epoch": 0.8035133376707873, "grad_norm": 1.0091153383255005, "learning_rate": 4.729481368869465e-05, "loss": 0.8853, "step": 1235 }, { "epoch": 0.8067664281067014, "grad_norm": 0.6991880536079407, "learning_rate": 4.727141253808974e-05, "loss": 0.8783, "step": 1240 }, { "epoch": 0.8100195185426154, "grad_norm": 1.1925946474075317, "learning_rate": 4.724791644811287e-05, "loss": 0.8685, "step": 1245 }, { "epoch": 0.8132726089785296, "grad_norm": 0.5790075659751892, "learning_rate": 4.722432551892402e-05, "loss": 0.8715, "step": 1250 }, { "epoch": 0.8165256994144438, "grad_norm": 0.6202079653739929, "learning_rate": 4.720063985108743e-05, "loss": 0.8673, "step": 1255 }, { "epoch": 0.8197787898503578, "grad_norm": 0.6250977516174316, "learning_rate": 4.717685954557123e-05, "loss": 0.8636, "step": 1260 }, { "epoch": 0.8230318802862719, "grad_norm": 0.5988998413085938, "learning_rate": 4.715298470374694e-05, "loss": 0.8929, "step": 1265 }, { "epoch": 0.8262849707221861, "grad_norm": 0.6314589977264404, "learning_rate": 4.712901542738908e-05, "loss": 0.8759, "step": 1270 }, { "epoch": 0.8295380611581002, "grad_norm": 2.416761875152588, "learning_rate": 4.7104951818674755e-05, "loss": 0.8854, "step": 1275 }, { "epoch": 0.8327911515940143, "grad_norm": 0.6815042495727539, "learning_rate": 4.7080793980183165e-05, "loss": 0.8801, "step": 1280 }, { "epoch": 0.8360442420299284, "grad_norm": 0.763461709022522, "learning_rate": 4.7056542014895204e-05, "loss": 0.8805, "step": 1285 }, { "epoch": 0.8392973324658426, "grad_norm": 0.6846510767936707, "learning_rate": 4.703219602619302e-05, "loss": 0.8847, "step": 1290 }, { "epoch": 0.8425504229017566, "grad_norm": 0.7311460375785828, "learning_rate": 4.7007756117859566e-05, "loss": 0.8802, "step": 1295 }, { "epoch": 0.8458035133376708, "grad_norm": 0.6256663203239441, "learning_rate": 4.698322239407814e-05, "loss": 0.872, "step": 1300 }, { "epoch": 0.8490566037735849, "grad_norm": 0.7773202061653137, "learning_rate": 4.695859495943199e-05, "loss": 0.8902, "step": 1305 }, { "epoch": 0.852309694209499, "grad_norm": 0.6700133085250854, "learning_rate": 4.6933873918903816e-05, "loss": 0.8713, "step": 1310 }, { "epoch": 0.8555627846454131, "grad_norm": 1.7503775358200073, "learning_rate": 4.690905937787536e-05, "loss": 0.8763, "step": 1315 }, { "epoch": 0.8588158750813273, "grad_norm": 0.7265576720237732, "learning_rate": 4.688415144212692e-05, "loss": 0.8808, "step": 1320 }, { "epoch": 0.8620689655172413, "grad_norm": 0.9674301147460938, "learning_rate": 4.685915021783694e-05, "loss": 0.8953, "step": 1325 }, { "epoch": 0.8653220559531555, "grad_norm": 0.7183430194854736, "learning_rate": 4.683405581158153e-05, "loss": 0.8679, "step": 1330 }, { "epoch": 0.8685751463890696, "grad_norm": 0.6251899003982544, "learning_rate": 4.6808868330334024e-05, "loss": 0.9087, "step": 1335 }, { "epoch": 0.8718282368249838, "grad_norm": 0.6841866970062256, "learning_rate": 4.67835878814645e-05, "loss": 0.8935, "step": 1340 }, { "epoch": 0.8750813272608978, "grad_norm": 0.7957316637039185, "learning_rate": 4.675821457273938e-05, "loss": 0.8677, "step": 1345 }, { "epoch": 0.878334417696812, "grad_norm": 0.6032708287239075, "learning_rate": 4.67327485123209e-05, "loss": 0.8975, "step": 1350 }, { "epoch": 0.8815875081327261, "grad_norm": 0.7431797981262207, "learning_rate": 4.6707189808766684e-05, "loss": 0.8919, "step": 1355 }, { "epoch": 0.8848405985686402, "grad_norm": 0.6148476004600525, "learning_rate": 4.6681538571029295e-05, "loss": 0.8548, "step": 1360 }, { "epoch": 0.8880936890045543, "grad_norm": 0.6982467770576477, "learning_rate": 4.665579490845574e-05, "loss": 0.8858, "step": 1365 }, { "epoch": 0.8913467794404685, "grad_norm": 0.6519801616668701, "learning_rate": 4.662995893078702e-05, "loss": 0.8957, "step": 1370 }, { "epoch": 0.8945998698763825, "grad_norm": 0.6641463041305542, "learning_rate": 4.660403074815767e-05, "loss": 0.86, "step": 1375 }, { "epoch": 0.8978529603122967, "grad_norm": 0.7551462650299072, "learning_rate": 4.657801047109527e-05, "loss": 0.8709, "step": 1380 }, { "epoch": 0.9011060507482108, "grad_norm": 0.7857052087783813, "learning_rate": 4.655189821051998e-05, "loss": 0.8539, "step": 1385 }, { "epoch": 0.904359141184125, "grad_norm": 1.1861300468444824, "learning_rate": 4.6525694077744076e-05, "loss": 0.8855, "step": 1390 }, { "epoch": 0.907612231620039, "grad_norm": 0.8817470073699951, "learning_rate": 4.6499398184471476e-05, "loss": 0.8734, "step": 1395 }, { "epoch": 0.9108653220559532, "grad_norm": 0.6167863011360168, "learning_rate": 4.647301064279725e-05, "loss": 0.8765, "step": 1400 }, { "epoch": 0.9141184124918673, "grad_norm": 0.8413434624671936, "learning_rate": 4.644653156520715e-05, "loss": 0.8889, "step": 1405 }, { "epoch": 0.9173715029277814, "grad_norm": 0.5738908052444458, "learning_rate": 4.6419961064577134e-05, "loss": 0.8479, "step": 1410 }, { "epoch": 0.9206245933636955, "grad_norm": 0.9078507423400879, "learning_rate": 4.6393299254172875e-05, "loss": 0.881, "step": 1415 }, { "epoch": 0.9238776837996097, "grad_norm": 1.190238356590271, "learning_rate": 4.63665462476493e-05, "loss": 0.8692, "step": 1420 }, { "epoch": 0.9271307742355237, "grad_norm": 0.5501294136047363, "learning_rate": 4.633970215905007e-05, "loss": 0.8792, "step": 1425 }, { "epoch": 0.9303838646714379, "grad_norm": 0.6713528633117676, "learning_rate": 4.631276710280713e-05, "loss": 0.861, "step": 1430 }, { "epoch": 0.933636955107352, "grad_norm": 0.600857675075531, "learning_rate": 4.6285741193740194e-05, "loss": 0.8657, "step": 1435 }, { "epoch": 0.936890045543266, "grad_norm": 1.3047159910202026, "learning_rate": 4.625862454705629e-05, "loss": 0.8716, "step": 1440 }, { "epoch": 0.9401431359791802, "grad_norm": 0.7485547065734863, "learning_rate": 4.623141727834919e-05, "loss": 0.8742, "step": 1445 }, { "epoch": 0.9433962264150944, "grad_norm": 0.7072353959083557, "learning_rate": 4.620411950359903e-05, "loss": 0.8659, "step": 1450 }, { "epoch": 0.9466493168510085, "grad_norm": 0.5867493748664856, "learning_rate": 4.617673133917175e-05, "loss": 0.8864, "step": 1455 }, { "epoch": 0.9499024072869225, "grad_norm": 0.6515786647796631, "learning_rate": 4.614925290181858e-05, "loss": 0.8841, "step": 1460 }, { "epoch": 0.9531554977228367, "grad_norm": 0.7220116853713989, "learning_rate": 4.612168430867559e-05, "loss": 0.88, "step": 1465 }, { "epoch": 0.9564085881587508, "grad_norm": 0.5353178381919861, "learning_rate": 4.6094025677263155e-05, "loss": 0.8578, "step": 1470 }, { "epoch": 0.9596616785946649, "grad_norm": 1.0675499439239502, "learning_rate": 4.606627712548548e-05, "loss": 0.8705, "step": 1475 }, { "epoch": 0.962914769030579, "grad_norm": 0.6946088671684265, "learning_rate": 4.6038438771630074e-05, "loss": 0.8707, "step": 1480 }, { "epoch": 0.9661678594664932, "grad_norm": 0.6132957339286804, "learning_rate": 4.601051073436728e-05, "loss": 0.872, "step": 1485 }, { "epoch": 0.9694209499024072, "grad_norm": 2.741361379623413, "learning_rate": 4.5982493132749724e-05, "loss": 0.8711, "step": 1490 }, { "epoch": 0.9726740403383214, "grad_norm": 0.6481953859329224, "learning_rate": 4.595438608621183e-05, "loss": 0.8804, "step": 1495 }, { "epoch": 0.9759271307742355, "grad_norm": 0.8871548771858215, "learning_rate": 4.592618971456933e-05, "loss": 0.863, "step": 1500 }, { "epoch": 0.9791802212101497, "grad_norm": 1.2673571109771729, "learning_rate": 4.5897904138018724e-05, "loss": 0.8781, "step": 1505 }, { "epoch": 0.9824333116460637, "grad_norm": 0.5219647288322449, "learning_rate": 4.586952947713677e-05, "loss": 0.8738, "step": 1510 }, { "epoch": 0.9856864020819779, "grad_norm": 0.7620292901992798, "learning_rate": 4.584106585287998e-05, "loss": 0.8602, "step": 1515 }, { "epoch": 0.988939492517892, "grad_norm": 5.0167717933654785, "learning_rate": 4.581251338658412e-05, "loss": 0.879, "step": 1520 }, { "epoch": 0.9921925829538061, "grad_norm": 0.6157656311988831, "learning_rate": 4.578387219996366e-05, "loss": 0.8645, "step": 1525 }, { "epoch": 0.9954456733897202, "grad_norm": 0.6330501437187195, "learning_rate": 4.5755142415111264e-05, "loss": 0.8549, "step": 1530 }, { "epoch": 0.9986987638256344, "grad_norm": 0.7185651063919067, "learning_rate": 4.572632415449729e-05, "loss": 0.8799, "step": 1535 }, { "epoch": 1.0, "eval_f1": 0.8050111210499576, "eval_loss": 0.432861328125, "eval_precision": 0.8078559249569004, "eval_recall": 0.8036506251146921, "eval_runtime": 475.6379, "eval_samples_per_second": 827.173, "eval_steps_per_second": 0.809, "step": 1537 }, { "epoch": 1.0019518542615484, "grad_norm": 1.2476260662078857, "learning_rate": 4.5697417540969234e-05, "loss": 0.8628, "step": 1540 }, { "epoch": 1.0052049446974627, "grad_norm": 1.6661646366119385, "learning_rate": 4.566842269775126e-05, "loss": 0.8106, "step": 1545 }, { "epoch": 1.0084580351333767, "grad_norm": 7.054599285125732, "learning_rate": 4.563933974844361e-05, "loss": 0.7696, "step": 1550 }, { "epoch": 1.0117111255692908, "grad_norm": 0.7829424142837524, "learning_rate": 4.561016881702212e-05, "loss": 0.8057, "step": 1555 }, { "epoch": 1.014964216005205, "grad_norm": 0.5904113054275513, "learning_rate": 4.5580910027837673e-05, "loss": 0.8178, "step": 1560 }, { "epoch": 1.018217306441119, "grad_norm": 1.8633893728256226, "learning_rate": 4.555156350561569e-05, "loss": 0.8021, "step": 1565 }, { "epoch": 1.0214703968770331, "grad_norm": 0.935964047908783, "learning_rate": 4.5522129375455555e-05, "loss": 0.7791, "step": 1570 }, { "epoch": 1.0247234873129474, "grad_norm": 1.3689883947372437, "learning_rate": 4.5492607762830145e-05, "loss": 0.814, "step": 1575 }, { "epoch": 1.0279765777488614, "grad_norm": 0.8765047788619995, "learning_rate": 4.546299879358523e-05, "loss": 0.8149, "step": 1580 }, { "epoch": 1.0312296681847755, "grad_norm": 1.1618647575378418, "learning_rate": 4.5433302593939e-05, "loss": 0.7935, "step": 1585 }, { "epoch": 1.0344827586206897, "grad_norm": 0.7140945196151733, "learning_rate": 4.540351929048146e-05, "loss": 0.7859, "step": 1590 }, { "epoch": 1.0377358490566038, "grad_norm": 0.9278448820114136, "learning_rate": 4.537364901017393e-05, "loss": 0.8269, "step": 1595 }, { "epoch": 1.0409889394925178, "grad_norm": 0.7428409457206726, "learning_rate": 4.534369188034853e-05, "loss": 0.806, "step": 1600 }, { "epoch": 1.044242029928432, "grad_norm": 0.7308477759361267, "learning_rate": 4.5313648028707557e-05, "loss": 0.7991, "step": 1605 }, { "epoch": 1.047495120364346, "grad_norm": 0.7885825037956238, "learning_rate": 4.528351758332303e-05, "loss": 0.7896, "step": 1610 }, { "epoch": 1.0507482108002602, "grad_norm": 0.8900930285453796, "learning_rate": 4.525330067263608e-05, "loss": 0.791, "step": 1615 }, { "epoch": 1.0540013012361744, "grad_norm": 0.7243936061859131, "learning_rate": 4.5222997425456446e-05, "loss": 0.8118, "step": 1620 }, { "epoch": 1.0572543916720885, "grad_norm": 0.7627750039100647, "learning_rate": 4.519260797096187e-05, "loss": 0.7967, "step": 1625 }, { "epoch": 1.0605074821080025, "grad_norm": 0.7201557755470276, "learning_rate": 4.5162132438697615e-05, "loss": 0.8087, "step": 1630 }, { "epoch": 1.0637605725439168, "grad_norm": 1.3476982116699219, "learning_rate": 4.513157095857586e-05, "loss": 0.8152, "step": 1635 }, { "epoch": 1.0670136629798308, "grad_norm": 2.0568456649780273, "learning_rate": 4.510092366087518e-05, "loss": 0.7879, "step": 1640 }, { "epoch": 1.070266753415745, "grad_norm": 0.805178701877594, "learning_rate": 4.507019067623997e-05, "loss": 0.8083, "step": 1645 }, { "epoch": 1.073519843851659, "grad_norm": 0.8525136709213257, "learning_rate": 4.5039372135679883e-05, "loss": 0.8044, "step": 1650 }, { "epoch": 1.0767729342875731, "grad_norm": 0.7201101183891296, "learning_rate": 4.5008468170569295e-05, "loss": 0.798, "step": 1655 }, { "epoch": 1.0800260247234874, "grad_norm": 0.8228124976158142, "learning_rate": 4.497747891264675e-05, "loss": 0.7921, "step": 1660 }, { "epoch": 1.0832791151594015, "grad_norm": 0.8848757147789001, "learning_rate": 4.494640449401434e-05, "loss": 0.789, "step": 1665 }, { "epoch": 1.0865322055953155, "grad_norm": 0.7168120741844177, "learning_rate": 4.491524504713722e-05, "loss": 0.8081, "step": 1670 }, { "epoch": 1.0897852960312298, "grad_norm": 0.7164594531059265, "learning_rate": 4.4884000704842976e-05, "loss": 0.8004, "step": 1675 }, { "epoch": 1.0930383864671438, "grad_norm": 0.7822607159614563, "learning_rate": 4.485267160032112e-05, "loss": 0.811, "step": 1680 }, { "epoch": 1.0962914769030578, "grad_norm": 1.0780830383300781, "learning_rate": 4.4821257867122475e-05, "loss": 0.8068, "step": 1685 }, { "epoch": 1.099544567338972, "grad_norm": 0.9002332091331482, "learning_rate": 4.478975963915861e-05, "loss": 0.7883, "step": 1690 }, { "epoch": 1.1027976577748861, "grad_norm": 0.8772884011268616, "learning_rate": 4.475817705070132e-05, "loss": 0.8103, "step": 1695 }, { "epoch": 1.1060507482108002, "grad_norm": 0.7703087329864502, "learning_rate": 4.472651023638196e-05, "loss": 0.7852, "step": 1700 }, { "epoch": 1.1093038386467144, "grad_norm": 0.6608516573905945, "learning_rate": 4.469475933119098e-05, "loss": 0.8177, "step": 1705 }, { "epoch": 1.1125569290826285, "grad_norm": 0.6942402720451355, "learning_rate": 4.4662924470477255e-05, "loss": 0.7958, "step": 1710 }, { "epoch": 1.1158100195185425, "grad_norm": 1.1164368391036987, "learning_rate": 4.4631005789947576e-05, "loss": 0.8265, "step": 1715 }, { "epoch": 1.1190631099544568, "grad_norm": 0.9671571850776672, "learning_rate": 4.4599003425666026e-05, "loss": 0.828, "step": 1720 }, { "epoch": 1.1223162003903708, "grad_norm": 0.7158306837081909, "learning_rate": 4.456691751405343e-05, "loss": 0.8068, "step": 1725 }, { "epoch": 1.1255692908262849, "grad_norm": 0.7458908557891846, "learning_rate": 4.453474819188675e-05, "loss": 0.8044, "step": 1730 }, { "epoch": 1.1288223812621991, "grad_norm": 0.832358181476593, "learning_rate": 4.450249559629853e-05, "loss": 0.8041, "step": 1735 }, { "epoch": 1.1320754716981132, "grad_norm": 1.0899025201797485, "learning_rate": 4.447015986477628e-05, "loss": 0.8171, "step": 1740 }, { "epoch": 1.1353285621340272, "grad_norm": 0.9264464378356934, "learning_rate": 4.443774113516192e-05, "loss": 0.7795, "step": 1745 }, { "epoch": 1.1385816525699415, "grad_norm": 1.837517261505127, "learning_rate": 4.440523954565114e-05, "loss": 0.8011, "step": 1750 }, { "epoch": 1.1418347430058555, "grad_norm": 0.7183268070220947, "learning_rate": 4.437265523479291e-05, "loss": 0.8071, "step": 1755 }, { "epoch": 1.1450878334417696, "grad_norm": 1.3461638689041138, "learning_rate": 4.433998834148877e-05, "loss": 0.8142, "step": 1760 }, { "epoch": 1.1483409238776838, "grad_norm": 1.060806155204773, "learning_rate": 4.430723900499232e-05, "loss": 0.7857, "step": 1765 }, { "epoch": 1.1515940143135979, "grad_norm": 0.9301265478134155, "learning_rate": 4.427440736490861e-05, "loss": 0.8009, "step": 1770 }, { "epoch": 1.1548471047495121, "grad_norm": 1.4558310508728027, "learning_rate": 4.4241493561193515e-05, "loss": 0.8203, "step": 1775 }, { "epoch": 1.1581001951854262, "grad_norm": 0.7694815993309021, "learning_rate": 4.4208497734153177e-05, "loss": 0.7825, "step": 1780 }, { "epoch": 1.1613532856213402, "grad_norm": 0.7246577739715576, "learning_rate": 4.417542002444339e-05, "loss": 0.8157, "step": 1785 }, { "epoch": 1.1646063760572545, "grad_norm": 1.0033169984817505, "learning_rate": 4.4142260573068993e-05, "loss": 0.8013, "step": 1790 }, { "epoch": 1.1678594664931685, "grad_norm": 1.7446528673171997, "learning_rate": 4.410901952138326e-05, "loss": 0.8004, "step": 1795 }, { "epoch": 1.1711125569290826, "grad_norm": 1.0588308572769165, "learning_rate": 4.407569701108737e-05, "loss": 0.8055, "step": 1800 }, { "epoch": 1.1743656473649968, "grad_norm": 0.7162896990776062, "learning_rate": 4.404229318422968e-05, "loss": 0.8091, "step": 1805 }, { "epoch": 1.1776187378009109, "grad_norm": 1.3299797773361206, "learning_rate": 4.400880818320521e-05, "loss": 0.8068, "step": 1810 }, { "epoch": 1.180871828236825, "grad_norm": 0.7723076939582825, "learning_rate": 4.397524215075504e-05, "loss": 0.8065, "step": 1815 }, { "epoch": 1.1841249186727392, "grad_norm": 0.700183629989624, "learning_rate": 4.3941595229965636e-05, "loss": 0.8006, "step": 1820 }, { "epoch": 1.1873780091086532, "grad_norm": 0.7281947135925293, "learning_rate": 4.390786756426829e-05, "loss": 0.8026, "step": 1825 }, { "epoch": 1.1906310995445673, "grad_norm": 0.8081494569778442, "learning_rate": 4.3874059297438515e-05, "loss": 0.7887, "step": 1830 }, { "epoch": 1.1938841899804815, "grad_norm": 0.749001681804657, "learning_rate": 4.384017057359538e-05, "loss": 0.8007, "step": 1835 }, { "epoch": 1.1971372804163956, "grad_norm": 0.8392378091812134, "learning_rate": 4.380620153720095e-05, "loss": 0.8228, "step": 1840 }, { "epoch": 1.2003903708523098, "grad_norm": 0.9195884466171265, "learning_rate": 4.377215233305966e-05, "loss": 0.8009, "step": 1845 }, { "epoch": 1.2036434612882239, "grad_norm": 0.9343377947807312, "learning_rate": 4.373802310631765e-05, "loss": 0.7785, "step": 1850 }, { "epoch": 1.206896551724138, "grad_norm": 0.9939282536506653, "learning_rate": 4.370381400246221e-05, "loss": 0.8228, "step": 1855 }, { "epoch": 1.2101496421600522, "grad_norm": 4.1950907707214355, "learning_rate": 4.366952516732114e-05, "loss": 0.8051, "step": 1860 }, { "epoch": 1.2134027325959662, "grad_norm": 1.1269813776016235, "learning_rate": 4.3635156747062105e-05, "loss": 0.8059, "step": 1865 }, { "epoch": 1.2166558230318802, "grad_norm": 1.8155053853988647, "learning_rate": 4.360070888819203e-05, "loss": 0.8157, "step": 1870 }, { "epoch": 1.2199089134677945, "grad_norm": 0.7301884889602661, "learning_rate": 4.356618173755648e-05, "loss": 0.786, "step": 1875 }, { "epoch": 1.2231620039037086, "grad_norm": 0.6721681952476501, "learning_rate": 4.353157544233902e-05, "loss": 0.818, "step": 1880 }, { "epoch": 1.2264150943396226, "grad_norm": 0.9088836312294006, "learning_rate": 4.349689015006061e-05, "loss": 0.7883, "step": 1885 }, { "epoch": 1.2296681847755369, "grad_norm": 0.7416070103645325, "learning_rate": 4.3462126008578936e-05, "loss": 0.8033, "step": 1890 }, { "epoch": 1.232921275211451, "grad_norm": 0.6465050578117371, "learning_rate": 4.342728316608783e-05, "loss": 0.8111, "step": 1895 }, { "epoch": 1.236174365647365, "grad_norm": 0.870087206363678, "learning_rate": 4.3392361771116604e-05, "loss": 0.8307, "step": 1900 }, { "epoch": 1.2394274560832792, "grad_norm": 0.7588717937469482, "learning_rate": 4.335736197252942e-05, "loss": 0.8024, "step": 1905 }, { "epoch": 1.2426805465191932, "grad_norm": 24.7558650970459, "learning_rate": 4.332228391952469e-05, "loss": 0.8089, "step": 1910 }, { "epoch": 1.2459336369551073, "grad_norm": 0.7845138907432556, "learning_rate": 4.328712776163436e-05, "loss": 0.8092, "step": 1915 }, { "epoch": 1.2491867273910215, "grad_norm": 0.5970214605331421, "learning_rate": 4.325189364872337e-05, "loss": 0.768, "step": 1920 }, { "epoch": 1.2524398178269356, "grad_norm": 0.6692010164260864, "learning_rate": 4.321658173098895e-05, "loss": 0.7974, "step": 1925 }, { "epoch": 1.2556929082628496, "grad_norm": 0.6391417384147644, "learning_rate": 4.318119215896001e-05, "loss": 0.8151, "step": 1930 }, { "epoch": 1.258945998698764, "grad_norm": 1.4557222127914429, "learning_rate": 4.314572508349646e-05, "loss": 0.817, "step": 1935 }, { "epoch": 1.262199089134678, "grad_norm": 0.6106812357902527, "learning_rate": 4.311018065578864e-05, "loss": 0.8154, "step": 1940 }, { "epoch": 1.265452179570592, "grad_norm": 0.7062475681304932, "learning_rate": 4.307455902735659e-05, "loss": 0.8106, "step": 1945 }, { "epoch": 1.2687052700065062, "grad_norm": 0.6937923431396484, "learning_rate": 4.303886035004947e-05, "loss": 0.8193, "step": 1950 }, { "epoch": 1.2719583604424203, "grad_norm": 0.5820494294166565, "learning_rate": 4.3003084776044855e-05, "loss": 0.8166, "step": 1955 }, { "epoch": 1.2752114508783343, "grad_norm": 0.9809524416923523, "learning_rate": 4.2967232457848154e-05, "loss": 0.7983, "step": 1960 }, { "epoch": 1.2784645413142486, "grad_norm": 0.6147664785385132, "learning_rate": 4.293130354829191e-05, "loss": 0.8195, "step": 1965 }, { "epoch": 1.2817176317501626, "grad_norm": 0.632622241973877, "learning_rate": 4.289529820053515e-05, "loss": 0.7964, "step": 1970 }, { "epoch": 1.2849707221860767, "grad_norm": 0.5672839283943176, "learning_rate": 4.285921656806276e-05, "loss": 0.7854, "step": 1975 }, { "epoch": 1.288223812621991, "grad_norm": 0.7861791849136353, "learning_rate": 4.2823058804684815e-05, "loss": 0.7772, "step": 1980 }, { "epoch": 1.291476903057905, "grad_norm": 0.627170741558075, "learning_rate": 4.2786825064535905e-05, "loss": 0.8033, "step": 1985 }, { "epoch": 1.294729993493819, "grad_norm": 0.7018983960151672, "learning_rate": 4.275051550207453e-05, "loss": 0.8067, "step": 1990 }, { "epoch": 1.2979830839297333, "grad_norm": 0.6708381772041321, "learning_rate": 4.2714130272082365e-05, "loss": 0.8019, "step": 1995 }, { "epoch": 1.3012361743656473, "grad_norm": 1.4676740169525146, "learning_rate": 4.267766952966369e-05, "loss": 0.8366, "step": 2000 }, { "epoch": 1.3044892648015614, "grad_norm": 0.6410109996795654, "learning_rate": 4.2641133430244644e-05, "loss": 0.7767, "step": 2005 }, { "epoch": 1.3077423552374756, "grad_norm": 0.6521077752113342, "learning_rate": 4.2604522129572624e-05, "loss": 0.8134, "step": 2010 }, { "epoch": 1.3109954456733897, "grad_norm": 0.605689525604248, "learning_rate": 4.256783578371557e-05, "loss": 0.7803, "step": 2015 }, { "epoch": 1.3142485361093037, "grad_norm": 1.1350992918014526, "learning_rate": 4.253107454906137e-05, "loss": 0.787, "step": 2020 }, { "epoch": 1.317501626545218, "grad_norm": 0.7411300539970398, "learning_rate": 4.2494238582317114e-05, "loss": 0.8047, "step": 2025 }, { "epoch": 1.320754716981132, "grad_norm": 0.8492873311042786, "learning_rate": 4.2457328040508484e-05, "loss": 0.788, "step": 2030 }, { "epoch": 1.3240078074170463, "grad_norm": 0.7676826119422913, "learning_rate": 4.2420343080979035e-05, "loss": 0.808, "step": 2035 }, { "epoch": 1.3272608978529603, "grad_norm": 0.6066752076148987, "learning_rate": 4.238328386138959e-05, "loss": 0.7894, "step": 2040 }, { "epoch": 1.3305139882888743, "grad_norm": 0.9500682353973389, "learning_rate": 4.234615053971751e-05, "loss": 0.7933, "step": 2045 }, { "epoch": 1.3337670787247886, "grad_norm": 0.6935995221138, "learning_rate": 4.230894327425604e-05, "loss": 0.7949, "step": 2050 }, { "epoch": 1.3370201691607027, "grad_norm": 0.701280415058136, "learning_rate": 4.227166222361364e-05, "loss": 0.7879, "step": 2055 }, { "epoch": 1.340273259596617, "grad_norm": 0.6840693354606628, "learning_rate": 4.2234307546713305e-05, "loss": 0.8095, "step": 2060 }, { "epoch": 1.343526350032531, "grad_norm": 0.6785464286804199, "learning_rate": 4.219687940279188e-05, "loss": 0.7931, "step": 2065 }, { "epoch": 1.346779440468445, "grad_norm": 1.4129618406295776, "learning_rate": 4.2159377951399385e-05, "loss": 0.8222, "step": 2070 }, { "epoch": 1.3500325309043593, "grad_norm": 0.9987890124320984, "learning_rate": 4.212180335239836e-05, "loss": 0.8086, "step": 2075 }, { "epoch": 1.3532856213402733, "grad_norm": 0.7902525067329407, "learning_rate": 4.208415576596315e-05, "loss": 0.808, "step": 2080 }, { "epoch": 1.3565387117761873, "grad_norm": 1.0341746807098389, "learning_rate": 4.2046435352579206e-05, "loss": 0.8218, "step": 2085 }, { "epoch": 1.3597918022121016, "grad_norm": 0.6914475560188293, "learning_rate": 4.200864227304247e-05, "loss": 0.8033, "step": 2090 }, { "epoch": 1.3630448926480156, "grad_norm": 0.8766419291496277, "learning_rate": 4.1970776688458624e-05, "loss": 0.7885, "step": 2095 }, { "epoch": 1.3662979830839297, "grad_norm": 0.7936528325080872, "learning_rate": 4.1932838760242445e-05, "loss": 0.8163, "step": 2100 }, { "epoch": 1.369551073519844, "grad_norm": 0.6358763575553894, "learning_rate": 4.189482865011706e-05, "loss": 0.8042, "step": 2105 }, { "epoch": 1.372804163955758, "grad_norm": 0.6236315965652466, "learning_rate": 4.1856746520113345e-05, "loss": 0.7926, "step": 2110 }, { "epoch": 1.376057254391672, "grad_norm": 0.6494086980819702, "learning_rate": 4.181859253256916e-05, "loss": 0.8001, "step": 2115 }, { "epoch": 1.3793103448275863, "grad_norm": 0.5840818881988525, "learning_rate": 4.178036685012868e-05, "loss": 0.7936, "step": 2120 }, { "epoch": 1.3825634352635003, "grad_norm": 0.6631473898887634, "learning_rate": 4.174206963574171e-05, "loss": 0.8022, "step": 2125 }, { "epoch": 1.3858165256994144, "grad_norm": 0.6331456303596497, "learning_rate": 4.1703701052662974e-05, "loss": 0.8023, "step": 2130 }, { "epoch": 1.3890696161353286, "grad_norm": 0.9553548693656921, "learning_rate": 4.166526126445145e-05, "loss": 0.7927, "step": 2135 }, { "epoch": 1.3923227065712427, "grad_norm": 0.5648055076599121, "learning_rate": 4.162675043496963e-05, "loss": 0.8094, "step": 2140 }, { "epoch": 1.3955757970071567, "grad_norm": 0.7545254826545715, "learning_rate": 4.158816872838285e-05, "loss": 0.7898, "step": 2145 }, { "epoch": 1.398828887443071, "grad_norm": 0.6327040791511536, "learning_rate": 4.1549516309158586e-05, "loss": 0.7933, "step": 2150 }, { "epoch": 1.402081977878985, "grad_norm": 0.897237241268158, "learning_rate": 4.151079334206577e-05, "loss": 0.8026, "step": 2155 }, { "epoch": 1.405335068314899, "grad_norm": 0.6637735962867737, "learning_rate": 4.147199999217402e-05, "loss": 0.8089, "step": 2160 }, { "epoch": 1.4085881587508133, "grad_norm": 0.7697905898094177, "learning_rate": 4.143313642485302e-05, "loss": 0.7986, "step": 2165 }, { "epoch": 1.4118412491867274, "grad_norm": 0.6807245016098022, "learning_rate": 4.139420280577177e-05, "loss": 0.8095, "step": 2170 }, { "epoch": 1.4150943396226414, "grad_norm": 0.6346644163131714, "learning_rate": 4.1355199300897894e-05, "loss": 0.7874, "step": 2175 }, { "epoch": 1.4183474300585557, "grad_norm": 1.3532865047454834, "learning_rate": 4.1316126076496935e-05, "loss": 0.8077, "step": 2180 }, { "epoch": 1.4216005204944697, "grad_norm": 0.7018231749534607, "learning_rate": 4.127698329913161e-05, "loss": 0.7826, "step": 2185 }, { "epoch": 1.4248536109303838, "grad_norm": 0.6970395445823669, "learning_rate": 4.1237771135661164e-05, "loss": 0.7959, "step": 2190 }, { "epoch": 1.428106701366298, "grad_norm": 0.802807092666626, "learning_rate": 4.119848975324059e-05, "loss": 0.8093, "step": 2195 }, { "epoch": 1.431359791802212, "grad_norm": 0.6809616684913635, "learning_rate": 4.115913931931997e-05, "loss": 0.7949, "step": 2200 }, { "epoch": 1.434612882238126, "grad_norm": 0.6927191615104675, "learning_rate": 4.1119720001643745e-05, "loss": 0.8072, "step": 2205 }, { "epoch": 1.4378659726740404, "grad_norm": 0.8336883187294006, "learning_rate": 4.108023196824998e-05, "loss": 0.8058, "step": 2210 }, { "epoch": 1.4411190631099544, "grad_norm": 0.9360595345497131, "learning_rate": 4.1040675387469685e-05, "loss": 0.796, "step": 2215 }, { "epoch": 1.4443721535458685, "grad_norm": 0.7317126989364624, "learning_rate": 4.1001050427926045e-05, "loss": 0.789, "step": 2220 }, { "epoch": 1.4476252439817827, "grad_norm": 0.7633559703826904, "learning_rate": 4.0961357258533774e-05, "loss": 0.7885, "step": 2225 }, { "epoch": 1.4508783344176968, "grad_norm": 0.6734256744384766, "learning_rate": 4.0921596048498315e-05, "loss": 0.7852, "step": 2230 }, { "epoch": 1.4541314248536108, "grad_norm": 1.5330891609191895, "learning_rate": 4.088176696731517e-05, "loss": 0.8067, "step": 2235 }, { "epoch": 1.457384515289525, "grad_norm": 1.4686920642852783, "learning_rate": 4.084187018476918e-05, "loss": 0.8, "step": 2240 }, { "epoch": 1.460637605725439, "grad_norm": 0.8620485067367554, "learning_rate": 4.0801905870933764e-05, "loss": 0.7865, "step": 2245 }, { "epoch": 1.4638906961613534, "grad_norm": 0.8238881826400757, "learning_rate": 4.076187419617024e-05, "loss": 0.8486, "step": 2250 }, { "epoch": 1.4671437865972674, "grad_norm": 0.5753189921379089, "learning_rate": 4.072177533112703e-05, "loss": 0.7975, "step": 2255 }, { "epoch": 1.4703968770331814, "grad_norm": 0.707665741443634, "learning_rate": 4.068160944673903e-05, "loss": 0.8067, "step": 2260 }, { "epoch": 1.4736499674690957, "grad_norm": 0.702339768409729, "learning_rate": 4.0641376714226795e-05, "loss": 0.7823, "step": 2265 }, { "epoch": 1.4769030579050098, "grad_norm": 1.500267505645752, "learning_rate": 4.060107730509587e-05, "loss": 0.8159, "step": 2270 }, { "epoch": 1.480156148340924, "grad_norm": 0.8150926232337952, "learning_rate": 4.0560711391135986e-05, "loss": 0.8227, "step": 2275 }, { "epoch": 1.483409238776838, "grad_norm": 0.8220822215080261, "learning_rate": 4.052027914442043e-05, "loss": 0.8067, "step": 2280 }, { "epoch": 1.486662329212752, "grad_norm": 0.6167823672294617, "learning_rate": 4.047978073730519e-05, "loss": 0.8158, "step": 2285 }, { "epoch": 1.4899154196486664, "grad_norm": 36.838592529296875, "learning_rate": 4.043921634242836e-05, "loss": 0.8178, "step": 2290 }, { "epoch": 1.4931685100845804, "grad_norm": 0.6391186714172363, "learning_rate": 4.039858613270927e-05, "loss": 0.7954, "step": 2295 }, { "epoch": 1.4964216005204944, "grad_norm": 0.7283660769462585, "learning_rate": 4.035789028134782e-05, "loss": 0.8045, "step": 2300 }, { "epoch": 1.4996746909564087, "grad_norm": 1.2124541997909546, "learning_rate": 4.031712896182376e-05, "loss": 0.7941, "step": 2305 }, { "epoch": 1.5029277813923227, "grad_norm": 0.6527173519134521, "learning_rate": 4.0276302347895864e-05, "loss": 0.808, "step": 2310 }, { "epoch": 1.5061808718282368, "grad_norm": 0.7583323121070862, "learning_rate": 4.023541061360131e-05, "loss": 0.8228, "step": 2315 }, { "epoch": 1.509433962264151, "grad_norm": 2.344557762145996, "learning_rate": 4.019445393325483e-05, "loss": 0.798, "step": 2320 }, { "epoch": 1.512687052700065, "grad_norm": 0.8352357149124146, "learning_rate": 4.0153432481448027e-05, "loss": 0.7912, "step": 2325 }, { "epoch": 1.5159401431359791, "grad_norm": 0.8066681623458862, "learning_rate": 4.01123464330486e-05, "loss": 0.8125, "step": 2330 }, { "epoch": 1.5191932335718934, "grad_norm": 0.7398434281349182, "learning_rate": 4.007119596319962e-05, "loss": 0.7997, "step": 2335 }, { "epoch": 1.5224463240078074, "grad_norm": 0.9586179852485657, "learning_rate": 4.002998124731879e-05, "loss": 0.7994, "step": 2340 }, { "epoch": 1.5256994144437215, "grad_norm": 0.6949347853660583, "learning_rate": 3.998870246109767e-05, "loss": 0.8192, "step": 2345 }, { "epoch": 1.5289525048796357, "grad_norm": 1.5228863954544067, "learning_rate": 3.994735978050094e-05, "loss": 0.7902, "step": 2350 }, { "epoch": 1.5322055953155498, "grad_norm": 0.6021274328231812, "learning_rate": 3.990595338176564e-05, "loss": 0.7995, "step": 2355 }, { "epoch": 1.5354586857514638, "grad_norm": 0.6251325607299805, "learning_rate": 3.986448344140047e-05, "loss": 0.7943, "step": 2360 }, { "epoch": 1.538711776187378, "grad_norm": 0.6837208867073059, "learning_rate": 3.9822950136184946e-05, "loss": 0.8093, "step": 2365 }, { "epoch": 1.5419648666232921, "grad_norm": 1.162960171699524, "learning_rate": 3.978135364316874e-05, "loss": 0.8309, "step": 2370 }, { "epoch": 1.5452179570592062, "grad_norm": 0.6852900385856628, "learning_rate": 3.973969413967086e-05, "loss": 0.803, "step": 2375 }, { "epoch": 1.5484710474951204, "grad_norm": 0.8108793497085571, "learning_rate": 3.9697971803278924e-05, "loss": 0.8035, "step": 2380 }, { "epoch": 1.5517241379310345, "grad_norm": 0.7817934155464172, "learning_rate": 3.9656186811848395e-05, "loss": 0.7872, "step": 2385 }, { "epoch": 1.5549772283669485, "grad_norm": 0.6819909811019897, "learning_rate": 3.9614339343501836e-05, "loss": 0.7978, "step": 2390 }, { "epoch": 1.5582303188028628, "grad_norm": 1.547499179840088, "learning_rate": 3.9572429576628114e-05, "loss": 0.8042, "step": 2395 }, { "epoch": 1.5614834092387768, "grad_norm": 0.6589548587799072, "learning_rate": 3.9530457689881684e-05, "loss": 0.8112, "step": 2400 }, { "epoch": 1.5647364996746909, "grad_norm": 1.0360349416732788, "learning_rate": 3.94884238621818e-05, "loss": 0.7949, "step": 2405 }, { "epoch": 1.5679895901106051, "grad_norm": 0.8563544154167175, "learning_rate": 3.944632827271176e-05, "loss": 0.8193, "step": 2410 }, { "epoch": 1.5712426805465192, "grad_norm": 0.8043304681777954, "learning_rate": 3.940417110091816e-05, "loss": 0.7986, "step": 2415 }, { "epoch": 1.5744957709824332, "grad_norm": 0.7991480827331543, "learning_rate": 3.9361952526510085e-05, "loss": 0.791, "step": 2420 }, { "epoch": 1.5777488614183475, "grad_norm": 0.8276852369308472, "learning_rate": 3.9319672729458376e-05, "loss": 0.8065, "step": 2425 }, { "epoch": 1.5810019518542615, "grad_norm": 0.601204514503479, "learning_rate": 3.927733188999486e-05, "loss": 0.803, "step": 2430 }, { "epoch": 1.5842550422901756, "grad_norm": 0.77410888671875, "learning_rate": 3.92349301886116e-05, "loss": 0.8056, "step": 2435 }, { "epoch": 1.5875081327260898, "grad_norm": 1.0822545289993286, "learning_rate": 3.9192467806060044e-05, "loss": 0.7781, "step": 2440 }, { "epoch": 1.5907612231620039, "grad_norm": 0.602203905582428, "learning_rate": 3.914994492335038e-05, "loss": 0.7805, "step": 2445 }, { "epoch": 1.594014313597918, "grad_norm": 0.6420490741729736, "learning_rate": 3.910736172175066e-05, "loss": 0.8081, "step": 2450 }, { "epoch": 1.5972674040338322, "grad_norm": 1.321158766746521, "learning_rate": 3.9064718382786076e-05, "loss": 0.8251, "step": 2455 }, { "epoch": 1.6005204944697464, "grad_norm": 1.2666637897491455, "learning_rate": 3.9022015088238174e-05, "loss": 0.8017, "step": 2460 }, { "epoch": 1.6037735849056602, "grad_norm": 0.7312076687812805, "learning_rate": 3.897925202014409e-05, "loss": 0.8193, "step": 2465 }, { "epoch": 1.6070266753415745, "grad_norm": 0.6371551156044006, "learning_rate": 3.8936429360795745e-05, "loss": 0.8028, "step": 2470 }, { "epoch": 1.6102797657774888, "grad_norm": 0.7501451373100281, "learning_rate": 3.88935472927391e-05, "loss": 0.809, "step": 2475 }, { "epoch": 1.6135328562134026, "grad_norm": 0.6462128162384033, "learning_rate": 3.885060599877337e-05, "loss": 0.7898, "step": 2480 }, { "epoch": 1.6167859466493169, "grad_norm": 0.6399087905883789, "learning_rate": 3.880760566195023e-05, "loss": 0.7848, "step": 2485 }, { "epoch": 1.6200390370852311, "grad_norm": 0.6313064694404602, "learning_rate": 3.876454646557305e-05, "loss": 0.7907, "step": 2490 }, { "epoch": 1.623292127521145, "grad_norm": 0.5535691976547241, "learning_rate": 3.872142859319612e-05, "loss": 0.8071, "step": 2495 }, { "epoch": 1.6265452179570592, "grad_norm": 0.7361935377120972, "learning_rate": 3.867825222862383e-05, "loss": 0.8034, "step": 2500 }, { "epoch": 1.6297983083929735, "grad_norm": 0.7017170190811157, "learning_rate": 3.863501755590994e-05, "loss": 0.8202, "step": 2505 }, { "epoch": 1.6330513988288873, "grad_norm": 2.15995717048645, "learning_rate": 3.8591724759356734e-05, "loss": 0.8064, "step": 2510 }, { "epoch": 1.6363044892648015, "grad_norm": 0.6328789591789246, "learning_rate": 3.854837402351431e-05, "loss": 0.7959, "step": 2515 }, { "epoch": 1.6395575797007158, "grad_norm": 0.7377115488052368, "learning_rate": 3.8504965533179724e-05, "loss": 0.7826, "step": 2520 }, { "epoch": 1.6428106701366298, "grad_norm": 0.7871087193489075, "learning_rate": 3.8461499473396246e-05, "loss": 0.7836, "step": 2525 }, { "epoch": 1.6460637605725439, "grad_norm": 0.7310390472412109, "learning_rate": 3.841797602945254e-05, "loss": 0.82, "step": 2530 }, { "epoch": 1.6493168510084582, "grad_norm": 0.6992024779319763, "learning_rate": 3.837439538688189e-05, "loss": 0.7865, "step": 2535 }, { "epoch": 1.6525699414443722, "grad_norm": 0.6299036741256714, "learning_rate": 3.833075773146142e-05, "loss": 0.8071, "step": 2540 }, { "epoch": 1.6558230318802862, "grad_norm": 0.7134023904800415, "learning_rate": 3.828706324921128e-05, "loss": 0.7931, "step": 2545 }, { "epoch": 1.6590761223162005, "grad_norm": 0.9033751487731934, "learning_rate": 3.824331212639388e-05, "loss": 0.8144, "step": 2550 }, { "epoch": 1.6623292127521145, "grad_norm": 0.6907141208648682, "learning_rate": 3.8199504549513055e-05, "loss": 0.7866, "step": 2555 }, { "epoch": 1.6655823031880286, "grad_norm": 0.9282029271125793, "learning_rate": 3.81556407053133e-05, "loss": 0.7935, "step": 2560 }, { "epoch": 1.6688353936239428, "grad_norm": 0.8306324481964111, "learning_rate": 3.811172078077899e-05, "loss": 0.7896, "step": 2565 }, { "epoch": 1.6720884840598569, "grad_norm": 0.8237498998641968, "learning_rate": 3.806774496313355e-05, "loss": 0.7894, "step": 2570 }, { "epoch": 1.675341574495771, "grad_norm": 1.046177625656128, "learning_rate": 3.802371343983865e-05, "loss": 0.7965, "step": 2575 }, { "epoch": 1.6785946649316852, "grad_norm": 0.8086977601051331, "learning_rate": 3.797962639859344e-05, "loss": 0.7962, "step": 2580 }, { "epoch": 1.6818477553675992, "grad_norm": 0.8723990321159363, "learning_rate": 3.7935484027333746e-05, "loss": 0.8, "step": 2585 }, { "epoch": 1.6851008458035133, "grad_norm": 0.6789833903312683, "learning_rate": 3.7891286514231225e-05, "loss": 0.801, "step": 2590 }, { "epoch": 1.6883539362394275, "grad_norm": 0.7277116179466248, "learning_rate": 3.784703404769263e-05, "loss": 0.7984, "step": 2595 }, { "epoch": 1.6916070266753416, "grad_norm": 1.03284752368927, "learning_rate": 3.780272681635894e-05, "loss": 0.8021, "step": 2600 }, { "epoch": 1.6948601171112556, "grad_norm": 0.9302812218666077, "learning_rate": 3.77583650091046e-05, "loss": 0.8046, "step": 2605 }, { "epoch": 1.6981132075471699, "grad_norm": 1.2493805885314941, "learning_rate": 3.771394881503673e-05, "loss": 0.805, "step": 2610 }, { "epoch": 1.701366297983084, "grad_norm": 1.7447010278701782, "learning_rate": 3.766947842349423e-05, "loss": 0.7968, "step": 2615 }, { "epoch": 1.704619388418998, "grad_norm": 0.5957579016685486, "learning_rate": 3.76249540240471e-05, "loss": 0.7958, "step": 2620 }, { "epoch": 1.7078724788549122, "grad_norm": 0.8952409625053406, "learning_rate": 3.7580375806495524e-05, "loss": 0.7974, "step": 2625 }, { "epoch": 1.7111255692908263, "grad_norm": 0.5696317553520203, "learning_rate": 3.753574396086913e-05, "loss": 0.8037, "step": 2630 }, { "epoch": 1.7143786597267403, "grad_norm": 1.1165753602981567, "learning_rate": 3.7491058677426135e-05, "loss": 0.8038, "step": 2635 }, { "epoch": 1.7176317501626546, "grad_norm": 0.7049902081489563, "learning_rate": 3.7446320146652556e-05, "loss": 0.7678, "step": 2640 }, { "epoch": 1.7208848405985686, "grad_norm": 0.6802716851234436, "learning_rate": 3.740152855926139e-05, "loss": 0.8163, "step": 2645 }, { "epoch": 1.7241379310344827, "grad_norm": 0.808645486831665, "learning_rate": 3.735668410619183e-05, "loss": 0.8045, "step": 2650 }, { "epoch": 1.727391021470397, "grad_norm": 0.6994810104370117, "learning_rate": 3.7311786978608415e-05, "loss": 0.794, "step": 2655 }, { "epoch": 1.730644111906311, "grad_norm": 0.9750849604606628, "learning_rate": 3.726683736790022e-05, "loss": 0.8192, "step": 2660 }, { "epoch": 1.733897202342225, "grad_norm": 0.8701574206352234, "learning_rate": 3.7221835465680024e-05, "loss": 0.8072, "step": 2665 }, { "epoch": 1.7371502927781393, "grad_norm": 0.7430034875869751, "learning_rate": 3.717678146378357e-05, "loss": 0.8147, "step": 2670 }, { "epoch": 1.7404033832140533, "grad_norm": 0.7101485729217529, "learning_rate": 3.7131675554268654e-05, "loss": 0.8174, "step": 2675 }, { "epoch": 1.7436564736499673, "grad_norm": 0.6719719767570496, "learning_rate": 3.7086517929414346e-05, "loss": 0.7912, "step": 2680 }, { "epoch": 1.7469095640858816, "grad_norm": 0.8749985098838806, "learning_rate": 3.70413087817202e-05, "loss": 0.7914, "step": 2685 }, { "epoch": 1.7501626545217959, "grad_norm": 0.6824137568473816, "learning_rate": 3.699604830390537e-05, "loss": 0.7974, "step": 2690 }, { "epoch": 1.7534157449577097, "grad_norm": 0.6715524196624756, "learning_rate": 3.695073668890785e-05, "loss": 0.8134, "step": 2695 }, { "epoch": 1.756668835393624, "grad_norm": 1.2192811965942383, "learning_rate": 3.690537412988359e-05, "loss": 0.8042, "step": 2700 }, { "epoch": 1.7599219258295382, "grad_norm": 0.5974992513656616, "learning_rate": 3.685996082020574e-05, "loss": 0.7871, "step": 2705 }, { "epoch": 1.763175016265452, "grad_norm": 0.7278191447257996, "learning_rate": 3.681449695346376e-05, "loss": 0.8104, "step": 2710 }, { "epoch": 1.7664281067013663, "grad_norm": 0.662208080291748, "learning_rate": 3.676898272346266e-05, "loss": 0.7831, "step": 2715 }, { "epoch": 1.7696811971372806, "grad_norm": 0.891823410987854, "learning_rate": 3.6723418324222126e-05, "loss": 0.8045, "step": 2720 }, { "epoch": 1.7729342875731944, "grad_norm": 0.6170246005058289, "learning_rate": 3.667780394997569e-05, "loss": 0.7881, "step": 2725 }, { "epoch": 1.7761873780091086, "grad_norm": 0.6385141015052795, "learning_rate": 3.663213979516994e-05, "loss": 0.7926, "step": 2730 }, { "epoch": 1.779440468445023, "grad_norm": 0.827928900718689, "learning_rate": 3.658642605446367e-05, "loss": 0.7865, "step": 2735 }, { "epoch": 1.7826935588809367, "grad_norm": 0.8710260987281799, "learning_rate": 3.6540662922727034e-05, "loss": 0.8045, "step": 2740 }, { "epoch": 1.785946649316851, "grad_norm": 0.7606410980224609, "learning_rate": 3.6494850595040745e-05, "loss": 0.8041, "step": 2745 }, { "epoch": 1.7891997397527653, "grad_norm": 0.598753809928894, "learning_rate": 3.644898926669524e-05, "loss": 0.8023, "step": 2750 }, { "epoch": 1.7924528301886793, "grad_norm": 0.6432173848152161, "learning_rate": 3.640307913318982e-05, "loss": 0.8165, "step": 2755 }, { "epoch": 1.7957059206245933, "grad_norm": 0.6266681551933289, "learning_rate": 3.6357120390231825e-05, "loss": 0.8133, "step": 2760 }, { "epoch": 1.7989590110605076, "grad_norm": 1.0969923734664917, "learning_rate": 3.6311113233735836e-05, "loss": 0.8016, "step": 2765 }, { "epoch": 1.8022121014964216, "grad_norm": 1.2453726530075073, "learning_rate": 3.626505785982281e-05, "loss": 0.7391, "step": 2770 }, { "epoch": 1.8054651919323357, "grad_norm": 0.8763654828071594, "learning_rate": 3.6218954464819224e-05, "loss": 0.7857, "step": 2775 }, { "epoch": 1.80871828236825, "grad_norm": 0.7538381218910217, "learning_rate": 3.6172803245256284e-05, "loss": 0.7857, "step": 2780 }, { "epoch": 1.811971372804164, "grad_norm": 0.6540011763572693, "learning_rate": 3.612660439786904e-05, "loss": 0.7982, "step": 2785 }, { "epoch": 1.815224463240078, "grad_norm": 0.7887677550315857, "learning_rate": 3.608035811959561e-05, "loss": 0.8012, "step": 2790 }, { "epoch": 1.8184775536759923, "grad_norm": 0.6847140192985535, "learning_rate": 3.603406460757627e-05, "loss": 0.8011, "step": 2795 }, { "epoch": 1.8217306441119063, "grad_norm": 0.5890961289405823, "learning_rate": 3.598772405915264e-05, "loss": 0.7934, "step": 2800 }, { "epoch": 1.8249837345478204, "grad_norm": 0.5724996328353882, "learning_rate": 3.594133667186688e-05, "loss": 0.7992, "step": 2805 }, { "epoch": 1.8282368249837346, "grad_norm": 0.6108303070068359, "learning_rate": 3.58949026434608e-05, "loss": 0.7798, "step": 2810 }, { "epoch": 1.8314899154196487, "grad_norm": 1.2907392978668213, "learning_rate": 3.584842217187503e-05, "loss": 0.7767, "step": 2815 }, { "epoch": 1.8347430058555627, "grad_norm": 0.6735864877700806, "learning_rate": 3.580189545524818e-05, "loss": 0.7884, "step": 2820 }, { "epoch": 1.837996096291477, "grad_norm": 1.0391428470611572, "learning_rate": 3.575532269191599e-05, "loss": 0.7944, "step": 2825 }, { "epoch": 1.841249186727391, "grad_norm": 0.7969436645507812, "learning_rate": 3.57087040804105e-05, "loss": 0.7756, "step": 2830 }, { "epoch": 1.844502277163305, "grad_norm": 0.6109620332717896, "learning_rate": 3.566203981945921e-05, "loss": 0.802, "step": 2835 }, { "epoch": 1.8477553675992193, "grad_norm": 0.6684165000915527, "learning_rate": 3.561533010798418e-05, "loss": 0.7942, "step": 2840 }, { "epoch": 1.8510084580351334, "grad_norm": 0.6041696071624756, "learning_rate": 3.556857514510123e-05, "loss": 0.8121, "step": 2845 }, { "epoch": 1.8542615484710474, "grad_norm": 0.844451904296875, "learning_rate": 3.5521775130119095e-05, "loss": 0.7941, "step": 2850 }, { "epoch": 1.8575146389069617, "grad_norm": 0.8286609649658203, "learning_rate": 3.547493026253854e-05, "loss": 0.7955, "step": 2855 }, { "epoch": 1.8607677293428757, "grad_norm": 0.5692376494407654, "learning_rate": 3.542804074205155e-05, "loss": 0.7736, "step": 2860 }, { "epoch": 1.8640208197787898, "grad_norm": 0.80513995885849, "learning_rate": 3.5381106768540426e-05, "loss": 0.819, "step": 2865 }, { "epoch": 1.867273910214704, "grad_norm": 0.748439371585846, "learning_rate": 3.5334128542077004e-05, "loss": 0.8042, "step": 2870 }, { "epoch": 1.870527000650618, "grad_norm": 0.5762147903442383, "learning_rate": 3.528710626292174e-05, "loss": 0.7935, "step": 2875 }, { "epoch": 1.873780091086532, "grad_norm": 0.8298909664154053, "learning_rate": 3.5240040131522876e-05, "loss": 0.796, "step": 2880 }, { "epoch": 1.8770331815224464, "grad_norm": 0.805218517780304, "learning_rate": 3.519293034851559e-05, "loss": 0.777, "step": 2885 }, { "epoch": 1.8802862719583604, "grad_norm": 0.617969810962677, "learning_rate": 3.514577711472117e-05, "loss": 0.7925, "step": 2890 }, { "epoch": 1.8835393623942744, "grad_norm": 0.6062343120574951, "learning_rate": 3.509858063114608e-05, "loss": 0.7853, "step": 2895 }, { "epoch": 1.8867924528301887, "grad_norm": 0.8101537823677063, "learning_rate": 3.505134109898118e-05, "loss": 0.7601, "step": 2900 }, { "epoch": 1.8900455432661027, "grad_norm": 0.8260220885276794, "learning_rate": 3.500405871960085e-05, "loss": 0.7946, "step": 2905 }, { "epoch": 1.8932986337020168, "grad_norm": 0.8925743699073792, "learning_rate": 3.495673369456207e-05, "loss": 0.799, "step": 2910 }, { "epoch": 1.896551724137931, "grad_norm": 0.5829617977142334, "learning_rate": 3.490936622560368e-05, "loss": 0.8058, "step": 2915 }, { "epoch": 1.8998048145738453, "grad_norm": 0.6162145137786865, "learning_rate": 3.4861956514645386e-05, "loss": 0.7715, "step": 2920 }, { "epoch": 1.9030579050097591, "grad_norm": 0.641473114490509, "learning_rate": 3.481450476378703e-05, "loss": 0.7854, "step": 2925 }, { "epoch": 1.9063109954456734, "grad_norm": 0.6128933429718018, "learning_rate": 3.4767011175307595e-05, "loss": 0.8211, "step": 2930 }, { "epoch": 1.9095640858815877, "grad_norm": 0.6579800248146057, "learning_rate": 3.4719475951664464e-05, "loss": 0.793, "step": 2935 }, { "epoch": 1.9128171763175015, "grad_norm": 0.7673735022544861, "learning_rate": 3.4671899295492485e-05, "loss": 0.7965, "step": 2940 }, { "epoch": 1.9160702667534157, "grad_norm": 0.6426681876182556, "learning_rate": 3.462428140960311e-05, "loss": 0.7817, "step": 2945 }, { "epoch": 1.91932335718933, "grad_norm": 1.0133545398712158, "learning_rate": 3.4576622496983575e-05, "loss": 0.806, "step": 2950 }, { "epoch": 1.9225764476252438, "grad_norm": 0.5959199666976929, "learning_rate": 3.452892276079599e-05, "loss": 0.7939, "step": 2955 }, { "epoch": 1.925829538061158, "grad_norm": 0.6069105267524719, "learning_rate": 3.4481182404376485e-05, "loss": 0.8011, "step": 2960 }, { "epoch": 1.9290826284970723, "grad_norm": 0.6653848886489868, "learning_rate": 3.443340163123437e-05, "loss": 0.7864, "step": 2965 }, { "epoch": 1.9323357189329864, "grad_norm": 0.8323044180870056, "learning_rate": 3.4385580645051216e-05, "loss": 0.8096, "step": 2970 }, { "epoch": 1.9355888093689004, "grad_norm": 0.7751704454421997, "learning_rate": 3.433771964968004e-05, "loss": 0.8101, "step": 2975 }, { "epoch": 1.9388418998048147, "grad_norm": 0.7757251262664795, "learning_rate": 3.4289818849144384e-05, "loss": 0.7871, "step": 2980 }, { "epoch": 1.9420949902407287, "grad_norm": 0.6133014559745789, "learning_rate": 3.424187844763751e-05, "loss": 0.7759, "step": 2985 }, { "epoch": 1.9453480806766428, "grad_norm": 0.6651121973991394, "learning_rate": 3.419389864952145e-05, "loss": 0.8079, "step": 2990 }, { "epoch": 1.948601171112557, "grad_norm": 0.6148653626441956, "learning_rate": 3.414587965932622e-05, "loss": 0.8075, "step": 2995 }, { "epoch": 1.951854261548471, "grad_norm": 0.7674732208251953, "learning_rate": 3.409782168174887e-05, "loss": 0.7836, "step": 3000 }, { "epoch": 1.9551073519843851, "grad_norm": 0.7626878619194031, "learning_rate": 3.404972492165267e-05, "loss": 0.787, "step": 3005 }, { "epoch": 1.9583604424202994, "grad_norm": 0.6008310914039612, "learning_rate": 3.40015895840662e-05, "loss": 0.8043, "step": 3010 }, { "epoch": 1.9616135328562134, "grad_norm": 0.6407262682914734, "learning_rate": 3.3953415874182495e-05, "loss": 0.7847, "step": 3015 }, { "epoch": 1.9648666232921275, "grad_norm": 0.8387401103973389, "learning_rate": 3.390520399735818e-05, "loss": 0.7885, "step": 3020 }, { "epoch": 1.9681197137280417, "grad_norm": 0.9059749245643616, "learning_rate": 3.385695415911253e-05, "loss": 0.7885, "step": 3025 }, { "epoch": 1.9713728041639558, "grad_norm": 0.7701427340507507, "learning_rate": 3.38086665651267e-05, "loss": 0.7951, "step": 3030 }, { "epoch": 1.9746258945998698, "grad_norm": 0.5787619352340698, "learning_rate": 3.376034142124277e-05, "loss": 0.7692, "step": 3035 }, { "epoch": 1.977878985035784, "grad_norm": 0.5660680532455444, "learning_rate": 3.371197893346288e-05, "loss": 0.7935, "step": 3040 }, { "epoch": 1.9811320754716981, "grad_norm": 0.6145839095115662, "learning_rate": 3.3663579307948365e-05, "loss": 0.774, "step": 3045 }, { "epoch": 1.9843851659076122, "grad_norm": 0.9014440178871155, "learning_rate": 3.3615142751018894e-05, "loss": 0.795, "step": 3050 }, { "epoch": 1.9876382563435264, "grad_norm": 1.5400235652923584, "learning_rate": 3.356666946915152e-05, "loss": 0.8015, "step": 3055 }, { "epoch": 1.9908913467794405, "grad_norm": 0.7585124373435974, "learning_rate": 3.35181596689799e-05, "loss": 0.7751, "step": 3060 }, { "epoch": 1.9941444372153545, "grad_norm": 0.675399124622345, "learning_rate": 3.3469613557293345e-05, "loss": 0.7552, "step": 3065 }, { "epoch": 1.9973975276512688, "grad_norm": 0.7362255454063416, "learning_rate": 3.342103134103593e-05, "loss": 0.7674, "step": 3070 }, { "epoch": 2.0, "eval_f1": 0.8125089732634697, "eval_loss": 0.419677734375, "eval_precision": 0.812527609369082, "eval_recall": 0.8124974017274719, "eval_runtime": 298.2094, "eval_samples_per_second": 1319.325, "eval_steps_per_second": 1.291, "step": 3074 }, { "epoch": 2.000650618087183, "grad_norm": 0.6836552023887634, "learning_rate": 3.3372413227305684e-05, "loss": 0.7623, "step": 3075 }, { "epoch": 2.003903708523097, "grad_norm": 0.7960084676742554, "learning_rate": 3.3323759423353615e-05, "loss": 0.6671, "step": 3080 }, { "epoch": 2.007156798959011, "grad_norm": 0.7370645999908447, "learning_rate": 3.327507013658291e-05, "loss": 0.6733, "step": 3085 }, { "epoch": 2.0104098893949254, "grad_norm": 1.088987112045288, "learning_rate": 3.3226345574548e-05, "loss": 0.6668, "step": 3090 }, { "epoch": 2.013662979830839, "grad_norm": 0.9499191045761108, "learning_rate": 3.317758594495367e-05, "loss": 0.6749, "step": 3095 }, { "epoch": 2.0169160702667535, "grad_norm": 0.8411104083061218, "learning_rate": 3.312879145565422e-05, "loss": 0.6643, "step": 3100 }, { "epoch": 2.0201691607026677, "grad_norm": 0.8027604818344116, "learning_rate": 3.307996231465254e-05, "loss": 0.6604, "step": 3105 }, { "epoch": 2.0234222511385815, "grad_norm": 0.8328303694725037, "learning_rate": 3.303109873009922e-05, "loss": 0.6921, "step": 3110 }, { "epoch": 2.026675341574496, "grad_norm": 0.8239777088165283, "learning_rate": 3.298220091029171e-05, "loss": 0.6619, "step": 3115 }, { "epoch": 2.02992843201041, "grad_norm": 0.8349932432174683, "learning_rate": 3.293326906367338e-05, "loss": 0.6289, "step": 3120 }, { "epoch": 2.033181522446324, "grad_norm": 1.271503210067749, "learning_rate": 3.2884303398832634e-05, "loss": 0.6643, "step": 3125 }, { "epoch": 2.036434612882238, "grad_norm": 0.8596307039260864, "learning_rate": 3.283530412450207e-05, "loss": 0.6434, "step": 3130 }, { "epoch": 2.0396877033181524, "grad_norm": 0.8263525366783142, "learning_rate": 3.278627144955754e-05, "loss": 0.6485, "step": 3135 }, { "epoch": 2.0429407937540662, "grad_norm": 0.8425394892692566, "learning_rate": 3.2737205583017286e-05, "loss": 0.6428, "step": 3140 }, { "epoch": 2.0461938841899805, "grad_norm": 1.253609299659729, "learning_rate": 3.268810673404102e-05, "loss": 0.6427, "step": 3145 }, { "epoch": 2.0494469746258948, "grad_norm": 0.8926024436950684, "learning_rate": 3.2638975111929084e-05, "loss": 0.6748, "step": 3150 }, { "epoch": 2.0527000650618086, "grad_norm": 0.959561288356781, "learning_rate": 3.25898109261215e-05, "loss": 0.6542, "step": 3155 }, { "epoch": 2.055953155497723, "grad_norm": 1.0102553367614746, "learning_rate": 3.254061438619711e-05, "loss": 0.6661, "step": 3160 }, { "epoch": 2.059206245933637, "grad_norm": 1.3455661535263062, "learning_rate": 3.249138570187268e-05, "loss": 0.6656, "step": 3165 }, { "epoch": 2.062459336369551, "grad_norm": 1.2887942790985107, "learning_rate": 3.244212508300201e-05, "loss": 0.6641, "step": 3170 }, { "epoch": 2.065712426805465, "grad_norm": 0.8388302326202393, "learning_rate": 3.239283273957502e-05, "loss": 0.6493, "step": 3175 }, { "epoch": 2.0689655172413794, "grad_norm": 1.096605658531189, "learning_rate": 3.2343508881716874e-05, "loss": 0.6664, "step": 3180 }, { "epoch": 2.0722186076772933, "grad_norm": 0.8951081037521362, "learning_rate": 3.229415371968706e-05, "loss": 0.6772, "step": 3185 }, { "epoch": 2.0754716981132075, "grad_norm": 0.8592993021011353, "learning_rate": 3.2244767463878525e-05, "loss": 0.6688, "step": 3190 }, { "epoch": 2.078724788549122, "grad_norm": 0.8971413969993591, "learning_rate": 3.219535032481679e-05, "loss": 0.6739, "step": 3195 }, { "epoch": 2.0819778789850356, "grad_norm": 2.3976173400878906, "learning_rate": 3.214590251315896e-05, "loss": 0.663, "step": 3200 }, { "epoch": 2.08523096942095, "grad_norm": 1.024121642112732, "learning_rate": 3.209642423969296e-05, "loss": 0.6618, "step": 3205 }, { "epoch": 2.088484059856864, "grad_norm": 0.8724255561828613, "learning_rate": 3.204691571533652e-05, "loss": 0.6459, "step": 3210 }, { "epoch": 2.091737150292778, "grad_norm": 0.9145408272743225, "learning_rate": 3.1997377151136356e-05, "loss": 0.6759, "step": 3215 }, { "epoch": 2.094990240728692, "grad_norm": 1.3194828033447266, "learning_rate": 3.194780875826723e-05, "loss": 0.6687, "step": 3220 }, { "epoch": 2.0982433311646065, "grad_norm": 1.0560972690582275, "learning_rate": 3.189821074803103e-05, "loss": 0.6554, "step": 3225 }, { "epoch": 2.1014964216005203, "grad_norm": 0.8973615765571594, "learning_rate": 3.1848583331855954e-05, "loss": 0.6554, "step": 3230 }, { "epoch": 2.1047495120364346, "grad_norm": 1.0747287273406982, "learning_rate": 3.1808860368799674e-05, "loss": 0.6729, "step": 3235 }, { "epoch": 2.108002602472349, "grad_norm": 1.3772774934768677, "learning_rate": 3.1759180555133126e-05, "loss": 0.6449, "step": 3240 }, { "epoch": 2.1112556929082626, "grad_norm": 0.8960999250411987, "learning_rate": 3.170947192819057e-05, "loss": 0.6678, "step": 3245 }, { "epoch": 2.114508783344177, "grad_norm": 0.891995906829834, "learning_rate": 3.165973469987168e-05, "loss": 0.6542, "step": 3250 }, { "epoch": 2.117761873780091, "grad_norm": 0.8801698684692383, "learning_rate": 3.160996908219812e-05, "loss": 0.6588, "step": 3255 }, { "epoch": 2.121014964216005, "grad_norm": 1.0145418643951416, "learning_rate": 3.1560175287312534e-05, "loss": 0.6561, "step": 3260 }, { "epoch": 2.1242680546519193, "grad_norm": 1.2028182744979858, "learning_rate": 3.151035352747767e-05, "loss": 0.6689, "step": 3265 }, { "epoch": 2.1275211450878335, "grad_norm": 0.9466059803962708, "learning_rate": 3.1460504015075525e-05, "loss": 0.6518, "step": 3270 }, { "epoch": 2.130774235523748, "grad_norm": 0.938401460647583, "learning_rate": 3.141062696260636e-05, "loss": 0.6649, "step": 3275 }, { "epoch": 2.1340273259596616, "grad_norm": 2.2599849700927734, "learning_rate": 3.1360722582687876e-05, "loss": 0.6607, "step": 3280 }, { "epoch": 2.137280416395576, "grad_norm": 0.8594357371330261, "learning_rate": 3.1310791088054225e-05, "loss": 0.6572, "step": 3285 }, { "epoch": 2.14053350683149, "grad_norm": 1.1824400424957275, "learning_rate": 3.126083269155517e-05, "loss": 0.6812, "step": 3290 }, { "epoch": 2.143786597267404, "grad_norm": 1.1186939477920532, "learning_rate": 3.121084760615515e-05, "loss": 0.6653, "step": 3295 }, { "epoch": 2.147039687703318, "grad_norm": 0.9989317059516907, "learning_rate": 3.116083604493236e-05, "loss": 0.6934, "step": 3300 }, { "epoch": 2.1502927781392325, "grad_norm": 0.8350909948348999, "learning_rate": 3.111079822107788e-05, "loss": 0.6447, "step": 3305 }, { "epoch": 2.1535458685751463, "grad_norm": 0.9244917035102844, "learning_rate": 3.106073434789472e-05, "loss": 0.6492, "step": 3310 }, { "epoch": 2.1567989590110606, "grad_norm": 0.9779621362686157, "learning_rate": 3.1010644638796956e-05, "loss": 0.6679, "step": 3315 }, { "epoch": 2.160052049446975, "grad_norm": 1.2001359462738037, "learning_rate": 3.096052930730877e-05, "loss": 0.6752, "step": 3320 }, { "epoch": 2.1633051398828886, "grad_norm": 0.8512766361236572, "learning_rate": 3.091038856706361e-05, "loss": 0.6601, "step": 3325 }, { "epoch": 2.166558230318803, "grad_norm": 3.723388433456421, "learning_rate": 3.086022263180318e-05, "loss": 0.6976, "step": 3330 }, { "epoch": 2.169811320754717, "grad_norm": 0.9627018570899963, "learning_rate": 3.081003171537665e-05, "loss": 0.666, "step": 3335 }, { "epoch": 2.173064411190631, "grad_norm": 0.9980655312538147, "learning_rate": 3.075981603173963e-05, "loss": 0.6729, "step": 3340 }, { "epoch": 2.1763175016265452, "grad_norm": 1.0389659404754639, "learning_rate": 3.070957579495333e-05, "loss": 0.6772, "step": 3345 }, { "epoch": 2.1795705920624595, "grad_norm": 1.0569038391113281, "learning_rate": 3.065931121918364e-05, "loss": 0.6711, "step": 3350 }, { "epoch": 2.1828236824983733, "grad_norm": 1.2096091508865356, "learning_rate": 3.060902251870017e-05, "loss": 0.6699, "step": 3355 }, { "epoch": 2.1860767729342876, "grad_norm": 1.1020735502243042, "learning_rate": 3.0558709907875385e-05, "loss": 0.6426, "step": 3360 }, { "epoch": 2.189329863370202, "grad_norm": 0.9492762088775635, "learning_rate": 3.0508373601183695e-05, "loss": 0.6712, "step": 3365 }, { "epoch": 2.1925829538061157, "grad_norm": 0.9105232954025269, "learning_rate": 3.045801381320048e-05, "loss": 0.6722, "step": 3370 }, { "epoch": 2.19583604424203, "grad_norm": 0.8517335653305054, "learning_rate": 3.0407630758601256e-05, "loss": 0.6727, "step": 3375 }, { "epoch": 2.199089134677944, "grad_norm": 0.9834687113761902, "learning_rate": 3.035722465216071e-05, "loss": 0.6581, "step": 3380 }, { "epoch": 2.202342225113858, "grad_norm": 1.1220489740371704, "learning_rate": 3.030679570875177e-05, "loss": 0.6648, "step": 3385 }, { "epoch": 2.2055953155497723, "grad_norm": 1.169081211090088, "learning_rate": 3.0256344143344765e-05, "loss": 0.6567, "step": 3390 }, { "epoch": 2.2088484059856865, "grad_norm": 1.303977131843567, "learning_rate": 3.02058701710064e-05, "loss": 0.6429, "step": 3395 }, { "epoch": 2.2121014964216004, "grad_norm": 1.1148353815078735, "learning_rate": 3.0155374006898946e-05, "loss": 0.686, "step": 3400 }, { "epoch": 2.2153545868575146, "grad_norm": 1.0681456327438354, "learning_rate": 3.010485586627924e-05, "loss": 0.6585, "step": 3405 }, { "epoch": 2.218607677293429, "grad_norm": 1.259251356124878, "learning_rate": 3.005431596449782e-05, "loss": 0.6525, "step": 3410 }, { "epoch": 2.2218607677293427, "grad_norm": 1.041143774986267, "learning_rate": 3.0003754516997984e-05, "loss": 0.6983, "step": 3415 }, { "epoch": 2.225113858165257, "grad_norm": 0.8610508441925049, "learning_rate": 2.9953171739314867e-05, "loss": 0.6491, "step": 3420 }, { "epoch": 2.2283669486011712, "grad_norm": 1.259366750717163, "learning_rate": 2.9902567847074537e-05, "loss": 0.6771, "step": 3425 }, { "epoch": 2.231620039037085, "grad_norm": 0.8394737243652344, "learning_rate": 2.9851943055993088e-05, "loss": 0.6659, "step": 3430 }, { "epoch": 2.2348731294729993, "grad_norm": 0.8857366442680359, "learning_rate": 2.980129758187567e-05, "loss": 0.6525, "step": 3435 }, { "epoch": 2.2381262199089136, "grad_norm": 1.0085824728012085, "learning_rate": 2.9750631640615617e-05, "loss": 0.6437, "step": 3440 }, { "epoch": 2.2413793103448274, "grad_norm": 0.8579273819923401, "learning_rate": 2.969994544819352e-05, "loss": 0.6583, "step": 3445 }, { "epoch": 2.2446324007807417, "grad_norm": 1.7075871229171753, "learning_rate": 2.9649239220676285e-05, "loss": 0.6572, "step": 3450 }, { "epoch": 2.247885491216656, "grad_norm": 1.0663272142410278, "learning_rate": 2.959851317421622e-05, "loss": 0.656, "step": 3455 }, { "epoch": 2.2511385816525697, "grad_norm": 1.072970986366272, "learning_rate": 2.9547767525050142e-05, "loss": 0.6809, "step": 3460 }, { "epoch": 2.254391672088484, "grad_norm": 0.9220450520515442, "learning_rate": 2.9497002489498393e-05, "loss": 0.6804, "step": 3465 }, { "epoch": 2.2576447625243983, "grad_norm": 1.0572975873947144, "learning_rate": 2.9446218283964e-05, "loss": 0.6561, "step": 3470 }, { "epoch": 2.260897852960312, "grad_norm": 1.0907764434814453, "learning_rate": 2.939541512493167e-05, "loss": 0.6555, "step": 3475 }, { "epoch": 2.2641509433962264, "grad_norm": 0.9391184449195862, "learning_rate": 2.9344593228966925e-05, "loss": 0.6512, "step": 3480 }, { "epoch": 2.2674040338321406, "grad_norm": 0.9960388541221619, "learning_rate": 2.929375281271517e-05, "loss": 0.6694, "step": 3485 }, { "epoch": 2.2706571242680544, "grad_norm": 0.9363919496536255, "learning_rate": 2.9242894092900725e-05, "loss": 0.6748, "step": 3490 }, { "epoch": 2.2739102147039687, "grad_norm": 0.9546090364456177, "learning_rate": 2.9192017286325973e-05, "loss": 0.6509, "step": 3495 }, { "epoch": 2.277163305139883, "grad_norm": 1.040223240852356, "learning_rate": 2.9141122609870364e-05, "loss": 0.6519, "step": 3500 }, { "epoch": 2.280416395575797, "grad_norm": 0.921008288860321, "learning_rate": 2.909021028048955e-05, "loss": 0.6645, "step": 3505 }, { "epoch": 2.283669486011711, "grad_norm": 1.0411720275878906, "learning_rate": 2.9039280515214428e-05, "loss": 0.6607, "step": 3510 }, { "epoch": 2.2869225764476253, "grad_norm": 0.9046562910079956, "learning_rate": 2.898833353115021e-05, "loss": 0.6588, "step": 3515 }, { "epoch": 2.290175666883539, "grad_norm": 0.932158350944519, "learning_rate": 2.8937369545475517e-05, "loss": 0.6718, "step": 3520 }, { "epoch": 2.2934287573194534, "grad_norm": 1.0239461660385132, "learning_rate": 2.8886388775441457e-05, "loss": 0.6896, "step": 3525 }, { "epoch": 2.2966818477553677, "grad_norm": 2.0023818016052246, "learning_rate": 2.8835391438370664e-05, "loss": 0.6653, "step": 3530 }, { "epoch": 2.2999349381912815, "grad_norm": 1.012035608291626, "learning_rate": 2.8784377751656416e-05, "loss": 0.657, "step": 3535 }, { "epoch": 2.3031880286271957, "grad_norm": 0.922524631023407, "learning_rate": 2.873334793276166e-05, "loss": 0.6805, "step": 3540 }, { "epoch": 2.30644111906311, "grad_norm": 1.0384365320205688, "learning_rate": 2.8682302199218148e-05, "loss": 0.6643, "step": 3545 }, { "epoch": 2.3096942094990243, "grad_norm": 2.3190078735351562, "learning_rate": 2.8631240768625446e-05, "loss": 0.6761, "step": 3550 }, { "epoch": 2.312947299934938, "grad_norm": 1.3237860202789307, "learning_rate": 2.8580163858650038e-05, "loss": 0.6532, "step": 3555 }, { "epoch": 2.3162003903708523, "grad_norm": 1.0602989196777344, "learning_rate": 2.85290716870244e-05, "loss": 0.6546, "step": 3560 }, { "epoch": 2.3194534808067666, "grad_norm": 1.1202350854873657, "learning_rate": 2.8477964471546077e-05, "loss": 0.6703, "step": 3565 }, { "epoch": 2.3227065712426804, "grad_norm": 0.7759214043617249, "learning_rate": 2.8426842430076712e-05, "loss": 0.6569, "step": 3570 }, { "epoch": 2.3259596616785947, "grad_norm": 1.013677716255188, "learning_rate": 2.8375705780541173e-05, "loss": 0.6719, "step": 3575 }, { "epoch": 2.329212752114509, "grad_norm": 0.9612709283828735, "learning_rate": 2.8324554740926594e-05, "loss": 0.6685, "step": 3580 }, { "epoch": 2.3324658425504228, "grad_norm": 0.9653801321983337, "learning_rate": 2.827338952928146e-05, "loss": 0.6578, "step": 3585 }, { "epoch": 2.335718932986337, "grad_norm": 0.9308706521987915, "learning_rate": 2.8222210363714653e-05, "loss": 0.6446, "step": 3590 }, { "epoch": 2.3389720234222513, "grad_norm": 0.9803590774536133, "learning_rate": 2.8171017462394546e-05, "loss": 0.6395, "step": 3595 }, { "epoch": 2.342225113858165, "grad_norm": 1.5392948389053345, "learning_rate": 2.8119811043548063e-05, "loss": 0.6452, "step": 3600 }, { "epoch": 2.3454782042940794, "grad_norm": 1.1688051223754883, "learning_rate": 2.806859132545975e-05, "loss": 0.6619, "step": 3605 }, { "epoch": 2.3487312947299936, "grad_norm": 2.6195671558380127, "learning_rate": 2.801735852647086e-05, "loss": 0.6603, "step": 3610 }, { "epoch": 2.3519843851659075, "grad_norm": 0.9085677266120911, "learning_rate": 2.79661128649784e-05, "loss": 0.6562, "step": 3615 }, { "epoch": 2.3552374756018217, "grad_norm": 1.1864854097366333, "learning_rate": 2.791485455943419e-05, "loss": 0.6566, "step": 3620 }, { "epoch": 2.358490566037736, "grad_norm": 1.0113550424575806, "learning_rate": 2.7863583828343964e-05, "loss": 0.6555, "step": 3625 }, { "epoch": 2.36174365647365, "grad_norm": 0.8967930674552917, "learning_rate": 2.7812300890266442e-05, "loss": 0.6351, "step": 3630 }, { "epoch": 2.364996746909564, "grad_norm": 0.9505155682563782, "learning_rate": 2.7761005963812337e-05, "loss": 0.6717, "step": 3635 }, { "epoch": 2.3682498373454783, "grad_norm": 17.054515838623047, "learning_rate": 2.7709699267643503e-05, "loss": 0.6866, "step": 3640 }, { "epoch": 2.371502927781392, "grad_norm": 1.044188141822815, "learning_rate": 2.7658381020471964e-05, "loss": 0.6564, "step": 3645 }, { "epoch": 2.3747560182173064, "grad_norm": 1.0670133829116821, "learning_rate": 2.7607051441058958e-05, "loss": 0.677, "step": 3650 }, { "epoch": 2.3780091086532207, "grad_norm": 0.9037075638771057, "learning_rate": 2.7555710748214064e-05, "loss": 0.6675, "step": 3655 }, { "epoch": 2.3812621990891345, "grad_norm": 1.2479665279388428, "learning_rate": 2.75043591607942e-05, "loss": 0.6359, "step": 3660 }, { "epoch": 2.3845152895250488, "grad_norm": 1.097076654434204, "learning_rate": 2.7452996897702765e-05, "loss": 0.6477, "step": 3665 }, { "epoch": 2.387768379960963, "grad_norm": 0.9353396892547607, "learning_rate": 2.7401624177888636e-05, "loss": 0.6452, "step": 3670 }, { "epoch": 2.391021470396877, "grad_norm": 1.2035560607910156, "learning_rate": 2.7350241220345274e-05, "loss": 0.6522, "step": 3675 }, { "epoch": 2.394274560832791, "grad_norm": 1.0560545921325684, "learning_rate": 2.729884824410979e-05, "loss": 0.6655, "step": 3680 }, { "epoch": 2.3975276512687054, "grad_norm": 1.2608094215393066, "learning_rate": 2.724744546826199e-05, "loss": 0.6566, "step": 3685 }, { "epoch": 2.4007807417046196, "grad_norm": 1.0126628875732422, "learning_rate": 2.719603311192347e-05, "loss": 0.6596, "step": 3690 }, { "epoch": 2.4040338321405335, "grad_norm": 0.8379278182983398, "learning_rate": 2.7144611394256653e-05, "loss": 0.6581, "step": 3695 }, { "epoch": 2.4072869225764477, "grad_norm": 1.0668292045593262, "learning_rate": 2.7093180534463863e-05, "loss": 0.6623, "step": 3700 }, { "epoch": 2.410540013012362, "grad_norm": 2.791311264038086, "learning_rate": 2.7041740751786408e-05, "loss": 0.6364, "step": 3705 }, { "epoch": 2.413793103448276, "grad_norm": 1.5181926488876343, "learning_rate": 2.6990292265503646e-05, "loss": 0.6522, "step": 3710 }, { "epoch": 2.41704619388419, "grad_norm": 1.028696894645691, "learning_rate": 2.6938835294931996e-05, "loss": 0.6755, "step": 3715 }, { "epoch": 2.4202992843201043, "grad_norm": 2.595792531967163, "learning_rate": 2.6887370059424078e-05, "loss": 0.6704, "step": 3720 }, { "epoch": 2.423552374756018, "grad_norm": 1.041309118270874, "learning_rate": 2.6835896778367738e-05, "loss": 0.6489, "step": 3725 }, { "epoch": 2.4268054651919324, "grad_norm": 1.087664246559143, "learning_rate": 2.6784415671185104e-05, "loss": 0.6521, "step": 3730 }, { "epoch": 2.4300585556278467, "grad_norm": 1.149911880493164, "learning_rate": 2.6732926957331688e-05, "loss": 0.6461, "step": 3735 }, { "epoch": 2.4333116460637605, "grad_norm": 0.9765056371688843, "learning_rate": 2.668143085629541e-05, "loss": 0.6408, "step": 3740 }, { "epoch": 2.4365647364996748, "grad_norm": 1.053946614265442, "learning_rate": 2.6629927587595688e-05, "loss": 0.658, "step": 3745 }, { "epoch": 2.439817826935589, "grad_norm": 1.1712669134140015, "learning_rate": 2.65784173707825e-05, "loss": 0.6504, "step": 3750 }, { "epoch": 2.443070917371503, "grad_norm": 0.9881893396377563, "learning_rate": 2.6526900425435425e-05, "loss": 0.6709, "step": 3755 }, { "epoch": 2.446324007807417, "grad_norm": 0.8471539616584778, "learning_rate": 2.6475376971162734e-05, "loss": 0.6754, "step": 3760 }, { "epoch": 2.4495770982433314, "grad_norm": 0.9622436165809631, "learning_rate": 2.642384722760046e-05, "loss": 0.6597, "step": 3765 }, { "epoch": 2.452830188679245, "grad_norm": 0.7975105047225952, "learning_rate": 2.6372311414411427e-05, "loss": 0.662, "step": 3770 }, { "epoch": 2.4560832791151594, "grad_norm": 1.1026084423065186, "learning_rate": 2.6320769751284335e-05, "loss": 0.6651, "step": 3775 }, { "epoch": 2.4593363695510737, "grad_norm": 1.0752489566802979, "learning_rate": 2.6269222457932824e-05, "loss": 0.6366, "step": 3780 }, { "epoch": 2.4625894599869875, "grad_norm": 0.8716150522232056, "learning_rate": 2.621766975409453e-05, "loss": 0.6653, "step": 3785 }, { "epoch": 2.465842550422902, "grad_norm": 1.0649757385253906, "learning_rate": 2.616611185953018e-05, "loss": 0.6869, "step": 3790 }, { "epoch": 2.469095640858816, "grad_norm": 1.258510947227478, "learning_rate": 2.6114548994022576e-05, "loss": 0.6564, "step": 3795 }, { "epoch": 2.47234873129473, "grad_norm": 0.9622224569320679, "learning_rate": 2.6062981377375762e-05, "loss": 0.6615, "step": 3800 }, { "epoch": 2.475601821730644, "grad_norm": 2.6835622787475586, "learning_rate": 2.6011409229414003e-05, "loss": 0.6793, "step": 3805 }, { "epoch": 2.4788549121665584, "grad_norm": 1.3989510536193848, "learning_rate": 2.59598327699809e-05, "loss": 0.6462, "step": 3810 }, { "epoch": 2.482108002602472, "grad_norm": 0.9427128434181213, "learning_rate": 2.5908252218938423e-05, "loss": 0.6489, "step": 3815 }, { "epoch": 2.4853610930383865, "grad_norm": 1.0668915510177612, "learning_rate": 2.585666779616598e-05, "loss": 0.6633, "step": 3820 }, { "epoch": 2.4886141834743007, "grad_norm": 0.9310431480407715, "learning_rate": 2.5805079721559494e-05, "loss": 0.6798, "step": 3825 }, { "epoch": 2.4918672739102146, "grad_norm": 1.0935829877853394, "learning_rate": 2.5753488215030448e-05, "loss": 0.669, "step": 3830 }, { "epoch": 2.495120364346129, "grad_norm": 0.8651193380355835, "learning_rate": 2.5701893496504953e-05, "loss": 0.6789, "step": 3835 }, { "epoch": 2.498373454782043, "grad_norm": 1.0802959203720093, "learning_rate": 2.5650295785922817e-05, "loss": 0.6656, "step": 3840 }, { "epoch": 2.501626545217957, "grad_norm": 1.0248281955718994, "learning_rate": 2.5598695303236615e-05, "loss": 0.6506, "step": 3845 }, { "epoch": 2.504879635653871, "grad_norm": 1.0692211389541626, "learning_rate": 2.5547092268410703e-05, "loss": 0.6667, "step": 3850 }, { "epoch": 2.5081327260897854, "grad_norm": 1.2791413068771362, "learning_rate": 2.5495486901420362e-05, "loss": 0.6506, "step": 3855 }, { "epoch": 2.5113858165256993, "grad_norm": 1.1521552801132202, "learning_rate": 2.5443879422250767e-05, "loss": 0.6698, "step": 3860 }, { "epoch": 2.5146389069616135, "grad_norm": 0.9791963696479797, "learning_rate": 2.539227005089614e-05, "loss": 0.6732, "step": 3865 }, { "epoch": 2.517891997397528, "grad_norm": 0.9954161643981934, "learning_rate": 2.5340659007358742e-05, "loss": 0.6599, "step": 3870 }, { "epoch": 2.5211450878334416, "grad_norm": 1.1552543640136719, "learning_rate": 2.5289046511647972e-05, "loss": 0.6849, "step": 3875 }, { "epoch": 2.524398178269356, "grad_norm": 0.9102405905723572, "learning_rate": 2.523743278377943e-05, "loss": 0.6375, "step": 3880 }, { "epoch": 2.52765126870527, "grad_norm": 1.0647556781768799, "learning_rate": 2.518581804377394e-05, "loss": 0.6585, "step": 3885 }, { "epoch": 2.530904359141184, "grad_norm": 1.022453784942627, "learning_rate": 2.5134202511656658e-05, "loss": 0.6667, "step": 3890 }, { "epoch": 2.534157449577098, "grad_norm": 0.9109323024749756, "learning_rate": 2.5082586407456134e-05, "loss": 0.6754, "step": 3895 }, { "epoch": 2.5374105400130125, "grad_norm": 0.8672389388084412, "learning_rate": 2.5030969951203316e-05, "loss": 0.6432, "step": 3900 }, { "epoch": 2.5406636304489263, "grad_norm": 1.022900938987732, "learning_rate": 2.4979353362930685e-05, "loss": 0.6512, "step": 3905 }, { "epoch": 2.5439167208848406, "grad_norm": 0.9560525417327881, "learning_rate": 2.492773686267128e-05, "loss": 0.6528, "step": 3910 }, { "epoch": 2.547169811320755, "grad_norm": 1.1724059581756592, "learning_rate": 2.4876120670457754e-05, "loss": 0.7026, "step": 3915 }, { "epoch": 2.5504229017566686, "grad_norm": 0.9830509424209595, "learning_rate": 2.482450500632145e-05, "loss": 0.6411, "step": 3920 }, { "epoch": 2.553675992192583, "grad_norm": 0.8978729844093323, "learning_rate": 2.477289009029147e-05, "loss": 0.6656, "step": 3925 }, { "epoch": 2.556929082628497, "grad_norm": 2.0030722618103027, "learning_rate": 2.4721276142393714e-05, "loss": 0.6554, "step": 3930 }, { "epoch": 2.560182173064411, "grad_norm": 14.189367294311523, "learning_rate": 2.4669663382649967e-05, "loss": 0.6196, "step": 3935 }, { "epoch": 2.5634352635003252, "grad_norm": 1.2099864482879639, "learning_rate": 2.4618052031076933e-05, "loss": 0.651, "step": 3940 }, { "epoch": 2.5666883539362395, "grad_norm": 0.9091722965240479, "learning_rate": 2.4566442307685325e-05, "loss": 0.6533, "step": 3945 }, { "epoch": 2.5699414443721533, "grad_norm": 1.1715190410614014, "learning_rate": 2.4514834432478927e-05, "loss": 0.6578, "step": 3950 }, { "epoch": 2.5731945348080676, "grad_norm": 0.9423794150352478, "learning_rate": 2.4463228625453607e-05, "loss": 0.665, "step": 3955 }, { "epoch": 2.576447625243982, "grad_norm": 0.8811748027801514, "learning_rate": 2.4411625106596457e-05, "loss": 0.6589, "step": 3960 }, { "epoch": 2.5797007156798957, "grad_norm": 0.9299075603485107, "learning_rate": 2.43600240958848e-05, "loss": 0.6568, "step": 3965 }, { "epoch": 2.58295380611581, "grad_norm": 0.9680789709091187, "learning_rate": 2.4308425813285255e-05, "loss": 0.654, "step": 3970 }, { "epoch": 2.586206896551724, "grad_norm": 1.6087170839309692, "learning_rate": 2.425683047875282e-05, "loss": 0.6812, "step": 3975 }, { "epoch": 2.589459986987638, "grad_norm": 0.9408166408538818, "learning_rate": 2.420523831222994e-05, "loss": 0.6292, "step": 3980 }, { "epoch": 2.5927130774235523, "grad_norm": 0.9801003336906433, "learning_rate": 2.4153649533645545e-05, "loss": 0.6536, "step": 3985 }, { "epoch": 2.5959661678594665, "grad_norm": 0.908278226852417, "learning_rate": 2.4102064362914108e-05, "loss": 0.6494, "step": 3990 }, { "epoch": 2.5992192582953804, "grad_norm": 2.5500690937042236, "learning_rate": 2.4050483019934737e-05, "loss": 0.6338, "step": 3995 }, { "epoch": 2.6024723487312946, "grad_norm": 1.1047178506851196, "learning_rate": 2.3998905724590237e-05, "loss": 0.656, "step": 4000 }, { "epoch": 2.605725439167209, "grad_norm": 1.0765790939331055, "learning_rate": 2.3947332696746122e-05, "loss": 0.6445, "step": 4005 }, { "epoch": 2.6089785296031227, "grad_norm": 0.9311931133270264, "learning_rate": 2.3895764156249746e-05, "loss": 0.6472, "step": 4010 }, { "epoch": 2.612231620039037, "grad_norm": 1.5003751516342163, "learning_rate": 2.3844200322929323e-05, "loss": 0.6713, "step": 4015 }, { "epoch": 2.6154847104749512, "grad_norm": 0.8282221555709839, "learning_rate": 2.3792641416592994e-05, "loss": 0.6709, "step": 4020 }, { "epoch": 2.618737800910865, "grad_norm": 1.0039994716644287, "learning_rate": 2.3741087657027912e-05, "loss": 0.6723, "step": 4025 }, { "epoch": 2.6219908913467793, "grad_norm": 1.1094609498977661, "learning_rate": 2.3689539263999286e-05, "loss": 0.6519, "step": 4030 }, { "epoch": 2.6252439817826936, "grad_norm": 0.9982606768608093, "learning_rate": 2.3637996457249434e-05, "loss": 0.6444, "step": 4035 }, { "epoch": 2.6284970722186074, "grad_norm": 0.9692511558532715, "learning_rate": 2.3586459456496877e-05, "loss": 0.6525, "step": 4040 }, { "epoch": 2.6317501626545217, "grad_norm": 1.5806814432144165, "learning_rate": 2.3534928481435388e-05, "loss": 0.6756, "step": 4045 }, { "epoch": 2.635003253090436, "grad_norm": 0.9672693014144897, "learning_rate": 2.348340375173303e-05, "loss": 0.635, "step": 4050 }, { "epoch": 2.63825634352635, "grad_norm": 0.9741607904434204, "learning_rate": 2.3442188612229703e-05, "loss": 0.6786, "step": 4055 }, { "epoch": 2.641509433962264, "grad_norm": 1.4344401359558105, "learning_rate": 2.3390675677651777e-05, "loss": 0.6695, "step": 4060 }, { "epoch": 2.6447625243981783, "grad_norm": 0.9737870693206787, "learning_rate": 2.3339169603358997e-05, "loss": 0.6562, "step": 4065 }, { "epoch": 2.6480156148340925, "grad_norm": 1.122745156288147, "learning_rate": 2.328767060891328e-05, "loss": 0.6428, "step": 4070 }, { "epoch": 2.6512687052700064, "grad_norm": 1.1388943195343018, "learning_rate": 2.323617891384638e-05, "loss": 0.6592, "step": 4075 }, { "epoch": 2.6545217957059206, "grad_norm": 1.3511914014816284, "learning_rate": 2.3184694737658942e-05, "loss": 0.6811, "step": 4080 }, { "epoch": 2.657774886141835, "grad_norm": 0.9977470636367798, "learning_rate": 2.3133218299819536e-05, "loss": 0.6489, "step": 4085 }, { "epoch": 2.6610279765777487, "grad_norm": 6.271300792694092, "learning_rate": 2.308174981976377e-05, "loss": 0.6539, "step": 4090 }, { "epoch": 2.664281067013663, "grad_norm": 1.12664794921875, "learning_rate": 2.3030289516893306e-05, "loss": 0.6874, "step": 4095 }, { "epoch": 2.6675341574495772, "grad_norm": 1.1613657474517822, "learning_rate": 2.2978837610574964e-05, "loss": 0.649, "step": 4100 }, { "epoch": 2.6707872478854915, "grad_norm": 0.9490109086036682, "learning_rate": 2.2927394320139765e-05, "loss": 0.6316, "step": 4105 }, { "epoch": 2.6740403383214053, "grad_norm": 1.0231504440307617, "learning_rate": 2.2875959864882002e-05, "loss": 0.6496, "step": 4110 }, { "epoch": 2.6772934287573196, "grad_norm": 1.2208635807037354, "learning_rate": 2.2824534464058314e-05, "loss": 0.6763, "step": 4115 }, { "epoch": 2.680546519193234, "grad_norm": 1.1819181442260742, "learning_rate": 2.2773118336886724e-05, "loss": 0.6631, "step": 4120 }, { "epoch": 2.6837996096291477, "grad_norm": 0.9770584106445312, "learning_rate": 2.2721711702545735e-05, "loss": 0.6727, "step": 4125 }, { "epoch": 2.687052700065062, "grad_norm": 0.9494980573654175, "learning_rate": 2.267031478017339e-05, "loss": 0.6555, "step": 4130 }, { "epoch": 2.690305790500976, "grad_norm": 1.0622323751449585, "learning_rate": 2.2618927788866316e-05, "loss": 0.6688, "step": 4135 }, { "epoch": 2.69355888093689, "grad_norm": 1.1738066673278809, "learning_rate": 2.2567550947678812e-05, "loss": 0.6665, "step": 4140 }, { "epoch": 2.6968119713728043, "grad_norm": 0.9498983025550842, "learning_rate": 2.2516184475621915e-05, "loss": 0.6593, "step": 4145 }, { "epoch": 2.7000650618087185, "grad_norm": 0.9943968057632446, "learning_rate": 2.246482859166245e-05, "loss": 0.6394, "step": 4150 }, { "epoch": 2.7033181522446323, "grad_norm": 0.9715344905853271, "learning_rate": 2.2413483514722117e-05, "loss": 0.6451, "step": 4155 }, { "epoch": 2.7065712426805466, "grad_norm": 0.9957199692726135, "learning_rate": 2.2362149463676536e-05, "loss": 0.654, "step": 4160 }, { "epoch": 2.709824333116461, "grad_norm": 1.3196548223495483, "learning_rate": 2.231082665735433e-05, "loss": 0.6377, "step": 4165 }, { "epoch": 2.7130774235523747, "grad_norm": 1.0311577320098877, "learning_rate": 2.22595153145362e-05, "loss": 0.6532, "step": 4170 }, { "epoch": 2.716330513988289, "grad_norm": 0.967720627784729, "learning_rate": 2.220821565395395e-05, "loss": 0.6723, "step": 4175 }, { "epoch": 2.719583604424203, "grad_norm": 0.9087415933609009, "learning_rate": 2.215692789428962e-05, "loss": 0.6738, "step": 4180 }, { "epoch": 2.722836694860117, "grad_norm": 0.9315296411514282, "learning_rate": 2.21056522541745e-05, "loss": 0.6727, "step": 4185 }, { "epoch": 2.7260897852960313, "grad_norm": 1.8225336074829102, "learning_rate": 2.2054388952188205e-05, "loss": 0.6787, "step": 4190 }, { "epoch": 2.7293428757319456, "grad_norm": 1.4423229694366455, "learning_rate": 2.2003138206857782e-05, "loss": 0.6549, "step": 4195 }, { "epoch": 2.7325959661678594, "grad_norm": 0.9800527691841125, "learning_rate": 2.1951900236656732e-05, "loss": 0.6622, "step": 4200 }, { "epoch": 2.7358490566037736, "grad_norm": 1.0587884187698364, "learning_rate": 2.1900675260004102e-05, "loss": 0.6575, "step": 4205 }, { "epoch": 2.739102147039688, "grad_norm": 0.9304030537605286, "learning_rate": 2.1849463495263546e-05, "loss": 0.6374, "step": 4210 }, { "epoch": 2.7423552374756017, "grad_norm": 1.906974196434021, "learning_rate": 2.1798265160742413e-05, "loss": 0.6516, "step": 4215 }, { "epoch": 2.745608327911516, "grad_norm": 1.2081124782562256, "learning_rate": 2.1747080474690778e-05, "loss": 0.6872, "step": 4220 }, { "epoch": 2.7488614183474303, "grad_norm": 0.8841726779937744, "learning_rate": 2.169590965530056e-05, "loss": 0.6585, "step": 4225 }, { "epoch": 2.752114508783344, "grad_norm": 1.3173036575317383, "learning_rate": 2.1644752920704534e-05, "loss": 0.6584, "step": 4230 }, { "epoch": 2.7553675992192583, "grad_norm": 1.285091519355774, "learning_rate": 2.1593610488975468e-05, "loss": 0.6578, "step": 4235 }, { "epoch": 2.7586206896551726, "grad_norm": 1.0926772356033325, "learning_rate": 2.1542482578125143e-05, "loss": 0.6489, "step": 4240 }, { "epoch": 2.7618737800910864, "grad_norm": 1.1347663402557373, "learning_rate": 2.149136940610343e-05, "loss": 0.6455, "step": 4245 }, { "epoch": 2.7651268705270007, "grad_norm": 1.2697584629058838, "learning_rate": 2.1440271190797403e-05, "loss": 0.6708, "step": 4250 }, { "epoch": 2.768379960962915, "grad_norm": 0.9204789996147156, "learning_rate": 2.1389188150030344e-05, "loss": 0.6889, "step": 4255 }, { "epoch": 2.7716330513988288, "grad_norm": 1.154973030090332, "learning_rate": 2.1338120501560862e-05, "loss": 0.6653, "step": 4260 }, { "epoch": 2.774886141834743, "grad_norm": 0.9574093222618103, "learning_rate": 2.128706846308196e-05, "loss": 0.6548, "step": 4265 }, { "epoch": 2.7781392322706573, "grad_norm": 0.9033051133155823, "learning_rate": 2.123603225222007e-05, "loss": 0.6521, "step": 4270 }, { "epoch": 2.781392322706571, "grad_norm": 0.9509397745132446, "learning_rate": 2.11850120865342e-05, "loss": 0.6542, "step": 4275 }, { "epoch": 2.7846454131424854, "grad_norm": 0.9007295370101929, "learning_rate": 2.1134008183514906e-05, "loss": 0.6354, "step": 4280 }, { "epoch": 2.7878985035783996, "grad_norm": 1.7208142280578613, "learning_rate": 2.108302076058346e-05, "loss": 0.6691, "step": 4285 }, { "epoch": 2.7911515940143135, "grad_norm": 0.8821661472320557, "learning_rate": 2.1032050035090865e-05, "loss": 0.6784, "step": 4290 }, { "epoch": 2.7944046844502277, "grad_norm": 0.9296684861183167, "learning_rate": 2.0981096224316944e-05, "loss": 0.6635, "step": 4295 }, { "epoch": 2.797657774886142, "grad_norm": 1.5931028127670288, "learning_rate": 2.093015954546942e-05, "loss": 0.6763, "step": 4300 }, { "epoch": 2.800910865322056, "grad_norm": 3.0784621238708496, "learning_rate": 2.0879240215683e-05, "loss": 0.6472, "step": 4305 }, { "epoch": 2.80416395575797, "grad_norm": 1.1016740798950195, "learning_rate": 2.0828338452018396e-05, "loss": 0.6822, "step": 4310 }, { "epoch": 2.8074170461938843, "grad_norm": 1.020324945449829, "learning_rate": 2.0777454471461476e-05, "loss": 0.6374, "step": 4315 }, { "epoch": 2.810670136629798, "grad_norm": 1.3198879957199097, "learning_rate": 2.0726588490922288e-05, "loss": 0.6525, "step": 4320 }, { "epoch": 2.8139232270657124, "grad_norm": 1.068981647491455, "learning_rate": 2.0675740727234142e-05, "loss": 0.6243, "step": 4325 }, { "epoch": 2.8171763175016267, "grad_norm": 1.2015876770019531, "learning_rate": 2.062491139715271e-05, "loss": 0.6779, "step": 4330 }, { "epoch": 2.8204294079375405, "grad_norm": 0.9587366580963135, "learning_rate": 2.057410071735506e-05, "loss": 0.6594, "step": 4335 }, { "epoch": 2.8236824983734548, "grad_norm": 0.9465182423591614, "learning_rate": 2.0523308904438775e-05, "loss": 0.6545, "step": 4340 }, { "epoch": 2.826935588809369, "grad_norm": 0.9994252324104309, "learning_rate": 2.0472536174921017e-05, "loss": 0.651, "step": 4345 }, { "epoch": 2.830188679245283, "grad_norm": 0.9640632271766663, "learning_rate": 2.0421782745237574e-05, "loss": 0.6617, "step": 4350 }, { "epoch": 2.833441769681197, "grad_norm": 0.8937436938285828, "learning_rate": 2.0371048831741987e-05, "loss": 0.6459, "step": 4355 }, { "epoch": 2.8366948601171114, "grad_norm": 1.2173950672149658, "learning_rate": 2.0320334650704594e-05, "loss": 0.658, "step": 4360 }, { "epoch": 2.839947950553025, "grad_norm": 1.0267751216888428, "learning_rate": 2.0269640418311608e-05, "loss": 0.6494, "step": 4365 }, { "epoch": 2.8432010409889394, "grad_norm": 3.8074207305908203, "learning_rate": 2.021896635066421e-05, "loss": 0.6454, "step": 4370 }, { "epoch": 2.8464541314248537, "grad_norm": 1.9503079652786255, "learning_rate": 2.0168312663777638e-05, "loss": 0.6446, "step": 4375 }, { "epoch": 2.8497072218607675, "grad_norm": 0.942142903804779, "learning_rate": 2.011767957358021e-05, "loss": 0.6374, "step": 4380 }, { "epoch": 2.852960312296682, "grad_norm": 1.3679735660552979, "learning_rate": 2.0067067295912494e-05, "loss": 0.6554, "step": 4385 }, { "epoch": 2.856213402732596, "grad_norm": 1.217850923538208, "learning_rate": 2.0016476046526305e-05, "loss": 0.6332, "step": 4390 }, { "epoch": 2.85946649316851, "grad_norm": 0.9636296033859253, "learning_rate": 1.996590604108383e-05, "loss": 0.6601, "step": 4395 }, { "epoch": 2.862719583604424, "grad_norm": 1.0870169401168823, "learning_rate": 1.991535749515668e-05, "loss": 0.6488, "step": 4400 }, { "epoch": 2.8659726740403384, "grad_norm": 0.9022223353385925, "learning_rate": 1.9864830624225005e-05, "loss": 0.6379, "step": 4405 }, { "epoch": 2.869225764476252, "grad_norm": 0.9370593428611755, "learning_rate": 1.981432564367657e-05, "loss": 0.6288, "step": 4410 }, { "epoch": 2.8724788549121665, "grad_norm": 1.368965744972229, "learning_rate": 1.976384276880578e-05, "loss": 0.6723, "step": 4415 }, { "epoch": 2.8757319453480807, "grad_norm": 1.0498238801956177, "learning_rate": 1.971338221481285e-05, "loss": 0.6348, "step": 4420 }, { "epoch": 2.8789850357839946, "grad_norm": 11.295182228088379, "learning_rate": 1.966294419680283e-05, "loss": 0.6694, "step": 4425 }, { "epoch": 2.882238126219909, "grad_norm": 1.0129636526107788, "learning_rate": 1.96125289297847e-05, "loss": 0.6533, "step": 4430 }, { "epoch": 2.885491216655823, "grad_norm": 3.2188525199890137, "learning_rate": 1.9562136628670464e-05, "loss": 0.6661, "step": 4435 }, { "epoch": 2.888744307091737, "grad_norm": 1.130278468132019, "learning_rate": 1.9511767508274214e-05, "loss": 0.6489, "step": 4440 }, { "epoch": 2.891997397527651, "grad_norm": 1.2190179824829102, "learning_rate": 1.946142178331124e-05, "loss": 0.6323, "step": 4445 }, { "epoch": 2.8952504879635654, "grad_norm": 0.9841941595077515, "learning_rate": 1.9411099668397085e-05, "loss": 0.6623, "step": 4450 }, { "epoch": 2.8985035783994793, "grad_norm": 0.9542006850242615, "learning_rate": 1.9360801378046666e-05, "loss": 0.6389, "step": 4455 }, { "epoch": 2.9017566688353935, "grad_norm": 1.037227988243103, "learning_rate": 1.931052712667332e-05, "loss": 0.6504, "step": 4460 }, { "epoch": 2.905009759271308, "grad_norm": 1.146600365638733, "learning_rate": 1.9260277128587936e-05, "loss": 0.6558, "step": 4465 }, { "epoch": 2.9082628497072216, "grad_norm": 0.8806731104850769, "learning_rate": 1.921005159799798e-05, "loss": 0.6534, "step": 4470 }, { "epoch": 2.911515940143136, "grad_norm": 0.926228404045105, "learning_rate": 1.915985074900664e-05, "loss": 0.6416, "step": 4475 }, { "epoch": 2.91476903057905, "grad_norm": 3.157443046569824, "learning_rate": 1.9109674795611898e-05, "loss": 0.6614, "step": 4480 }, { "epoch": 2.918022121014964, "grad_norm": 0.9215693473815918, "learning_rate": 1.9059523951705585e-05, "loss": 0.6738, "step": 4485 }, { "epoch": 2.921275211450878, "grad_norm": 1.0931910276412964, "learning_rate": 1.900939843107251e-05, "loss": 0.6587, "step": 4490 }, { "epoch": 2.9245283018867925, "grad_norm": 0.8158690333366394, "learning_rate": 1.895929844738954e-05, "loss": 0.6562, "step": 4495 }, { "epoch": 2.9277813923227067, "grad_norm": 0.8992384672164917, "learning_rate": 1.8909224214224662e-05, "loss": 0.6552, "step": 4500 }, { "epoch": 2.9310344827586206, "grad_norm": 0.9259112477302551, "learning_rate": 1.885917594503611e-05, "loss": 0.6391, "step": 4505 }, { "epoch": 2.934287573194535, "grad_norm": 1.9078396558761597, "learning_rate": 1.8809153853171426e-05, "loss": 0.6419, "step": 4510 }, { "epoch": 2.937540663630449, "grad_norm": 1.015743374824524, "learning_rate": 1.875915815186657e-05, "loss": 0.6437, "step": 4515 }, { "epoch": 2.940793754066363, "grad_norm": 0.8620851039886475, "learning_rate": 1.8709189054244996e-05, "loss": 0.6496, "step": 4520 }, { "epoch": 2.944046844502277, "grad_norm": 0.9002664089202881, "learning_rate": 1.865924677331677e-05, "loss": 0.6468, "step": 4525 }, { "epoch": 2.9472999349381914, "grad_norm": 1.9784495830535889, "learning_rate": 1.8609331521977623e-05, "loss": 0.6707, "step": 4530 }, { "epoch": 2.9505530253741052, "grad_norm": 0.902123749256134, "learning_rate": 1.8559443513008067e-05, "loss": 0.6545, "step": 4535 }, { "epoch": 2.9538061158100195, "grad_norm": 1.239797592163086, "learning_rate": 1.8509582959072486e-05, "loss": 0.641, "step": 4540 }, { "epoch": 2.9570592062459338, "grad_norm": 1.3703653812408447, "learning_rate": 1.8459750072718235e-05, "loss": 0.6373, "step": 4545 }, { "epoch": 2.960312296681848, "grad_norm": 3.020883083343506, "learning_rate": 1.8409945066374706e-05, "loss": 0.6318, "step": 4550 }, { "epoch": 2.963565387117762, "grad_norm": 1.026063084602356, "learning_rate": 1.8360168152352472e-05, "loss": 0.6294, "step": 4555 }, { "epoch": 2.966818477553676, "grad_norm": 0.9441201090812683, "learning_rate": 1.8310419542842327e-05, "loss": 0.6543, "step": 4560 }, { "epoch": 2.9700715679895904, "grad_norm": 1.139441728591919, "learning_rate": 1.826069944991442e-05, "loss": 0.6433, "step": 4565 }, { "epoch": 2.973324658425504, "grad_norm": 1.0269153118133545, "learning_rate": 1.821100808551735e-05, "loss": 0.6669, "step": 4570 }, { "epoch": 2.9765777488614185, "grad_norm": 0.9721339344978333, "learning_rate": 1.8161345661477215e-05, "loss": 0.6481, "step": 4575 }, { "epoch": 2.9798308392973327, "grad_norm": 1.0770370960235596, "learning_rate": 1.811171238949679e-05, "loss": 0.6532, "step": 4580 }, { "epoch": 2.9830839297332465, "grad_norm": 0.8994714021682739, "learning_rate": 1.8062108481154545e-05, "loss": 0.6503, "step": 4585 }, { "epoch": 2.986337020169161, "grad_norm": 1.0329276323318481, "learning_rate": 1.801253414790379e-05, "loss": 0.6295, "step": 4590 }, { "epoch": 2.989590110605075, "grad_norm": 5.5768232345581055, "learning_rate": 1.796298960107177e-05, "loss": 0.6711, "step": 4595 }, { "epoch": 2.992843201040989, "grad_norm": 1.2496637105941772, "learning_rate": 1.7913475051858744e-05, "loss": 0.6501, "step": 4600 }, { "epoch": 2.996096291476903, "grad_norm": 1.1241545677185059, "learning_rate": 1.7863990711337093e-05, "loss": 0.6334, "step": 4605 }, { "epoch": 2.9993493819128174, "grad_norm": 1.352886438369751, "learning_rate": 1.7814536790450437e-05, "loss": 0.6427, "step": 4610 }, { "epoch": 3.0, "eval_f1": 0.812837658080022, "eval_loss": 0.43994140625, "eval_precision": 0.8124680334875379, "eval_recall": 0.8133431620386601, "eval_runtime": 406.4496, "eval_samples_per_second": 967.98, "eval_steps_per_second": 0.947, "step": 4611 }, { "epoch": 3.0026024723487312, "grad_norm": 1.0044734477996826, "learning_rate": 1.7765113500012706e-05, "loss": 0.5134, "step": 4615 }, { "epoch": 3.0058555627846455, "grad_norm": 1.0504697561264038, "learning_rate": 1.771572105070727e-05, "loss": 0.4684, "step": 4620 }, { "epoch": 3.0091086532205593, "grad_norm": 1.143985390663147, "learning_rate": 1.766635965308603e-05, "loss": 0.4485, "step": 4625 }, { "epoch": 3.0123617436564736, "grad_norm": 1.933708667755127, "learning_rate": 1.7617029517568502e-05, "loss": 0.469, "step": 4630 }, { "epoch": 3.015614834092388, "grad_norm": 1.3680862188339233, "learning_rate": 1.756773085444095e-05, "loss": 0.4556, "step": 4635 }, { "epoch": 3.018867924528302, "grad_norm": 1.3127816915512085, "learning_rate": 1.7518463873855486e-05, "loss": 0.4536, "step": 4640 }, { "epoch": 3.022121014964216, "grad_norm": 1.4003212451934814, "learning_rate": 1.746922878582914e-05, "loss": 0.4553, "step": 4645 }, { "epoch": 3.02537410540013, "grad_norm": 1.3466031551361084, "learning_rate": 1.7420025800243e-05, "loss": 0.4634, "step": 4650 }, { "epoch": 3.0286271958360445, "grad_norm": 1.3510533571243286, "learning_rate": 1.7370855126841314e-05, "loss": 0.4385, "step": 4655 }, { "epoch": 3.0318802862719583, "grad_norm": 1.4002180099487305, "learning_rate": 1.732171697523059e-05, "loss": 0.4455, "step": 4660 }, { "epoch": 3.0351333767078725, "grad_norm": 1.432400107383728, "learning_rate": 1.7272611554878678e-05, "loss": 0.443, "step": 4665 }, { "epoch": 3.038386467143787, "grad_norm": 3.641195774078369, "learning_rate": 1.722353907511393e-05, "loss": 0.4351, "step": 4670 }, { "epoch": 3.0416395575797006, "grad_norm": 1.3562887907028198, "learning_rate": 1.717449974512426e-05, "loss": 0.4452, "step": 4675 }, { "epoch": 3.044892648015615, "grad_norm": 1.4275323152542114, "learning_rate": 1.7125493773956265e-05, "loss": 0.4424, "step": 4680 }, { "epoch": 3.048145738451529, "grad_norm": 1.6079832315444946, "learning_rate": 1.7076521370514355e-05, "loss": 0.4532, "step": 4685 }, { "epoch": 3.051398828887443, "grad_norm": 1.3562263250350952, "learning_rate": 1.7027582743559843e-05, "loss": 0.4417, "step": 4690 }, { "epoch": 3.0546519193233572, "grad_norm": 1.7932298183441162, "learning_rate": 1.6978678101710043e-05, "loss": 0.4375, "step": 4695 }, { "epoch": 3.0579050097592715, "grad_norm": 1.6265188455581665, "learning_rate": 1.6929807653437412e-05, "loss": 0.4307, "step": 4700 }, { "epoch": 3.0611581001951853, "grad_norm": 1.4614055156707764, "learning_rate": 1.6880971607068646e-05, "loss": 0.4275, "step": 4705 }, { "epoch": 3.0644111906310996, "grad_norm": 2.0057411193847656, "learning_rate": 1.6832170170783776e-05, "loss": 0.4359, "step": 4710 }, { "epoch": 3.067664281067014, "grad_norm": 1.3620643615722656, "learning_rate": 1.6783403552615314e-05, "loss": 0.4267, "step": 4715 }, { "epoch": 3.0709173715029277, "grad_norm": 1.4945169687271118, "learning_rate": 1.6734671960447333e-05, "loss": 0.4401, "step": 4720 }, { "epoch": 3.074170461938842, "grad_norm": 1.7050446271896362, "learning_rate": 1.6685975602014604e-05, "loss": 0.4553, "step": 4725 }, { "epoch": 3.077423552374756, "grad_norm": 2.7887024879455566, "learning_rate": 1.6637314684901713e-05, "loss": 0.4312, "step": 4730 }, { "epoch": 3.08067664281067, "grad_norm": 1.5501539707183838, "learning_rate": 1.658868941654213e-05, "loss": 0.4495, "step": 4735 }, { "epoch": 3.0839297332465843, "grad_norm": 2.2847344875335693, "learning_rate": 1.6540100004217402e-05, "loss": 0.4535, "step": 4740 }, { "epoch": 3.0871828236824985, "grad_norm": 1.3970599174499512, "learning_rate": 1.6491546655056208e-05, "loss": 0.4329, "step": 4745 }, { "epoch": 3.0904359141184123, "grad_norm": 1.7147552967071533, "learning_rate": 1.644302957603349e-05, "loss": 0.4465, "step": 4750 }, { "epoch": 3.0936890045543266, "grad_norm": 1.4633418321609497, "learning_rate": 1.6394548973969588e-05, "loss": 0.4163, "step": 4755 }, { "epoch": 3.096942094990241, "grad_norm": 2.099158763885498, "learning_rate": 1.634610505552934e-05, "loss": 0.4554, "step": 4760 }, { "epoch": 3.1001951854261547, "grad_norm": 1.6046409606933594, "learning_rate": 1.6297698027221216e-05, "loss": 0.4331, "step": 4765 }, { "epoch": 3.103448275862069, "grad_norm": 1.6139500141143799, "learning_rate": 1.6249328095396415e-05, "loss": 0.451, "step": 4770 }, { "epoch": 3.106701366297983, "grad_norm": 2.9759113788604736, "learning_rate": 1.6200995466248014e-05, "loss": 0.4275, "step": 4775 }, { "epoch": 3.109954456733897, "grad_norm": 1.3965107202529907, "learning_rate": 1.6152700345810063e-05, "loss": 0.4394, "step": 4780 }, { "epoch": 3.1132075471698113, "grad_norm": 1.3388948440551758, "learning_rate": 1.6104442939956733e-05, "loss": 0.4496, "step": 4785 }, { "epoch": 3.1164606376057256, "grad_norm": 1.3315989971160889, "learning_rate": 1.6056223454401396e-05, "loss": 0.4359, "step": 4790 }, { "epoch": 3.1197137280416394, "grad_norm": 1.7022862434387207, "learning_rate": 1.6008042094695825e-05, "loss": 0.4239, "step": 4795 }, { "epoch": 3.1229668184775536, "grad_norm": 4.944558620452881, "learning_rate": 1.5959899066229218e-05, "loss": 0.4322, "step": 4800 }, { "epoch": 3.126219908913468, "grad_norm": 2.622772693634033, "learning_rate": 1.5911794574227402e-05, "loss": 0.4512, "step": 4805 }, { "epoch": 3.1294729993493817, "grad_norm": 1.4150971174240112, "learning_rate": 1.5863728823751923e-05, "loss": 0.4152, "step": 4810 }, { "epoch": 3.132726089785296, "grad_norm": 1.6928874254226685, "learning_rate": 1.5815702019699168e-05, "loss": 0.4388, "step": 4815 }, { "epoch": 3.1359791802212102, "grad_norm": 1.9819401502609253, "learning_rate": 1.576771436679952e-05, "loss": 0.4358, "step": 4820 }, { "epoch": 3.139232270657124, "grad_norm": 4.905421733856201, "learning_rate": 1.5719766069616457e-05, "loss": 0.4296, "step": 4825 }, { "epoch": 3.1424853610930383, "grad_norm": 1.4010788202285767, "learning_rate": 1.5671857332545685e-05, "loss": 0.4252, "step": 4830 }, { "epoch": 3.1457384515289526, "grad_norm": 1.8857386112213135, "learning_rate": 1.5623988359814285e-05, "loss": 0.4344, "step": 4835 }, { "epoch": 3.1489915419648664, "grad_norm": 1.8753392696380615, "learning_rate": 1.5576159355479812e-05, "loss": 0.4331, "step": 4840 }, { "epoch": 3.1522446324007807, "grad_norm": 1.5203431844711304, "learning_rate": 1.5528370523429465e-05, "loss": 0.4366, "step": 4845 }, { "epoch": 3.155497722836695, "grad_norm": 1.7210590839385986, "learning_rate": 1.5480622067379176e-05, "loss": 0.4161, "step": 4850 }, { "epoch": 3.1587508132726088, "grad_norm": 1.7494614124298096, "learning_rate": 1.5432914190872757e-05, "loss": 0.428, "step": 4855 }, { "epoch": 3.162003903708523, "grad_norm": 1.7109274864196777, "learning_rate": 1.538524709728106e-05, "loss": 0.423, "step": 4860 }, { "epoch": 3.1652569941444373, "grad_norm": 2.6436519622802734, "learning_rate": 1.533762098980107e-05, "loss": 0.4308, "step": 4865 }, { "epoch": 3.168510084580351, "grad_norm": 6.52752161026001, "learning_rate": 1.5290036071455055e-05, "loss": 0.4425, "step": 4870 }, { "epoch": 3.1717631750162654, "grad_norm": 1.7809419631958008, "learning_rate": 1.5242492545089698e-05, "loss": 0.4444, "step": 4875 }, { "epoch": 3.1750162654521796, "grad_norm": 4.83860969543457, "learning_rate": 1.5194990613375253e-05, "loss": 0.4315, "step": 4880 }, { "epoch": 3.178269355888094, "grad_norm": 1.6642948389053345, "learning_rate": 1.5147530478804634e-05, "loss": 0.4348, "step": 4885 }, { "epoch": 3.1815224463240077, "grad_norm": 1.948080062866211, "learning_rate": 1.5100112343692604e-05, "loss": 0.4334, "step": 4890 }, { "epoch": 3.184775536759922, "grad_norm": 1.5025594234466553, "learning_rate": 1.5052736410174877e-05, "loss": 0.4436, "step": 4895 }, { "epoch": 3.1880286271958362, "grad_norm": 1.6708216667175293, "learning_rate": 1.5005402880207273e-05, "loss": 0.446, "step": 4900 }, { "epoch": 3.19128171763175, "grad_norm": 3.007356643676758, "learning_rate": 1.495811195556486e-05, "loss": 0.42, "step": 4905 }, { "epoch": 3.1945348080676643, "grad_norm": 1.6949506998062134, "learning_rate": 1.4910863837841068e-05, "loss": 0.4226, "step": 4910 }, { "epoch": 3.1977878985035786, "grad_norm": 2.3278913497924805, "learning_rate": 1.4863658728446864e-05, "loss": 0.4101, "step": 4915 }, { "epoch": 3.2010409889394924, "grad_norm": 1.6102315187454224, "learning_rate": 1.4816496828609878e-05, "loss": 0.4139, "step": 4920 }, { "epoch": 3.2042940793754067, "grad_norm": 1.5101537704467773, "learning_rate": 1.476937833937352e-05, "loss": 0.4361, "step": 4925 }, { "epoch": 3.207547169811321, "grad_norm": 1.7118359804153442, "learning_rate": 1.472230346159619e-05, "loss": 0.4197, "step": 4930 }, { "epoch": 3.2108002602472347, "grad_norm": 2.325700044631958, "learning_rate": 1.4675272395950345e-05, "loss": 0.4322, "step": 4935 }, { "epoch": 3.214053350683149, "grad_norm": 2.856618642807007, "learning_rate": 1.46282853429217e-05, "loss": 0.439, "step": 4940 }, { "epoch": 3.2173064411190633, "grad_norm": 1.6420326232910156, "learning_rate": 1.4581342502808321e-05, "loss": 0.4078, "step": 4945 }, { "epoch": 3.220559531554977, "grad_norm": 1.6206941604614258, "learning_rate": 1.4534444075719839e-05, "loss": 0.4031, "step": 4950 }, { "epoch": 3.2238126219908914, "grad_norm": 2.0076100826263428, "learning_rate": 1.4487590261576542e-05, "loss": 0.428, "step": 4955 }, { "epoch": 3.2270657124268056, "grad_norm": 1.8135625123977661, "learning_rate": 1.4440781260108521e-05, "loss": 0.4177, "step": 4960 }, { "epoch": 3.2303188028627194, "grad_norm": 1.494215488433838, "learning_rate": 1.4394017270854887e-05, "loss": 0.4233, "step": 4965 }, { "epoch": 3.2335718932986337, "grad_norm": 1.5636515617370605, "learning_rate": 1.4347298493162823e-05, "loss": 0.4354, "step": 4970 }, { "epoch": 3.236824983734548, "grad_norm": 1.34303879737854, "learning_rate": 1.4300625126186806e-05, "loss": 0.4313, "step": 4975 }, { "epoch": 3.240078074170462, "grad_norm": 1.4629467725753784, "learning_rate": 1.4253997368887717e-05, "loss": 0.4272, "step": 4980 }, { "epoch": 3.243331164606376, "grad_norm": 1.7214086055755615, "learning_rate": 1.4207415420032044e-05, "loss": 0.435, "step": 4985 }, { "epoch": 3.2465842550422903, "grad_norm": 1.5725369453430176, "learning_rate": 1.4160879478190974e-05, "loss": 0.4239, "step": 4990 }, { "epoch": 3.249837345478204, "grad_norm": 2.499541997909546, "learning_rate": 1.411438974173957e-05, "loss": 0.438, "step": 4995 }, { "epoch": 3.2530904359141184, "grad_norm": 1.4437288045883179, "learning_rate": 1.4067946408855953e-05, "loss": 0.427, "step": 5000 }, { "epoch": 3.2563435263500327, "grad_norm": 1.851952075958252, "learning_rate": 1.4021549677520415e-05, "loss": 0.4419, "step": 5005 }, { "epoch": 3.2595966167859465, "grad_norm": 1.586517572402954, "learning_rate": 1.3975199745514587e-05, "loss": 0.4193, "step": 5010 }, { "epoch": 3.2628497072218607, "grad_norm": 1.516689419746399, "learning_rate": 1.392889681042063e-05, "loss": 0.4273, "step": 5015 }, { "epoch": 3.266102797657775, "grad_norm": 1.5535813570022583, "learning_rate": 1.3882641069620339e-05, "loss": 0.4326, "step": 5020 }, { "epoch": 3.269355888093689, "grad_norm": 2.197828531265259, "learning_rate": 1.3836432720294329e-05, "loss": 0.4381, "step": 5025 }, { "epoch": 3.272608978529603, "grad_norm": 1.452176570892334, "learning_rate": 1.3790271959421219e-05, "loss": 0.426, "step": 5030 }, { "epoch": 3.2758620689655173, "grad_norm": 1.550866723060608, "learning_rate": 1.3744158983776733e-05, "loss": 0.4284, "step": 5035 }, { "epoch": 3.279115159401431, "grad_norm": 1.6225146055221558, "learning_rate": 1.3698093989932904e-05, "loss": 0.4319, "step": 5040 }, { "epoch": 3.2823682498373454, "grad_norm": 1.6624497175216675, "learning_rate": 1.3652077174257249e-05, "loss": 0.4291, "step": 5045 }, { "epoch": 3.2856213402732597, "grad_norm": 2.7444093227386475, "learning_rate": 1.3606108732911882e-05, "loss": 0.4292, "step": 5050 }, { "epoch": 3.288874430709174, "grad_norm": 1.5146102905273438, "learning_rate": 1.3560188861852702e-05, "loss": 0.4466, "step": 5055 }, { "epoch": 3.2921275211450878, "grad_norm": 1.6835765838623047, "learning_rate": 1.3514317756828587e-05, "loss": 0.4188, "step": 5060 }, { "epoch": 3.295380611581002, "grad_norm": 1.7336374521255493, "learning_rate": 1.3468495613380533e-05, "loss": 0.4331, "step": 5065 }, { "epoch": 3.2986337020169163, "grad_norm": 1.7082960605621338, "learning_rate": 1.3422722626840791e-05, "loss": 0.4161, "step": 5070 }, { "epoch": 3.30188679245283, "grad_norm": 2.2675247192382812, "learning_rate": 1.3376998992332076e-05, "loss": 0.4111, "step": 5075 }, { "epoch": 3.3051398828887444, "grad_norm": 1.4138269424438477, "learning_rate": 1.3331324904766745e-05, "loss": 0.4244, "step": 5080 }, { "epoch": 3.3083929733246586, "grad_norm": 2.2490038871765137, "learning_rate": 1.328570055884592e-05, "loss": 0.4208, "step": 5085 }, { "epoch": 3.3116460637605725, "grad_norm": 1.6177328824996948, "learning_rate": 1.3240126149058685e-05, "loss": 0.4227, "step": 5090 }, { "epoch": 3.3148991541964867, "grad_norm": 1.7124465703964233, "learning_rate": 1.3194601869681272e-05, "loss": 0.4202, "step": 5095 }, { "epoch": 3.318152244632401, "grad_norm": 1.7719941139221191, "learning_rate": 1.3149127914776196e-05, "loss": 0.417, "step": 5100 }, { "epoch": 3.321405335068315, "grad_norm": 1.6123124361038208, "learning_rate": 1.3103704478191448e-05, "loss": 0.4398, "step": 5105 }, { "epoch": 3.324658425504229, "grad_norm": 1.724311113357544, "learning_rate": 1.3058331753559688e-05, "loss": 0.4456, "step": 5110 }, { "epoch": 3.3279115159401433, "grad_norm": 1.976486086845398, "learning_rate": 1.301300993429738e-05, "loss": 0.4207, "step": 5115 }, { "epoch": 3.331164606376057, "grad_norm": 1.5178265571594238, "learning_rate": 1.296773921360398e-05, "loss": 0.4138, "step": 5120 }, { "epoch": 3.3344176968119714, "grad_norm": 2.115887403488159, "learning_rate": 1.2922519784461154e-05, "loss": 0.4237, "step": 5125 }, { "epoch": 3.3376707872478857, "grad_norm": 1.584922194480896, "learning_rate": 1.2877351839631884e-05, "loss": 0.4331, "step": 5130 }, { "epoch": 3.3409238776837995, "grad_norm": 1.9146180152893066, "learning_rate": 1.283223557165969e-05, "loss": 0.4213, "step": 5135 }, { "epoch": 3.3441769681197138, "grad_norm": 2.934779644012451, "learning_rate": 1.2787171172867826e-05, "loss": 0.4304, "step": 5140 }, { "epoch": 3.347430058555628, "grad_norm": 1.7485958337783813, "learning_rate": 1.2742158835358412e-05, "loss": 0.4402, "step": 5145 }, { "epoch": 3.350683148991542, "grad_norm": 1.497227668762207, "learning_rate": 1.2697198751011641e-05, "loss": 0.4235, "step": 5150 }, { "epoch": 3.353936239427456, "grad_norm": 1.3970587253570557, "learning_rate": 1.2652291111484962e-05, "loss": 0.41, "step": 5155 }, { "epoch": 3.3571893298633704, "grad_norm": 2.238862991333008, "learning_rate": 1.2607436108212278e-05, "loss": 0.4398, "step": 5160 }, { "epoch": 3.360442420299284, "grad_norm": 4.999053478240967, "learning_rate": 1.256263393240309e-05, "loss": 0.4316, "step": 5165 }, { "epoch": 3.3636955107351985, "grad_norm": 2.9387760162353516, "learning_rate": 1.25178847750417e-05, "loss": 0.4391, "step": 5170 }, { "epoch": 3.3669486011711127, "grad_norm": 7.236008167266846, "learning_rate": 1.2473188826886428e-05, "loss": 0.4247, "step": 5175 }, { "epoch": 3.3702016916070265, "grad_norm": 1.8067480325698853, "learning_rate": 1.2428546278468753e-05, "loss": 0.4363, "step": 5180 }, { "epoch": 3.373454782042941, "grad_norm": 2.2515077590942383, "learning_rate": 1.2383957320092512e-05, "loss": 0.4218, "step": 5185 }, { "epoch": 3.376707872478855, "grad_norm": 2.5399014949798584, "learning_rate": 1.2339422141833127e-05, "loss": 0.4344, "step": 5190 }, { "epoch": 3.379960962914769, "grad_norm": 3.233678102493286, "learning_rate": 1.2294940933536725e-05, "loss": 0.4214, "step": 5195 }, { "epoch": 3.383214053350683, "grad_norm": 1.5409492254257202, "learning_rate": 1.2250513884819403e-05, "loss": 0.4225, "step": 5200 }, { "epoch": 3.3864671437865974, "grad_norm": 1.2861021757125854, "learning_rate": 1.2206141185066359e-05, "loss": 0.4268, "step": 5205 }, { "epoch": 3.3897202342225112, "grad_norm": 1.417588472366333, "learning_rate": 1.2161823023431133e-05, "loss": 0.4188, "step": 5210 }, { "epoch": 3.3929733246584255, "grad_norm": 3.5109989643096924, "learning_rate": 1.2117559588834757e-05, "loss": 0.4243, "step": 5215 }, { "epoch": 3.3962264150943398, "grad_norm": 6.47197151184082, "learning_rate": 1.207335106996497e-05, "loss": 0.4229, "step": 5220 }, { "epoch": 3.3994795055302536, "grad_norm": 1.6867964267730713, "learning_rate": 1.2029197655275442e-05, "loss": 0.4287, "step": 5225 }, { "epoch": 3.402732595966168, "grad_norm": 1.575161099433899, "learning_rate": 1.1985099532984917e-05, "loss": 0.4304, "step": 5230 }, { "epoch": 3.405985686402082, "grad_norm": 1.444969654083252, "learning_rate": 1.1941056891076432e-05, "loss": 0.4325, "step": 5235 }, { "epoch": 3.409238776837996, "grad_norm": 1.6448862552642822, "learning_rate": 1.1897069917296555e-05, "loss": 0.4358, "step": 5240 }, { "epoch": 3.41249186727391, "grad_norm": 1.5217825174331665, "learning_rate": 1.1853138799154514e-05, "loss": 0.4404, "step": 5245 }, { "epoch": 3.4157449577098244, "grad_norm": 10.740922927856445, "learning_rate": 1.1809263723921438e-05, "loss": 0.4421, "step": 5250 }, { "epoch": 3.4189980481457383, "grad_norm": 2.3015806674957275, "learning_rate": 1.1765444878629583e-05, "loss": 0.4376, "step": 5255 }, { "epoch": 3.4222511385816525, "grad_norm": 1.4035224914550781, "learning_rate": 1.1721682450071476e-05, "loss": 0.453, "step": 5260 }, { "epoch": 3.425504229017567, "grad_norm": 1.4976873397827148, "learning_rate": 1.167797662479915e-05, "loss": 0.4411, "step": 5265 }, { "epoch": 3.4287573194534806, "grad_norm": 1.4919233322143555, "learning_rate": 1.1634327589123373e-05, "loss": 0.4357, "step": 5270 }, { "epoch": 3.432010409889395, "grad_norm": 1.5440754890441895, "learning_rate": 1.1590735529112806e-05, "loss": 0.4288, "step": 5275 }, { "epoch": 3.435263500325309, "grad_norm": 1.637131690979004, "learning_rate": 1.1547200630593224e-05, "loss": 0.4379, "step": 5280 }, { "epoch": 3.438516590761223, "grad_norm": 1.525527834892273, "learning_rate": 1.1503723079146766e-05, "loss": 0.4158, "step": 5285 }, { "epoch": 3.441769681197137, "grad_norm": 1.5343090295791626, "learning_rate": 1.1460303060111083e-05, "loss": 0.4128, "step": 5290 }, { "epoch": 3.4450227716330515, "grad_norm": 1.7940443754196167, "learning_rate": 1.1416940758578567e-05, "loss": 0.4389, "step": 5295 }, { "epoch": 3.4482758620689653, "grad_norm": 1.8386902809143066, "learning_rate": 1.137363635939561e-05, "loss": 0.4537, "step": 5300 }, { "epoch": 3.4515289525048796, "grad_norm": 1.498874306678772, "learning_rate": 1.1330390047161729e-05, "loss": 0.4297, "step": 5305 }, { "epoch": 3.454782042940794, "grad_norm": 1.4000130891799927, "learning_rate": 1.1287202006228858e-05, "loss": 0.4235, "step": 5310 }, { "epoch": 3.4580351333767076, "grad_norm": 6.055357933044434, "learning_rate": 1.1244072420700502e-05, "loss": 0.4282, "step": 5315 }, { "epoch": 3.461288223812622, "grad_norm": 1.7397581338882446, "learning_rate": 1.1201001474431022e-05, "loss": 0.4053, "step": 5320 }, { "epoch": 3.464541314248536, "grad_norm": 1.7397915124893188, "learning_rate": 1.1157989351024767e-05, "loss": 0.4213, "step": 5325 }, { "epoch": 3.46779440468445, "grad_norm": 1.6159850358963013, "learning_rate": 1.1115036233835349e-05, "loss": 0.4241, "step": 5330 }, { "epoch": 3.4710474951203643, "grad_norm": 1.4938251972198486, "learning_rate": 1.1072142305964855e-05, "loss": 0.425, "step": 5335 }, { "epoch": 3.4743005855562785, "grad_norm": 2.087301015853882, "learning_rate": 1.102930775026306e-05, "loss": 0.4224, "step": 5340 }, { "epoch": 3.4775536759921923, "grad_norm": 1.5332581996917725, "learning_rate": 1.098653274932662e-05, "loss": 0.4281, "step": 5345 }, { "epoch": 3.4808067664281066, "grad_norm": 1.481587529182434, "learning_rate": 1.094381748549835e-05, "loss": 0.4083, "step": 5350 }, { "epoch": 3.484059856864021, "grad_norm": 1.8097538948059082, "learning_rate": 1.0901162140866395e-05, "loss": 0.4231, "step": 5355 }, { "epoch": 3.487312947299935, "grad_norm": 1.6291629076004028, "learning_rate": 1.0858566897263475e-05, "loss": 0.4298, "step": 5360 }, { "epoch": 3.490566037735849, "grad_norm": 1.3684316873550415, "learning_rate": 1.081603193626611e-05, "loss": 0.4045, "step": 5365 }, { "epoch": 3.493819128171763, "grad_norm": 1.9993515014648438, "learning_rate": 1.0773557439193865e-05, "loss": 0.4223, "step": 5370 }, { "epoch": 3.4970722186076775, "grad_norm": 1.364740014076233, "learning_rate": 1.0731143587108533e-05, "loss": 0.4472, "step": 5375 }, { "epoch": 3.5003253090435913, "grad_norm": 1.4979236125946045, "learning_rate": 1.0688790560813388e-05, "loss": 0.4232, "step": 5380 }, { "epoch": 3.5035783994795056, "grad_norm": 1.574204921722412, "learning_rate": 1.064649854085244e-05, "loss": 0.4228, "step": 5385 }, { "epoch": 3.5068314899154194, "grad_norm": 2.251814842224121, "learning_rate": 1.0604267707509608e-05, "loss": 0.4091, "step": 5390 }, { "epoch": 3.5100845803513336, "grad_norm": 2.445500135421753, "learning_rate": 1.0562098240807989e-05, "loss": 0.4257, "step": 5395 }, { "epoch": 3.513337670787248, "grad_norm": 1.3687506914138794, "learning_rate": 1.0519990320509104e-05, "loss": 0.4132, "step": 5400 }, { "epoch": 3.516590761223162, "grad_norm": 1.6088290214538574, "learning_rate": 1.0477944126112097e-05, "loss": 0.4151, "step": 5405 }, { "epoch": 3.519843851659076, "grad_norm": 1.3225458860397339, "learning_rate": 1.0435959836852967e-05, "loss": 0.4173, "step": 5410 }, { "epoch": 3.5230969420949902, "grad_norm": 2.5459144115448, "learning_rate": 1.0394037631703867e-05, "loss": 0.4344, "step": 5415 }, { "epoch": 3.5263500325309045, "grad_norm": 1.7091197967529297, "learning_rate": 1.0352177689372256e-05, "loss": 0.4328, "step": 5420 }, { "epoch": 3.5296031229668183, "grad_norm": 1.490488052368164, "learning_rate": 1.0310380188300178e-05, "loss": 0.4153, "step": 5425 }, { "epoch": 3.5328562134027326, "grad_norm": 8.786995887756348, "learning_rate": 1.0268645306663532e-05, "loss": 0.4466, "step": 5430 }, { "epoch": 3.536109303838647, "grad_norm": 2.5409798622131348, "learning_rate": 1.0226973222371253e-05, "loss": 0.4174, "step": 5435 }, { "epoch": 3.5393623942745607, "grad_norm": 2.2137346267700195, "learning_rate": 1.0185364113064577e-05, "loss": 0.4296, "step": 5440 }, { "epoch": 3.542615484710475, "grad_norm": 1.8226664066314697, "learning_rate": 1.0143818156116323e-05, "loss": 0.4076, "step": 5445 }, { "epoch": 3.545868575146389, "grad_norm": 1.7435983419418335, "learning_rate": 1.0102335528630061e-05, "loss": 0.4241, "step": 5450 }, { "epoch": 3.5491216655823035, "grad_norm": 2.1171414852142334, "learning_rate": 1.0060916407439413e-05, "loss": 0.4094, "step": 5455 }, { "epoch": 3.5523747560182173, "grad_norm": 1.3854146003723145, "learning_rate": 1.0019560969107302e-05, "loss": 0.4165, "step": 5460 }, { "epoch": 3.5556278464541315, "grad_norm": 1.7370872497558594, "learning_rate": 9.978269389925157e-06, "loss": 0.4179, "step": 5465 }, { "epoch": 3.558880936890046, "grad_norm": 2.7952463626861572, "learning_rate": 9.937041845912188e-06, "loss": 0.4366, "step": 5470 }, { "epoch": 3.5621340273259596, "grad_norm": 2.453916311264038, "learning_rate": 9.895878512814647e-06, "loss": 0.4105, "step": 5475 }, { "epoch": 3.565387117761874, "grad_norm": 3.428210973739624, "learning_rate": 9.854779566105068e-06, "loss": 0.4235, "step": 5480 }, { "epoch": 3.568640208197788, "grad_norm": 1.5725935697555542, "learning_rate": 9.813745180981502e-06, "loss": 0.4165, "step": 5485 }, { "epoch": 3.571893298633702, "grad_norm": 1.8538109064102173, "learning_rate": 9.772775532366774e-06, "loss": 0.4427, "step": 5490 }, { "epoch": 3.5751463890696162, "grad_norm": 2.118178606033325, "learning_rate": 9.731870794907789e-06, "loss": 0.4299, "step": 5495 }, { "epoch": 3.5783994795055305, "grad_norm": 6.738338470458984, "learning_rate": 9.691031142974707e-06, "loss": 0.433, "step": 5500 }, { "epoch": 3.5816525699414443, "grad_norm": 2.768328905105591, "learning_rate": 9.65025675066025e-06, "loss": 0.4251, "step": 5505 }, { "epoch": 3.5849056603773586, "grad_norm": 1.6791824102401733, "learning_rate": 9.609547791778964e-06, "loss": 0.4195, "step": 5510 }, { "epoch": 3.588158750813273, "grad_norm": 1.43679940700531, "learning_rate": 9.568904439866444e-06, "loss": 0.4125, "step": 5515 }, { "epoch": 3.5914118412491867, "grad_norm": 1.60779869556427, "learning_rate": 9.528326868178616e-06, "loss": 0.4309, "step": 5520 }, { "epoch": 3.594664931685101, "grad_norm": 2.161658763885498, "learning_rate": 9.487815249691012e-06, "loss": 0.4391, "step": 5525 }, { "epoch": 3.597918022121015, "grad_norm": 1.7440755367279053, "learning_rate": 9.447369757098002e-06, "loss": 0.4046, "step": 5530 }, { "epoch": 3.601171112556929, "grad_norm": 1.9223883152008057, "learning_rate": 9.406990562812068e-06, "loss": 0.3904, "step": 5535 }, { "epoch": 3.6044242029928433, "grad_norm": 1.7541393041610718, "learning_rate": 9.366677838963078e-06, "loss": 0.4238, "step": 5540 }, { "epoch": 3.6076772934287575, "grad_norm": 1.4891860485076904, "learning_rate": 9.32643175739756e-06, "loss": 0.4173, "step": 5545 }, { "epoch": 3.6109303838646714, "grad_norm": 2.7674455642700195, "learning_rate": 9.286252489677944e-06, "loss": 0.4033, "step": 5550 }, { "epoch": 3.6141834743005856, "grad_norm": 2.088984727859497, "learning_rate": 9.246140207081833e-06, "loss": 0.4468, "step": 5555 }, { "epoch": 3.6174365647365, "grad_norm": 1.516174554824829, "learning_rate": 9.206095080601319e-06, "loss": 0.4194, "step": 5560 }, { "epoch": 3.6206896551724137, "grad_norm": 2.387362241744995, "learning_rate": 9.16611728094218e-06, "loss": 0.4142, "step": 5565 }, { "epoch": 3.623942745608328, "grad_norm": 1.8257601261138916, "learning_rate": 9.126206978523202e-06, "loss": 0.4221, "step": 5570 }, { "epoch": 3.6271958360442422, "grad_norm": 1.815144419670105, "learning_rate": 9.086364343475461e-06, "loss": 0.4356, "step": 5575 }, { "epoch": 3.630448926480156, "grad_norm": 1.5288091897964478, "learning_rate": 9.04658954564156e-06, "loss": 0.4077, "step": 5580 }, { "epoch": 3.6337020169160703, "grad_norm": 1.570156455039978, "learning_rate": 9.006882754574914e-06, "loss": 0.4136, "step": 5585 }, { "epoch": 3.6369551073519846, "grad_norm": 2.161524772644043, "learning_rate": 8.967244139539064e-06, "loss": 0.4204, "step": 5590 }, { "epoch": 3.6402081977878984, "grad_norm": 1.590711236000061, "learning_rate": 8.927673869506905e-06, "loss": 0.4133, "step": 5595 }, { "epoch": 3.6434612882238127, "grad_norm": 1.5367971658706665, "learning_rate": 8.888172113159989e-06, "loss": 0.4152, "step": 5600 }, { "epoch": 3.646714378659727, "grad_norm": 1.5242828130722046, "learning_rate": 8.848739038887822e-06, "loss": 0.4128, "step": 5605 }, { "epoch": 3.6499674690956407, "grad_norm": 1.4691710472106934, "learning_rate": 8.809374814787124e-06, "loss": 0.4247, "step": 5610 }, { "epoch": 3.653220559531555, "grad_norm": 3.0192527770996094, "learning_rate": 8.770079608661108e-06, "loss": 0.4279, "step": 5615 }, { "epoch": 3.6564736499674693, "grad_norm": 1.5646493434906006, "learning_rate": 8.730853588018772e-06, "loss": 0.4055, "step": 5620 }, { "epoch": 3.659726740403383, "grad_norm": 1.502172589302063, "learning_rate": 8.691696920074214e-06, "loss": 0.4314, "step": 5625 }, { "epoch": 3.6629798308392973, "grad_norm": 1.4211641550064087, "learning_rate": 8.652609771745862e-06, "loss": 0.4218, "step": 5630 }, { "epoch": 3.6662329212752116, "grad_norm": 1.905414342880249, "learning_rate": 8.613592309655804e-06, "loss": 0.4178, "step": 5635 }, { "epoch": 3.6694860117111254, "grad_norm": 2.1291544437408447, "learning_rate": 8.574644700129087e-06, "loss": 0.4224, "step": 5640 }, { "epoch": 3.6727391021470397, "grad_norm": 1.6421247720718384, "learning_rate": 8.535767109192955e-06, "loss": 0.4378, "step": 5645 }, { "epoch": 3.675992192582954, "grad_norm": 1.5527359247207642, "learning_rate": 8.496959702576187e-06, "loss": 0.4242, "step": 5650 }, { "epoch": 3.6792452830188678, "grad_norm": 2.3626434803009033, "learning_rate": 8.458222645708394e-06, "loss": 0.4187, "step": 5655 }, { "epoch": 3.682498373454782, "grad_norm": 1.3949254751205444, "learning_rate": 8.419556103719279e-06, "loss": 0.4094, "step": 5660 }, { "epoch": 3.6857514638906963, "grad_norm": 1.7462977170944214, "learning_rate": 8.380960241437947e-06, "loss": 0.4228, "step": 5665 }, { "epoch": 3.68900455432661, "grad_norm": 1.5994423627853394, "learning_rate": 8.342435223392232e-06, "loss": 0.4216, "step": 5670 }, { "epoch": 3.6922576447625244, "grad_norm": 2.040391206741333, "learning_rate": 8.303981213807947e-06, "loss": 0.419, "step": 5675 }, { "epoch": 3.6955107351984386, "grad_norm": 1.5465586185455322, "learning_rate": 8.265598376608211e-06, "loss": 0.4105, "step": 5680 }, { "epoch": 3.6987638256343525, "grad_norm": 1.965409278869629, "learning_rate": 8.227286875412766e-06, "loss": 0.4056, "step": 5685 }, { "epoch": 3.7020169160702667, "grad_norm": 1.9103503227233887, "learning_rate": 8.189046873537237e-06, "loss": 0.4142, "step": 5690 }, { "epoch": 3.705270006506181, "grad_norm": 1.6853419542312622, "learning_rate": 8.150878533992458e-06, "loss": 0.4421, "step": 5695 }, { "epoch": 3.708523096942095, "grad_norm": 1.7083536386489868, "learning_rate": 8.112782019483813e-06, "loss": 0.4175, "step": 5700 }, { "epoch": 3.711776187378009, "grad_norm": 1.8322556018829346, "learning_rate": 8.074757492410471e-06, "loss": 0.4202, "step": 5705 }, { "epoch": 3.7150292778139233, "grad_norm": 1.8045029640197754, "learning_rate": 8.036805114864736e-06, "loss": 0.4232, "step": 5710 }, { "epoch": 3.718282368249837, "grad_norm": 1.9806746244430542, "learning_rate": 7.998925048631362e-06, "loss": 0.4176, "step": 5715 }, { "epoch": 3.7215354586857514, "grad_norm": 1.4905263185501099, "learning_rate": 7.96111745518685e-06, "loss": 0.422, "step": 5720 }, { "epoch": 3.7247885491216657, "grad_norm": 1.4577668905258179, "learning_rate": 7.923382495698758e-06, "loss": 0.406, "step": 5725 }, { "epoch": 3.7280416395575795, "grad_norm": 2.971972942352295, "learning_rate": 7.88572033102501e-06, "loss": 0.4168, "step": 5730 }, { "epoch": 3.7312947299934938, "grad_norm": 1.7888929843902588, "learning_rate": 7.848131121713234e-06, "loss": 0.4397, "step": 5735 }, { "epoch": 3.734547820429408, "grad_norm": 1.5038715600967407, "learning_rate": 7.810615028000045e-06, "loss": 0.4017, "step": 5740 }, { "epoch": 3.737800910865322, "grad_norm": 1.679883599281311, "learning_rate": 7.773172209810397e-06, "loss": 0.4221, "step": 5745 }, { "epoch": 3.741054001301236, "grad_norm": 1.669687271118164, "learning_rate": 7.735802826756856e-06, "loss": 0.408, "step": 5750 }, { "epoch": 3.7443070917371504, "grad_norm": 2.8292086124420166, "learning_rate": 7.698507038138978e-06, "loss": 0.4408, "step": 5755 }, { "epoch": 3.747560182173064, "grad_norm": 1.6891485452651978, "learning_rate": 7.661285002942572e-06, "loss": 0.4202, "step": 5760 }, { "epoch": 3.7508132726089785, "grad_norm": 18.631166458129883, "learning_rate": 7.624136879839053e-06, "loss": 0.4368, "step": 5765 }, { "epoch": 3.7540663630448927, "grad_norm": 2.31217622756958, "learning_rate": 7.5870628271847765e-06, "loss": 0.4099, "step": 5770 }, { "epoch": 3.7573194534808065, "grad_norm": 1.7663567066192627, "learning_rate": 7.550063003020333e-06, "loss": 0.4242, "step": 5775 }, { "epoch": 3.760572543916721, "grad_norm": 1.651605486869812, "learning_rate": 7.5131375650698835e-06, "loss": 0.428, "step": 5780 }, { "epoch": 3.763825634352635, "grad_norm": 1.542211890220642, "learning_rate": 7.476286670740518e-06, "loss": 0.4163, "step": 5785 }, { "epoch": 3.767078724788549, "grad_norm": 1.7693531513214111, "learning_rate": 7.439510477121536e-06, "loss": 0.4192, "step": 5790 }, { "epoch": 3.770331815224463, "grad_norm": 2.135603666305542, "learning_rate": 7.402809140983799e-06, "loss": 0.4155, "step": 5795 }, { "epoch": 3.7735849056603774, "grad_norm": 2.1200711727142334, "learning_rate": 7.366182818779088e-06, "loss": 0.4056, "step": 5800 }, { "epoch": 3.7768379960962912, "grad_norm": 2.6680068969726562, "learning_rate": 7.329631666639392e-06, "loss": 0.4124, "step": 5805 }, { "epoch": 3.7800910865322055, "grad_norm": 2.2767679691314697, "learning_rate": 7.2931558403762535e-06, "loss": 0.3993, "step": 5810 }, { "epoch": 3.7833441769681198, "grad_norm": 1.7897844314575195, "learning_rate": 7.256755495480141e-06, "loss": 0.3827, "step": 5815 }, { "epoch": 3.7865972674040336, "grad_norm": 1.7870675325393677, "learning_rate": 7.220430787119742e-06, "loss": 0.4199, "step": 5820 }, { "epoch": 3.789850357839948, "grad_norm": 1.4809794425964355, "learning_rate": 7.184181870141307e-06, "loss": 0.4056, "step": 5825 }, { "epoch": 3.793103448275862, "grad_norm": 2.3490641117095947, "learning_rate": 7.148008899068029e-06, "loss": 0.4084, "step": 5830 }, { "epoch": 3.796356538711776, "grad_norm": 2.2586324214935303, "learning_rate": 7.1119120280993295e-06, "loss": 0.4125, "step": 5835 }, { "epoch": 3.79960962914769, "grad_norm": 3.2746353149414062, "learning_rate": 7.0758914111102335e-06, "loss": 0.3964, "step": 5840 }, { "epoch": 3.8028627195836044, "grad_norm": 1.9759531021118164, "learning_rate": 7.039947201650726e-06, "loss": 0.4151, "step": 5845 }, { "epoch": 3.8061158100195187, "grad_norm": 1.395560622215271, "learning_rate": 7.004079552945062e-06, "loss": 0.415, "step": 5850 }, { "epoch": 3.8093689004554325, "grad_norm": 1.6155577898025513, "learning_rate": 6.968288617891116e-06, "loss": 0.4093, "step": 5855 }, { "epoch": 3.812621990891347, "grad_norm": 1.647705078125, "learning_rate": 6.932574549059789e-06, "loss": 0.4033, "step": 5860 }, { "epoch": 3.815875081327261, "grad_norm": 1.5718135833740234, "learning_rate": 6.8969374986942784e-06, "loss": 0.4281, "step": 5865 }, { "epoch": 3.819128171763175, "grad_norm": 6.746004581451416, "learning_rate": 6.861377618709466e-06, "loss": 0.4199, "step": 5870 }, { "epoch": 3.822381262199089, "grad_norm": 1.6057871580123901, "learning_rate": 6.825895060691273e-06, "loss": 0.4059, "step": 5875 }, { "epoch": 3.8256343526350034, "grad_norm": 1.5333271026611328, "learning_rate": 6.790489975896033e-06, "loss": 0.4311, "step": 5880 }, { "epoch": 3.828887443070917, "grad_norm": 4.261138439178467, "learning_rate": 6.755162515249799e-06, "loss": 0.3987, "step": 5885 }, { "epoch": 3.8321405335068315, "grad_norm": 1.475321650505066, "learning_rate": 6.719912829347716e-06, "loss": 0.424, "step": 5890 }, { "epoch": 3.8353936239427457, "grad_norm": 1.6494297981262207, "learning_rate": 6.6847410684534235e-06, "loss": 0.4367, "step": 5895 }, { "epoch": 3.8386467143786596, "grad_norm": 1.8216992616653442, "learning_rate": 6.649647382498345e-06, "loss": 0.4201, "step": 5900 }, { "epoch": 3.841899804814574, "grad_norm": 1.7172224521636963, "learning_rate": 6.6146319210810935e-06, "loss": 0.4279, "step": 5905 }, { "epoch": 3.845152895250488, "grad_norm": 2.8617303371429443, "learning_rate": 6.579694833466843e-06, "loss": 0.4219, "step": 5910 }, { "epoch": 3.8484059856864024, "grad_norm": 1.705623984336853, "learning_rate": 6.5448362685866485e-06, "loss": 0.4085, "step": 5915 }, { "epoch": 3.851659076122316, "grad_norm": 1.7146087884902954, "learning_rate": 6.510056375036841e-06, "loss": 0.4089, "step": 5920 }, { "epoch": 3.8549121665582304, "grad_norm": 1.4763227701187134, "learning_rate": 6.47535530107839e-06, "loss": 0.408, "step": 5925 }, { "epoch": 3.8581652569941447, "grad_norm": 2.6685636043548584, "learning_rate": 6.440733194636281e-06, "loss": 0.4216, "step": 5930 }, { "epoch": 3.8614183474300585, "grad_norm": 1.5533685684204102, "learning_rate": 6.406190203298859e-06, "loss": 0.4182, "step": 5935 }, { "epoch": 3.864671437865973, "grad_norm": 1.7280486822128296, "learning_rate": 6.3717264743172134e-06, "loss": 0.415, "step": 5940 }, { "epoch": 3.867924528301887, "grad_norm": 1.7348867654800415, "learning_rate": 6.337342154604573e-06, "loss": 0.4046, "step": 5945 }, { "epoch": 3.871177618737801, "grad_norm": 1.730487585067749, "learning_rate": 6.303037390735634e-06, "loss": 0.4186, "step": 5950 }, { "epoch": 3.874430709173715, "grad_norm": 1.4803285598754883, "learning_rate": 6.268812328945961e-06, "loss": 0.4071, "step": 5955 }, { "epoch": 3.8776837996096294, "grad_norm": 1.5369083881378174, "learning_rate": 6.234667115131382e-06, "loss": 0.4205, "step": 5960 }, { "epoch": 3.880936890045543, "grad_norm": 2.9555954933166504, "learning_rate": 6.200601894847324e-06, "loss": 0.4054, "step": 5965 }, { "epoch": 3.8841899804814575, "grad_norm": 1.5368090867996216, "learning_rate": 6.166616813308221e-06, "loss": 0.3972, "step": 5970 }, { "epoch": 3.8874430709173717, "grad_norm": 1.577239990234375, "learning_rate": 6.132712015386902e-06, "loss": 0.4253, "step": 5975 }, { "epoch": 3.8906961613532856, "grad_norm": 1.5454272031784058, "learning_rate": 6.098887645613943e-06, "loss": 0.4081, "step": 5980 }, { "epoch": 3.8939492517892, "grad_norm": 1.506635069847107, "learning_rate": 6.065143848177066e-06, "loss": 0.4041, "step": 5985 }, { "epoch": 3.897202342225114, "grad_norm": 5.971263408660889, "learning_rate": 6.03148076692055e-06, "loss": 0.4031, "step": 5990 }, { "epoch": 3.900455432661028, "grad_norm": 2.4734601974487305, "learning_rate": 5.997898545344571e-06, "loss": 0.43, "step": 5995 }, { "epoch": 3.903708523096942, "grad_norm": 2.6702654361724854, "learning_rate": 5.9643973266046145e-06, "loss": 0.416, "step": 6000 }, { "epoch": 3.9069616135328564, "grad_norm": 1.439784288406372, "learning_rate": 5.930977253510886e-06, "loss": 0.4059, "step": 6005 }, { "epoch": 3.9102147039687702, "grad_norm": 1.8066974878311157, "learning_rate": 5.897638468527653e-06, "loss": 0.4147, "step": 6010 }, { "epoch": 3.9134677944046845, "grad_norm": 1.603287935256958, "learning_rate": 5.864381113772685e-06, "loss": 0.4168, "step": 6015 }, { "epoch": 3.9167208848405988, "grad_norm": 1.6566202640533447, "learning_rate": 5.831205331016612e-06, "loss": 0.4095, "step": 6020 }, { "epoch": 3.9199739752765126, "grad_norm": 2.173736095428467, "learning_rate": 5.798111261682357e-06, "loss": 0.3955, "step": 6025 }, { "epoch": 3.923227065712427, "grad_norm": 1.4487274885177612, "learning_rate": 5.765099046844491e-06, "loss": 0.4048, "step": 6030 }, { "epoch": 3.926480156148341, "grad_norm": 1.4893627166748047, "learning_rate": 5.7321688272286596e-06, "loss": 0.3908, "step": 6035 }, { "epoch": 3.929733246584255, "grad_norm": 1.9365359544754028, "learning_rate": 5.699320743210984e-06, "loss": 0.4141, "step": 6040 }, { "epoch": 3.932986337020169, "grad_norm": 1.6186310052871704, "learning_rate": 5.666554934817447e-06, "loss": 0.3887, "step": 6045 }, { "epoch": 3.9362394274560835, "grad_norm": 7.201216697692871, "learning_rate": 5.633871541723295e-06, "loss": 0.4115, "step": 6050 }, { "epoch": 3.9394925178919973, "grad_norm": 1.8057231903076172, "learning_rate": 5.601270703252481e-06, "loss": 0.3989, "step": 6055 }, { "epoch": 3.9427456083279115, "grad_norm": 2.119253158569336, "learning_rate": 5.5687525583770135e-06, "loss": 0.4295, "step": 6060 }, { "epoch": 3.945998698763826, "grad_norm": 1.8245245218276978, "learning_rate": 5.536317245716391e-06, "loss": 0.4213, "step": 6065 }, { "epoch": 3.9492517891997396, "grad_norm": 1.69149649143219, "learning_rate": 5.503964903537037e-06, "loss": 0.4128, "step": 6070 }, { "epoch": 3.952504879635654, "grad_norm": 1.5030890703201294, "learning_rate": 5.471695669751664e-06, "loss": 0.3899, "step": 6075 }, { "epoch": 3.955757970071568, "grad_norm": 1.764691710472107, "learning_rate": 5.439509681918703e-06, "loss": 0.4024, "step": 6080 }, { "epoch": 3.959011060507482, "grad_norm": 1.7484804391860962, "learning_rate": 5.407407077241749e-06, "loss": 0.4088, "step": 6085 }, { "epoch": 3.9622641509433962, "grad_norm": 1.4906771183013916, "learning_rate": 5.381785121346411e-06, "loss": 0.4077, "step": 6090 }, { "epoch": 3.9655172413793105, "grad_norm": 2.1942009925842285, "learning_rate": 5.349832950968298e-06, "loss": 0.4025, "step": 6095 }, { "epoch": 3.9687703318152243, "grad_norm": 1.9535316228866577, "learning_rate": 5.3179645460233574e-06, "loss": 0.4079, "step": 6100 }, { "epoch": 3.9720234222511386, "grad_norm": 1.6382977962493896, "learning_rate": 5.286180042361361e-06, "loss": 0.4072, "step": 6105 }, { "epoch": 3.975276512687053, "grad_norm": 2.194603204727173, "learning_rate": 5.254479575474411e-06, "loss": 0.4147, "step": 6110 }, { "epoch": 3.9785296031229667, "grad_norm": 2.237362861633301, "learning_rate": 5.222863280496406e-06, "loss": 0.4174, "step": 6115 }, { "epoch": 3.981782693558881, "grad_norm": 1.5425729751586914, "learning_rate": 5.191331292202409e-06, "loss": 0.3867, "step": 6120 }, { "epoch": 3.985035783994795, "grad_norm": 3.286719799041748, "learning_rate": 5.159883745008099e-06, "loss": 0.4134, "step": 6125 }, { "epoch": 3.988288874430709, "grad_norm": 1.8072408437728882, "learning_rate": 5.1285207729692146e-06, "loss": 0.409, "step": 6130 }, { "epoch": 3.9915419648666233, "grad_norm": 2.3535733222961426, "learning_rate": 5.097242509780945e-06, "loss": 0.3986, "step": 6135 }, { "epoch": 3.9947950553025375, "grad_norm": 2.018533706665039, "learning_rate": 5.06604908877738e-06, "loss": 0.4115, "step": 6140 }, { "epoch": 3.9980481457384514, "grad_norm": 1.9201387166976929, "learning_rate": 5.03494064293096e-06, "loss": 0.4206, "step": 6145 }, { "epoch": 4.0, "eval_f1": 0.8072647932831747, "eval_loss": 0.55810546875, "eval_precision": 0.8068495156761535, "eval_recall": 0.8078495084242743, "eval_runtime": 352.5087, "eval_samples_per_second": 1116.1, "eval_steps_per_second": 1.092, "step": 6148 }, { "epoch": 4.001301236174366, "grad_norm": 1.1841827630996704, "learning_rate": 5.003917304851868e-06, "loss": 0.3531, "step": 6150 }, { "epoch": 4.00455432661028, "grad_norm": 1.870800495147705, "learning_rate": 4.972979206787503e-06, "loss": 0.2658, "step": 6155 }, { "epoch": 4.007807417046194, "grad_norm": 6.779404163360596, "learning_rate": 4.9421264806218865e-06, "loss": 0.2545, "step": 6160 }, { "epoch": 4.011060507482108, "grad_norm": 1.2459709644317627, "learning_rate": 4.911359257875131e-06, "loss": 0.2436, "step": 6165 }, { "epoch": 4.014313597918022, "grad_norm": 1.5694559812545776, "learning_rate": 4.880677669702846e-06, "loss": 0.2433, "step": 6170 }, { "epoch": 4.017566688353936, "grad_norm": 1.46892249584198, "learning_rate": 4.850081846895591e-06, "loss": 0.2388, "step": 6175 }, { "epoch": 4.020819778789851, "grad_norm": 32.99214553833008, "learning_rate": 4.819571919878346e-06, "loss": 0.2353, "step": 6180 }, { "epoch": 4.024072869225765, "grad_norm": 1.623369574546814, "learning_rate": 4.78914801870991e-06, "loss": 0.2389, "step": 6185 }, { "epoch": 4.027325959661678, "grad_norm": 1.8111308813095093, "learning_rate": 4.7588102730823676e-06, "loss": 0.2362, "step": 6190 }, { "epoch": 4.030579050097593, "grad_norm": 1.494247555732727, "learning_rate": 4.7285588123205546e-06, "loss": 0.2411, "step": 6195 }, { "epoch": 4.033832140533507, "grad_norm": 1.8853955268859863, "learning_rate": 4.698393765381473e-06, "loss": 0.2434, "step": 6200 }, { "epoch": 4.037085230969421, "grad_norm": 2.163872241973877, "learning_rate": 4.668315260853753e-06, "loss": 0.2634, "step": 6205 }, { "epoch": 4.040338321405335, "grad_norm": 2.3891079425811768, "learning_rate": 4.6383234269571305e-06, "loss": 0.2346, "step": 6210 }, { "epoch": 4.043591411841249, "grad_norm": 3.2441794872283936, "learning_rate": 4.608418391541861e-06, "loss": 0.25, "step": 6215 }, { "epoch": 4.046844502277163, "grad_norm": 2.0283098220825195, "learning_rate": 4.578600282088186e-06, "loss": 0.241, "step": 6220 }, { "epoch": 4.050097592713078, "grad_norm": 1.6591633558273315, "learning_rate": 4.548869225705821e-06, "loss": 0.2366, "step": 6225 }, { "epoch": 4.053350683148992, "grad_norm": 1.9975348711013794, "learning_rate": 4.5192253491333656e-06, "loss": 0.2287, "step": 6230 }, { "epoch": 4.056603773584905, "grad_norm": 1.8785626888275146, "learning_rate": 4.489668778737793e-06, "loss": 0.2329, "step": 6235 }, { "epoch": 4.05985686402082, "grad_norm": 1.7227510213851929, "learning_rate": 4.460199640513912e-06, "loss": 0.2456, "step": 6240 }, { "epoch": 4.063109954456734, "grad_norm": 2.2654736042022705, "learning_rate": 4.430818060083816e-06, "loss": 0.2271, "step": 6245 }, { "epoch": 4.066363044892648, "grad_norm": 1.5479950904846191, "learning_rate": 4.4015241626963436e-06, "loss": 0.2404, "step": 6250 }, { "epoch": 4.0696161353285625, "grad_norm": 2.179954767227173, "learning_rate": 4.372318073226583e-06, "loss": 0.2358, "step": 6255 }, { "epoch": 4.072869225764476, "grad_norm": 1.5730780363082886, "learning_rate": 4.343199916175284e-06, "loss": 0.2256, "step": 6260 }, { "epoch": 4.07612231620039, "grad_norm": 1.6954352855682373, "learning_rate": 4.3141698156683645e-06, "loss": 0.2302, "step": 6265 }, { "epoch": 4.079375406636305, "grad_norm": 2.26914644241333, "learning_rate": 4.285227895456373e-06, "loss": 0.2262, "step": 6270 }, { "epoch": 4.082628497072219, "grad_norm": 1.8111881017684937, "learning_rate": 4.2563742789139635e-06, "loss": 0.2368, "step": 6275 }, { "epoch": 4.0858815875081325, "grad_norm": 2.6787023544311523, "learning_rate": 4.227609089039361e-06, "loss": 0.2299, "step": 6280 }, { "epoch": 4.089134677944047, "grad_norm": 1.6558953523635864, "learning_rate": 4.198932448453832e-06, "loss": 0.2502, "step": 6285 }, { "epoch": 4.092387768379961, "grad_norm": 2.19753360748291, "learning_rate": 4.170344479401203e-06, "loss": 0.2229, "step": 6290 }, { "epoch": 4.095640858815875, "grad_norm": 1.825623631477356, "learning_rate": 4.14184530374728e-06, "loss": 0.2334, "step": 6295 }, { "epoch": 4.0988939492517895, "grad_norm": 5.066878795623779, "learning_rate": 4.113435042979357e-06, "loss": 0.2323, "step": 6300 }, { "epoch": 4.102147039687703, "grad_norm": 1.8815892934799194, "learning_rate": 4.085113818205724e-06, "loss": 0.2439, "step": 6305 }, { "epoch": 4.105400130123617, "grad_norm": 1.726470708847046, "learning_rate": 4.056881750155095e-06, "loss": 0.2469, "step": 6310 }, { "epoch": 4.108653220559532, "grad_norm": 2.11625075340271, "learning_rate": 4.028738959176143e-06, "loss": 0.234, "step": 6315 }, { "epoch": 4.111906310995446, "grad_norm": 1.8043195009231567, "learning_rate": 4.000685565236953e-06, "loss": 0.235, "step": 6320 }, { "epoch": 4.1151594014313595, "grad_norm": 1.5731197595596313, "learning_rate": 3.972721687924546e-06, "loss": 0.2369, "step": 6325 }, { "epoch": 4.118412491867274, "grad_norm": 2.3870668411254883, "learning_rate": 3.94484744644433e-06, "loss": 0.2354, "step": 6330 }, { "epoch": 4.121665582303188, "grad_norm": 2.9168872833251953, "learning_rate": 3.917062959619611e-06, "loss": 0.2391, "step": 6335 }, { "epoch": 4.124918672739102, "grad_norm": 2.253788709640503, "learning_rate": 3.889368345891101e-06, "loss": 0.2414, "step": 6340 }, { "epoch": 4.1281717631750166, "grad_norm": 1.9010400772094727, "learning_rate": 3.86176372331638e-06, "loss": 0.2385, "step": 6345 }, { "epoch": 4.13142485361093, "grad_norm": 2.5451037883758545, "learning_rate": 3.834249209569415e-06, "loss": 0.2434, "step": 6350 }, { "epoch": 4.134677944046844, "grad_norm": 1.9200527667999268, "learning_rate": 3.806824921940069e-06, "loss": 0.2327, "step": 6355 }, { "epoch": 4.137931034482759, "grad_norm": 1.741262435913086, "learning_rate": 3.7794909773335664e-06, "loss": 0.2218, "step": 6360 }, { "epoch": 4.141184124918673, "grad_norm": 2.2827839851379395, "learning_rate": 3.752247492270017e-06, "loss": 0.238, "step": 6365 }, { "epoch": 4.1444372153545865, "grad_norm": 2.077956199645996, "learning_rate": 3.7250945828839286e-06, "loss": 0.2371, "step": 6370 }, { "epoch": 4.147690305790501, "grad_norm": 1.9408165216445923, "learning_rate": 3.6980323649236925e-06, "loss": 0.2241, "step": 6375 }, { "epoch": 4.150943396226415, "grad_norm": 2.407376766204834, "learning_rate": 3.671060953751085e-06, "loss": 0.2444, "step": 6380 }, { "epoch": 4.154196486662329, "grad_norm": 1.6587666273117065, "learning_rate": 3.6441804643408156e-06, "loss": 0.2388, "step": 6385 }, { "epoch": 4.157449577098244, "grad_norm": 1.826478362083435, "learning_rate": 3.617391011279986e-06, "loss": 0.2361, "step": 6390 }, { "epoch": 4.160702667534157, "grad_norm": 1.628882646560669, "learning_rate": 3.590692708767626e-06, "loss": 0.2246, "step": 6395 }, { "epoch": 4.163955757970071, "grad_norm": 1.8005129098892212, "learning_rate": 3.5640856706142283e-06, "loss": 0.2435, "step": 6400 }, { "epoch": 4.167208848405986, "grad_norm": 1.8075467348098755, "learning_rate": 3.5375700102412118e-06, "loss": 0.2283, "step": 6405 }, { "epoch": 4.1704619388419, "grad_norm": 1.7112232446670532, "learning_rate": 3.51114584068048e-06, "loss": 0.2428, "step": 6410 }, { "epoch": 4.173715029277814, "grad_norm": 3.4661080837249756, "learning_rate": 3.484813274573931e-06, "loss": 0.2221, "step": 6415 }, { "epoch": 4.176968119713728, "grad_norm": 1.970160961151123, "learning_rate": 3.458572424172962e-06, "loss": 0.2409, "step": 6420 }, { "epoch": 4.180221210149642, "grad_norm": 2.032346248626709, "learning_rate": 3.432423401338014e-06, "loss": 0.2259, "step": 6425 }, { "epoch": 4.183474300585556, "grad_norm": 2.329317808151245, "learning_rate": 3.4063663175380622e-06, "loss": 0.2277, "step": 6430 }, { "epoch": 4.186727391021471, "grad_norm": 2.001601457595825, "learning_rate": 3.3804012838501877e-06, "loss": 0.2392, "step": 6435 }, { "epoch": 4.189980481457384, "grad_norm": 2.017449140548706, "learning_rate": 3.354528410959054e-06, "loss": 0.2223, "step": 6440 }, { "epoch": 4.193233571893298, "grad_norm": 2.0057036876678467, "learning_rate": 3.3287478091564628e-06, "loss": 0.2282, "step": 6445 }, { "epoch": 4.196486662329213, "grad_norm": 1.7296228408813477, "learning_rate": 3.3030595883408953e-06, "loss": 0.2234, "step": 6450 }, { "epoch": 4.199739752765127, "grad_norm": 1.9821724891662598, "learning_rate": 3.2774638580170075e-06, "loss": 0.2491, "step": 6455 }, { "epoch": 4.202992843201041, "grad_norm": 1.769161581993103, "learning_rate": 3.2519607272951862e-06, "loss": 0.2342, "step": 6460 }, { "epoch": 4.206245933636955, "grad_norm": 1.7646567821502686, "learning_rate": 3.226550304891099e-06, "loss": 0.231, "step": 6465 }, { "epoch": 4.209499024072869, "grad_norm": 1.8771380186080933, "learning_rate": 3.201232699125198e-06, "loss": 0.2246, "step": 6470 }, { "epoch": 4.212752114508783, "grad_norm": 7.158396244049072, "learning_rate": 3.1760080179222663e-06, "loss": 0.2281, "step": 6475 }, { "epoch": 4.216005204944698, "grad_norm": 2.6318135261535645, "learning_rate": 3.15087636881099e-06, "loss": 0.2326, "step": 6480 }, { "epoch": 4.2192582953806115, "grad_norm": 1.8173600435256958, "learning_rate": 3.125837858923453e-06, "loss": 0.2236, "step": 6485 }, { "epoch": 4.222511385816525, "grad_norm": 1.9394645690917969, "learning_rate": 3.100892594994706e-06, "loss": 0.2356, "step": 6490 }, { "epoch": 4.22576447625244, "grad_norm": 2.29449725151062, "learning_rate": 3.076040683362308e-06, "loss": 0.2336, "step": 6495 }, { "epoch": 4.229017566688354, "grad_norm": 1.8967347145080566, "learning_rate": 3.0512822299658824e-06, "loss": 0.2251, "step": 6500 }, { "epoch": 4.232270657124268, "grad_norm": 1.6016850471496582, "learning_rate": 3.0266173403466438e-06, "loss": 0.2119, "step": 6505 }, { "epoch": 4.235523747560182, "grad_norm": 1.7070870399475098, "learning_rate": 3.002046119646959e-06, "loss": 0.2228, "step": 6510 }, { "epoch": 4.238776837996096, "grad_norm": 2.9714415073394775, "learning_rate": 2.977568672609915e-06, "loss": 0.2317, "step": 6515 }, { "epoch": 4.24202992843201, "grad_norm": 1.7705700397491455, "learning_rate": 2.95318510357884e-06, "loss": 0.2135, "step": 6520 }, { "epoch": 4.245283018867925, "grad_norm": 1.5718425512313843, "learning_rate": 2.9288955164968766e-06, "loss": 0.2309, "step": 6525 }, { "epoch": 4.2485361093038385, "grad_norm": 6.268550872802734, "learning_rate": 2.904700014906553e-06, "loss": 0.2259, "step": 6530 }, { "epoch": 4.251789199739752, "grad_norm": 10.964041709899902, "learning_rate": 2.8805987019493137e-06, "loss": 0.2255, "step": 6535 }, { "epoch": 4.255042290175667, "grad_norm": 6.053678512573242, "learning_rate": 2.8565916803650866e-06, "loss": 0.2171, "step": 6540 }, { "epoch": 4.258295380611581, "grad_norm": 1.9141877889633179, "learning_rate": 2.8326790524918765e-06, "loss": 0.2229, "step": 6545 }, { "epoch": 4.261548471047496, "grad_norm": 1.7668460607528687, "learning_rate": 2.8088609202652742e-06, "loss": 0.2281, "step": 6550 }, { "epoch": 4.264801561483409, "grad_norm": 4.9379048347473145, "learning_rate": 2.7851373852180617e-06, "loss": 0.228, "step": 6555 }, { "epoch": 4.268054651919323, "grad_norm": 1.8191304206848145, "learning_rate": 2.761508548479777e-06, "loss": 0.2223, "step": 6560 }, { "epoch": 4.271307742355237, "grad_norm": 3.2812600135803223, "learning_rate": 2.7379745107762726e-06, "loss": 0.2254, "step": 6565 }, { "epoch": 4.274560832791152, "grad_norm": 1.9550873041152954, "learning_rate": 2.7145353724292776e-06, "loss": 0.2245, "step": 6570 }, { "epoch": 4.2778139232270656, "grad_norm": 1.6830195188522339, "learning_rate": 2.691191233355986e-06, "loss": 0.2343, "step": 6575 }, { "epoch": 4.28106701366298, "grad_norm": 1.9676400423049927, "learning_rate": 2.6679421930686317e-06, "loss": 0.2418, "step": 6580 }, { "epoch": 4.284320104098894, "grad_norm": 2.293408155441284, "learning_rate": 2.644788350674049e-06, "loss": 0.2352, "step": 6585 }, { "epoch": 4.287573194534808, "grad_norm": 2.1475830078125, "learning_rate": 2.6217298048732604e-06, "loss": 0.2263, "step": 6590 }, { "epoch": 4.290826284970722, "grad_norm": 1.7658709287643433, "learning_rate": 2.598766653961068e-06, "loss": 0.2369, "step": 6595 }, { "epoch": 4.294079375406636, "grad_norm": 1.8443782329559326, "learning_rate": 2.5758989958256043e-06, "loss": 0.2362, "step": 6600 }, { "epoch": 4.29733246584255, "grad_norm": 1.6136929988861084, "learning_rate": 2.5531269279479325e-06, "loss": 0.2475, "step": 6605 }, { "epoch": 4.300585556278465, "grad_norm": 1.7001075744628906, "learning_rate": 2.530450547401647e-06, "loss": 0.2257, "step": 6610 }, { "epoch": 4.303838646714379, "grad_norm": 1.7640526294708252, "learning_rate": 2.5078699508524288e-06, "loss": 0.2359, "step": 6615 }, { "epoch": 4.307091737150293, "grad_norm": 1.509757161140442, "learning_rate": 2.485385234557641e-06, "loss": 0.2163, "step": 6620 }, { "epoch": 4.310344827586207, "grad_norm": 1.9168065786361694, "learning_rate": 2.462996494365949e-06, "loss": 0.2209, "step": 6625 }, { "epoch": 4.313597918022121, "grad_norm": 1.701675295829773, "learning_rate": 2.440703825716867e-06, "loss": 0.2263, "step": 6630 }, { "epoch": 4.316851008458035, "grad_norm": 3.188270330429077, "learning_rate": 2.4185073236403707e-06, "loss": 0.2286, "step": 6635 }, { "epoch": 4.32010409889395, "grad_norm": 2.1177477836608887, "learning_rate": 2.396407082756513e-06, "loss": 0.2153, "step": 6640 }, { "epoch": 4.3233571893298635, "grad_norm": 1.7593662738800049, "learning_rate": 2.3744031972749826e-06, "loss": 0.2378, "step": 6645 }, { "epoch": 4.326610279765777, "grad_norm": 1.7856861352920532, "learning_rate": 2.352495760994733e-06, "loss": 0.2222, "step": 6650 }, { "epoch": 4.329863370201692, "grad_norm": 2.1246399879455566, "learning_rate": 2.3306848673035536e-06, "loss": 0.2222, "step": 6655 }, { "epoch": 4.333116460637606, "grad_norm": 1.8715474605560303, "learning_rate": 2.308970609177713e-06, "loss": 0.2283, "step": 6660 }, { "epoch": 4.33636955107352, "grad_norm": 1.6002229452133179, "learning_rate": 2.28735307918152e-06, "loss": 0.2311, "step": 6665 }, { "epoch": 4.339622641509434, "grad_norm": 1.71176016330719, "learning_rate": 2.2658323694669498e-06, "loss": 0.2261, "step": 6670 }, { "epoch": 4.342875731945348, "grad_norm": 4.760283946990967, "learning_rate": 2.24440857177326e-06, "loss": 0.2174, "step": 6675 }, { "epoch": 4.346128822381262, "grad_norm": 1.7307775020599365, "learning_rate": 2.2230817774265724e-06, "loss": 0.2235, "step": 6680 }, { "epoch": 4.349381912817177, "grad_norm": 2.089346170425415, "learning_rate": 2.201852077339506e-06, "loss": 0.2387, "step": 6685 }, { "epoch": 4.3526350032530905, "grad_norm": 1.815763235092163, "learning_rate": 2.1807195620107914e-06, "loss": 0.2406, "step": 6690 }, { "epoch": 4.355888093689004, "grad_norm": 1.9279545545578003, "learning_rate": 2.15968432152486e-06, "loss": 0.2385, "step": 6695 }, { "epoch": 4.359141184124919, "grad_norm": 1.6756705045700073, "learning_rate": 2.1387464455514928e-06, "loss": 0.2232, "step": 6700 }, { "epoch": 4.362394274560833, "grad_norm": 1.606360912322998, "learning_rate": 2.117906023345406e-06, "loss": 0.2275, "step": 6705 }, { "epoch": 4.365647364996747, "grad_norm": 1.7262459993362427, "learning_rate": 2.097163143745909e-06, "loss": 0.2232, "step": 6710 }, { "epoch": 4.368900455432661, "grad_norm": 1.5949680805206299, "learning_rate": 2.0765178951764774e-06, "loss": 0.2306, "step": 6715 }, { "epoch": 4.372153545868575, "grad_norm": 1.6405928134918213, "learning_rate": 2.0559703656444107e-06, "loss": 0.2286, "step": 6720 }, { "epoch": 4.375406636304489, "grad_norm": 1.8739897012710571, "learning_rate": 2.0355206427404626e-06, "loss": 0.2128, "step": 6725 }, { "epoch": 4.378659726740404, "grad_norm": 1.7278401851654053, "learning_rate": 2.015168813638435e-06, "loss": 0.2217, "step": 6730 }, { "epoch": 4.3819128171763175, "grad_norm": 1.776465654373169, "learning_rate": 1.9949149650948267e-06, "loss": 0.2492, "step": 6735 }, { "epoch": 4.385165907612231, "grad_norm": 3.8171842098236084, "learning_rate": 1.974759183448477e-06, "loss": 0.2262, "step": 6740 }, { "epoch": 4.388418998048146, "grad_norm": 1.7197240591049194, "learning_rate": 1.954701554620164e-06, "loss": 0.2287, "step": 6745 }, { "epoch": 4.39167208848406, "grad_norm": 3.6815900802612305, "learning_rate": 1.9347421641122576e-06, "loss": 0.2285, "step": 6750 }, { "epoch": 4.394925178919974, "grad_norm": 2.5846571922302246, "learning_rate": 1.9148810970083725e-06, "loss": 0.2228, "step": 6755 }, { "epoch": 4.398178269355888, "grad_norm": 1.7526190280914307, "learning_rate": 1.8951184379729674e-06, "loss": 0.2268, "step": 6760 }, { "epoch": 4.401431359791802, "grad_norm": 3.6665701866149902, "learning_rate": 1.8754542712510065e-06, "loss": 0.2295, "step": 6765 }, { "epoch": 4.404684450227716, "grad_norm": 2.0210907459259033, "learning_rate": 1.8558886806676112e-06, "loss": 0.2212, "step": 6770 }, { "epoch": 4.407937540663631, "grad_norm": 1.6685539484024048, "learning_rate": 1.8364217496276731e-06, "loss": 0.2336, "step": 6775 }, { "epoch": 4.411190631099545, "grad_norm": 1.7345695495605469, "learning_rate": 1.8170535611155143e-06, "loss": 0.2349, "step": 6780 }, { "epoch": 4.414443721535458, "grad_norm": 1.5429155826568604, "learning_rate": 1.797784197694552e-06, "loss": 0.2206, "step": 6785 }, { "epoch": 4.417696811971373, "grad_norm": 1.8358302116394043, "learning_rate": 1.7786137415069126e-06, "loss": 0.225, "step": 6790 }, { "epoch": 4.420949902407287, "grad_norm": 1.858405590057373, "learning_rate": 1.7595422742730905e-06, "loss": 0.2049, "step": 6795 }, { "epoch": 4.424202992843201, "grad_norm": 1.6918565034866333, "learning_rate": 1.7405698772916313e-06, "loss": 0.2224, "step": 6800 }, { "epoch": 4.427456083279115, "grad_norm": 1.7888590097427368, "learning_rate": 1.7216966314387378e-06, "loss": 0.2359, "step": 6805 }, { "epoch": 4.430709173715029, "grad_norm": 4.2224273681640625, "learning_rate": 1.7029226171679542e-06, "loss": 0.2278, "step": 6810 }, { "epoch": 4.433962264150943, "grad_norm": 3.129756450653076, "learning_rate": 1.684247914509826e-06, "loss": 0.2317, "step": 6815 }, { "epoch": 4.437215354586858, "grad_norm": 1.7897382974624634, "learning_rate": 1.6656726030715358e-06, "loss": 0.2305, "step": 6820 }, { "epoch": 4.440468445022772, "grad_norm": 2.1024298667907715, "learning_rate": 1.6471967620365846e-06, "loss": 0.2433, "step": 6825 }, { "epoch": 4.443721535458685, "grad_norm": 1.8943312168121338, "learning_rate": 1.6288204701644382e-06, "loss": 0.2301, "step": 6830 }, { "epoch": 4.4469746258946, "grad_norm": 1.8938087224960327, "learning_rate": 1.6105438057902295e-06, "loss": 0.2245, "step": 6835 }, { "epoch": 4.450227716330514, "grad_norm": 1.9831109046936035, "learning_rate": 1.592366846824364e-06, "loss": 0.2471, "step": 6840 }, { "epoch": 4.453480806766428, "grad_norm": 4.050066947937012, "learning_rate": 1.5742896707522242e-06, "loss": 0.2437, "step": 6845 }, { "epoch": 4.4567338972023425, "grad_norm": 1.9161465167999268, "learning_rate": 1.5563123546338572e-06, "loss": 0.2232, "step": 6850 }, { "epoch": 4.459986987638256, "grad_norm": 2.1633474826812744, "learning_rate": 1.5384349751035948e-06, "loss": 0.2387, "step": 6855 }, { "epoch": 4.46324007807417, "grad_norm": 1.729988694190979, "learning_rate": 1.5206576083697687e-06, "loss": 0.2387, "step": 6860 }, { "epoch": 4.466493168510085, "grad_norm": 3.217106342315674, "learning_rate": 1.502980330214379e-06, "loss": 0.222, "step": 6865 }, { "epoch": 4.469746258945999, "grad_norm": 3.8715853691101074, "learning_rate": 1.4854032159927562e-06, "loss": 0.2256, "step": 6870 }, { "epoch": 4.4729993493819125, "grad_norm": 1.8818020820617676, "learning_rate": 1.4679263406332467e-06, "loss": 0.2282, "step": 6875 }, { "epoch": 4.476252439817827, "grad_norm": 1.7817474603652954, "learning_rate": 1.450549778636895e-06, "loss": 0.2336, "step": 6880 }, { "epoch": 4.479505530253741, "grad_norm": 1.9050745964050293, "learning_rate": 1.43327360407714e-06, "loss": 0.2239, "step": 6885 }, { "epoch": 4.482758620689655, "grad_norm": 1.8388644456863403, "learning_rate": 1.416097890599466e-06, "loss": 0.2231, "step": 6890 }, { "epoch": 4.4860117111255695, "grad_norm": 1.8944287300109863, "learning_rate": 1.3990227114211191e-06, "loss": 0.2252, "step": 6895 }, { "epoch": 4.489264801561483, "grad_norm": 1.994055986404419, "learning_rate": 1.3820481393307855e-06, "loss": 0.2351, "step": 6900 }, { "epoch": 4.492517891997397, "grad_norm": 1.8318955898284912, "learning_rate": 1.365174246688275e-06, "loss": 0.2328, "step": 6905 }, { "epoch": 4.495770982433312, "grad_norm": 1.8265342712402344, "learning_rate": 1.3484011054242157e-06, "loss": 0.2202, "step": 6910 }, { "epoch": 4.499024072869226, "grad_norm": 2.5501973628997803, "learning_rate": 1.3317287870397572e-06, "loss": 0.2283, "step": 6915 }, { "epoch": 4.5022771633051395, "grad_norm": 2.1136600971221924, "learning_rate": 1.3151573626062535e-06, "loss": 0.2362, "step": 6920 }, { "epoch": 4.505530253741054, "grad_norm": 2.1394035816192627, "learning_rate": 1.298686902764959e-06, "loss": 0.2262, "step": 6925 }, { "epoch": 4.508783344176968, "grad_norm": 2.5853805541992188, "learning_rate": 1.2823174777267439e-06, "loss": 0.2235, "step": 6930 }, { "epoch": 4.512036434612883, "grad_norm": 11.947797775268555, "learning_rate": 1.266049157271773e-06, "loss": 0.2135, "step": 6935 }, { "epoch": 4.5152895250487965, "grad_norm": 1.589674472808838, "learning_rate": 1.2498820107492204e-06, "loss": 0.232, "step": 6940 }, { "epoch": 4.51854261548471, "grad_norm": 2.020092248916626, "learning_rate": 1.233816107076985e-06, "loss": 0.2373, "step": 6945 }, { "epoch": 4.521795705920624, "grad_norm": 1.9216246604919434, "learning_rate": 1.2178515147413665e-06, "loss": 0.2275, "step": 6950 }, { "epoch": 4.525048796356539, "grad_norm": 2.0308332443237305, "learning_rate": 1.2019883017967943e-06, "loss": 0.225, "step": 6955 }, { "epoch": 4.528301886792453, "grad_norm": 1.7747068405151367, "learning_rate": 1.1862265358655505e-06, "loss": 0.2252, "step": 6960 }, { "epoch": 4.531554977228367, "grad_norm": 3.725299119949341, "learning_rate": 1.170566284137442e-06, "loss": 0.2343, "step": 6965 }, { "epoch": 4.534808067664281, "grad_norm": 2.894679069519043, "learning_rate": 1.1550076133695604e-06, "loss": 0.2436, "step": 6970 }, { "epoch": 4.538061158100195, "grad_norm": 1.8062316179275513, "learning_rate": 1.1395505898859487e-06, "loss": 0.237, "step": 6975 }, { "epoch": 4.541314248536109, "grad_norm": 1.745668888092041, "learning_rate": 1.1241952795773697e-06, "loss": 0.2324, "step": 6980 }, { "epoch": 4.544567338972024, "grad_norm": 1.908809781074524, "learning_rate": 1.108941747900985e-06, "loss": 0.2356, "step": 6985 }, { "epoch": 4.547820429407937, "grad_norm": 6.190516948699951, "learning_rate": 1.0937900598800872e-06, "loss": 0.23, "step": 6990 }, { "epoch": 4.551073519843852, "grad_norm": 2.0687854290008545, "learning_rate": 1.0787402801038405e-06, "loss": 0.2376, "step": 6995 }, { "epoch": 4.554326610279766, "grad_norm": 2.2060608863830566, "learning_rate": 1.0637924727269822e-06, "loss": 0.2179, "step": 7000 }, { "epoch": 4.55757970071568, "grad_norm": 1.8114817142486572, "learning_rate": 1.0489467014695526e-06, "loss": 0.2445, "step": 7005 }, { "epoch": 4.560832791151594, "grad_norm": 1.8267011642456055, "learning_rate": 1.0342030296166428e-06, "loss": 0.2249, "step": 7010 }, { "epoch": 4.564085881587508, "grad_norm": 1.8274999856948853, "learning_rate": 1.0195615200180974e-06, "loss": 0.2198, "step": 7015 }, { "epoch": 4.567338972023422, "grad_norm": 4.727597713470459, "learning_rate": 1.0050222350882682e-06, "loss": 0.23, "step": 7020 }, { "epoch": 4.570592062459337, "grad_norm": 17.39756965637207, "learning_rate": 9.905852368057383e-07, "loss": 0.2217, "step": 7025 }, { "epoch": 4.573845152895251, "grad_norm": 1.8855489492416382, "learning_rate": 9.762505867130594e-07, "loss": 0.2233, "step": 7030 }, { "epoch": 4.577098243331164, "grad_norm": 1.946487545967102, "learning_rate": 9.620183459164878e-07, "loss": 0.2398, "step": 7035 }, { "epoch": 4.580351333767078, "grad_norm": 5.050466060638428, "learning_rate": 9.478885750857285e-07, "loss": 0.2437, "step": 7040 }, { "epoch": 4.583604424202993, "grad_norm": 2.065145254135132, "learning_rate": 9.338613344536701e-07, "loss": 0.235, "step": 7045 }, { "epoch": 4.586857514638907, "grad_norm": 1.8699109554290771, "learning_rate": 9.199366838161389e-07, "loss": 0.2314, "step": 7050 }, { "epoch": 4.5901106050748215, "grad_norm": 1.8447792530059814, "learning_rate": 9.06114682531628e-07, "loss": 0.2261, "step": 7055 }, { "epoch": 4.593363695510735, "grad_norm": 7.244560241699219, "learning_rate": 8.923953895210612e-07, "loss": 0.2094, "step": 7060 }, { "epoch": 4.596616785946649, "grad_norm": 1.6492129564285278, "learning_rate": 8.787788632675293e-07, "loss": 0.2185, "step": 7065 }, { "epoch": 4.599869876382563, "grad_norm": 1.8190096616744995, "learning_rate": 8.652651618160424e-07, "loss": 0.2368, "step": 7070 }, { "epoch": 4.603122966818478, "grad_norm": 1.6560496091842651, "learning_rate": 8.51854342773295e-07, "loss": 0.2297, "step": 7075 }, { "epoch": 4.6063760572543915, "grad_norm": 1.6347981691360474, "learning_rate": 8.385464633074019e-07, "loss": 0.2228, "step": 7080 }, { "epoch": 4.609629147690306, "grad_norm": 2.087787628173828, "learning_rate": 8.25341580147665e-07, "loss": 0.2277, "step": 7085 }, { "epoch": 4.61288223812622, "grad_norm": 1.9914604425430298, "learning_rate": 8.122397495843343e-07, "loss": 0.2332, "step": 7090 }, { "epoch": 4.616135328562134, "grad_norm": 2.1197216510772705, "learning_rate": 7.992410274683615e-07, "loss": 0.2218, "step": 7095 }, { "epoch": 4.6193884189980485, "grad_norm": 3.4777309894561768, "learning_rate": 7.863454692111583e-07, "loss": 0.224, "step": 7100 }, { "epoch": 4.622641509433962, "grad_norm": 1.6859432458877563, "learning_rate": 7.735531297843713e-07, "loss": 0.2434, "step": 7105 }, { "epoch": 4.625894599869876, "grad_norm": 3.000272035598755, "learning_rate": 7.60864063719649e-07, "loss": 0.2296, "step": 7110 }, { "epoch": 4.629147690305791, "grad_norm": 1.8207471370697021, "learning_rate": 7.482783251083869e-07, "loss": 0.2252, "step": 7115 }, { "epoch": 4.632400780741705, "grad_norm": 2.1532914638519287, "learning_rate": 7.357959676015214e-07, "loss": 0.2275, "step": 7120 }, { "epoch": 4.6356538711776185, "grad_norm": 1.644968867301941, "learning_rate": 7.234170444092942e-07, "loss": 0.2384, "step": 7125 }, { "epoch": 4.638906961613533, "grad_norm": 1.7306429147720337, "learning_rate": 7.11141608301022e-07, "loss": 0.2202, "step": 7130 }, { "epoch": 4.642160052049447, "grad_norm": 2.0186638832092285, "learning_rate": 6.989697116048633e-07, "loss": 0.2242, "step": 7135 }, { "epoch": 4.645413142485361, "grad_norm": 1.7209802865982056, "learning_rate": 6.86901406207624e-07, "loss": 0.2152, "step": 7140 }, { "epoch": 4.648666232921276, "grad_norm": 1.6327604055404663, "learning_rate": 6.749367435545024e-07, "loss": 0.2384, "step": 7145 }, { "epoch": 4.651919323357189, "grad_norm": 2.552006721496582, "learning_rate": 6.630757746488886e-07, "loss": 0.2287, "step": 7150 }, { "epoch": 4.655172413793103, "grad_norm": 1.8012171983718872, "learning_rate": 6.513185500521463e-07, "loss": 0.2148, "step": 7155 }, { "epoch": 4.658425504229018, "grad_norm": 1.9249675273895264, "learning_rate": 6.396651198833897e-07, "loss": 0.2249, "step": 7160 }, { "epoch": 4.661678594664932, "grad_norm": 1.640886664390564, "learning_rate": 6.281155338192762e-07, "loss": 0.2167, "step": 7165 }, { "epoch": 4.6649316851008455, "grad_norm": 1.9455459117889404, "learning_rate": 6.166698410937949e-07, "loss": 0.2264, "step": 7170 }, { "epoch": 4.66818477553676, "grad_norm": 2.079929828643799, "learning_rate": 6.053280904980557e-07, "loss": 0.2249, "step": 7175 }, { "epoch": 4.671437865972674, "grad_norm": 1.5457408428192139, "learning_rate": 5.940903303800705e-07, "loss": 0.218, "step": 7180 }, { "epoch": 4.674690956408588, "grad_norm": 2.4226558208465576, "learning_rate": 5.829566086445721e-07, "loss": 0.2229, "step": 7185 }, { "epoch": 4.677944046844503, "grad_norm": 2.8459372520446777, "learning_rate": 5.719269727527843e-07, "loss": 0.2206, "step": 7190 }, { "epoch": 4.681197137280416, "grad_norm": 1.8545024394989014, "learning_rate": 5.610014697222249e-07, "loss": 0.2157, "step": 7195 }, { "epoch": 4.68445022771633, "grad_norm": 2.315584421157837, "learning_rate": 5.501801461265304e-07, "loss": 0.2179, "step": 7200 }, { "epoch": 4.687703318152245, "grad_norm": 1.7075843811035156, "learning_rate": 5.394630480952178e-07, "loss": 0.2147, "step": 7205 }, { "epoch": 4.690956408588159, "grad_norm": 1.7120945453643799, "learning_rate": 5.288502213135149e-07, "loss": 0.2329, "step": 7210 }, { "epoch": 4.694209499024073, "grad_norm": 1.6173934936523438, "learning_rate": 5.183417110221606e-07, "loss": 0.2338, "step": 7215 }, { "epoch": 4.697462589459987, "grad_norm": 1.8131818771362305, "learning_rate": 5.07937562017205e-07, "loss": 0.231, "step": 7220 }, { "epoch": 4.700715679895901, "grad_norm": 2.225557327270508, "learning_rate": 4.976378186498293e-07, "loss": 0.2303, "step": 7225 }, { "epoch": 4.703968770331815, "grad_norm": 1.9588440656661987, "learning_rate": 4.874425248261428e-07, "loss": 0.2254, "step": 7230 }, { "epoch": 4.70722186076773, "grad_norm": 1.5929067134857178, "learning_rate": 4.773517240070108e-07, "loss": 0.2177, "step": 7235 }, { "epoch": 4.7104749512036435, "grad_norm": 1.794873833656311, "learning_rate": 4.67365459207858e-07, "loss": 0.2268, "step": 7240 }, { "epoch": 4.713728041639557, "grad_norm": 2.0999152660369873, "learning_rate": 4.5748377299849045e-07, "loss": 0.2201, "step": 7245 }, { "epoch": 4.716981132075472, "grad_norm": 1.7107945680618286, "learning_rate": 4.477067075029123e-07, "loss": 0.2272, "step": 7250 }, { "epoch": 4.720234222511386, "grad_norm": 1.8921961784362793, "learning_rate": 4.3803430439915137e-07, "loss": 0.2176, "step": 7255 }, { "epoch": 4.7234873129473, "grad_norm": 2.4301459789276123, "learning_rate": 4.284666049190644e-07, "loss": 0.2146, "step": 7260 }, { "epoch": 4.726740403383214, "grad_norm": 1.8662853240966797, "learning_rate": 4.1900364984818754e-07, "loss": 0.2483, "step": 7265 }, { "epoch": 4.729993493819128, "grad_norm": 1.5476515293121338, "learning_rate": 4.0964547952554443e-07, "loss": 0.2265, "step": 7270 }, { "epoch": 4.733246584255042, "grad_norm": 2.1670968532562256, "learning_rate": 4.0039213384347187e-07, "loss": 0.2384, "step": 7275 }, { "epoch": 4.736499674690957, "grad_norm": 1.7863905429840088, "learning_rate": 3.912436522474666e-07, "loss": 0.2287, "step": 7280 }, { "epoch": 4.7397527651268705, "grad_norm": 2.369838237762451, "learning_rate": 3.822000737360026e-07, "loss": 0.2362, "step": 7285 }, { "epoch": 4.743005855562784, "grad_norm": 2.5364902019500732, "learning_rate": 3.7326143686036706e-07, "loss": 0.2211, "step": 7290 }, { "epoch": 4.746258945998699, "grad_norm": 2.2473394870758057, "learning_rate": 3.644277797244966e-07, "loss": 0.2349, "step": 7295 }, { "epoch": 4.749512036434613, "grad_norm": 2.3497040271759033, "learning_rate": 3.556991399848275e-07, "loss": 0.2254, "step": 7300 }, { "epoch": 4.752765126870527, "grad_norm": 1.744187831878662, "learning_rate": 3.4707555485011533e-07, "loss": 0.2231, "step": 7305 }, { "epoch": 4.756018217306441, "grad_norm": 2.6059043407440186, "learning_rate": 3.385570610812794e-07, "loss": 0.2141, "step": 7310 }, { "epoch": 4.759271307742355, "grad_norm": 2.001283645629883, "learning_rate": 3.3014369499126675e-07, "loss": 0.2219, "step": 7315 }, { "epoch": 4.762524398178269, "grad_norm": 2.1158323287963867, "learning_rate": 3.218354924448719e-07, "loss": 0.237, "step": 7320 }, { "epoch": 4.765777488614184, "grad_norm": 1.978641152381897, "learning_rate": 3.1363248885859506e-07, "loss": 0.2225, "step": 7325 }, { "epoch": 4.7690305790500975, "grad_norm": 2.592278003692627, "learning_rate": 3.055347192004954e-07, "loss": 0.2235, "step": 7330 }, { "epoch": 4.772283669486011, "grad_norm": 3.1187829971313477, "learning_rate": 2.9754221799003503e-07, "loss": 0.2324, "step": 7335 }, { "epoch": 4.775536759921926, "grad_norm": 1.6826578378677368, "learning_rate": 2.8965501929792695e-07, "loss": 0.2309, "step": 7340 }, { "epoch": 4.77878985035784, "grad_norm": 1.6795811653137207, "learning_rate": 2.818731567460098e-07, "loss": 0.2087, "step": 7345 }, { "epoch": 4.782042940793754, "grad_norm": 1.9984930753707886, "learning_rate": 2.741966635070842e-07, "loss": 0.22, "step": 7350 }, { "epoch": 4.785296031229668, "grad_norm": 2.397368907928467, "learning_rate": 2.6662557230477667e-07, "loss": 0.227, "step": 7355 }, { "epoch": 4.788549121665582, "grad_norm": 1.748075008392334, "learning_rate": 2.5915991541340667e-07, "loss": 0.2206, "step": 7360 }, { "epoch": 4.791802212101496, "grad_norm": 2.152040481567383, "learning_rate": 2.5179972465784186e-07, "loss": 0.2214, "step": 7365 }, { "epoch": 4.795055302537411, "grad_norm": 1.829852819442749, "learning_rate": 2.4454503141336513e-07, "loss": 0.213, "step": 7370 }, { "epoch": 4.798308392973325, "grad_norm": 2.7500312328338623, "learning_rate": 2.3739586660554148e-07, "loss": 0.2188, "step": 7375 }, { "epoch": 4.801561483409239, "grad_norm": 2.0260448455810547, "learning_rate": 2.3035226071008997e-07, "loss": 0.2264, "step": 7380 }, { "epoch": 4.804814573845153, "grad_norm": 2.0971879959106445, "learning_rate": 2.2341424375274256e-07, "loss": 0.2382, "step": 7385 }, { "epoch": 4.808067664281067, "grad_norm": 1.875805139541626, "learning_rate": 2.165818453091245e-07, "loss": 0.2181, "step": 7390 }, { "epoch": 4.811320754716981, "grad_norm": 2.7413530349731445, "learning_rate": 2.098550945046268e-07, "loss": 0.2342, "step": 7395 }, { "epoch": 4.814573845152895, "grad_norm": 1.8814730644226074, "learning_rate": 2.0323402001428682e-07, "loss": 0.2251, "step": 7400 }, { "epoch": 4.817826935588809, "grad_norm": 2.818944215774536, "learning_rate": 1.9671865006265223e-07, "loss": 0.2247, "step": 7405 }, { "epoch": 4.821080026024724, "grad_norm": 1.7889087200164795, "learning_rate": 1.9030901242367837e-07, "loss": 0.2401, "step": 7410 }, { "epoch": 4.824333116460638, "grad_norm": 2.2675249576568604, "learning_rate": 1.8400513442059786e-07, "loss": 0.2343, "step": 7415 }, { "epoch": 4.827586206896552, "grad_norm": 2.051952362060547, "learning_rate": 1.7780704292580107e-07, "loss": 0.2222, "step": 7420 }, { "epoch": 4.830839297332465, "grad_norm": 1.9150989055633545, "learning_rate": 1.7292475378629936e-07, "loss": 0.2197, "step": 7425 }, { "epoch": 4.83409238776838, "grad_norm": 2.003403902053833, "learning_rate": 1.6691714428535288e-07, "loss": 0.2274, "step": 7430 }, { "epoch": 4.837345478204294, "grad_norm": 1.729634165763855, "learning_rate": 1.6101539413598822e-07, "loss": 0.2313, "step": 7435 }, { "epoch": 4.840598568640209, "grad_norm": 2.2372894287109375, "learning_rate": 1.5521952849639476e-07, "loss": 0.2227, "step": 7440 }, { "epoch": 4.8438516590761225, "grad_norm": 1.8825215101242065, "learning_rate": 1.4952957207339802e-07, "loss": 0.2182, "step": 7445 }, { "epoch": 4.847104749512036, "grad_norm": 1.7914789915084839, "learning_rate": 1.439455491223457e-07, "loss": 0.2104, "step": 7450 }, { "epoch": 4.85035783994795, "grad_norm": 1.8011417388916016, "learning_rate": 1.3846748344701065e-07, "loss": 0.2243, "step": 7455 }, { "epoch": 4.853610930383865, "grad_norm": 1.737289547920227, "learning_rate": 1.3309539839948538e-07, "loss": 0.2272, "step": 7460 }, { "epoch": 4.856864020819779, "grad_norm": 2.1911211013793945, "learning_rate": 1.2782931688008482e-07, "loss": 0.2273, "step": 7465 }, { "epoch": 4.860117111255693, "grad_norm": 6.419521331787109, "learning_rate": 1.2266926133725487e-07, "loss": 0.2332, "step": 7470 }, { "epoch": 4.863370201691607, "grad_norm": 7.40968656539917, "learning_rate": 1.1761525376745575e-07, "loss": 0.2092, "step": 7475 }, { "epoch": 4.866623292127521, "grad_norm": 1.6772918701171875, "learning_rate": 1.1266731571509815e-07, "loss": 0.222, "step": 7480 }, { "epoch": 4.869876382563435, "grad_norm": 1.780863881111145, "learning_rate": 1.0782546827242667e-07, "loss": 0.2239, "step": 7485 }, { "epoch": 4.8731294729993495, "grad_norm": 1.984481930732727, "learning_rate": 1.0308973207944217e-07, "loss": 0.2212, "step": 7490 }, { "epoch": 4.876382563435263, "grad_norm": 1.720576286315918, "learning_rate": 9.846012732380727e-08, "loss": 0.2308, "step": 7495 }, { "epoch": 4.879635653871178, "grad_norm": 8.550189971923828, "learning_rate": 9.483287143148001e-08, "loss": 0.2275, "step": 7500 }, { "epoch": 4.882888744307092, "grad_norm": 2.3628833293914795, "learning_rate": 9.039435269181384e-08, "loss": 0.2137, "step": 7505 }, { "epoch": 4.886141834743006, "grad_norm": 2.029315233230591, "learning_rate": 8.606201950781267e-08, "loss": 0.2229, "step": 7510 }, { "epoch": 4.8893949251789195, "grad_norm": 4.765905857086182, "learning_rate": 8.183589034750639e-08, "loss": 0.2361, "step": 7515 }, { "epoch": 4.892648015614834, "grad_norm": 1.9303253889083862, "learning_rate": 7.771598322618422e-08, "loss": 0.2198, "step": 7520 }, { "epoch": 4.895901106050748, "grad_norm": 2.502507209777832, "learning_rate": 7.370231570633656e-08, "loss": 0.2263, "step": 7525 }, { "epoch": 4.899154196486663, "grad_norm": 2.3058369159698486, "learning_rate": 6.979490489756601e-08, "loss": 0.2224, "step": 7530 }, { "epoch": 4.9024072869225765, "grad_norm": 1.8423314094543457, "learning_rate": 6.599376745652641e-08, "loss": 0.2237, "step": 7535 }, { "epoch": 4.90566037735849, "grad_norm": 1.7330564260482788, "learning_rate": 6.229891958683675e-08, "loss": 0.2248, "step": 7540 }, { "epoch": 4.908913467794405, "grad_norm": 1.9083019495010376, "learning_rate": 5.8710377039031264e-08, "loss": 0.2174, "step": 7545 }, { "epoch": 4.912166558230319, "grad_norm": 4.153858661651611, "learning_rate": 5.52281551104733e-08, "loss": 0.2279, "step": 7550 }, { "epoch": 4.915419648666233, "grad_norm": 1.6192256212234497, "learning_rate": 5.185226864530546e-08, "loss": 0.2263, "step": 7555 }, { "epoch": 4.918672739102147, "grad_norm": 1.8693500757217407, "learning_rate": 4.8582732034374575e-08, "loss": 0.2234, "step": 7560 }, { "epoch": 4.921925829538061, "grad_norm": 1.9380360841751099, "learning_rate": 4.541955921518182e-08, "loss": 0.2182, "step": 7565 }, { "epoch": 4.925178919973975, "grad_norm": 1.9134066104888916, "learning_rate": 4.236276367180769e-08, "loss": 0.2283, "step": 7570 }, { "epoch": 4.92843201040989, "grad_norm": 1.6409050226211548, "learning_rate": 3.9412358434876003e-08, "loss": 0.2325, "step": 7575 }, { "epoch": 4.931685100845804, "grad_norm": 1.7989153861999512, "learning_rate": 3.6568356081473354e-08, "loss": 0.2357, "step": 7580 }, { "epoch": 4.934938191281717, "grad_norm": 2.0496795177459717, "learning_rate": 3.383076873511859e-08, "loss": 0.2218, "step": 7585 }, { "epoch": 4.938191281717632, "grad_norm": 1.7194660902023315, "learning_rate": 3.119960806569344e-08, "loss": 0.2324, "step": 7590 }, { "epoch": 4.941444372153546, "grad_norm": 2.1285674571990967, "learning_rate": 2.867488528940643e-08, "loss": 0.2339, "step": 7595 }, { "epoch": 4.94469746258946, "grad_norm": 2.114654779434204, "learning_rate": 2.6256611168734568e-08, "loss": 0.2403, "step": 7600 }, { "epoch": 4.9479505530253745, "grad_norm": 7.625702857971191, "learning_rate": 2.3944796012381754e-08, "loss": 0.2255, "step": 7605 }, { "epoch": 4.951203643461288, "grad_norm": 1.6961826086044312, "learning_rate": 2.173944967523711e-08, "loss": 0.2197, "step": 7610 }, { "epoch": 4.954456733897202, "grad_norm": 3.5485808849334717, "learning_rate": 1.9640581558330594e-08, "loss": 0.2309, "step": 7615 }, { "epoch": 4.957709824333117, "grad_norm": 1.928830862045288, "learning_rate": 1.7648200608791353e-08, "loss": 0.2205, "step": 7620 }, { "epoch": 4.960962914769031, "grad_norm": 2.2189247608184814, "learning_rate": 1.5762315319814425e-08, "loss": 0.2159, "step": 7625 }, { "epoch": 4.964216005204944, "grad_norm": 1.728050947189331, "learning_rate": 1.3982933730613545e-08, "loss": 0.223, "step": 7630 }, { "epoch": 4.967469095640859, "grad_norm": 7.718484878540039, "learning_rate": 1.2310063426404506e-08, "loss": 0.2345, "step": 7635 }, { "epoch": 4.970722186076773, "grad_norm": 1.8739633560180664, "learning_rate": 1.0743711538357959e-08, "loss": 0.2354, "step": 7640 }, { "epoch": 4.973975276512687, "grad_norm": 2.505018711090088, "learning_rate": 9.28388474357167e-09, "loss": 0.2341, "step": 7645 }, { "epoch": 4.9772283669486015, "grad_norm": 2.42279314994812, "learning_rate": 7.930589265051081e-09, "loss": 0.2242, "step": 7650 }, { "epoch": 4.980481457384515, "grad_norm": 1.8265007734298706, "learning_rate": 6.683830871667685e-09, "loss": 0.2052, "step": 7655 }, { "epoch": 4.983734547820429, "grad_norm": 1.9440053701400757, "learning_rate": 5.543614878153469e-09, "loss": 0.2132, "step": 7660 }, { "epoch": 4.986987638256344, "grad_norm": 3.599294900894165, "learning_rate": 4.509946145059285e-09, "loss": 0.2236, "step": 7665 }, { "epoch": 4.990240728692258, "grad_norm": 6.0209126472473145, "learning_rate": 3.5828290787437436e-09, "loss": 0.2376, "step": 7670 }, { "epoch": 4.9934938191281715, "grad_norm": 1.537784218788147, "learning_rate": 2.762267631356563e-09, "loss": 0.2264, "step": 7675 }, { "epoch": 4.996746909564086, "grad_norm": 2.2779605388641357, "learning_rate": 2.0482653008163654e-09, "loss": 0.2452, "step": 7680 }, { "epoch": 5.0, "grad_norm": 3.292484998703003, "learning_rate": 1.440825130796797e-09, "loss": 0.2322, "step": 7685 }, { "epoch": 5.0, "eval_f1": 0.801246098825816, "eval_loss": 0.7490234375, "eval_precision": 0.8011274883507161, "eval_recall": 0.8013850778258188, "eval_runtime": 255.3842, "eval_samples_per_second": 1540.561, "eval_steps_per_second": 1.508, "step": 7685 }, { "epoch": 5.0, "step": 7685, "total_flos": 1.664464427634026e+19, "train_loss": 0.6255086388770993, "train_runtime": 65052.9958, "train_samples_per_second": 241.916, "train_steps_per_second": 0.118 } ], "logging_steps": 5, "max_steps": 7685, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.664464427634026e+19, "train_batch_size": 512, "trial_name": null, "trial_params": null }